In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
import math
from random import shuffle

In [8]:
from mosestokenizer import *
from pprint import pprint

In [9]:
SOS_token = 0
EOS_token = 1
UNK = 2

In [10]:
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

In [12]:
# 파일을 열어서 언어별로 분리 후 train, valid, test 데이터셋 만들기

NUM_TRAIN = 10000
NUM_VALID = 1000
NUM_TEST = 1000

with open('eng-fra.txt','r') as f:
    sentence_pairs = []
    for i in range(20000): #데이터셋 초반에 단순한 문장들 제거
        f.readline()
    for i in range(NUM_TRAIN+NUM_VALID+NUM_TEST): #TRAIN, VALID, TEST 셋에 들어갈 문장들 Extraction
        line = f.readline().strip('\n')
        sentence_pairs.append(line)
    shuffle(sentence_pairs) #Random shuffle
    f.close()

train_pairs = sentence_pairs[:NUM_TRAIN]
valid_pairs = sentence_pairs[NUM_TRAIN:NUM_TRAIN+NUM_VALID]
test_pairs = sentence_pairs[NUM_TRAIN+NUM_VALID:NUM_TRAIN+NUM_VALID+NUM_TEST]

print("The number of training set is {}.".format(len(train_pairs)))
print("The number of valid set is {}.".format(len(valid_pairs)))
print("The number of test set is {}.".format(len(test_pairs)))

The number of training set is 10000.
The number of valid set is 1000.
The number of test set is 1000.


In [16]:
# 만들어진 데이터셋을 언어별로 나눈 후 소문자화 및 tokenizing

tokenize_en = MosesTokenizer('en')
tokenize_fr = MosesTokenizer('fr')

def preprocessor(pairs):
    english_sentences = []
    french_sentences = []
    for line in pairs:
        pair = line.lower().split('\t')
        english_sentences.append(tokenize_en(pair[0]))
        french_sentences.append(tokenize_fr(pair[1]))
    return english_sentences, french_sentences

train_en, train_fr = preprocessor(train_pairs)
valid_en, valid_fr = preprocessor(valid_pairs)
test_en, test_fr = preprocessor(test_pairs)

pprint(train_en)
pprint(valid_en)


[['is', 'that', 'all', 'of', 'them', '?'],
 ['we', 'better', 'be', 'going', '.'],
 ['i', 'knew', 'we', '&apos;d', 'find', 'you', '.'],
 ['how', 'is', 'this', 'possible', '?'],
 ['it', 'is', 'to', 'be', 'expected', '.'],
 ['are', 'you', 'almost', 'ready', '?'],
 ['times', 'have', 'changed', '.'],
 ['these', 'shoes', 'are', 'hers', '.'],
 ['i', '&apos;m', 'not', 'presentable', '.'],
 ['i', 'liked', 'your', 'friends', '.'],
 ['that', 'was', 'years', 'ago', '.'],
 ['you', 'party', 'too', 'much', '.'],
 ['clean', 'up', 'the', 'kitchen', '.'],
 ['what', 'would', 'he', 'know', '?'],
 ['just', 'follow', 'my', 'lead', '.'],
 ['what', 'part', 'is', 'wrong', '?'],
 ['bring', 'your', 'student', 'id', '!'],
 ['tom', 'knows', 'who', 'we', 'are', '.'],
 ['that', 'comes', 'in', 'handy', '.'],
 ['i', '&apos;d', 'say', 'you', 'did', 'well', '.'],
 ['don', '&apos;t', 'try', 'to', 'fool', 'me', '.'],
 ['she', 'failed', 'to', 'appear', '.'],
 ['we', '&apos;re', 'truly', 'worried', '.'],
 ['you', '&apos;ve'

 ['someday', ',', 'we', '&apos;ll', 'know', '.'],
 ['love', 'is', 'blinding', 'you', '.'],
 ['we', 'had', 'fun', 'with', 'it', '.'],
 ['are', 'you', 'ready', 'to', 'go', '?'],
 ['he', 'looked', 'surprised', '.'],
 ['i', 'applied', 'for', 'a', 'visa', '.'],
 ['we', '&apos;re', 'going', 'hunting', '.'],
 ['stand', 'on', 'the', 'scales', '.'],
 ['tom', 'has', 'only', 'one', 'leg', '.'],
 ['it', 'was', 'kind', 'of', 'weird', '.'],
 ['i', 'want', 'him', 'to', 'leave', '.'],
 ['i', 'want', 'to', 'start', 'over', '.'],
 ['no', 'one', 'will', 'survive', '.'],
 ['i', '&apos;m', 'not', 'at', 'all', 'busy', '.'],
 ['he', 'lives', 'in', 'nagasaki', '.'],
 ['paper', 'burns', 'easily', '.'],
 ['how', 'can', 'you', 'not', 'know', '?'],
 ['do', 'you', 'take', 'plastic', '?'],
 ['that', 'was', 'very', 'good', '.'],
 ['can', 'i', 'take', 'a', 'day', 'off', '?'],
 ['she', 'fixed', 'us', 'a', 'snack', '.'],
 ['you', '&apos;re', 'very', 'talented', '.'],
 ['this', 'one', 'is', 'bigger', '.'],
 ['this', 'is

 ['can', 'i', 'get', 'you', 'a', 'drink', '?'],
 ['i', 'am', 'a', 'stranger', 'here', '.'],
 ['she', 'cannot', 'stop', 'us', '.'],
 ['are', 'you', 'still', 'at', 'work', '?'],
 ['i', 'want', 'a', 'new', 'kitchen', '.'],
 ['so', 'what', 'does', 'it', 'mean', '?'],
 ['i', '&apos;m', 'not', 'good', 'at', 'it', '.'],
 ['you', 'won', '&apos;t', 'be', 'fired', '.'],
 ['is', 'everything', 'okay', '?'],
 ['it', '&apos;s', 'not', 'always', 'easy', '.'],
 ['i', 'sprang', 'out', 'of', 'bed', '.'],
 ['it', 'was', 'a', 'lot', 'of', 'fun', '.'],
 ['is', 'anybody', 'listening', '?'],
 ['i', '&apos;m', 'out', 'of', 'practice', '.'],
 ['you', '&apos;re', 'the', 'teacher', '.'],
 ['i', '&apos;m', 'a', 'beginner', ',', 'too', '.'],
 ['did', 'you', 'see', 'that', 'car', '?'],
 ['he', '&apos;s', 'in', 'his', 'fifties', '.'],
 ['i', 'know', 'all', 'about', 'it', '.'],
 ['i', 'think', 'you', '&apos;re', 'cute', '.'],
 ['don', '&apos;t', 'play', 'with', 'fire', '.'],
 ['tom', 'has', 'a', 'big', 'nose', '.'],


 ['why', 'don', '&apos;t', 'you', 'start', '?'],
 ['why', 'are', 'you', 'bleeding', '?'],
 ['he', 'knows', 'the', 'secret', '.'],
 ['he', 'is', 'in', 'the', 'kitchen', '.'],
 ['stay', 'a', 'little', 'longer', '.'],
 ['he', 'devoured', 'his', 'meal', '.'],
 ['this', 'is', 'my', 'father', '&apos;s', '.'],
 ['i', 'knew', 'it', 'was', 'right', '.'],
 ['who', 'did', 'you', 'talk', 'to', '?'],
 ['what', 'did', 'the', 'boy', 'say', '?'],
 ['write', 'down', 'each', 'word', '.'],
 ['i', 'have', 'bottled', 'water', '.'],
 ['who', 'gave', 'the', 'command', '?'],
 ['i', '&apos;d', 'like', 'to', 'do', 'more', '.'],
 ['maybe', 'tom', 'is', 'stupid', '.'],
 ['please', 'let', 'us', 'know', '.'],
 ['speak', 'slowly', ',', 'please', '.'],
 ['ignore', 'tom', '&apos;s', 'request', '.'],
 ['he', 'removed', 'his', 'shirt', '.'],
 ['the', 'dog', 'is', 'bleeding', '.'],
 ['tom', 'hasn', '&apos;t', 'met', 'mary', '.'],
 ['i', 'don', '&apos;t', 'have', 'a', 'badge', '.'],
 ['maybe', 'i', '&apos;ll', 'call', 'yo

 ['who', 'is', 'your', 'friend', '?'],
 ['the', 'shotgun', 'went', 'off', '.'],
 ['you', 'see', 'what', 'i', 'mean', '?'],
 ['she', 'went', 'on', 'working', '.'],
 ['i', '&apos;m', 'getting', 'hungry', '.'],
 ['what', 'might', 'that', 'be', '?'],
 ['i', 'know', 'your', 'roommate', '.'],
 ['she', 'did', 'a', 'good', 'job', '.'],
 ['i', 'was', 'younger', 'then', '.'],
 ['he', 'has', 'to', 'study', 'hard', '.'],
 ['now', '&apos;s', 'not', 'the', 'time', '.'],
 ['no', 'one', 'will', 'miss', 'me', '.'],
 ['the', 'class', 'went', 'wild', '.'],
 ['we', '&apos;re', 'just', 'nervous', '.'],
 ['may', 'i', 'use', 'your', 'phone', '?'],
 ['i', '&apos;m', 'shorter', 'than', 'him', '.'],
 ['how', 'did', 'that', 'happen', '?'],
 ['we', 'all', 'want', 'you', 'back', '.'],
 ['i', '&apos;m', 'better', 'than', 'him', '.'],
 ['give', 'it', 'back', 'to', 'her', '.'],
 ['i', '&apos;m', 'good', 'at', 'skiing', '.'],
 ['tom', 'is', 'getting', 'bored', '.'],
 ['i', 'did', 'see', 'something', '.'],
 ['who', 'is

In [17]:
# Vocabulary를 만들고 각 단어에 indexing

class vocab:
    
    def __init__(self, sets, size):
        self.sets = sets
        self.size = size - 3
        self.word2count = {}
        self.word2index = {"SOS": 0, "EOS": 1, "UNK": 2}
        self.n_words = 3  # SOS, EOS, UNK 포함
    
    def count_words(self): # 데이터셋의 포함된 단어의 갯수를 counting하여 dictionary로 return
        for line in self.sets:
            for token in line:
                if token not in self.word2count:
                    self.word2count[token] = 1
                else:
                    self.word2count[token] += 1
        return sorted(self.word2count.items(), key=(lambda x:x[1]), reverse=True)
                    
    def build_vocab(self): # Counting된 dictory에서 최빈 단어 순서로 vocabulary 생성
        vocab_list = self.count_words()[:self.size]
        for word in vocab_list:
            self.word2index[word[0]] = self.n_words
            self.n_words += 1
        return self.word2index
    
    def index2word(self): # Decoding에서 생성된 결과값을 단어로 바꾸어 주는 dictionary
         return {value:key for key, value in self.word2index.items()}

src_vocab_size = 1000
tgt_vocab_size = 1000

vocab_cls_en = vocab(train_en, src_vocab_size)
vocab_cls_fr = vocab(train_fr, tgt_vocab_size)

vocab_en = vocab_cls_en.build_vocab()
vocab_fr = vocab_cls_fr.build_vocab()

print("We build English vocabulary with trimming to {} words from {} words in training set.".format(src_vocab_size, len(vocab_cls_en.count_words())))
print("We build French vocabulary with trimming to {} words from {} words in training set.".format(tgt_vocab_size, len(vocab_cls_fr.count_words())))

We build English vocabulary with trimming to 1000 words from 2959 words in training set.
We build French vocabulary with trimming to 1000 words from 4822 words in training set.


In [18]:
# Data들의 token을 indexing하고 pair를 맞춤

def sentence2index(sent, vocab): #string으로 된 문장을 tensor로 표현, EOS_token을 붙임
    indexing_sent = [SOS_token]
    for token in sent:
        if token in vocab:
            indexing_sent.append(vocab[token])
        else:
            indexing_sent.append(UNK)
    indexing_sent.append(EOS_token)
    return torch.tensor(indexing_sent, dtype=torch.long).view(-1,1).to(device)

def prepare_indexed_sent(data_en, data_fr, vocab_en, vocab_fr): #두 언어쌍을 하나의 pair로 묶는 함수
    datasets = []
    for en_sent, fr_sent in zip(data_en, data_fr):
        en_tensor= sentence2index(en_sent, vocab_en)
        fr_tensor= sentence2index(fr_sent, vocab_fr)
        datasets.append((en_tensor, fr_tensor))
    return datasets

indexing_train_sets = prepare_indexed_sent(train_en, train_fr, vocab_en, vocab_fr)
indexing_valid_sets = prepare_indexed_sent(valid_en, valid_fr, vocab_en, vocab_fr)
indexing_test_sets = prepare_indexed_sent(test_en, test_fr, vocab_en, vocab_fr)

In [21]:
# Encoder와 Decoder building

class EncoderRNN(nn.Module):
    def __init__(self, input_dim, emb_dim, hidden_dim):
        super(EncoderRNN, self).__init__()
        self.hidden_dim = hidden_dim #hidden state 수 정의

        self.embedding = nn.Embedding(input_dim, emb_dim) #단어를 word vector로 임베딩하는 함수 선언
        self.gru = nn.GRU(emb_dim, hidden_dim) #활용할 network 선언, 네트워크에 따라 다양한 변수 활용 가능 ex) layer 수, attention

    def forward(self, src):
        embedded = self.embedding(src)
        output, hidden = self.gru(embedded)
        return output, hidden
    
class DecoderRNN(nn.Module):
    def __init__(self, hidden_dim, emb_dim, output_dim):
        super(DecoderRNN, self).__init__()
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim

        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.gru = nn.GRU(emb_dim, hidden_dim)
        self.out = nn.Linear(hidden_dim, output_dim)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden): #input = previous token
        output = self.embedding(input).unsqueeze(0) #Batch를 활용하지 않아 input type을 맞추기 위해서 dimension 추가
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden
    

In [22]:
# Model building

class Model(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        
        self.encoder = encoder.to(device)
        self.decoder = decoder.to(device)
                
    def forward(self, src, trg):
                
        output, hidden = self.encoder(src)
        
        outputs = torch.zeros(trg.size(0), self.decoder.output_dim)
        input = trg[SOS_token,:]
        
        for t in range(1, trg.size(0)):
            output, hidden = self.decoder(input, hidden)
            outputs[t] = output
            input = output.argmax(1)
        
        return outputs.to(device)

In [23]:
# Model 생성
INPUT_DIM = src_vocab_size
OUTPUT_DIM = tgt_vocab_size
HIDDEN_DIM = 32
EMB_DIM = 128 #활용할 변수 선언

encoder = EncoderRNN(INPUT_DIM, EMB_DIM, HIDDEN_DIM)
decoder = DecoderRNN(HIDDEN_DIM, EMB_DIM, OUTPUT_DIM)

model = Model(encoder, decoder)

def init_weights(m): #hidden state 초기화
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        
model.apply(init_weights)

def count_parameters(model): #전체 변수의 수 counting
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

optimizer = torch.optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

print(model)
print('The model has {} trainable parameters'.format(count_parameters(model)))

Model(
  (encoder): EncoderRNN(
    (embedding): Embedding(1000, 128)
    (gru): GRU(128, 32)
  )
  (decoder): DecoderRNN(
    (embedding): Embedding(1000, 128)
    (gru): GRU(128, 32)
    (out): Linear(in_features=32, out_features=1000, bias=True)
    (softmax): LogSoftmax()
  )
)
The model has 320104 trainable parameters


In [24]:
# Trainer와 Evaluator 생성

def train(model, indexing_train_sets, optimizer, criterion, clip): #모델을 학습하는 데 활용
    
    model.train()
    epoch_loss = 0
    
    for (src, trg) in indexing_train_sets:
        optimizer.zero_grad()
        output = model(src, trg)
        output_dim = output.shape[-1]
        
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        
        loss = criterion(output, trg)
        
        with torch.autograd.set_detect_anomaly(True):
            loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(indexing_train_sets)

def evaluate(model, indexing_valid_sets, criterion): #모델을 검증하는 데 활용
    
    model.eval()
    epoch_loss = 0
    
    with torch.no_grad():
        for (src, trg) in indexing_valid_sets:
            
            output = model(src, trg)
            output_dim = output.shape[-1]
            
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)

            loss = criterion(output, trg)
            
            epoch_loss += loss.item()
        
    return epoch_loss / len(indexing_valid_sets)

In [25]:
# 학습 진행

N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    train_loss = train(model, indexing_train_sets, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, indexing_valid_sets, criterion)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'best_model.pt')
        
    print('Epoch: {}'.format(epoch))
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Epoch: 0
	Train Loss: 3.743 | Train PPL:  42.241
	 Val. Loss: 3.280 |  Val. PPL:  26.568
Epoch: 1
	Train Loss: 3.192 | Train PPL:  24.327
	 Val. Loss: 2.999 |  Val. PPL:  20.067
Epoch: 2
	Train Loss: 2.954 | Train PPL:  19.175
	 Val. Loss: 2.887 |  Val. PPL:  17.939
Epoch: 3
	Train Loss: 2.795 | Train PPL:  16.368
	 Val. Loss: 2.819 |  Val. PPL:  16.761
Epoch: 4
	Train Loss: 2.675 | Train PPL:  14.515
	 Val. Loss: 2.751 |  Val. PPL:  15.657
Epoch: 5
	Train Loss: 2.576 | Train PPL:  13.139
	 Val. Loss: 2.713 |  Val. PPL:  15.069
Epoch: 6
	Train Loss: 2.495 | Train PPL:  12.119
	 Val. Loss: 2.668 |  Val. PPL:  14.416
Epoch: 7
	Train Loss: 2.429 | Train PPL:  11.349
	 Val. Loss: 2.667 |  Val. PPL:  14.392
Epoch: 8
	Train Loss: 2.373 | Train PPL:  10.730
	 Val. Loss: 2.631 |  Val. PPL:  13.883
Epoch: 9
	Train Loss: 2.327 | Train PPL:  10.252
	 Val. Loss: 2.641 |  Val. PPL:  14.021


In [26]:
# 번역하는 함수 생성
def translate(model, src, vocab_en, vocab_fr_index2word, max_length):
    enc_output, enc_hidden = model.encoder(src)

    dec_hidden = enc_hidden

    decoded_word = []
    input = torch.tensor([SOS_token]).to(device)

    for i in range(max_length):
        dec_output, dec_hidden = model.decoder(input, dec_hidden)
        input = dec_output.argmax(1)
        if input.item() == EOS_token:
            decoded_word.append('<EOS>')
            break
        else:
            decoded_word.append(vocab_fr_index2word[input.item()])
            
    return decoded_word

In [27]:
def evaltest(model, indexing_test_sets, vocab_en, vocab_fr, vocab_fr_index2word, max_length):
    correct = 0
    for (src, trg) in indexing_test_sets:
        translated_sent = translate(model, src, vocab_en, vocab_fr_index2word, max_length)
        if translated_sent is not None:
            translated_sent_tensor = sentence2index(translated_sent, vocab_fr)
            if trg == translated_sent_tensor:
                correct += 1
        
    return correct/len(indexing_test_sets)

"""
model = Model(encoder, decoder)
model.load_state_dict(torch.load('best_model.pt')
"""

result = evaltest(model, indexing_test_sets, vocab_en, vocab_fr, vocab_cls_fr.index2word(), 10)

print("Model Accuracy : {}".format(result))

RuntimeError: bool value of Tensor with more than one value is ambiguous

In [None]:
model = Model(encoder, decoder)
model.load_state_dict(torch.load('best_model.pt')
                      
sent = "i am a boy ."
src = sentence2index(sent, vocab_en)
result = teanslate(model, src, vocab_en, vocab_cls_fr.index2word)