In [1]:
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
MAX_LENGTH = 10

eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s ",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
SOS_token = 0
EOS_token = 1

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )
    
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

def readLangs(lang1, lang2, reverse=False):
    print("Reading lines...")

    lines = open('../data/%s-%s.txt' % (lang1, lang2),
                encoding='utf-8').read().strip().split('\n')

    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]

    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs



def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH and \
        p[1].startswith(eng_prefixes)


def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

def prepareData(lang1, lang2, reverse=False):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
    print("Read %s sentence pairs" % len(pairs))
    pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs

input_lang, output_lang, pairs = prepareData('eng', 'fra', True)
print(random.choice(pairs))

def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)


Reading lines...
Read 135842 sentence pairs
Trimmed to 10599 sentence pairs
Counting words...
Counted words:
fra 4345
eng 2803
['je suis contre le travail du dimanche .', 'i am against working on sundays .']


In [5]:
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size) -> None:
        super(Encoder, self).__init__()
        
        # vocab size
        self.input_size = input_size
        
        # hidden neurons in the RNNs
        self.hidden_size = hidden_size 
        
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, source_input, source_hidden):
        encoder_embedded = self.embedding(source_input).view(1, 1, -1)
        encoder_output, encoder_hidden = self.gru(encoder_embedded, source_hidden)
        return encoder_output, encoder_hidden
    
    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device = DEVICE)
        
class Decoder(nn.Module):
    def __init__(self, hidden_size, output_size) -> None:
        super(Decoder, self).__init__()
        
        # encoder hidden unit size
        self.hidden_size = hidden_size
        
        # output lang vocab size
        self.output_size = output_size
        
        self.embeddings = nn.Embedding(self.output_size, self.hidden_size)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.linear = nn.Linear(self.hidden_size, self.output_size)
        self.softmax = nn.LogSoftmax(dim=1)
    
    def forward(self, target_input, target_hidden):
        # print(target_input.size())
        decoder_embedded = self.embeddings(target_input).view(1, 1, -1)
        decoder_embedded = F.relu(decoder_embedded)
        # print(embedded.size())
        decoder_output, decoder_hidden = self.gru(decoder_embedded, target_hidden)
        linear_output = self.linear(decoder_output)[0]
        # print(linear_output.size())
        decoder_output = self.softmax(linear_output)
        return decoder_output, decoder_hidden
    
    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device = DEVICE)     
        

In [54]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        # print(f'1 input {input.size()} hidden {hidden.size()} encoder_outputs {encoder_outputs.size()}')
        embedded = self.embedding(input).view(1, 1, -1)
        # print(f'2 embedded {embedded.size()}')
        embedded = self.dropout(embedded)

        # print(f'3 embedded {embedded.size()}')
        concat = torch.cat((embedded[0], hidden[0]), 1)
        # print(f'4 concat {concat.size()}')
        attn = self.attn(concat)
        # print(f'5 attn {attn.size()}')
        attn_weights = F.softmax(attn, dim=1)
        attn_weights_unsq = attn_weights.unsqueeze(0)
        encoder_outputs_unsq = encoder_outputs.unsqueeze(0)
        # print(f'6 attn_weights {attn_weights.size()} attn_weights_unsq {attn_weights_unsq.size()} encoder_outputs_unsq {encoder_outputs_unsq.size()}')
        attn_applied = torch.bmm(attn_weights_unsq, encoder_outputs_unsq)
        # print(f'7 attn_applied {attn_applied.size()}')
        
        output = torch.cat((embedded[0], attn_applied[0]), 1)
        # print(f'8 output {output.size()}')
        output = self.attn_combine(output).unsqueeze(0)
        # print(f'9 output {output.size()}')

        output = F.relu(output)
        # print(f'10 output {output.size()}')
        output, hidden = self.gru(output, hidden)
        # print(f'11 output {output.size()} hidden {hidden.size()}')

        output = F.log_softmax(self.out(output[0]), dim=1)
        # print(f'12 output {output.size()}')
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [55]:
n_iters = 1
training_pairs = [tensorsFromPair(random.choice(pairs)) for i in range(n_iters)]
input_tensor, target_tensor = training_pairs[0][0][0], training_pairs[0][1][0]
len(training_pairs), input_tensor.size(), target_tensor.size()

(1, torch.Size([1]), torch.Size([1]))

In [56]:
hidden_size = 256
encoder_gru = Encoder(input_lang.n_words, hidden_size=hidden_size).to(DEVICE)
hidden_units = encoder_gru.initHidden()
encoder_output, encoder_hidden = encoder_gru(input_tensor, hidden_units)
encoder_output.size(), encoder_hidden.size()
encoder_outputs = torch.zeros(MAX_LENGTH, encoder_gru.hidden_size,
                              device=DEVICE)
encoder_outputs[0] = encoder_output[0, 0]

In [57]:
attn_decoder = AttnDecoderRNN(hidden_size, output_lang.n_words).to(DEVICE)
attn_decoder_output, attn_decoder_hidden, attn_decoder_weights = attn_decoder(target_tensor,
                                                                              encoder_hidden,
                                                                              encoder_outputs) 

In [58]:
decoder_gru = Decoder(hidden_size=hidden_size, 
                         output_size=output_lang.n_words,
                         ).to(DEVICE)
decoder_output, decoder_hidden = decoder_gru(target_tensor, encoder_hidden)
decoder_output.size(), decoder_hidden.size()

(torch.Size([1, 2803]), torch.Size([1, 1, 256]))

In [59]:
target_tensor.type()

'torch.cuda.LongTensor'

In [60]:
def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer,
          decoder_optimizer, criterion, max_length=MAX_LENGTH):
    
      encoder_hidden = encoder.initHidden()

      encoder_optimizer.zero_grad()
      decoder_optimizer.zero_grad()

      input_length = input_tensor.size(0)
      target_length = target_tensor.size(0)

      encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=DEVICE)

      loss = 0

      for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei], 
                                                      encoder_hidden)
            encoder_outputs[ei] = encoder_output[0, 0]
      
      decoder_input = torch.tensor([[SOS_token]], dtype=torch.int64,
                                   device=DEVICE)
      decoder_hidden = encoder_hidden
      for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input,
                                                                        decoder_hidden,
                                                                        encoder_outputs)
            # decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]
            
      loss.backward()
      encoder_optimizer.step()
      decoder_optimizer.step()
      
      return loss.item() / target_length

def train_iters(encoder, decoder, n_iters, print_every=1000, plot_every=100, 
                learning_rate=0.01):
      encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
      decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
      training_pairs = [tensorsFromPair(random.choice(pairs)) for i in range(n_iters)]
      crierion = nn.NLLLoss()
      for iter in range(1, n_iters+1):
            training_pair = training_pairs[iter-1]
            input_tensor = training_pair[0]
            target_tensor = training_pair[1]
            
            loss = train(input_tensor=input_tensor, target_tensor=target_tensor, 
                         encoder=encoder, decoder=decoder, 
                         encoder_optimizer=encoder_optimizer, 
                         decoder_optimizer=decoder_optimizer,
                         criterion=crierion,)
            if iter % print_every ==0:
                  print(loss)

In [61]:
hidden_size = 256
encoder = Encoder(input_lang.n_words, hidden_size=hidden_size).to(device=DEVICE)
# decoder = Decoder(hidden_size, output_lang.n_words).to(device=DEVICE)
decoder_attn = AttnDecoderRNN(hidden_size, output_lang.n_words).to(device=DEVICE)
train_iters(encoder, decoder_attn, 75000, print_every=5000)

3.140512466430664
1.6582053899765015
2.3729005541120256
0.40833795070648193
1.546718915303548
0.46567126115163165
0.5524654388427734
0.4465926170349121
0.028758640090624493
0.07189585481371198
0.7609364986419678
0.42893415027194554
0.00702615703145663
1.5226192474365234
0.16236746311187744


In [26]:
# inp_temp = tensorFromSentence(input_lang, pairs[0][0])
# temp_hidden = encoder.initHidden()
# encoder_output_temp, encoder_hidden_temp = encoder(inp_temp[0], temp_hidden)
# decoder_inp_temp = torch.tensor([[SOS_token]], device=DEVICE)
# decoder_output_temp, decoder_hidden_temp = decoder(decoder_inp_temp,
#                                                    encoder_hidden_temp)
# # decoder_output_temp.size(), decoder_hidden_temp.size()
# topv, topi = decoder_output_temp.data.topk(1)
# topv, topi, topi.squeeze(), topi.squeeze().detach()

In [62]:
decoder_input = torch.tensor([[SOS_token]], device=DEVICE)
decoder_input

tensor([[0]], device='cuda:0')

In [64]:
def predict(encoder, decoder, sentence, max_length = MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()
        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=DEVICE)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] = encoder_output[0, 0]
        decoder_input = torch.tensor([[SOS_token]], device=DEVICE)
        decoder_hidden = encoder_hidden
        
        decoded_words = []
        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, 
                                                                        decoder_hidden,
                                                                        encoder_outputs)
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            decoded_words.append(output_lang.index2word[topi.item()])
            decoder_input = topi
        return decoded_words

In [72]:
pair = random.choice(pairs)
test_sentence, correct_sentence  = pair[0], pair[1]
decoded_words_final = predict(encoder, decoder_attn, test_sentence)
test_sentence, correct_sentence, decoded_words_final, 

('elle est inquiete pour votre securite .',
 'she s worried about your safety .',
 ['she', 's', 'worried', 'about', 'your', 'safety', '.', '<EOS>'])