In [1]:
import torch
from torch import nn,optim
import matplotlib.pyplot as plt
from torch.nn import functional as F
import torchvision
import torchvision.transforms as transforms
import numpy as np
import pandas as pd
import re
import glob
import os
import unicodedata,string
import time
import math,random
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from nltk.tokenize import word_tokenize
from tqdm.notebook import tqdm

In [2]:
print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.get_device_name(0))
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

True
1
GeForce 940MX
cuda:0


In [3]:
def findFiles(path): return glob.glob(path)

print(findFiles('data/*.txt'))

['data/eng-fra.txt']


In [4]:
data = pd.read_csv('data/eng-fra.txt',delimiter='\t',header=None)

In [5]:
data[0] = data[0].str.lower()
data[1] = data[1].str.lower()

In [6]:
print(len(data))
data.head()

135842


Unnamed: 0,0,1
0,go.,va !
1,run!,cours !
2,run!,courez !
3,wow!,ça alors !
4,fire!,au feu !


In [7]:
data[0][1]

'run!'

In [8]:
UNK = 0
SOS = 1
EOS = 2
class Lang:
    def __init__(self, name):
        self.name = name
        self.w2i = {}
        self.i2w = {0:'UNK',1:'SOS',2:'EOS'}
        self.n_words = 3
        
    def addSentence(self, sentence, add=True):
        line = word_tokenize(sentence,language=self.name)
        # line = sentence.split(' ')
        # sent_tensor = torch.tensor(len(words)+1, dtype=torch.long, device=device).view(-1, 1)
        if add:
            for word in line:
                self.addWord(word)
        
        return line
    
    def addWord(self, word):
        if word not in self.w2i:
            self.w2i[word] = self.n_words
            self.i2w[self.n_words] = word
            self.n_words += 1

In [9]:
eng = Lang('english')
fre = Lang('french')
N_train = 100000

In [10]:
eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s ",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)
pairs = [[fre.addSentence(data[1][i]),eng.addSentence(data[0][i])] for i in tqdm(range(N_train))] # if data[0][i].startswith(eng_prefixes)
for i in tqdm(range(N_train,len(data))):
    pairs.append([fre.addSentence(data[1][i],False),eng.addSentence(data[0][i],False)])
print(random.choice(pairs))

  0%|          | 0/100000 [00:00<?, ?it/s]

  0%|          | 0/35842 [00:00<?, ?it/s]

[["j'en", 'accepte', 'le', 'risque', '.'], ['i', 'accept', 'the', 'risk', '.']]


In [11]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [12]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [13]:
def indexesFromLine(lang, line):
    return [lang.w2i[word] if word in lang.w2i else 0 for word in line]


def tensorFromLine(lang, line):
    indexes = indexesFromLine(lang, line)
    indexes.append(EOS)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromLine(fre, pair[0])
    target_tensor = tensorFromLine(eng, pair[1])
    return (input_tensor, target_tensor)

In [14]:
plt.switch_backend('agg')

def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [15]:
def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion):
    
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)
    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
    
    decoder_input = torch.tensor([[SOS]], device=device)
    decoder_hidden = encoder_output
    
    for di in range(target_length):
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
        topv, topi = decoder_output.topk(1)
        decoder_input = topi.squeeze().detach()
        loss += criterion(decoder_output, target_tensor[di])
        
        if decoder_input.item() == EOS:
            break

    loss.backward()
    
    encoder_optimizer.step()
    decoder_optimizer.step()
    
    return loss.item() / target_length

In [16]:
def evaluate(encoder, decoder, input_tensor):
    with torch.no_grad():
        
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)


        decoded_words = []
        decoder_input = torch.tensor([[SOS]], device=device)
        decoder_hidden = encoder_hidden
        
        for di in range(20):
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            topv, topi = decoder_output.topk(1)

            if topi.item() == EOS:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(eng.i2w[topi.item()])
            
            decoder_input = topi.squeeze().detach()

        return ' '.join(decoded_words)

In [17]:
def trainIters(encoder, decoder, n_iters, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    
    training_pairs = [tensorsFromPair(random.choice(pairs))
                      for i in range(n_iters)]
    
    criterion = nn.NLLLoss()

    for iter in tqdm(range(1, n_iters + 1)):
        
        training_pair = training_pairs[iter - 1]

        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        
        plot_loss_total += loss

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

In [18]:
hidden_size = 256
encoder1 = EncoderRNN(fre.n_words, hidden_size).to(device)
decoder1 = DecoderRNN(hidden_size, eng.n_words).to(device)

In [19]:
trainIters(encoder1, decoder1, 75000) # No of iterations should be increased

  0%|          | 0/75000 [00:00<?, ?it/s]

In [20]:
def evaluateRandom(encoder, decoder, n=10):
    for _ in range(n):
        pair = random.choice(pairs)
        input_ = pair[0]
        target_ = pair[1]
        input_tensor = tensorsFromPair(pair)
        
        ouput = evaluate(encoder, decoder, input_tensor[0])
        
        print('>', input_)
        print('=', target_)
        print('<', ouput)
        print(' ')

In [21]:
evaluateRandom(encoder1, decoder1)

> ['le', 'seau', 'à', 'charbon', 'est', 'plein', '.']
= ['the', 'coal', 'bin', 'is', 'full', '.']
< the is is in . . . <EOS>
 
> ['il', 'me', 'critiqua', 'pour', 'avoir', 'négligé', 'mon', 'devoir', '.']
= ['he', 'criticized', 'me', 'for', 'neglecting', 'my', 'duty', '.']
< he went to to my to my . . <EOS>
 
> ['nous', 'ferions', 'mieux', 'de', 'faire', 'quelque', 'chose', '.']
= ['we', "'d", 'better', 'do', 'something', '.']
< we 'd better better better . . <EOS>
 
> ['il', 'est', 'allé', 'faire', 'des', 'courses', '.']
= ['he', 'went', 'shopping', '.']
< he went to . . <EOS>
 
> ["c'est", 'mon', 'tour', '.']
= ['it', "'s", 'my', 'turn', '.']
< this 's my . . <EOS>
 
> ['aidons-le', 'afin', "qu'il", 'réussisse', '.']
= ['let', "'s", 'help', 'him', 'so', 'that', 'he', 'will', 'succeed', '.']
< UNK UNK UNK he he he . . <EOS>
 
> ['mon', 'médecin', 'm', "'", 'a', 'conseillé', 'de', "m'abstenir", 'de', 'consommer', 'de', "l'alcool", 'pendant', 'un', 'certain', 'temps', '.']
= ['my', 'phys