# 2019-10-22_fundamentallearning_pytorchWordRNN_translation

### TODO
- follow this tutorial https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html
- use reloading on training loop https://github.com/julvo/reloading

In [1]:
%load_ext autoreload
%autoreload 2

In [51]:
import os
import re
import random
import requests
import unicodedata
import torch.nn as nn
import torch
import torch.optim as optim
import torch.nn.functional as F
import itertools
import pdb
from time import time
from datetime import datetime

import pytorch_tutorial.word_utils as word_utils
from pytorch_tutorial.word_utils import (
    prepare_data, 
    pair2tensors, 
    sentence2tensor,
    timedelta_string,
    SOS_token, 
    EOS_token, 
    MAX_LENGTH, 
    
)




In [3]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"device is : {DEVICE}")

device is : cpu


Get french english translations from https://www.manythings.org/anki/fra-eng.zip and put them in a folder `pytorch_tutorial/data/fra-eng/fra.txt`


## Load and process data

filter for short sentences that start with 'i am', 'he is' etc

In [19]:
fra_lang, eng_lang, sentence_pairs = prepare_data('fra', 'eng')

Loading data
example loaded pair  ['va !', 'go .']
total sentance pairs in data : 175623
filtering data
Filtered sentence pairs in data : 13019
fra : 4790
eng : 3083


In [20]:
sentence_pairs[0]

['j ai ans .', 'i m .']

## Model 

In [21]:
class EncoderRNN(nn.Module):
    def __init__(self, inout_dim, hidden_dim=256):
        super(EncoderRNN, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding = nn.Embedding(inout_dim, hidden_dim)
        self.encoder = nn.GRU(hidden_dim, hidden_dim)
        
    def forward(self,x,hidden):
        embedding = self.embedding(x).view(1,1,-1)
        output, hidden = self.encoder(embedding,hidden)
        return output,hidden
    
    def init_hidden(self):
        return torch.zeros(1, 1, self.hidden_dim, device=DEVICE)
    
class DecoderRNN(nn.Module):
    def __init__(self, inout_dim, hidden_dim=256):
        super(DecoderRNN,self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding = nn.Embedding(inout_dim, hidden_dim)
        self.encoder = nn.GRU(hidden_dim, hidden_dim)
        self.hidden2out = nn.Linear(hidden_dim, inout_dim)
        self.softmax = nn.LogSoftmax(dim =1)
        
    def forward(self, x, hidden):
        embedding = self.embedding(x).view(1,1,-1)
        output, hidden = self.encoder(embedding, hidden)
        output = self.softmax(self.hidden2out(output[0]))
        
        return output, hidden
    
    def init_hidden(self):
        return torch.zeros(1, 1, self.hidden_dim, device=DEVICE)
    
    
        

In [22]:
sentence_pairs[0]

['j ai ans .', 'i m .']

### Dummy pass into model

In [54]:
def random_infer(dset):
    dummy_pair = random.choice(dset)
    dummy_in = sentence2tensor(fra_lang, dummy_pair[0], DEVICE)
    dummy_out = sentence2tensor(eng_lang, dummy_pair[1], DEVICE)
    enc_h0 = enc.init_hidden()
    h = enc_h0

    for i in range(dummy_in.size(0)):
        _, h = enc(dummy_in[i],h)
    enc_h = h

    outs = []
    hidden = enc_h
    output = torch.tensor([[SOS_token]], device=DEVICE)
    for i in range(dummy_out.size(0)):
        output, h = dec(output,hidden)

        top_value, top_index = output.topk(1)  
        output = top_index.squeeze().detach()
        outs.append(output)


    [eng_lang.index2word[o.item()] for o in outs]
    print(f"x : {dummy_pair[0]}")
    print(f"y : {dummy_pair[1]}")
    print(f"ŷ : {' '.join([eng_lang.index2word[o.item()] for o in outs])}")



In [39]:
random_infer()

x : ce n est pas mon cousin .
y : he isn t my cousin .
ŷ : exactly agree freaking spoiling guilty remodeling launch


## training model

In [25]:
def eval_example(input_tensor, target_tensor, enc, dec, optimizer, criterion, teacher_forcing=0.5):
    encoder_hidden = enc.init_hidden()
    optimizer.zero_grad()
    loss = 0
    
    target_length = input_tensor.size(0)
    
    for ei in range(target_length):
        _, encoder_hidden = enc(input_tensor[ei], encoder_hidden)
        
    decoder_input = torch.tensor([[SOS_token]],device=DEVICE)
    decoder_hidden = encoder_hidden
    
    for di in range(target_tensor.size(0)):
        decoder_input, decoder_hidden = dec(decoder_input, decoder_hidden)
        # for nll output = [batchsize, number of classes (words)] and target = correct class
        loss += criterion(decoder_input,target_tensor[di])
        if random.random()<teacher_forcing:
            decoder_input = target_tensor[di] #teacher forcing
        else:
            topv, topi = decoder_input.topk(1)
            decoder_input = topi.squeeze().detach()
    return loss

def train_step(input_tensor, target_tensor, enc, dec, optimizer, criterion):
    loss = eval_example(input_tensor, target_tensor, enc, dec, optimizer, criterion, teacher_forcing=True)
    loss.backward()
    optimizer.step()
    return loss.item()/ target_tensor.size(0)

def test_evaluate(test_set):
    
    for i, pair in enumerate(test_set):
        input_tensor = pair[0]
        target_tensor = pair[1]
        loss_total = 0
        with torch.no_grad():
            loss = eval_example(input_tensor, target_tensor, enc, dec, optimizer, criterion)
            loss_total += loss.item()/ target_tensor.size(0)
        return loss_total

def train(training_set, test_set, epochs, print_every):
    set_size = len(training_set)
    start_time = datetime.now()
    train_losses = []
    eval_losses = []
    for e in range(epochs):
        for i, pair in enumerate(training_set):
            n_example = ((e)*set_size + (i+1))
            input_tensor = pair[0]
            target_tensor = pair[1]
            loss_total =0
            loss = train_step(input_tensor, target_tensor, enc, dec, optimizer, criterion)
            train_losses.append((n_example,loss))
            loss_total += loss

            if ((e)*set_size + (i+1))%print_every ==0:
                timestamp = f"trainging time : {timedelta_string(datetime.now() - start_time)}"
                example_count = f"examples {n_example:8}"
                eval_loss = test_evaluate(test_set)
                eval_losses.append((n_example,loss))
                print (f'epoch {e:3} | {example_count} | loss:{loss_total:5.2f} | test loss:{eval_loss:5.2f} | {timestamp}')
                loss_total = 0
    return train_losses, eval_losses

    
    

In [56]:
print(f"total examples {len(sentence_pairs)}")
def train_test_split(total_examples, training_size = 1000,test_size = 100):
    total_idx = [ i for i in range(len(total_examples))]
    training_idx = [random.choice(total_idx) for _ in range(training_size)]
    remaining_idx = list(set(total_idx) - set(training_idx))
    test_idx = [random.choice(remaining_idx) for _ in range(test_size)]
    assert set(test_idx).intersection(training_idx) == set([])
    
    training_examples = [total_examples[i] for i in training_idx]
    test_examples = [total_examples[i] for i in test_idx]
    
    
    training_set = [ pair2tensors(fra_lang, eng_lang, e , DEVICE) for e in training_examples]
    test_set = [pair2tensors(fra_lang, eng_lang, e, DEVICE) for e in test_examples]
    return training_set, test_set, training_examples, test_examples

total examples 13019


No attention: results

    epoch   2 | examples    36000 | loss: 1.06 | test loss: 4.52 | trainging time : 0:00:09:44.56
    
With many of the random examples looking like:
    
    ['ils ne sont pas prepares a ca .', 'they re not prepared for this .']
    ['they', 're', 'not', 'they', 're', 'not', 'they', 're']

## Attention models

In [27]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_dim, output_dim, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.dropout_p = dropout_p
        self.max_length = max_length
        
        self.embedding = nn.Embedding(self.output_dim, self.hidden_dim)
        self.dropout = nn.Dropout(self.dropout_p)
        
        self.attn = nn.Linear(self.hidden_dim*2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_dim * 2 , self.hidden_dim)
        self.gru = nn.GRU(self.hidden_dim, self.hidden_dim)
        self.out = nn.Linear(self.hidden_dim, self.output_dim)
        self.softmax = nn.Softmax(dim=1)
        self.log_softmax = nn.LogSoftmax(dim=1)
        self.relu = nn.ReLU()
        
    def forward(self, x, hidden, encoder_outputs):
        embedded = self.dropout(self.embedding(x).view(1, 1, -1)) # (1, 1, h_dim)
        
        # (1,2 * h_dim) -> (1, max_l)
        attn_weights = self.attn(torch.cat((hidden[0], embedded[0]), 1))
        attn_weights = self.softmax(attn_weights) 
        
        # (1,1,max_l) , (1,max_l,h_dim) - > (1, 1, h_dim)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0))
        
        
        output = torch.cat((embedded[0], attn_applied[0]), 1) # (1,1, 2 * h_dim)
        output = self.relu(self.attn_combine(output).unsqueeze(0)) # (1,1, h_dim)
        
        output, hidden = self.gru(output, hidden)
        output = self.log_softmax(self.out(output[0]))
        return output, hidden, attn_weights
        
        
    def init_hidden(self):
        return torch.zeros(1, 1, self.hidden_dim, device=DEVICE)
    
    


In [28]:
def attn_test_evaluate(test_set):
    
    for i, pair in enumerate(test_set):
        input_tensor = pair[0]
        target_tensor = pair[1]
        loss_total = 0
        with torch.no_grad():
            loss = attn_eval_example(input_tensor, target_tensor, attn_enc, attn_dec, attn_optimizer, criterion)
            loss_total += loss.item()/ target_tensor.size(0)
        return loss_total
    
def attn_eval_example(input_tensor, target_tensor, attn_enc, attn_dec, attn_optimizer, criterion, teacher_forcing=0.5):
    encoder_hidden = attn_enc.init_hidden()
    attn_optimizer.zero_grad()
    loss = 0
    
    input_length = input_tensor.size(0)
    encoder_outputs = torch.zeros(MAX_LENGTH, attn_enc.hidden_dim, device=DEVICE)
    
    for ei in range(input_length):
        encoder_output, encoder_hidden = attn_enc(input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0,0]
        
    dec_input = torch.tensor([[SOS_token]],device=DEVICE)
    dec_hidden = encoder_hidden
    
    
    
    for di in range(target_tensor.size(0)):
        dec_input, dec_hidden, dec_attention = attn_dec(dec_input, dec_hidden, encoder_outputs)
        # for nll output = [batchsize, number of classes (words)] and target = correct class
        loss += criterion(dec_input,target_tensor[di])
        
        if random.random()<teacher_forcing:
            dec_input = target_tensor[di] #teacher forcing
        else:
            topv, topi = dec_input.topk(1)
            dec_input = topi.squeeze().detach()
            if dec_input.item() == EOS_token:
                break
    return loss

def attn_train_step(input_tensor, target_tensor, attn_enc, attn_dec, attn_optimizer, criterion):
    loss = attn_eval_example(input_tensor, target_tensor, attn_enc, attn_dec, attn_optimizer, criterion, teacher_forcing=True)
    loss.backward()
    attn_optimizer.step()
    return loss.item()/ target_tensor.size(0)



def attn_train(training_set, test_set, epochs, print_every):
    set_size = len(training_set)
    start_time = datetime.now()
    
    train_losses = []
    eval_losses = []
    for e in range(epochs):
        for i, pair in enumerate(training_set):
            n_example = ((e)*set_size + (i+1))
            input_tensor = pair[0]
            target_tensor = pair[1]
            loss_total =0
            loss = attn_train_step(input_tensor, target_tensor, attn_enc, attn_dec, attn_optimizer, criterion)
            train_losses.append((n_example,loss))
            loss_total += loss

            if ((e)*set_size + (i+1))%print_every ==0:
                timestamp = f"trainging time : {timedelta_string(datetime.now() - start_time)}"
                example_count = f"examples {n_example:8}"
                eval_loss = attn_test_evaluate(test_set)
                eval_losses.append((n_example,eval_loss))
                print (f'epoch {e:3} | {example_count} | loss:{loss_total:5.2f} | test loss:{eval_loss:5.2f} | {timestamp}')
                loss_total = 0
    return train_losses, eval_losses

    
    

In [53]:
def attn_random_infer(dset):
    dummy_pair = random.choice(dset)
    dummy_in = sentence2tensor(fra_lang, dummy_pair[0], DEVICE)
    dummy_out = sentence2tensor(eng_lang, dummy_pair[1], DEVICE)
    enc_h0 = attn_enc.init_hidden()
    h = enc_h0
    
    enc_outputs = torch.zeros(MAX_LENGTH, attn_enc.hidden_dim, device=DEVICE)
    for i in range(dummy_in.size(0)):
        o, h = attn_enc(dummy_in[i], h)
        enc_outputs[i] = o[0,0]
    enc_h = h

    outs = []
    hidden = enc_h
    output = torch.tensor([[SOS_token]], device=DEVICE)
    for i in range(dummy_out.size(0)*2):
        output, hidden, attention_weights = attn_dec(output, hidden, enc_outputs)

        top_value, top_index = output.topk(1)  
        output = top_index.squeeze().detach()
        outs.append(output)
        if output.item()==EOS_token:
            break

    print(f"x : {dummy_pair[0]}")
    print(f"y : {dummy_pair[1]}")
    print(f"ŷ : {' '.join([eng_lang.index2word[o.item()] for o in outs])}")

## Training comparison

In [59]:
training_set, test_set, training_examples, test_examples = train_test_split(sentence_pairs, 13000, 1000)

### normal rnn

In [60]:
enc = EncoderRNN(fra_lang.n_words).to(DEVICE)
dec = DecoderRNN(eng_lang.n_words).to(DEVICE)
criterion = nn.NLLLoss()
optimizer = optim.SGD(itertools.chain(enc.parameters(),dec.parameters()),lr=0.01)

In [67]:
random_infer(training_examples)

x : je suis tres reconnaissant pour votre aide .
y : i m very grateful for your help .
ŷ : i m not i m not i m not


In [68]:
train_losses, test_losses = train(training_set, test_set, 3, 5000)

In [69]:
random_infer(training_examples)

x : tu es connu .
y : you re famous .
ŷ : you re just going to


In [70]:
random_infer(test_examples)

x : je suis sur que ca changera bientot .
y : i m sure that ll change soon .
ŷ : i m sick of the moment i m sick


### rnn with attention

In [71]:
attn_dec = AttnDecoderRNN(hidden_dim=256, output_dim=eng_lang.n_words).to(DEVICE)
attn_enc = EncoderRNN(fra_lang.n_words, hidden_dim=256).to(DEVICE)
attn_optimizer = optim.SGD(itertools.chain(attn_enc.parameters(),attn_dec.parameters()),lr=0.01)

In [72]:
attn_random_infer(training_examples)

x : je n ai pas faim non plus .
y : i m not hungry either .
ŷ : flexible flexible documents bragging bragging bragging mop freak fat simple for behind armed japan


In [73]:
attn_train_losses, attn_eval_losses = attn_train(training_set, test_set, 3, 5000)

In [74]:
attn_random_infer(training_examples)

x : vous m attirez beaucoup .
y : i m very drawn to you .
ŷ : i m of the the . EOS


In [75]:
random_infer(test_examples)

x : c est vraiment une fille adorable .
y : she is indeed a lovely girl .
ŷ : i m not afraid of the moment you
