# 2019-10-22_fundamentallearning_pytorchWordRNN_translation

### TODO
- follow this tutorial https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html
- use reloading on training loop https://github.com/julvo/reloading

In [1]:
import os
import re
import random
import requests
import unicodedata
import torch.nn as nn
import torch
import torch.optim as optim
import torch.nn.functional as F
import itertools
import pdb
from time import time
from datetime import datetime

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"device is : {device}")

device is : cuda


Get french english translations from https://www.manythings.org/anki/fra-eng.zip and put them in a folder `data/pytorch_tutorial/fra-eng/fra.txt`


In [2]:

def timedelta_string(delta_time):
    days = delta_time.days
    hours = delta_time.seconds // 3600
    minutes = delta_time.seconds % 3600 // 60
    seconds = delta_time.seconds % 60 + delta_time.microseconds / 1e6
    return f"{days:3>}:{hours:02}:{minutes:02}:{seconds:05.2f}"




### Language Processing Code

In [3]:
# start and end tokens
SOS_token = 0
EOS_token = 1

class Lang:
    """Helper class to manage index <=> word translation"""
    def __init__(self,name):
        self.name = name
        self.word2index = {}
        self.index2word = {SOS_token:'SOS', EOS_token:'EOS'}
        self.word2count = {} # for rare words
        self.n_words = 2
        
    def add_word(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.index2word[self.n_words] = word
            self.word2count[word] = 1
            self.n_words += 1
        else:
            self.word2count[word] += 1
    def add_sentence(self, sentence):
        for word in sentence.split(' '):
            self.add_word(word)

def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD',s)
        if unicodedata.category(c) != 'Mn'
    
    )

def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    # adds a space before end of sentance punctuation
    s = re.sub(r"([.!?])", r" \1", s) 
    #replaces all other punctuation or unusual characters with spaces
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)     
    return s
    
def sentence2indexlist(lang, s):
    return [lang.word2index[w] for w in s.split(' ')]

def sentence2tensor(lang,s):
    indexes = sentence2indexlist(lang, s)
    indexes.append(EOS_token)
    return torch.tensor(indexes ,dtype=torch.long, device=device).view(-1,1)

def pair2tensors(l1,l2,pair):
    input_tensor = sentence2tensor(l1,pair[0])
    target_tensor = sentence2tensor(l2,pair[1])
    return (input_tensor, target_tensor)

## Load and process data

filter for short sentences that start with 'i am', 'he is' etc

In [4]:
MAX_LENGTH = 10
ENG_PREFIXES = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s ",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)

def check_pair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and p[0].startswith(ENG_PREFIXES)
def filter_pairs(pairs):
    # file is english french, so must reverse when filtering
    return [p[::-1] for p in pairs if check_pair(p)]
    
    

In [5]:
def prepare_data(lang1, lang2, simple_sentences = True):
    print('Loading data')
    with open(f'data/pytorch_tutorial/{lang1}-{lang2}/fra.txt') as f:
        pairs = [[ normalizeString(s) for s in line.split('\t')[:2]] for line in f]
    print(f"total sentance pairs in data : {len(pairs)}")
    print('filtering data')
    if simple_sentences:
        pairs = filter_pairs(pairs)
        print(f"Filtered sentance pairs in data : {len(pairs)}")
    print(random.choice(pairs))
    l1 = Lang(lang1)
    l2 = Lang(lang2)
    
    for p in pairs:
        l1.add_sentence(p[0])
        l2.add_sentence(p[1])
    print(f'{l1.name} : {l1.n_words}')
    print(f'{l2.name} : {l2.n_words}')
    return l1, l2, pairs

In [6]:
fra_lang, eng_lang, sentence_pairs = prepare_data('fra', 'eng')

Loading data
total sentance pairs in data : 175623
filtering data
Filtered sentance pairs in data : 14213
['je serai en retard .', 'i m going to be late .']
fra : 5037
eng : 3262


In [7]:
print(sentence_pairs[0])
pair2tensors(fra_lang, eng_lang, sentence_pairs[0])

['j ai ans .', 'i m .']


(tensor([[2],
         [3],
         [4],
         [5],
         [1]], device='cuda:0'), tensor([[2],
         [3],
         [4],
         [1]], device='cuda:0'))

In [8]:
test = "I'm a cat"
sentence2tensor(eng_lang,normalizeString(test))
# sentence2indexlist(eng_lang,normalizeString(test))

tensor([[  2],
        [  3],
        [ 50],
        [792],
        [  1]], device='cuda:0')

## Model 

In [9]:
class EncoderRNN(nn.Module):
    def __init__(self, inout_dim, hidden_dim=256):
        super(EncoderRNN, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding = nn.Embedding(inout_dim, hidden_dim)
        self.encoder = nn.GRU(hidden_dim, hidden_dim)
        
    def forward(self,x,hidden):
        embedding = self.embedding(x).view(1,1,-1)
        output, hidden = self.encoder(embedding,hidden)
        return output,hidden
    
    def init_hidden(self):
        return torch.zeros(1, 1, self.hidden_dim, device=device)
    
class DecoderRNN(nn.Module):
    def __init__(self, inout_dim, hidden_dim=256):
        super(DecoderRNN,self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding = nn.Embedding(inout_dim, hidden_dim)
        self.encoder = nn.GRU(hidden_dim, hidden_dim)
        self.hidden2out = nn.Linear(hidden_dim, inout_dim)
        self.softmax = nn.LogSoftmax(dim =1)
        
    def forward(self, x, hidden):
        embedding = self.embedding(x).view(1,1,-1)
        output, hidden = self.encoder(embedding, hidden)
        output = self.softmax(self.hidden2out(output[0]))
        
        return output, hidden
    
    def init_hidden(self):
        return torch.zeros(1, 1, self.hidden_dim, device=device)
    
    
        

In [10]:
enc = EncoderRNN(fra_lang.n_words).to(device)
dec = DecoderRNN(eng_lang.n_words).to(device)
criterion = nn.NLLLoss()
optimizer = optim.SGD(itertools.chain(enc.parameters(),dec.parameters()),lr=0.01)

### Dummy pass into model

In [11]:
dummy_pair = random.choice(sentence_pairs)
dummy_in = sentence2tensor(fra_lang, dummy_pair[0])
dummy_out = sentence2tensor(eng_lang, dummy_pair[1])
enc_h0 = enc.init_hidden()
h = enc_h0

print(dummy_pair)
for i in range(dummy_in.size(0)):
    _, h = enc(dummy_in[i],h)
enc_h = h

outs = []
hidden = enc_h
output = torch.tensor([[SOS_token]], device=device)
for i in range(dummy_out.size(0)):
    output, h = dec(output,hidden)

    top_value, top_index = output.topk(1)  
    output = top_index.squeeze().detach()
    outs.append(output)
    

[eng_lang.index2word[o.item()] for o in outs]

['je suis employe de banque .', 'i m a bank clerk .']


['honest',
 'carpenter',
 'swamped',
 'pressure',
 'community',
 'involved',
 'considered']

## training model

In [90]:
def eval_example(input_tensor, target_tensor, enc, dec, optimizer, criterion, teacher_forcing=0.5):
    encoder_hidden = enc.init_hidden()
    optimizer.zero_grad()
    loss = 0
    
    target_length = input_tensor.size(0)
    
    for ei in range(target_length):
        _, encoder_hidden = enc(input_tensor[ei], encoder_hidden)
        
    decoder_input = torch.tensor([[SOS_token]],device=device)
    decoder_hidden = encoder_hidden
    
    for di in range(target_tensor.size(0)):
        decoder_input, decoder_hidden = dec(decoder_input, decoder_hidden)
        # for nll output = [batchsize, number of classes (words)] and target = correct class
        loss += criterion(decoder_input,target_tensor[di])
        if random.random()<teacher_forcing:
            decoder_input = target_tensor[di] #teacher forcing
        else:
            topv, topi = decoder_input.topk(1)
            decoder_input = topi.squeeze().detach()
    return loss

def train_step(input_tensor, target_tensor, enc, dec, optimizer, criterion):
    loss = eval_example(input_tensor, target_tensor, enc, dec, optimizer, criterion, teacher_forcing=True)
    loss.backward()
    optimizer.step()
    return loss.item()/ target_tensor.size(0)

def test_evaluate(test_set):
    
    for i, pair in enumerate(test_set):
        input_tensor = pair[0]
        target_tensor = pair[1]
        loss_total = 0
        with torch.no_grad():
            loss = eval_example(input_tensor, target_tensor, enc, dec, optimizer, criterion)
            loss_total += loss.item()/ target_tensor.size(0)
        return loss_total

def train(training_set, test_set, epochs, print_every):
    set_size = len(training_set)
    start_time = datetime.now()

    for e in range(epochs):
        for i, pair in enumerate(training_set):
            input_tensor = pair[0]
            target_tensor = pair[1]
            loss_total =0
            loss = train_step(input_tensor, target_tensor, enc, dec, optimizer, criterion)
            loss_total += loss

            if ((e)*set_size + (i+1))%print_every ==0:
                timestamp = f"trainging time : {timedelta_string(datetime.now() - start_time)}"
                example_count = f"examples {((e)*set_size + (i+1)):8}"
                eval_loss = test_evaluate(test_set)
                print (f'epoch {e:3} | {example_count} | loss:{loss_total:5.2f} | test loss:{eval_loss:5.2f} | {timestamp}')
                loss_total = 0

    
    

In [22]:
print(f"total examples {len(sentence_pairs)}")
def train_test_split(total_examples, training_size = 1000,test_size = 100):
    total_idx = [ i for i in range(len(total_examples))]
    training_idx = [random.choice(total_idx) for _ in range(training_size)]
    remaining_idx = list(set(total_idx) - set(training_idx))
    test_idx = [random.choice(remaining_idx) for _ in range(test_size)]
    assert set(test_idx).intersection(training_idx) == set([])
    training_set = [ pair2tensors(fra_lang, eng_lang, total_examples[i]) for i in training_idx]
    test_set = [pair2tensors(fra_lang, eng_lang, total_examples[i]) for i in test_idx]
    return training_set, test_set

total examples 14213


In [23]:
training_set, test_set = train_test_split(sentence_pairs, 13000, 1000)

In [111]:
train(training_set, test_set, 10, 5000)

KeyboardInterrupt: 

No attention: results

    epoch   2 | examples    36000 | loss: 1.06 | test loss: 4.52 | trainging time : 0:00:09:44.56
    
With many of the random examples looking like:
    
    ['ils ne sont pas prepares a ca .', 'they re not prepared for this .']
    ['they', 're', 'not', 'they', 're', 'not', 'they', 're']

## Attention models

In [12]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_dim, output_dim, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.dropout_p = dropout_p
        self.max_length = max_length
        
        self.embedding = nn.Embedding(self.output_dim, self.hidden_dim)
        self.dropout = nn.Dropout(self.dropout_p)
        
        self.attn = nn.Linear(self.hidden_dim*2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_dim * 2 , self.hidden_dim)
        self.gru = nn.GRU(self.hidden_dim, self.hidden_dim)
        self.out = nn.Linear(self.hidden_dim, self.output_dim)
        self.softmax = nn.Softmax(dim=1)
        self.relu = nn.ReLU()
        
    def forward(self, x, hidden, encoder_outputs):
        embedded = self.dropout(self.embedding(x).view(1, 1, -1)) # (1, 1, h_dim)
        
        # (1,2 * h_dim) -> (1, max_l)
        attn_weights = self.attn(torch.cat((hidden[0], embedded[0]), 1))
        attn_weights = self.softmax(attn_weights) 
        
        # (1,1,max_l) , (1,max_l,h_dim) - > (1, 1, h_dim)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0))
        
        
        output = torch.cat((embedded[0], attn_applied[0]), 1) # (1,1, 2 * h_dim)
        output = self.relu(self.attn_combine(output).unsqueeze(0)) # (1,1, h_dim)
        
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden, attn_weights
        
        
    def init_hidden(self):
        return torch.zeros(1, 1, self.hidden_dim, device=device)
    
    


In [14]:
attn_dec = AttnDecoderRNN(hidden_dim=256, output_dim=eng_lang.n_words).to(device)
attn_enc = EncoderRNN(fra_lang.n_words, hidden_dim=256).to(device)
attn_optimizer = optim.SGD(itertools.chain(attn_enc.parameters(),attn_dec.parameters()),lr=0.01)

In [35]:
def attn_eval_example(input_tensor, target_tensor, attn_enc, attn_dec, attn_optimizer, criterion, teacher_forcing=0.5):
    encoder_hidden = attn_enc.init_hidden()
    attn_optimizer.zero_grad()
    loss = 0
    
    input_length = input_tensor.size(0)
    encoder_outputs = torch.zeros(MAX_LENGTH, attn_enc.hidden_dim, device=device)
    
    for ei in range(input_length):
        encoder_output, encoder_hidden = attn_enc(input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0,0]
        
    dec_input = torch.tensor([[SOS_token]],device=device)
    dec_hidden = encoder_hidden
    
    
    
    for di in range(target_tensor.size(0)):
        dec_input, dec_hidden, dec_attention = attn_dec(dec_input, dec_hidden, encoder_outputs)
        # for nll output = [batchsize, number of classes (words)] and target = correct class
        loss += criterion(dec_input,target_tensor[di])
        
        if random.random()<teacher_forcing:
            dec_input = target_tensor[di] #teacher forcing
        else:
            topv, topi = dec_input.topk(1)
            dec_input = topi.squeeze().detach()
            if dec_input.item() == EOS_token:
                break
    return loss

def attn_train_step(input_tensor, target_tensor, attn_enc, attn_dec, attn_optimizer, criterion):
    loss = attn_eval_example(input_tensor, target_tensor, attn_enc, attn_dec, attn_optimizer, criterion, teacher_forcing=True)
    loss.backward()
    attn_optimizer.step()
    return loss.item()/ target_tensor.size(0)

def attn_test_evaluate(test_set):
    
    for i, pair in enumerate(test_set):
        input_tensor = pair[0]
        target_tensor = pair[1]
        loss_total = 0
        with torch.no_grad():
            loss = attn_eval_example(input_tensor, target_tensor, attn_enc, attn_dec, attn_optimizer, criterion)
            loss_total += loss.item()/ target_tensor.size(0)
        return loss_total

def attn_train(training_set, test_set, epochs, print_every):
    set_size = len(training_set)
    start_time = datetime.now()

    for e in range(epochs):
        for i, pair in enumerate(training_set):
            input_tensor = pair[0]
            target_tensor = pair[1]
            loss_total =0
            loss = attn_train_step(input_tensor, target_tensor, attn_enc, attn_dec, attn_optimizer, criterion)
            loss_total += loss

            if ((e)*set_size + (i+1))%print_every ==0:
                timestamp = f"trainging time : {timedelta_string(datetime.now() - start_time)}"
                example_count = f"examples {((e)*set_size + (i+1)):8}"
                eval_loss = attn_test_evaluate(test_set)
                print (f'epoch {e:3} | {example_count} | loss:{loss_total:5.2f} | test loss:{eval_loss:5.2f} | {timestamp}')
                loss_total = 0

    
    

In [30]:
def attn_random_infer():
    dummy_pair = random.choice(sentence_pairs)
    dummy_in = sentence2tensor(fra_lang, dummy_pair[0])
    dummy_out = sentence2tensor(eng_lang, dummy_pair[1])
    enc_h0 = attn_enc.init_hidden()
    h = enc_h0
    
    enc_outputs = torch.zeros(MAX_LENGTH, attn_enc.hidden_dim, device=device)
    for i in range(dummy_in.size(0)):
        o, h = attn_enc(dummy_in[i], h)
        enc_outputs[i] = o[0,0]
    enc_h = h

    outs = []
    hidden = enc_h
    output = torch.tensor([[SOS_token]], device=device)
    for i in range(dummy_out.size(0)*2):
        output, hidden, attention_weights = attn_dec(output, hidden, enc_outputs)

        top_value, top_index = output.topk(1)  
        output = top_index.squeeze().detach()
        outs.append(output)
        if output.item()==EOS_token:
            break

    print(dummy_pair[0])
    print(dummy_pair[1])
    print(' '.join([eng_lang.index2word[o.item()] for o in outs]))

In [31]:
attn_random_infer()

je suis toujours prudente .
i m always careful .
definitely superior stuck rugby rugby rugby faith nearsighted japanese mentally willed refugees


In [36]:
attn_train(training_set, test_set, 10, 1000)

> <ipython-input-35-30662a3e9a48>(9)attn_eval_example()
-> for ei in range(input_length):


(Pdb)  input_length


8


(Pdb)  encoder_outputs.size()


torch.Size([10, 256])
--KeyboardInterrupt--


(Pdb)  q
(Pdb)  q


BdbQuit: 