# 2019-10-22_fundamentallearning_pytorchWordRNN_translation

### TODO
- follow this tutorial https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html
- use reloading on training loop https://github.com/julvo/reloading

In [1]:
import os
import re
import random
import requests
import unicodedata
import torch.nn as nn
import torch
import torch.optim as optim
import itertools

Get french english translations from https://www.manythings.org/anki/fra-eng.zip and put them in a folder `data/pytorch_tutorial/fra-eng/fra.txt`


In [2]:
# start and end tokens
SOS_token = 0
EOS_token = 1

class Lang:
    """Helper class to manage index <=> word translation"""
    def __init__(self,name):
        self.name = name
        self.word2index = {}
        self.index2word = {SOS_token:'SOS', EOS_token:'EOS'}
        self.word2count = {} # for rare words
        self.n_words = 2
        
    def add_word(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.index2word[self.n_words] = word
            self.word2count[word] = 1
            self.n_words += 1
        else:
            self.word2count[word] += 1
    def add_sentence(self, sentence):
        for word in sentence.split(' '):
            self.add_word(word)

def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD',s)
        if unicodedata.category(c) != 'Mn'
    
    )

def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    # adds a space before end of sentance punctuation
    s = re.sub(r"([.!?])", r" \1", s) 
    #replaces all other punctuation or unusual characters with spaces
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)     
    return s
    
def sentence2indexlist(lang, s):
    return [lang.word2index[w] for w in s.split(' ')]

def sentence2tensor(lang,s):
    indexes = sentence2indexlist(lang, s)
    indexes.append(EOS_token)
    return torch.tensor(indexes ,dtype=torch.long).view(-1,1)

def pair2tensors(l1,l2,pair):
    input_tensor = sentence2tensor(l1,pair[0])
    target_tensor = sentence2tensor(l2,pair[1])
    return (input_tensor, target_tensor)

## Load and process data

filter for short sentences that start with 'i am', 'he is' etc

In [3]:
MAX_ENG_LENGTH = 10
ENG_PREFIXES = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s ",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)

def check_pair(p):
    return len(p[0].split(' ')) < MAX_ENG_LENGTH and p[0].startswith(ENG_PREFIXES)
def filter_pairs(pairs):
    # file is english french, so must reverse when filtering
    return [p[::-1] for p in pairs if check_pair(p)]
    
    

In [4]:
def prepare_data(lang1, lang2):
    print('Loading data')
    with open(f'data/pytorch_tutorial/{lang1}-{lang2}/fra.txt') as f:
        pairs = [[ normalizeString(s) for s in line.split('\t')[:2]] for line in f]
    print('filtering data')
    pairs = filter_pairs(pairs)
    print(random.choice(pairs))
    l1 = Lang(lang1)
    l2 = Lang(lang2)
    
    for p in pairs:
        l1.add_sentence(p[0])
        l2.add_sentence(p[1])
    print(f'{l1.name} : {l1.n_words}')
    print(f'{l2.name} : {l2.n_words}')
    return l1, l2, pairs

In [5]:
fra_lang, eng_lang, sentence_pairs = prepare_data('fra', 'eng')

Loading data
filtering data
['si j ai offusque quelqu un je m en excuse .', 'i m sorry if i offended anyone .']
fra : 4982
eng : 3231


In [6]:
print(sentence_pairs[0])
pair2tensors(fra_lang, eng_lang, sentence_pairs[0])

['j ai ans .', 'i m .']


(tensor([[2],
         [3],
         [4],
         [5],
         [1]]), tensor([[2],
         [3],
         [4],
         [1]]))

In [7]:
test = "I'm a cat"
sentence2tensor(eng_lang,normalizeString(test))
# sentence2indexlist(eng_lang,normalizeString(test))

tensor([[  2],
        [  3],
        [ 49],
        [786],
        [  1]])

## Model 

In [8]:
class EncoderRNN(nn.Module):
    def __init__(self, inout_dim, hidden_dim=128):
        super(EncoderRNN, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding = nn.Embedding(inout_dim, hidden_dim)
        self.encoder = nn.GRU(hidden_dim, hidden_dim)
        
    def forward(self,x,hidden):
        embedding = self.embedding(x).view(1,1,-1)
        output, hidden = self.encoder(embedding,hidden)
        return output,hidden
    
    def init_hidden(self):
        return torch.zeros(1, 1, self.hidden_dim)
    
class DecoderRNN(nn.Module):
    def __init__(self, inout_dim, hidden_dim=128):
        super(DecoderRNN,self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding = nn.Embedding(inout_dim, hidden_dim)
        self.encoder = nn.GRU(hidden_dim, hidden_dim)
        self.hidden2out = nn.Linear(hidden_dim, inout_dim)
        self.softmax = nn.LogSoftmax(dim =1)
        
    def forward(self, x, hidden):
        embedding = self.embedding(x).view(1,1,-1)
        output, hidden = self.encoder(embedding, hidden)
        output = self.softmax(self.hidden2out(output[0]))
        
        return output, hidden
    
    def init_hidden(self):
        return torch.zeros(1, 1, self.hidden_dim)
    
    
        

In [9]:
enc = EncoderRNN(fra_lang.n_words)
dec = DecoderRNN(eng_lang.n_words)

### Dummy pass into model

In [10]:
dummy_pair = random.choice(sentence_pairs)
dummy_in = sentence2tensor(fra_lang, dummy_pair[0])
dummy_out = sentence2tensor(eng_lang, dummy_pair[1])
enc_h0 = enc.init_hidden()
h = enc_h0

In [11]:
dummy_pair


['nous sommes realistes .', 'we re realistic .']

In [12]:
for i in range(dummy_in.size(0)):
    _, h = enc(dummy_in[i],h)
enc_h = h

In [13]:
outs = []
hidden = enc_h
output = torch.tensor([[SOS_token]])
for i in range(dummy_out.size(0)):
    output, h = dec(output,hidden)

    top_value, top_index = output.topk(1)  
    output = top_index.squeeze().detach()
    outs.append(output)

In [14]:
outs

[tensor(2141), tensor(2097), tensor(1097), tensor(732), tensor(1238)]

In [None]:
sequence

## training model

In [15]:
def train_step(input_tensor, target_tensor, enc, dec, optimizer, criterion):
    encoder_hidden = enc.init_hidden()
    optimizer.zero_grad()
    loss = 0
    
    target_length = input_tensor.size(0)
    
    for ei in range(target_length):
        _, encoder_hidden = enc(input_tensor[ei], encoder_hidden)
        
    decoder_input = torch.tensor([[SOS_token]])
    decoder_hidden = encoder_hidden
    
    for di in range(target_tensor.size(0)):
        decoder_input, decoder_hidden = dec(decoder_input, decoder_hidden)
        
        # for nll output = [batchsize, number of classes (words)] and target = correct class
        loss += criterion(decoder_input,target_tensor[di])
        decoder_input = target_tensor[di] #teacher forcing
    loss.backward()
    optimizer.step()
    return loss.item()/ target_length


In [16]:
criterion = nn.NLLLoss()
optimizer = optim.Adam(itertools.chain(enc.parameters(),dec.parameters()))

In [30]:
def train(epochs, n_iters, print_every):
    training_set = [pair2tensors(fra_lang,eng_lang, random.choice(sentence_pairs)) for i in range(n_iters)]

    for e in range(epochs):
        for i, pair in enumerate(training_set):
            input_tensor = pair[0]
            target_tensor = pair[1]
            loss_total =0
            loss_total += train_step(input_tensor, target_tensor, enc, dec, optimizer, criterion)

        if e%print_every ==0:
            print (f'epoch{e:3} loss:{loss_total:5.2f}')
            loss_total = 0

    
    

In [31]:
train(10, 100,1)

epoch  0 loss: 2.75
epoch  1 loss: 1.85
epoch  2 loss: 1.43
epoch  3 loss: 1.11
epoch  4 loss: 0.88
epoch  5 loss: 0.73
epoch  6 loss: 0.63
epoch  7 loss: 0.56
epoch  8 loss: 0.50
epoch  9 loss: 0.45
