In [20]:
!python -m spacy download fr

import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator
import numpy as np
import spacy
import random
from torch.utils.tensorboard import SummaryWriter

Collecting fr_core_news_sm==2.2.5
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-2.2.5/fr_core_news_sm-2.2.5.tar.gz (14.7MB)
[K     |████████████████████████████████| 14.7MB 23.1MB/s 
Building wheels for collected packages: fr-core-news-sm
  Building wheel for fr-core-news-sm (setup.py) ... [?25l[?25hdone
  Created wheel for fr-core-news-sm: filename=fr_core_news_sm-2.2.5-cp36-none-any.whl size=14727027 sha256=afbcfd6250fde6e6c574cdb3646fbec9a87643de83d3f36a1f8e3ab216ef82fd
  Stored in directory: /tmp/pip-ephem-wheel-cache-35_su425/wheels/46/1b/e6/29b020e3f9420a24c3f463343afe5136aaaf955dbc9e46dfc5
Successfully built fr-core-news-sm
Installing collected packages: fr-core-news-sm
Successfully installed fr-core-news-sm-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('fr_core_news_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/fr_core_news_sm -->
/usr/local

In [30]:
!ls .data/multi30k


mmt_task1_test2016.tar.gz  test2016.fr	training.tar.gz  validation.tar.gz
test2016.de		   train.de	val.de
test2016.en		   train.en	val.en


In [35]:
nlp_de = spacy.load('de')
nlp_eng = spacy.load('en')

def tokenizer_de(text):
    return [tok.text for tok in nlp_de.tokenizer(text)]


def tokenizer_eng(text):
    return [tok.text for tok in nlp_eng.tokenizer(text)]


german = Field(tokenize = tokenizer_de, lower=True, init_token ='<sos>', eos_token = '<eos>')

english = Field(tokenize = tokenizer_eng, lower=True, init_token ='<sos>', eos_token = '<eos>')

train_data, validation_data, test_data = Multi30k.splits(exts = ('.de', '.en'), fields = (german, english))

german.build_vocab(train_data, max_size = 10000, min_freq = 2) 
english.build_vocab(train_data, max_size = 10000, min_freq = 2) 

In [11]:
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, validation_data, test_data),
    batch_size = 3,
    sort_within_batch = True,
    sort_key = lambda x: len(x.src))

In [16]:
x = next(iter(train_iterator))
print(x.src)

tensor([[   2,    2,    2],
        [ 105,   18,   18],
        [  41,   30,   41],
        [  52,    7,   53],
        [  27,  486,   10],
        [   6, 3534,  307],
        [3487,   59,  127],
        [  10,    6,  317],
        [ 928,    0, 3093],
        [  75,  292, 3542],
        [   4,    4,    4],
        [   3,    3,    3]])


In [17]:
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, p):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.dropout = nn.Dropout(p)
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout = p)

    def forward(self, x):
        # c_shape (seq_length, batch_size, )
        embedding = self.dropout(self.embedding(x))
        # embedding (seq_length, batch_size, embedding_size)
        outputs, (hidden, cell) = self.rnn(embedding)

        return hidden, cell



class Decoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size, num_layers, p):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = nn.Dropout(p)

        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p)
        self.fc = nn.Linear(hidden_size, output_size)

    
    def forward(self, x, hidden, cell):
        
        # shape of x (batch_size) but we want (1,batch_size) 
        x = x.unsqueeze(0)
        embedding = self.dropout(self.embedding(x))
        outputs, (hidden, cell) = self.rnn(embedding, (hidden, cell))
        # shape of outputs (1, N, hidden_size)

        predictions = self.fc(outputs)
        # shape 1, N, length of vocab

        predictions = predictions.squeeze(0)

        return predictions, hidden, cell

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder


    def forward(self, source, target, teacher_force_ratio = 0.5):
        batch_size = source.shape[1]
        target_len = target.shape[0]
        target_vocab_size = len(english.vocab)

        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)

        hidden, cell = self.encoder(source)
        
    
        #grab start toker
        x = target[0]

        for t in range(1, target_len):
            output, hidden, cell = self.decoder(x, hidden, cell)

            outputs[t] = output

            # (N, english_vocab_size)
            best_guess = output.argmax(1)
            x = target[t] if random.random() < teacher_force_ratio else best_guess


        return outputs


In [62]:

### Training

# training hyperparameters

num_epochs = 5
learning_rate = 0.001
batch_size = 32

# Model hyperparameters
load_model = False
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
input_size_encoder = len(german.vocab)
input_size_decoder = len(english.vocab)
output_size = len(english.vocab)
encoder_embedding_size = 300
decoder_embedding_size = 300
hidden_size = 1024
num_layers = 1
enc_dropout = 0.5
dec_dropout = 0.5

#Tensor board
writer = SummaryWriter()
step = 0

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, validation_data, test_data),
    batch_size = batch_size,
    sort_within_batch = True,
    sort_key = lambda x: len(x.src),
    device = device
)

encoder_net = Encoder(input_size_encoder, encoder_embedding_size, hidden_size, num_layers, enc_dropout).to(device)

decoder_net = Decoder(input_size_decoder, decoder_embedding_size, hidden_size, output_size, num_layers, dec_dropout).to(device)

model = Seq2Seq(encoder_net, decoder_net).to(device)

pad_idx = english.vocab.stoi['<pad>']

criterion = nn.CrossEntropyLoss(ignore_index = pad_idx)


  "num_layers={}".format(dropout, num_layers))


In [63]:
import tensorflow as tf

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# saving stuff

v_inp_data, v_tgt_data = next(iter(valid_iterator)).src,  next(iter(valid_iterator)).trg

i = 0

for epoch in range(100):
    print(f'Epoch {epoch} / {num_epochs}')

    checkpoint = {'state_dict':model.state_dict(), 'optimizer':optimizer.state_dict()}

    # save_checkpoint(checkpoint)
    a = translate_sentence(model, 'Der Himmel ist heute klar. Die Sonne scheint.', german, english, device)
    print(a)
    for batch_idx, batch in enumerate(train_iterator):
        inp_data = batch.src.to(device)
        target = batch.trg.to(device)

        output = model(inp_data, target)
        #output shape (tgt_len ,b_size, output_dim)

        if i == 0:
            writer.add_graph(model.encoder, inp_data)
            i += 1

        output = output[1:].reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)

        optimizer.zero_grad()
        loss = criterion(output, target)

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm = 1)
        optimizer.step()

        writer.add_scalar('/Training Loss/Tr_Loss', loss, global_step = step)

        #val loss
        v_output = model(v_inp_data, v_tgt_data)
        #output shape (tgt_len ,b_size, output_dim)

        v_output = v_output[1:].reshape(-1, v_output.shape[2])
        v_target = v_tgt_data[1:].reshape(-1)

        val_loss = criterion(v_output, v_target)
        writer.add_scalar('/Validation Loss/Val_loss', val_loss, global_step = step)

        step += 1

writer.close()

Epoch 0 / 5
['attaching', 'attaching', 'spilled', 'taking', 'fixing', '100', 'attaching', 'shelves', 'sprinkles', 'completing', 'smoking', 'competition', 'balding', 'someones', 'showgirl', 'image', 'arranged', 'satchel', 'much', 'wheat', 'sing', 'organized', 'kind', 'ballerinas', 'ballerinas', 'bloom', 'repairing', 'waits', 'swans', 'taste', 'studies', 'lollipop', 'observed', 'caricature', 'cream', 'ribs', 'ribs', 'tattoos', 'operated', 'posed', 'close', 'lots', 'cue', 'weave', 'dirty', 'milk', 'monitor', 'carpenter', 'products', 'gesture']
Epoch 1 / 5
['the', '<unk>', 'is', '<unk>', 'the', 'the', 'the', '<unk>', '.', '<eos>']
Epoch 2 / 5
['the', '<unk>', '<unk>', 'is', 'the', 'the', 'the', 'the', '.', '.', '<eos>']
Epoch 3 / 5
['the', '<unk>', 'is', 'the', 'the', 'the', 'the', 'the', '.', '.', '<eos>']
Epoch 4 / 5
['the', '<unk>', "'s", '<unk>', 'is', 'the', 'the', 'the', 'the', '.', '.', '<eos>']
Epoch 5 / 5
['the', '<unk>', 'is', 'is', 'the', 'the', 'of', 'the', 'red', 'and', 'white

KeyboardInterrupt: ignored

In [43]:
def translate_sentence(model, sentence, german, english, device, max_length=50):
    # Load german tokenizer
    spacy_ger = spacy.load("de")

    # Create tokens using spacy and everything in lower case (which is what our vocab is)
    if type(sentence) == str:
        tokens = [token.text.lower() for token in spacy_ger(sentence)]
    else:
        tokens = [token.lower() for token in sentence]

    # print(tokens)

    # sys.exit()
    # Add <SOS> and <EOS> in beginning and end respectively
    tokens.insert(0, german.init_token)
    tokens.append(german.eos_token)

    # Go through each german token and convert to an index
    text_to_indices = [german.vocab.stoi[token] for token in tokens]

    # Convert to Tensor
    sentence_tensor = torch.tensor(text_to_indices).unsqueeze(1).to(device)

    # Build encoder hidden, cell state
    with torch.no_grad():
        hidden, cell = model.encoder(sentence_tensor)

    outputs = [english.vocab.stoi["<sos>"]]

    for _ in range(max_length):
        previous_word = torch.tensor([outputs[-1]]).to(device)

        with torch.no_grad():
            output, hidden, cell = model.decoder(previous_word, hidden, cell)
            best_guess = output.argmax(1).item()

        outputs.append(best_guess)

        # Model predicts it's the end of the sentence
        if output.argmax(1).item() == english.vocab.stoi["<eos>"]:
            break

    translated_sentence = [english.vocab.itos[idx] for idx in outputs]

    # remove start token
    return translated_sentence[1:]

In [71]:
translate_sentence(model, 'Germany', german, english, device)

['school', 'children', 'going', 'on', 'a', 'ground', '.', '<eos>']

In [None]:
%load_ext tensorboard
%tensorboard --logdir runs