In [1]:
!python -m spacy download de
!pip install torchtext==0.6.0

import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator
from torchtext.data.metrics import bleu_score
import numpy as np
import spacy
import random
from torch.utils.tensorboard import SummaryWriter

Collecting de_core_news_sm==2.2.5
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.2.5/de_core_news_sm-2.2.5.tar.gz (14.9MB)
[K     |████████████████████████████████| 14.9MB 7.8MB/s 
Building wheels for collected packages: de-core-news-sm
  Building wheel for de-core-news-sm (setup.py) ... [?25l[?25hdone
  Created wheel for de-core-news-sm: filename=de_core_news_sm-2.2.5-cp36-none-any.whl size=14907057 sha256=16329ebaaa42fe864e9b05f347df2f0b1a3dea0beffc3a21eca8b4af65f683e8
  Stored in directory: /tmp/pip-ephem-wheel-cache-zmmahwf6/wheels/ba/3f/ed/d4aa8e45e7191b7f32db4bfad565e7da1edbf05c916ca7a1ca
Successfully built de-core-news-sm
Installing collected packages: de-core-news-sm
Successfully installed de-core-news-sm-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('de_core_news_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/de_core_news_sm -->
/usr/local/

In [None]:
!ls -a multi30k/


.			   test2016.de	train.de	 val.de
..			   test2016.en	train.en	 val.en
mmt_task1_test2016.tar.gz  test2016.fr	training.tar.gz  validation.tar.gz


In [2]:
nlp_de = spacy.load('de')
nlp_eng = spacy.load('en')

def tokenizer_de(text):
    return [tok.text for tok in nlp_de.tokenizer(text)]


def tokenizer_eng(text):
    return [tok.text for tok in nlp_eng.tokenizer(text)]


german = Field(tokenize = tokenizer_de, lower=True, init_token ='<sos>', eos_token = '<eos>')

english = Field(tokenize = tokenizer_eng, lower=True, init_token ='<sos>', eos_token = '<eos>')

train_data, validation_data, test_data = Multi30k.splits(exts = ('.de', '.en'), fields = (german, english))

german.build_vocab(train_data, max_size = 10000, min_freq = 2) 
english.build_vocab(train_data, max_size = 10000, min_freq = 2) 

downloading training.tar.gz


training.tar.gz: 100%|██████████| 1.21M/1.21M [00:02<00:00, 508kB/s]


downloading validation.tar.gz


validation.tar.gz: 100%|██████████| 46.3k/46.3k [00:00<00:00, 175kB/s]


downloading mmt_task1_test2016.tar.gz


mmt_task1_test2016.tar.gz: 100%|██████████| 66.2k/66.2k [00:00<00:00, 167kB/s]


In [None]:
x = next(iter(train_iterator))
print(x.src)

tensor([[   2,    2,    2],
        [ 105,   18,   18],
        [  41,   30,   41],
        [  52,    7,   53],
        [  27,  486,   10],
        [   6, 3534,  307],
        [3487,   59,  127],
        [  10,    6,  317],
        [ 928,    0, 3093],
        [  75,  292, 3542],
        [   4,    4,    4],
        [   3,    3,    3]])


In [3]:
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, p):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.dropout = nn.Dropout(p)
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout = p)

    def forward(self, x):
        # c_shape (seq_length, batch_size, )
        embedding = self.dropout(self.embedding(x))
        # embedding (seq_length, batch_size, embedding_size)
        outputs, (hidden, cell) = self.rnn(embedding)

        return hidden, cell



class Decoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size, num_layers, p):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = nn.Dropout(p)

        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p)
        self.fc = nn.Linear(hidden_size, output_size)

    
    def forward(self, x, hidden, cell):
        
        # shape of x (batch_size) but we want (1,batch_size) 
        x = x.unsqueeze(0)
        embedding = self.dropout(self.embedding(x))
        outputs, (hidden, cell) = self.rnn(embedding, (hidden, cell))
        # shape of outputs (1, N, hidden_size)

        predictions = self.fc(outputs)
        # shape 1, N, length of vocab

        predictions = predictions.squeeze(0)

        return predictions, hidden, cell

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder


    def forward(self, source, target, teacher_force_ratio = 0.5):
        batch_size = source.shape[1]
        target_len = target.shape[0]
        target_vocab_size = len(english.vocab)

        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)

        hidden, cell = self.encoder(source)
        
    
        #grab start toker
        x = target[0]

        for t in range(1, target_len):
            output, hidden, cell = self.decoder(x, hidden, cell)

            outputs[t] = output

            # (N, english_vocab_size)
            best_guess = output.argmax(1)
            x = target[t] if random.random() < teacher_force_ratio else best_guess


        return outputs


In [4]:
### Training
# training hyperparameters

num_epochs = 100
learning_rate = 0.0001
batch_size = 64

# Model hyperparameters
load_model = False
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
input_size_encoder = len(german.vocab)
input_size_decoder = len(english.vocab)
output_size = len(english.vocab)
encoder_embedding_size = 300
decoder_embedding_size = 300
hidden_size = 1024
num_layers = 3
enc_dropout = 0.5
dec_dropout = 0.5

#Tensor board
writer = SummaryWriter()
step = 0

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, validation_data, test_data),
    batch_size = batch_size,
    sort_within_batch = True,
    sort_key = lambda x: len(x.src),
    device = device
)

encoder_net = Encoder(input_size_encoder, encoder_embedding_size, hidden_size, num_layers, enc_dropout).to(device)

decoder_net = Decoder(input_size_decoder, decoder_embedding_size, hidden_size, output_size, num_layers, dec_dropout).to(device)

model = Seq2Seq(encoder_net, decoder_net).to(device)

pad_idx = english.vocab.stoi['<pad>']

criterion = nn.CrossEntropyLoss(ignore_index = pad_idx)


In [5]:
def translate_sentence(model, sentence, german, english, device, max_length=50):
    # Load german tokenizer
    spacy_ger = spacy.load("de")

    # Create tokens using spacy and everything in lower case (which is what our vocab is)
    if type(sentence) == str:
        tokens = [token.text.lower() for token in spacy_ger(sentence)]
    else:
        tokens = [token.lower() for token in sentence]

    # print(tokens)

    # sys.exit()
    # Add <SOS> and <EOS> in beginning and end respectively
    tokens.insert(0, german.init_token)
    tokens.append(german.eos_token)

    # Go through each german token and convert to an index
    text_to_indices = [german.vocab.stoi[token] for token in tokens]

    # Convert to Tensor
    sentence_tensor = torch.tensor(text_to_indices).unsqueeze(1).to(device)

    # Build encoder hidden, cell state
    with torch.no_grad():
        hidden, cell = model.encoder(sentence_tensor)

    outputs = [english.vocab.stoi["<sos>"]]

    for _ in range(max_length):
        previous_word = torch.tensor([outputs[-1]]).to(device)

        with torch.no_grad():
            output, hidden, cell = model.decoder(previous_word, hidden, cell)
            best_guess = output.argmax(1).item()

        outputs.append(best_guess)

        # Model predicts it's the end of the sentence
        if output.argmax(1).item() == english.vocab.stoi["<eos>"]:
            break

    translated_sentence = [english.vocab.itos[idx] for idx in outputs]

    # remove start token
    return translated_sentence[1:]

In [None]:
import tensorflow as tf

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# saving stuff

v_inp_data, v_tgt_data = next(iter(valid_iterator)).src,  next(iter(valid_iterator)).trg

i = 0

for epoch in range(num_epochs):
    print(f'Epoch {epoch} / {num_epochs}')

    checkpoint = {'state_dict':model.state_dict(), 'optimizer':optimizer.state_dict()}

    # save_checkpoint(checkpoint)
    a = translate_sentence(model, 'Der Himmel ist heute klar. Die Sonne scheint.', german, english, device)
    print(a)
    for batch_idx, batch in enumerate(train_iterator):
        inp_data = batch.src.to(device)
        target = batch.trg.to(device)

        output = model(inp_data, target)
        #output shape (tgt_len ,b_size, output_dim)

        if i == 0:
            writer.add_graph(model.encoder, inp_data)
            i += 1

        output = output[1:].reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)

        optimizer.zero_grad()
        loss = criterion(output, target)

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm = 1)
        optimizer.step()

        writer.add_scalar('/Training Loss/Tr_Loss', loss, global_step = step)

        #val loss
        v_output = model(v_inp_data, v_tgt_data)
        #output shape (tgt_len ,b_size, output_dim)

        v_output = v_output[1:].reshape(-1, v_output.shape[2])
        v_target = v_tgt_data[1:].reshape(-1)

        val_loss = criterion(v_output, v_target)
        writer.add_scalar('/Validation Loss/Val_loss', val_loss, global_step = step)

        step += 1

writer.close()

Epoch 0 / 100
['cow', 'port', 'formation', 'skim', 'skim', 'companion', 'companion', 'skim', 'skim', 'grilling', 'grilling', 'grilling', 'grilling', 'skiing', 'grilling', 'grilling', 'interviewed', 'grilling', 'grilling', 'grilling', 'grilling', 'aquarium', 'aquarium', 'aquarium', 'aquarium', 'aquarium', 'fedora', 'booths', 'slowly', 'skim', 'skim', 'skim', 'aquarium', 'skiing', 'skiing', 'cattle', 'grilling', 'tricks', 'entertainment', 'employees', 'aquarium', 'grinds', 'avoid', 'tricks', 'tricks', 'tricks', 'tricks', 'after', 'structure', 'after']
Epoch 1 / 100
['a', 'group', 'of', 'a', 'a', 'a', 'a', 'a', '.', '<eos>']
Epoch 2 / 100
['the', 'dog', 'is', 'a', 'a', 'a', 'a', 'a', '.', '.', '<eos>']
Epoch 3 / 100
['the', 'is', 'is', 'a', 'the', 'the', 'of', 'the', 'the', '.', '.', '<eos>']
Epoch 4 / 100
['the', 'person', 'is', 'is', 'the', 'the', '<unk>', 'is', '.', 'the', '.', '.', '<eos>']
Epoch 5 / 100
['the', '<unk>', 'is', 'the', 'the', 'the', '<unk>', '<unk>', '<unk>', '.', '<eos

In [None]:
def bleu(data, model, german, english, device):
    references = []
    candidates = []
    i = 0
    for example in data:
        print(i)
        i += 1
        src = example.src
        trg = example.trg
        prediction = translate_sentence(model, src, german, english, device)
        prediction = prediction[:-1]  # remove <eos> token

        references.append([trg])
        candidates.append(prediction)

    return bleu_score(candidates, references)

In [None]:
%load_ext tensorboard
%tensorboard --logdir runs