In [None]:
!pip install torchtext==0.6
!pip install spacy
!python -m spacy download it


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchtext
from torchtext.data import Field, BucketIterator, TabularDataset
import numpy as np
import pandas as pd
import spacy
from sklearn.model_selection import train_test_split
import random
from torch.utils.tensorboard import SummaryWriter
from torchtext.data.metrics import bleu_score
import sys
from spacy.tokenizer import Tokenizer
from spacy.lang.ca import Catalan


In [None]:
print("downloading data...")
!wget -O ca-it.txt.zip https://opus.nlpl.eu/download.php?f=GlobalVoices/v2018q4/moses/ca-it.txt.zip
!unzip ca-it.txt.zip

print("splitting data...")
src_data = []
tgt_data = []

with open("GlobalVoices.ca-it.ca") as f:
    src_data = f.readlines()

with open("GlobalVoices.ca-it.it") as f:
    tgt_data = f.readlines()

raw_data = {'Catalan': [line for line in src_data[:]],'Italian': [line for line in tgt_data[:]]}
df = pd.DataFrame(raw_data, columns = ['Catalan', 'Italian'])


train, test = train_test_split(df, test_size = 0.2)
train.to_json('train.json', orient='records', lines=True)
test.to_json('test.json', orient='records', lines=True)

train.to_csv('train.csv', index=False)
test.to_csv('test.csv', index=False)



In [None]:


spacy_it = spacy.load('it')

cat = Catalan()
spacy_cat = cat.tokenizer

def tokenizer_cat(text):
    return [tok.text for tok in spacy_cat(text)]

def tokenizer_it(text):
    return [tok.text for tok in spacy_it.tokenizer(text)]

catalan = Field(sequential=True, use_vocab=True, tokenize=tokenizer_cat, lower=True)
italian = Field(sequential=True, use_vocab=True, tokenize=tokenizer_it, lower=True)

fields = {'Catalan': ('cat', catalan), 'Italian': ('it', italian)}



train_data, test_data = TabularDataset.splits(
    path='',
    train = 'train.json',
    test='test.json',
    format='json',
    fields=fields
)

catalan.build_vocab(train_data, max_size=10000, min_freq=2)
italian.build_vocab(train_data, max_size =10000, min_freq = 2)

In [None]:
print(len(train_data))
print(train_data[0])

In [None]:
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, p):
        super(Encoder, self).__init__()
        self.hidden_size=hidden_size
        self.num_layers = num_layers

        self.dropout = nn.Dropout(p)
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn= nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p)

    def forward(self,x):
        embedding = self.dropout(self.embedding(x))
        outputs, (hidden, cell) = self.rnn(embedding) 

        return hidden, cell

class Decoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size, num_layers, p):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.dropout = nn.Dropout(p)
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn= nn.LSTM(embedding_size, hidden_size, num_layers, dropout = p)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self,x,hidden,cell):
        x = x.unsqueeze(0)
        # x shape (1, N)
        embedding = self.dropout(self.embedding(x))
        outputs, (hidden, cell) = self.rnn(embedding, (hidden, cell))
        #outputs shape: (1, N, hidden_size)

        predictions = self.fc(outputs)
        # shape = (n, N, length_of_vocab)

        predictions = predictions.squeeze(0)

        return predictions, hidden, cell

In [None]:

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder  = encoder
        self.decoder = decoder

    def forward(self, source, target, teacher_force_ratio=0.5):
        batch_size = source.shape[1]
        target_len = target.shape[0]
        target_vocab_size = len(italian.vocab)

        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)

        hidden, cell = self.encoder(source)

        x = target[0]
        for t in range(1, target_len):
            output, hidden, cell = self.decoder(x, hidden, cell)
            outputs[t] = output
            best_guess = output.argmax(1)

            x = target[t] if random.random() < teacher_force_ratio else best_guess
        return outputs



In [None]:
def save_checkpoint(state, filename='my_check.ptr.tar'):
    print("saving checkpointz")
    torch.save(state, filename)

def load_checkpoint(checkpoint):
    print("loading checkpoint...")
    model.load_state_dict(checkpoint['state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer'])

In [None]:
def translate_sentence(model, sentence, catalan, italian, device, max_length=50):
    # print(sentence)

    # sys.exit()

    # Load catalan tokenizer
    cat = Catalan()
    spacy_cat = cat.tokenizer
  

    # Create tokens using spacy and everything in lower case (which is what our vocab is)
    if type(sentence) == str:
        tokens = [token.text.lower() for token in spacy_cat(sentence)]
    else:
        tokens = [token.lower() for token in sentence]

    # print(tokens)

    # sys.exit()
    # Add <SOS> and <EOS> in beginning and end respectively
    tokens.insert(0, catalan.init_token)
    tokens.append(catalan.eos_token)

    # Go through each catalan token and convert to an index
    text_to_indices = [catalan.vocab.stoi[token] for token in tokens]

    # Convert to Tensor
    sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)

    # Build encoder hidden, cell state
    with torch.no_grad():
        hidden, cell = model.encoder(sentence_tensor)

    outputs = [italian.vocab.stoi["<sos>"]]

    for _ in range(max_length):
        previous_word = torch.LongTensor([outputs[-1]]).to(device)

        with torch.no_grad():
            output, hidden, cell = model.decoder(previous_word, hidden, cell)
            best_guess = output.argmax(1).item()

        outputs.append(best_guess)

        # Model predicts it's the end of the sentence
        if output.argmax(1).item() == italian.vocab.stoi["<eos>"]:
            break

    translated_sentence = [italian.vocab.itos[idx] for idx in outputs]

    # remove start token
    return translated_sentence[1:]


def bleu(data, model, catalan, italian, device):
    targets = []
    outputs = []

    for example in data:
        cat = vars(example)["cat"]
        it = vars(example)["it"]

        prediction = translate_sentence(model, cat, catalan, italian, device)
        prediction = prediction[:-1]  # remove <eos> token

        targets.append([it])
        outputs.append(prediction)

    return bleu_score(outputs, targets)

In [None]:
num_epochs = 20
learning_rate = 0.001
batch_size = 32

load_model = False
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
input_size_encoder = len(catalan.vocab)
input_size_decoder = len(italian.vocab)
CUDA_LAUNCH_BLOCKING=1
output_size = len(italian.vocab)
encoder_embedding_size = 300
decoder_embedding_size = 300
hidden_size = 1024
num_layers = 2
enc_dropout =0.5
dec_dropout = 0.5

writer = SummaryWriter(f'runs/Loss_plot')
step = 0 
train_iterator, test_iterator = BucketIterator.splits(
    (train_data, test_data),
    batch_size = batch_size, 
    device=device
)

encoder_net = Encoder(input_size_encoder, encoder_embedding_size, 
                      hidden_size, num_layers, enc_dropout).to(device)

decoder_net = Decoder(input_size_encoder, decoder_embedding_size, 
                      hidden_size, output_size, num_layers, dec_dropout).to(device)

model = Seq2Seq(encoder_net, decoder_net).to(device)
pad_idx = italian.vocab.stoi['<pad>']
criterion=nn.CrossEntropyLoss(ignore_index=pad_idx)
optimizer = optim.Adam(model.parameters(), lr = learning_rate)

if load_model:
    load_checkpoint(torch.load('my_checkpoint.pth.ptar'), model, optimizer)

sentence = "Noah lässt uns diese Klasse auf keinen Fall bestehen"

for epoch in range(num_epochs):
    print(f'Epoch [{epoch}/{num_epochs}]')

    checkpoint = {'state_dict':model.state_dict(), 'optimizer':optimizer.state_dict()}
    save_checkpoint(checkpoint)

    model.eval()
    translated_sentence = translate_sentence(model, sentence, catalan, italian, device, max_length=50)
    print(f'translated sentence \n {translated_sentence}')

    model.train()

    for batch_idx, batch in enumerate(train_iterator):
        print(batch)
        print('-------')
        print(batch.cat)
        print('********')
        print(batch.it)
        print('########')
        inp_data = batch.cat.to(device)
        target = batch.it.to(device)

        output = model(inp_data, target)

        output = output[1:].reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)
        optimizer.zero_grad()
        loss = criterion(output, target)

        loss.backward()

        torch.nn.utils.clip_grad_norm(model.parameters(), max_norm=1)

        optimizer.step()

        writer.add_scalar('Training Loss', loss, global_step = step)

        step+= 1




