In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import unicodedata
import re
import matplotlib.pyplot as plt

from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torchtext.data.metrics import bleu_score

from sklearn.model_selection import train_test_split

import numpy as np

import random
import math
import time
import pickle

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
def extract_data(): 
    
    fe = open('english_sampled.txt',encoding='utf-8')
    Linese = fe.readlines()
    
    fh = open('hindi_sampled.txt',encoding='utf-8')
    Linesh = fh.readlines()
    
    en_sentences = list()
    maxen_seqlen = 0
    for line in Linese:
        line = re.sub('[()]', '', line)
        line = re.sub(r"([?.!,¿;।])", r" \1 ", line)
        line = re.sub(r'[" "]+', " ", line)
        line = re.sub(r"[^a-zA-Z?.!,¿']+", " ", line)
        line = line.lower()
        line = line.strip()
        line = line.split(' ')
        line = line[::-1]
        line.insert(0,'<sos>')
        line.append('<eos>')
        maxen_seqlen = max(len(line),maxen_seqlen)
        en_sentences.append(line)

    hi_sentences = list()
    maxhi_seqlen = 0
    for line in Linesh:
        line = re.sub('[()]', '', line)
        line = re.sub(r"([?.!,¿;।])", r" \1 ", line)
        line = re.sub(r'[" "]+', " ", line)
        line = line.strip()
       
        if(line[0] == '-'):
            line = line[1:]
        elif(line[0] == '-' and line[1] == ' '):
            line = line[2:]
        
        line = line.strip()
        line = '<sos> ' + line + ' <eos>'
        
        line = line.split(' ')
        maxhi_seqlen = max(maxhi_seqlen , len(line))
            
        hi_sentences.append(line)

    return en_sentences , hi_sentences , maxen_seqlen , maxhi_seqlen

In [None]:
src_sen, trg_sen, maxsrc_seqlen, maxtrg_seqlen =  extract_data()
print(maxsrc_seqlen, maxtrg_seqlen)



src_train, src_test, trg_train, trg_test = train_test_split(src_sen, trg_sen, test_size=0.30)
src_dev, src_test, trg_dev, trg_test = train_test_split(src_test, trg_test, test_size=2/3)

In [None]:
def build_vocab(sentences, min_freq):
    vocab = dict()
    
    for sentence in sentences:
        for word in sentence:
            if word not in vocab:
                vocab[word] = 0
            vocab[word] += 1
    
    keys = list(vocab.keys())
    
    new_vocab = list()
    
    for key in keys:
        if(vocab[key] > min_freq):
            new_vocab.append(key)
    
    pad = "<pad>"
    sos = "<sos>"
    eos = "<eos>"
    unk = "<unk>"

    
    id2word = {}
    word2id = {}
    
    id2word[0] = pad 
    id2word[1] = sos 
    id2word[2] = eos
    id2word[3] = unk
    
    word2id[pad] = 0 
    word2id[sos] = 1
    word2id[eos] = 2
    word2id[unk] = 3
    
    cur_id = 4
    
    for word in new_vocab:
        if word not in word2id:
            word2id[word] = cur_id
            id2word[cur_id] = word
            cur_id += 1

    return word2id, id2word

In [None]:
def encode_sentences(sentences, max_len, word2id):
    encoded_sentences = list()
    
    for sentence in sentences: 
        enc_sen = []
        for word in sentence:
            if word in word2id:
                enc_sen.append(word2id[word])
            else:
                ''' appending index corresponding to <unk> '''
                enc_sen.append(3)
        
        for i in range(max_len - len(enc_sen)):
            ''' appending index corresponding to <pad> for padding '''
            enc_sen.append(0)
        
        encoded_sentences.append(enc_sen)
    
    return encoded_sentences

In [None]:
def decode_sentences(sentence, id2word):
    decoded_sentence = list() 
    for i in sentence:
        if(i not in [0,1,2]):
            decoded_sentence.append(id2word[i])
    return decoded_sentence

In [None]:
src_word2id, src_id2word = build_vocab(src_train, 1)
trg_word2id, trg_id2word = build_vocab(trg_train, 1)

In [None]:
enc_src_train = encode_sentences(src_train, maxsrc_seqlen, src_word2id)
enc_src_test = encode_sentences(src_test, maxsrc_seqlen, src_word2id)
enc_src_dev = encode_sentences(src_dev, maxsrc_seqlen, src_word2id)

In [None]:
enc_trg_train = encode_sentences(trg_train, maxtrg_seqlen, trg_word2id)
enc_trg_test = encode_sentences(trg_test, maxtrg_seqlen, trg_word2id)
enc_trg_dev = encode_sentences(trg_dev, maxtrg_seqlen, trg_word2id)

In [None]:
# decode_sentences(enc_src_train[4] ,src_id2word)
# decode_sentences(enc_trg_train[4] ,trg_id2word)

In [None]:
BATCH_SIZE = 32
def create_data_loader(data):
    data = torch.tensor(data)
    data = TensorDataset(data)
    data_dataloader = DataLoader(data, batch_size = BATCH_SIZE)
    
    return data_dataloader

In [None]:
enc_src_train_dl = create_data_loader(enc_src_train)
enc_src_test_dl = create_data_loader(enc_src_test)
enc_src_dev_dl = create_data_loader(enc_src_dev)

In [None]:
enc_trg_train_dl = create_data_loader(enc_trg_train)
enc_trg_test_dl = create_data_loader(enc_trg_test)
enc_trg_dev_dl = create_data_loader(enc_trg_dev)

In [None]:
INPUT_DIM = len(src_word2id)
OUTPUT_DIM = len(trg_word2id)
ENC_EMB_DIM = 250
DEC_EMB_DIM = 250
HID_DIM = 256
NUM_LAYERS = 1
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5
N_EPOCHS = 25

In [None]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, num_layers, p):
        super().__init__()
        
        self.enc_dropout = nn.Dropout(p)
        self.enc_embedding = nn.Embedding(input_dim, emb_dim)
        self.enc_LSTM = nn.LSTM(emb_dim, hid_dim, num_layers, dropout = p)
        
        
        self.reset_parameters()
        
    def reset_parameters(self):
        nn.init.uniform_(self.enc_embedding.weight, -0.08, 0.08)
        nn.init.uniform_(self.enc_LSTM.weight_ih_l0, -0.08, 0.08)
        nn.init.uniform_(self.enc_LSTM.weight_hh_l0, -0.08, 0.08)
        nn.init.uniform_(self.enc_LSTM.bias_ih_l0, -0.08, 0.08)
        nn.init.uniform_(self.enc_LSTM.bias_hh_l0, -0.08, 0.08)
        
    def forward(self, src):
        
        embedded = self.enc_dropout(self.enc_embedding(src))
        o, (h, c) = self.enc_LSTM(embedded)
        
        return h, c

In [None]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, num_layers, p):
        super().__init__()
        
        self.dec_dropout = nn.Dropout(p)
        self.dec_embedding = nn.Embedding(output_dim, emb_dim)
        self.dec_LSTM = nn.LSTM(emb_dim, hid_dim, num_layers, dropout = p)
        self.fc_out = nn.Linear(hid_dim, output_dim)
        
        self.reset_parameters()
        
    def reset_parameters(self):
        
        nn.init.uniform_(self.dec_embedding.weight, -0.08, 0.08)
        nn.init.uniform_(self.dec_LSTM.weight_ih_l0, -0.08, 0.08)
        nn.init.uniform_(self.dec_LSTM.weight_hh_l0, -0.08, 0.08)
        nn.init.uniform_(self.dec_LSTM.bias_ih_l0, -0.08, 0.08)
        nn.init.uniform_(self.dec_LSTM.bias_hh_l0, -0.08, 0.08)
        nn.init.uniform_(self.fc_out.weight, -0.08, 0.08)
        nn.init.uniform_(self.fc_out.bias, -0.08, 0.08)
        
    def forward(self, input, hidden, cell):
        
        embedded = self.dec_dropout(self.dec_embedding(input))
        o, (h, c) = self.dec_LSTM(embedded, (hidden, cell))
        prediction = self.fc_out(o.squeeze(0))
        
        return prediction, h, c

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(self, src_batch, trg_batch, teacher_forcing_ratio = 0.5):
        
        trg_len,batch_size = trg_batch.shape
        trg_vocab_size = len(trg_id2word)
        
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        hidden, cell = self.encoder(src_batch)
        
        input = trg_batch[0,:]
        
        answer = list()
        
        
        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(input.unsqueeze(0), hidden, cell)
            outputs[t] = output
            answer.append(output.argmax(1).tolist())

            if(random.random() < teacher_forcing_ratio):
                input = trg_batch[t]
            else:
                input = output.argmax(1)       
        
        return outputs , answer

In [None]:
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, NUM_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, NUM_LAYERS, DEC_DROPOUT)

model = Seq2Seq(enc, dec, device).to(device)

In [None]:
optimizer = optim.Adam(model.parameters())
''' Ignoring Padding index '''
criterion = nn.CrossEntropyLoss(ignore_index = 0) 

In [None]:
def train(model, src_iterator, trg_iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, (batch_src, batch_trg) in enumerate(zip(src_iterator, trg_iterator)):
        
        b_src = batch_src[0]
        b_src = b_src.to(device)
        src_batch = torch.transpose(b_src,0,1)
        
        b_trg = batch_trg[0]
        b_trg = b_trg.to(device)
        trg_batch = torch.transpose(b_trg,0,1)
        
        optimizer.zero_grad()
        
        output, answer = model(src_batch, trg_batch)
        output = output[1:].view(-1, output.shape[-1])
        
        trg_batch = trg_batch.contiguous() 
        trg_batch = trg_batch[1:].view(-1)
        
        loss = criterion(output, trg_batch)
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(src_iterator)

In [None]:
def evaluate(model, src_iterator, trg_iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    targets = []
    answers = []
    
    with torch.no_grad():
    
        for i, (batch_src, batch_trg) in enumerate(zip(src_iterator, trg_iterator)):
        
            b_src = batch_src[0]
            b_src = b_src.to(device)
            src_batch = torch.transpose(b_src,0,1)

            b_trg = batch_trg[0]
            b_trg = b_trg.to(device)
            trg_batch = torch.transpose(b_trg,0,1)

            targets.append(trg_batch.tolist())
            output, answer = model(src_batch, trg_batch)
            answers.append(answer)
            
            output = output[1:].view(-1, output.shape[-1])

            trg_batch = trg_batch.contiguous()
            trg_batch = trg_batch[1:].view(-1)


            loss = criterion(output, trg_batch)
            
            epoch_loss += loss.item()
        
    return epoch_loss / len(src_iterator), targets, answers

In [None]:
best_valid_loss = float('inf')

train_losses = list()
val_losses = list()

for epoch in range(N_EPOCHS):

    train_loss = train(model, enc_src_train_dl, enc_trg_train_dl, optimizer, criterion, 1)
    valid_loss, targets, answers = evaluate(model, enc_src_dev_dl, enc_trg_dev_dl , criterion)
    
    val_losses.append(valid_loss)
    train_losses.append(train_loss)
    
    if valid_loss < best_valid_loss:
        torch.save(model.state_dict(), 'best-model-1.pt')
        best_valid_loss = valid_loss
        
    
    print('Epoch no.:', epoch+1)
    print('Train Loss:',round(train_loss,3))
    print('Val. Loss:', round(valid_loss,3))
    print()

In [None]:
def transpose(l1): 
      
    l2 = list()    
    for i in range(len(l1[0])): 
        row =[] 
        for item in l1: 
            row.append(item[i]) 
        l2.append(row) 
    return l2 

In [None]:
def calculate_blue_scores(targets, predicted):
    candidate_corpus = list()
    reference_corpus = list()
    
    for batch in predicted:
        batch = transpose(batch)
        for encoded_sentence in batch:
            decoded_sentence = decode_sentences(encoded_sentence ,trg_id2word)
            candidate_corpus.append(decoded_sentence)
            
    for batch in targets:
        batch = transpose(batch)
        for encoded_sentence in batch:
            decoded_sentence = decode_sentences(encoded_sentence ,trg_id2word)
            reference_corpus.append(decoded_sentence)
        
    
    print("BLUE@1 Score:", round(bleu_score(candidate_corpus, reference_corpus, max_n=1, weights=[1]),3))
    print("BLUE@2 Score:", round(bleu_score(candidate_corpus, reference_corpus, max_n=2, weights=[0.5, 0.5]),3))
    print("BLUE@3 Score:", round(bleu_score(candidate_corpus, reference_corpus, max_n=3, weights=[0.33, 0.33, 0.33]),3))
    print("BLUE@4 Score:", round(bleu_score(candidate_corpus, reference_corpus, max_n=4, weights=[0.25, 0.25, 0.25, 0.25]),3))    
    
    return candidate_corpus , reference_corpus

In [None]:
def generate_graphs():
    
    epoch_no = []
    for i in range(len(train_losses)):
        epoch_no.append(i + 1)
    plt.plot(epoch_no, train_losses)

In [None]:
def print_translated_sentences(candidate_corpus , reference_corpus):
    f = open("translated_sent1.txt", "w")
    
    for i in range(len(candidate_corpus)):
        print("Predicted: ", " ".join(word for word in candidate_corpus[i]),", Actual:"," ".join(word for word in reference_corpus[i]))
        
        f.write("Predicted: " + " ".join(word for word in candidate_corpus[i]) + "    Actual: " + " ".join(word for word in reference_corpus[i]) + '\n')
    f.close()

In [None]:
model.load_state_dict(torch.load('best-model-1.pt'))

test_loss, targets, predicted = evaluate(model, enc_src_test_dl, enc_trg_test_dl, criterion)

print('Test Loss:',round(test_loss,3))

candidate_corpus , reference_corpus = calculate_blue_scores(targets, predicted)
generate_graphs()
print_translated_sentences(candidate_corpus , reference_corpus)