<a href="https://colab.research.google.com/github/johncoder-30/NLP-translation-model/blob/main/seq2seq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Seq2Seq model for language translation from Tamil to English
Seq2Seq is a method of encoder-decoder based machine translation that maps an input of sequence to an output of sequence.


##Downloading of Datasets

In [None]:
# !wget https://storage.googleapis.com/samanantar-public/V0.2/data/en2indic/en-ta.zip
# !unzip "/content/en-ta.zip" -d "/content/data/"
# english_raw = open('/content/data/en-ta/train.en', 'r',encoding='utf8').read().split('\n')
# tamil_raw = open('/content/data/en-ta/train.ta', 'r', encoding='utf8').read().split('\n')

In [None]:
!git clone https://github.com/Ishikahooda/Tamil-English-Dataset.git
english_raw =open('/content/Tamil-English-Dataset/Dataset/data.en1', 'r',encoding='utf8').read().split('\n')
tamil_raw =  open('/content/Tamil-English-Dataset/Dataset/data.ta1', 'r',encoding='utf8').read().split('\n')
print(len(english_raw),len(tamil_raw))

Cloning into 'Tamil-English-Dataset'...
remote: Enumerating objects: 39, done.[K
remote: Counting objects: 100% (39/39), done.[K
remote: Compressing objects: 100% (26/26), done.[K
remote: Total 39 (delta 11), reused 36 (delta 11), pack-reused 0[K
Unpacking objects: 100% (39/39), done.
50001 50001


##Importing libraries

In [None]:
import torchtext
from torchtext.legacy.data import Field, BucketIterator, TabularDataset
import pandas as pd
from sklearn.model_selection import train_test_split
import re
import torch
import torch.nn as nn
import torch.optim as optim
import random
import os

#Preprocessing Dataset
Converting into CSV format of english sentence and it's tamil translation

In [None]:

# print(english_raw[10],tamil_raw[10])
raw_data = {'English': [line for line in english_raw[:1000]],
            'Tamil': [line for line in tamil_raw[:1000]]}
df = pd.DataFrame(raw_data, columns=['English', 'Tamil'])
train, test = train_test_split(df, test_size=0.05,shuffle=False)
train.to_csv('train.csv', index=False)
test.to_csv('test.csv', index=False)
df.head()

Unnamed: 0,English,Tamil
0,"moreover all the vessels , which king ahaz in ...",ராஜாவாகிய ஆகாஸ் அரசாளும்போது தம்முடைய பாதகத்தி...
1,similar conditions will be imposed if the sri ...,சர்வதேச நாணய நிதியம் இலங்கைக்கு கடன் வழங்கினால...
2,now kornelius argues the opposite instead of e...,தற்போது அதற்கு எதிராக வாதாடுகிறார் சர்வதேச சட...
3,chrysler the third largest us automaker filed ...,அமெரிக்காவின் மூன்றாம் பெரிய கார் தயாரிப்பு நி...
4,"moreover , khan has been in exile in iran for ...",மேலும் இனைவிட்டு தலிபானால் வெளியேற்றப்பட்ட 199...


In [None]:
# spacy_eng = spacy.load('en_core_web_sm')
def tokenize_eng(sentence):
    sentence = re.sub(r'\n', '', sentence)
    sentence = re.sub(r'[^\w\s\']', '', sentence.lower())
    return [words for words in sentence.split()]
# print(tokenize_eng('Every tournament is difficult.'))
print(tokenize_eng('She says she knows what is going on, but can do nothing about it.'))

def tokenize_tam(sentence):
    sentence = re.sub(r'\n', '', sentence)
    sentence = re.sub(r'\([^)]*\)', '', sentence)
    sentence = re.sub(r'\'\"\.','',sentence)
    return [words for words in sentence.split()]
# print(tokenize_tam('ஒவ்வொரு சுற்றுப்பயணமும் கடினமானது.'))
print(tokenize_tam('என்ன நடக்கிறது என்பது தமக்கு தெரியும் என்றும் ஆனால், தம்மால் எதுவும் செய்யமுடியாது என்றும் கடிதம் எழுதியிருந்தார்.'))

['she', 'says', 'she', 'knows', 'what', 'is', 'going', 'on', 'but', 'can', 'do', 'nothing', 'about', 'it']
['என்ன', 'நடக்கிறது', 'என்பது', 'தமக்கு', 'தெரியும்', 'என்றும்', 'ஆனால்,', 'தம்மால்', 'எதுவும்', 'செய்யமுடியாது', 'என்றும்', 'கடிதம்', 'எழுதியிருந்தார்.']


##Using torchtext library to
>1. tokenize sentences,
>2. build vocabulary 
>3. splitting into batches to train in GPU

In [None]:
english = Field(init_token='<sos>',eos_token='<eos>', tokenize=tokenize_eng, lower=True,batch_first=True)
tamil = Field(init_token='<sos>',eos_token='<eos>', tokenize=tokenize_tam, lower=False,batch_first=True)
fields = {'English': ('eng', english), 'Tamil': ('tam', tamil)}
train_data, test_data = TabularDataset.splits(path='', train='train.csv', test='test.csv', format='csv', fields=fields)
english.build_vocab(train_data, max_size=10000, min_freq=2)
tamil.build_vocab(train_data, max_size=10000, min_freq=2)
train_iterator, test_iterator = BucketIterator.splits((train_data, test_data),
                    batch_size=32, device='cuda',sort_key = lambda x: len(x.tam),sort_within_batch=True)

#Encoder 

In [None]:
class Encoder(nn.Module):  # input->Tamil sentences
    def __init__(self, input_size, hidden_dim, embedding_size, num_layers, p):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_dim
        self.num_layers = num_layers

        self.embedding = nn.Embedding(input_size, embedding_size)
        self.dropout = nn.Dropout(p)
        self.lstm = nn.LSTM(embedding_size, hidden_dim, num_layers=num_layers, dropout=p, batch_first=True)

    def forward(self, x):
        # x shape -> (N,seq_len)
        out = self.dropout(self.embedding(x))
        batch_size = x.shape[0]
        # x shape -> (N,seq_len,embedding_shape)
        hid = torch.zeros(self.num_layers, batch_size, self.hidden_size).to('cuda')
        cel = torch.zeros(self.num_layers, batch_size, self.hidden_size).to('cuda')
        output, (hidden, cell) = self.lstm(out, (hid, cel))
        # output shape -> (N,L,hid_dim)
        # hidden shape -> (num_layers,N,Hid_dim)
        return hidden, cell


#Decoder

In [None]:
class Decoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_dim, output_size, num_layers, p):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_dim
        self.num_layers = num_layers

        self.dropout = nn.Dropout(p)
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.lstm = nn.LSTM(embedding_size, hidden_dim, num_layers, dropout=p, batch_first=True)
        self.linear = nn.Linear(hidden_dim, output_size)

    def forward(self, x, hidden, cell):
        x = x.unsqueeze(1)
        out = self.dropout(self.embedding(x))
        output, (hidden, cell) = self.lstm(out, (hidden, cell))
        prediction = self.linear(output.squeeze(1))
        return prediction, hidden, cell


#Seq2Seq

In [None]:
class Seq2seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super(Seq2seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_force_ratio=0.6):
        batch_size = src.shape[0]
        target_len = trg.shape[1]
        target_vocab_size = len(english.vocab)
        outputs = torch.zeros(batch_size, target_len, target_vocab_size).to(self.device)
        hidden, cell = self.encoder(src)
        x = trg[:, 0]
        for i in range(1, target_len):
            output, hidden, cell = self.decoder(x, hidden, cell)
            outputs[:, i, :] = output
            best_guess = output.argmax(1)
            x = trg[:, i] if random.random() < teacher_force_ratio else best_guess
        return outputs

#Hyperparameters for model 

In [None]:
# training
num_epochs = 175
learning_rate = 0.003
device = torch.device('cuda')
input_size_encoder = len(tamil.vocab)
input_size_decoder = len(english.vocab)
output_size = len(english.vocab)
encoder_embedding_size = 256
decoder_embedding_size = 256
hidden_size = 512
num_layers = 2
enc_dropout = 0.5
dec_dropout = 0.5
batch_size = 32
print(input_size_encoder,input_size_decoder)

1665 1845


In [None]:
encoder_net = Encoder(input_size=input_size_encoder, embedding_size=encoder_embedding_size, hidden_dim=hidden_size,
                      num_layers=num_layers, p=enc_dropout).to('cuda')
decoder_net = Decoder(input_size=input_size_decoder, embedding_size=decoder_embedding_size, hidden_dim=hidden_size,
                      output_size=output_size, num_layers=num_layers, p=dec_dropout).to('cuda')
model = Seq2seq(encoder_net, decoder_net, device).to(device)
pad_idx = english.vocab.stoi['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

#Test function to check model output while training

In [None]:
# def eng_decoder(sentence):
#     dec = []
#     for idx in sentence:
#         if (int(idx) in eng_idx2word) and int(idx) != 0:
#             dec.append(eng_idx2word[int(idx)])
#     return dec
def test_model():
    model.eval()
    # tam_sen = '<sos> இதனால் மக்களின் இயல்பு வாழ்க்கை பெரிதும் பாதிப்படைந்துள்ளது. <eos>' 
    tam_sen = '<sos> என்ன நடக்கிறது என்பது தமக்கு தெரியும் என்றும் ஆனால், தம்மால் எதுவும் செய்யமுடியாது என்றும் கடிதம் எழுதியிருந்தார். <eos>'   
    tam_encoded=[]
    for x in tokenize_tam(tam_sen):
        tam_encoded.append(tamil.vocab.stoi[x])
    tam_sen = torch.Tensor(tam_encoded).long().to('cuda')
    tam_sen = tam_sen.reshape(1,-1)
    
    # eng_sen = '<sos> This has made the life of people very miserable. <eos>'
    eng_sen = '<sos> She says she knows what is going on, but can do nothing about it. <eos>'
    eng_encoded=[]
    for x in tokenize_eng(eng_sen):
        eng_encoded.append(english.vocab.stoi[x])
    eng_sen = torch.Tensor(eng_encoded).long().to('cuda')
    eng_sen = eng_sen.reshape(1,-1)

    with torch.no_grad():
        out = model(tam_sen, eng_sen, 0)
        out = out.reshape(-1, out.shape[2])
        out = out.argmax(1)
        # out_sen = eng_decoder(out)
        for a in out:
            if int(a)!=pad_idx:
                print(english.vocab.itos[int(a)],end=' ')
        print('\n')
# test_model()

#Training Seq2seq model

In [None]:
for epoch in range(1,num_epochs+1):
    model.train()
    for i, batch in enumerate(train_iterator):
        src, tar = batch.tam.to(device), batch.eng.to(device)
        # print(src.shape,tar.shape)
        out = model(src, tar)
        out = out[:,1:,:].reshape(-1, out.shape[2])
        tar = tar[:,1:].reshape(-1)
        optimizer.zero_grad()
        loss = criterion(out, tar)
        loss.backward()
        optimizer.step()
    if epoch%5==0:
        print('epoch:', epoch, 'loss:', loss.item())
        test_model()

epoch: 5 loss: 5.149782180786133
<unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <eos> <eos> <eos> 

epoch: 10 loss: 3.724816083908081
<unk> the <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <eos> <eos> <eos> <eos> 

epoch: 15 loss: 4.5774030685424805
<unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <eos> <unk> <eos> <eos> 

epoch: 20 loss: 3.644441604614258
<unk> the <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <eos> <eos> <eos> 

epoch: 25 loss: 4.279374122619629
<unk> i 'm <unk> <unk> <unk> <unk> <unk> and <unk> and <unk> <unk> <eos> <eos> <eos> 

epoch: 30 loss: 4.296356678009033
<unk> i have done judgment and justice leave the <unk> of the lord there was none 

epoch: 35 loss: 3.81123685836792
<unk> i 'm looking for the sources of information and the <unk> because he might <unk> 

epoch: 40 loss: 3.011389970779419
<unk> i fell to pay for the area and <unk> <unk> <unk> to <unk> <eos> <eos> 

epo

#To save trained-model to google drive

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')
!ls '/content/gdrive/My Drive/pytorch_models'

#save model
model_save_name = 'seq2seq_model.pt'
path = F"/content/gdrive/My Drive/pytorch_models/{model_save_name}" 
# torch.save(model.state_dict(), path)
torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': loss.item(),
                }, path)

Mounted at /content/gdrive
seq2seq_model.pt


#To load saved model from google drive 

In [None]:
#load model from g_drive
model = Seq2seq(encoder_net, decoder_net, device).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

model_save_name = 'seq2seq_model.pt'
path = F"/content/gdrive/My Drive/pytorch_models/{model_save_name}"
checkpoint = torch.load(path)
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
loss = checkpoint['loss']

model.eval()
# - or -
# model.train()

Seq2seq(
  (encoder): Encoder(
    (embedding): Embedding(1648, 256)
    (dropout): Dropout(p=0.5, inplace=False)
    (lstm): LSTM(256, 512, num_layers=2, batch_first=True, dropout=0.5)
  )
  (decoder): Decoder(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(1849, 256)
    (lstm): LSTM(256, 512, num_layers=2, batch_first=True, dropout=0.5)
    (linear): Linear(in_features=512, out_features=1849, bias=True)
  )
)