In [30]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import string
import random as r
from utils import load_data

In [9]:
path = 'english_telugu_data.txt'
english_sentences, telugu_sentences = load_data(path)
print(len(english_sentences))
print(len(telugu_sentences))

155798
155798


In [10]:
print(english_sentences[0:2])
print(telugu_sentences[0:2])

['His legs are long.', 'Who taught Tom how to speak French?']
['అతని కాళ్ళు పొడవుగా ఉన్నాయి.', 'టామ్ ఫ్రెంచ్ మాట్లాడటం ఎలా నేర్పించారు?']


In [11]:
english_sentences = english_sentences[:30000]
len(english_sentences)

30000

In [12]:
telugu_sentences = telugu_sentences[0:30000]
len(telugu_sentences)

30000

In [13]:
def tokenize(sentences):
    vocab = set()
    tokenized_sentences = []
    
    for sentence in sentences:
        sentence = sentence.translate(str.maketrans('', '', string.punctuation))
        words = sentence.split()
        tokenized_sentences.append(words)
        for word in words:
            vocab.add(word)
    return tokenized_sentences,list(vocab)

english_sentences_tokenized, english_vocab = tokenize(english_sentences)
telugu_sentences_tokenized, telugu_vocab = tokenize(telugu_sentences)



for i, vocab in enumerate(telugu_vocab):
    vocab = vocab.replace('\u200c','')
    telugu_vocab[i] = vocab

telugu_vocab = list(set(telugu_vocab))
telugu_vocab  = ['<pad>','<unk>','<sos>','<eos>'] + telugu_vocab
english_vocab = ['<pad>','<unk>','<sos>','<eos>'] + english_vocab

print(f' len(english_vocab): {len(english_vocab)}')
print(f' len(telugu_vocab): {len(telugu_vocab)}')

 len(english_vocab): 8141
 len(telugu_vocab): 17372


In [14]:
english_itow = {i: w for i, w in enumerate(english_vocab)}
english_wtoi = {w:i for i, w in english_itow.items()}

telugu_itow = {i: w for i, w in enumerate(telugu_vocab)}
telugu_wtoi = {w:i for i, w in telugu_itow.items()}

print("english_itow = ", len(english_itow)) 
print("telugu_itow = ", len(telugu_itow))
print("english_wtoi = ", len(english_wtoi))
print("telugu_wtoi = ", len(telugu_wtoi))





english_itow =  8141
telugu_itow =  17372
english_wtoi =  8141
telugu_wtoi =  17372


In [15]:
def vectorize_input(data, wtoi):
    vectorized_data = []
    for sentence in data:
        arr = [wtoi['<sos>']]
        for word in sentence:
            if word not in wtoi:
                arr.append(wtoi['<unk>'])
            else:
                arr.append(wtoi[word])
        arr.append(wtoi['<eos>'])
        vectorized_data.append(arr)
    return vectorized_data

def vectorize_target(data, wtoi):
    vectorized_data = []
    shifted_right = []
    for sentence in data:
        arr = []
        for word in sentence:
            if word not in wtoi:
                arr.append(wtoi['<unk>'])
            else:
                arr.append(wtoi[word])
        vectorized_data.append(arr + [wtoi['<eos>']])
        shifted_right.append([wtoi['<sos>']] + arr)
    return vectorized_data, shifted_right

In [16]:
X = vectorize_input(english_sentences_tokenized, english_wtoi)
Y, Y_shifted = vectorize_target(telugu_sentences_tokenized, telugu_wtoi)
print(len(X), len(Y))

30000 30000


In [17]:
len(telugu_itow)

17372

In [18]:
len(telugu_wtoi)

17372

In [42]:
def data_generator(X, Y, Y_shifted,batch_size = None, shuffle = True):
        
    if batch_size == None:
        batch_size = len(X)
    elif batch_size > len(X):
        raise Exception("batch size should be less than Length of X")
    
    indexes = [*range(len(X))]
    
    if shuffle:
        r.shuffle(indexes)
    
    buffer_x = [0]*batch_size
    buffer_y = [0]*batch_size
    buffer_y_shifted = [0]*batch_size
    
    max_length_x = 0
    max_length_y = 0
    
    for i in range(batch_size):
        buffer_x[i] = X[indexes[i]]
        buffer_y[i] = Y[indexes[i]]
        buffer_y_shifted[i] = Y_shifted[indexes[i]]
        max_length_x = max(len(X[indexes[i]]), max_length_x)
        max_length_y = max(len(Y[indexes[i]]), max_length_y)
    
    for index, (x, y, y_shifted) in enumerate(zip(buffer_x, buffer_y, buffer_y_shifted)):
        pad_length_x = max_length_x - len(x)
        pad_length_y = max_length_y - len(y)
        
        buffer_x[index] = x + [english_wtoi['<pad>']] * pad_length_x
        buffer_y[index] = y + [telugu_wtoi['<pad>']] * pad_length_y
        buffer_y_shifted[index] = y_shifted + [telugu_wtoi['<pad>']] * pad_length_y
    
    
    buffer_x = torch.tensor(buffer_x)
    buffer_y = torch.tensor(buffer_y)
    buffer_y_shifted = torch.tensor(buffer_y_shifted)
    
    #print(buffer_x.shape, buffer_y.shape, buffer_y_shifted.shape)
    
    return buffer_x, buffer_y, buffer_y_shifted

In [20]:
input, target, target_shifted = data_generator(X, Y, Y_shifted, batch_size = 64, shuffle = True)

torch.Size([64, 14]) torch.Size([64, 10]) torch.Size([64, 10])


In [21]:
class Encoder(nn.Module):
    def __init__(self, input_vocab_size, embedding_dim, hidden_dim):        
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(input_vocab_size, embedding_dim)
        self.LSTM = nn.LSTM(embedding_dim, hidden_size= hidden_dim,batch_first=True)
    
    def forward(self, x):
        x = self.embedding(x)
        x = self.LSTM(x)[0]
        return x

test_encoder = Encoder(len(english_wtoi), embedding_dim=10, hidden_dim=1)
encoder_output = test_encoder.forward(input)# (batch_size, input_sequence_length, hidden_dim)
encoder_output.shape

torch.Size([64, 14, 1])

In [22]:
class PreAttentionDecoder(nn.Module):
    def __init__(self, target_vocab_size, embedding_dim, hidden_dim):
        super(PreAttentionDecoder, self).__init__()
        self.embedding = nn.Embedding(target_vocab_size, embedding_dim)
        self.LSTM = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        
    def forward(self, y):
        y = self.embedding(y)
        y = self.LSTM(y)[0]
        return y

test_decoder = PreAttentionDecoder(len(telugu_wtoi), embedding_dim = 10, hidden_dim =1)
decoder_output = test_decoder.forward(target_shifted) # (batch_size, target_shifted_length, hidden_dim)
decoder_output.shape

torch.Size([64, 10, 1])

In [23]:
def prepare_attention_input(encoder_output, decoder_output, input):

    keys = encoder_output
    values = encoder_output
    
    queries = decoder_output
    
    mask = ~(input == 0)
    mask = mask.view((mask.shape[0] * 1, 1, mask.shape[1]))  
    mask = mask + torch.zeros((1, decoder_output.shape[1], 1))

    # mask shape = (batch_size * num_heads_attention, target_shifted_length, input_sequence_length) 
    
    return queries, keys, values, mask

In [24]:
class Translator(nn.Module):
    def __init__(self, input_vocab_size, embedding_dim, hidden_dim, target_vocab_size):
        super(Translator, self).__init__()
        self.encoder = Encoder(input_vocab_size, embedding_dim, hidden_dim)
        self.predecoder = PreAttentionDecoder(target_vocab_size, embedding_dim, hidden_dim) 
        self.attention = nn.MultiheadAttention(hidden_dim, num_heads = 1, batch_first = True)
        self.decoder = nn.LSTM(hidden_dim, hidden_dim, num_layers = 1, batch_first = True)
        self.linear = nn.Linear(hidden_dim, target_vocab_size)
        
    def forward(self, input, target_shifted):
        q, k, v, m = prepare_attention_input(self.encoder(input), self.predecoder(target_shifted), input)
        attn_output, _ = self.attention(q, k, v, attn_mask = m)
        y = self.decoder(attn_output)[0]
        logits = self.linear(y)
        return logits
        

torch.Size([64, 10, 17372])

In [43]:
def model_train(X, Y, Y_shifted,batch_size, epochs, learning_rate, NNmodel):
    interval = epochs // 10
    optimizer = optim.Adam(NNmodel.parameters(), lr = learning_rate)
    NNmodel.train()
    
    for epoch in range(epochs):
        input, target, target_shifted = data_generator(X, Y, Y_shifted,batch_size=batch_size)
        logits = NNmodel.forward(input, target_shifted)
        logits = logits.view(-1, len(telugu_wtoi))
        target = target.view(-1)
        loss = F.cross_entropy(logits, target, ignore_index=telugu_wtoi['<pad>'])
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if epoch % interval == 0:
            print(f'Loss at epoch {epoch} ====> {loss}')
        
    
    return loss, logits

In [55]:
input_vocab_size = len(english_wtoi)
embedding_dim = 32
hidden_dim = 32
target_vocab_size = len(telugu_wtoi)
model = Translator(input_vocab_size, embedding_dim, hidden_dim, target_vocab_size)

n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
n_parameters

1419260

In [62]:
loss, logits = model_train(X, Y, Y_shifted, 64, 1000, 0.005, model)

Loss at epoch 0 ====> 4.943141937255859
Loss at epoch 100 ====> 5.008384704589844
Loss at epoch 200 ====> 4.970757007598877
Loss at epoch 300 ====> 5.181593894958496
Loss at epoch 400 ====> 5.051990509033203
Loss at epoch 500 ====> 4.703856945037842
Loss at epoch 600 ====> 5.1451640129089355
Loss at epoch 700 ====> 4.93436861038208
Loss at epoch 800 ====> 4.663934230804443
Loss at epoch 900 ====> 4.633925437927246
