In [1]:
import torch 
import pandas as pd
import numpy as np 
from torch import nn
from torch import optim
from torch.utils.data import Dataset, DataLoader
import random
from collections import Counter
from typing import List
import torch.nn.functional as F
from torchsummary import summary

# Random Seeding 

In [2]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [3]:
# Sentance of Equal Lenth Problem 
# After eos nothing should contribute into Loss
# How Prepare Data

# Data Preprocessing

In [4]:
# Reading the text 
df = pd.read_csv('hin.txt',sep='\t')
df = df.drop("2",axis=1)

df.columns = ["English","Hindi"]
df.head()

Unnamed: 0,English,Hindi
0,Wow!,वाह!
1,Duck!,झुको!
2,Duck!,बतख़!
3,Help!,बचाओ!
4,Jump.,उछलो.


In [5]:
df["English"].str.lower()

0                                                    wow!
1                                                   duck!
2                                                   duck!
3                                                   help!
4                                                   jump.
                              ...                        
2910    if you go to that supermarket, you can buy mos...
2911    the passengers who were injured in the acciden...
2912    democracy is the worst form of government, exc...
2913    if my boy had not been killed in the traffic a...
2914    when i was a kid, touching bugs didn't bother ...
Name: English, Length: 2915, dtype: object

In [6]:
def lower_casing(df):
    for i in df.columns:
        df[i] = df[i].str.lower()
    return df

def apply_start_end_tokens(df):
    for i in df.columns:
        df[i] = df[i].apply(lambda x: "<sos> " + x + " <eos>")
        
    return df

In [7]:
df = lower_casing(df)
df = apply_start_end_tokens(df)
df.head()

Unnamed: 0,English,Hindi
0,<sos> wow! <eos>,<sos> वाह! <eos>
1,<sos> duck! <eos>,<sos> झुको! <eos>
2,<sos> duck! <eos>,<sos> बतख़! <eos>
3,<sos> help! <eos>,<sos> बचाओ! <eos>
4,<sos> jump. <eos>,<sos> उछलो. <eos>


In [8]:
train_df = df['English'].str.cat(sep=' ')
train_df.split(" ")[-24:]

['<sos>',
 'when',
 'i',
 'was',
 'a',
 'kid,',
 'touching',
 'bugs',
 "didn't",
 'bother',
 'me',
 'a',
 'bit.',
 'now',
 'i',
 'can',
 'hardly',
 'stand',
 'looking',
 'at',
 'pictures',
 'of',
 'them.',
 '<eos>']

In [9]:
df["English"].values[-1]

"<sos> when i was a kid, touching bugs didn't bother me a bit. now i can hardly stand looking at pictures of them. <eos>"

# Preparation

In [10]:
class Language:
    def __init__(self,data=List[list],lang=str):
        self._data = data
        self._lang = lang
        self.words = self._load_words()
        
        self.uniq_words = self._get_unique_words()
        self.index_to_word = {index: word for index, word in enumerate(self.uniq_words)}
        self.word_to_index = {word: index for index, word in enumerate(self.uniq_words)}
        
    def _load_words(self):
        text = self._data[self._lang].str.cat(sep=" ")
        return text.split(" ")
    
    def _get_unique_words(self):
        words = Counter(self.words)
        return sorted(words, key=words.get, reverse=True)

In [11]:
english = Language(df,"English")
hindi = Language(df,"Hindi")

In [12]:
class Pairs(Dataset):
    def __init__(self,data):
        self._data = data
        self.hindi = Language(df,"Hindi")
        self.english = Language(df,"English")
        
    def get_data(self):
        holder = []
        for i in self._data.values:
            input_ = [self.english.word_to_index[word] for word in i[0].split(' ')]
            output_ = [self.hindi.word_to_index[word] for word in i[1].split(' ')]
            
            holder.append((torch.tensor(input_,dtype=torch.long).view(-1,1),
                          torch.tensor(output_,dtype=torch.long).view(-1,1)))
        return holder

In [13]:
pair = Pairs(df)
pairs = pair.get_data()

In [14]:
pairs[:2]

[(tensor([[   0],
          [1548],
          [   1]]),
  tensor([[   0],
          [1506],
          [   1]])),
 (tensor([[  0],
          [954],
          [  1]]),
  tensor([[   0],
          [1507],
          [   1]]))]

# Encoder

In [15]:
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output, hidden = self.gru(embedded, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size)

# Decoder

In [16]:
class Decoder(nn.Module):
    def __init__(self, output_size,hidden_size):
        super().__init__()
        self.hidden_size = hidden_size
        self.vocab = output_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, input_, hidden):
        output = self.embedding(input_).view(1,1, -1)
        output = F.relu(output,inplace=False)
        output, hidden = self.gru(output, hidden)
        prediction = self.out(output[0].clone())
        return prediction, hidden

# Seq2seq

In [17]:
# class Seq2Seq(nn.Module):
#     def __init__(self, encoder, decoder):
#         super().__init__()
#         self.encoder = encoder
#         self.decoder = decoder
#         self.encoder_hidden = encoder.initHidden()
     
#     def forward(self, source, target):
#         #get the input length (number of words in sentence)
#         input_length = source.size(0)
#         target_length = target.shape[0]
        
#         batch_size = target.shape[1] 
#         vocab_size = self.decoder.vocab
        
#         outputs = torch.zeros(target_length, batch_size, vocab_size)
        
#         for i in range(input_length):
#             encoder_output, encoder_hidden = self.encoder(source[i],self.encoder_hidden)
            
#         decoder_input = target[0].clone()
        
#         for t in range(target_length):
#             decoder_output, decoder_hidden = self.decoder(decoder_input, encoder_hidden)
#             outputs[t] = decoder_output.detach()
#             topv, topi = decoder_output.topk(1)
#             decoder_input = topi.squeeze().detach()
            
#         return outputs        
        

# Custom Training 

In [None]:
epochs = range(0,2)
input_size = len(english.index_to_word)
output_size = len(hindi.index_to_word)
hidden_size = 128
batch_size = 1 

#create encoder-decoder model
encoder = Encoder(input_size, hidden_size)
#print("Encoder : \n",encoder)
decoder = Decoder(output_size, hidden_size)
#print("Decoder : \n",decoder)

In [19]:

en_optimizer = optim.Adam(encoder.parameters(),lr=0.001)
de_optimizer = optim.Adam(decoder.parameters(),lr=0.001)
criterion = nn.CrossEntropyLoss()

for epoch in epochs:
    for sen, (source, target) in enumerate(pairs):
        loss = 0
        # set gradient zero
        en_optimizer.zero_grad()
        de_optimizer.zero_grad()
        
        input_length = source.size(0)
        target_length = target.shape[0]
        
        batch_size = target.shape[1] 
        vocab_size = decoder.vocab
        
        encoder_hidden = encoder.initHidden()
        outputs = torch.zeros(target_length, batch_size, vocab_size)

        # encoder 
        for i in range(input_length):
            encoder_output, encoder_hidden = encoder(source[i],encoder_hidden)

        # decoder
        decoder_input = target[0].clone()
        decoder_hidden = encoder_hidden

        for t in range(target_length):
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            outputs[t] = decoder_output
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()

        # loss
        loss +=criterion(outputs.transpose(2,1),target)
    
        # Backpropagation
        loss.backward(retain_graph=True)


        de_optimizer.step()
        en_optimizer.step()

    print({"Epoch ": epoch, "Sentence ": sen, "Loss ": loss.item()})

{'Epoch ': 0, 'Sentence ': 2914, 'Loss ': 6.351917743682861}
{'Epoch ': 1, 'Sentence ': 2914, 'Loss ': 6.005764484405518}


In [24]:
for sen, (source, target) in enumerate(pairs):
    if sen > 0:
        break

In [25]:
source

tensor([[  0],
        [954],
        [  1]])

In [26]:
target

tensor([[   0],
        [1507],
        [   1]])