In [59]:
pip install indic-nlp-library



In [60]:
import codecs
import torch
import random
import torch.nn as nn
import math
import numpy as np
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [0]:
SEED = 21

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [62]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
with codecs.open('/content/drive/My Drive/Colab Notebooks/traing_english.txt', encoding='utf-8') as f:
  training_english = f.read().split('\n')

In [0]:
with codecs.open('/content/drive/My Drive/Colab Notebooks/traing_hindi.txt', encoding='utf-8') as f:
  training_hindi = f.read().split('\n')

In [0]:

with codecs.open('/content/drive/My Drive/Colab Notebooks/testing_english.txt', encoding='utf-8') as f:
  testing_english = f.read().split('\n')

In [0]:
with codecs.open('/content/drive/My Drive/Colab Notebooks/testing_hindi.txt', encoding='utf-8') as f:
  testing_hindi = f.read().split('\n')

In [0]:
from indicnlp.tokenize import indic_tokenize  
from nltk.tokenize import RegexpTokenizer

In [0]:
def tokenizer(hindi, english, flag):
  data = []
  corpus_hindi = []
  corpus_english = []
  tokenizer_english = RegexpTokenizer('\w+|\$[\.]+|\S+')
  for sentence_hindi, sentence_english in zip(hindi, english):
    dic = {}
    h = indic_tokenize.trivial_tokenize(sentence_hindi)
    e = tokenizer_english.tokenize(sentence_english)

    if flag:
      corpus_hindi += h
      corpus_english += e

    dic['trg'] = e
    dic['src'] = h
    data.append(dic)
  return data, corpus_hindi, corpus_english

In [69]:
training_data, corpus_hindi, corpus_english = tokenizer(training_hindi,training_english, True)
training_data[0]

{'src': ['द्वारका', 'चार', 'धामों', 'में', 'एक', 'धाम', 'भी', 'है'],
 'trg': ['dwarka',
  'is',
  'also',
  'a',
  'dhaam',
  'among',
  'the',
  'chaar',
  'dhaams']}

In [70]:
training_data[50]

{'src': ['मेले',
  'के',
  'दौरान',
  'तट',
  'पर',
  'तंबुओं',
  'का',
  'एक',
  'पूरा',
  'नगर',
  'बस',
  'जाता',
  'है'],
 'trg': ['one',
  'whole',
  'city',
  'of',
  'tents',
  'gets',
  'settled',
  'on',
  'the',
  'banks',
  'during',
  'the',
  'fair']}

In [71]:
valid_test_data, _, _ = tokenizer(testing_hindi, testing_english, False)
valid_test_data[0]

{'src': ['बैदाडीह', 'स्थान', 'बड़े', 'कुएँ', 'के', 'लिए', 'प्रसिद्ध', 'है'],
 'trg': ['the',
  'place',
  'baidadih',
  'is',
  'famous',
  'for',
  'the',
  'big',
  'well']}

In [72]:
valid_data = valid_test_data[:len(valid_test_data)//2]
testing_data = valid_test_data[len(valid_test_data)//2:]
print(len(valid_data))

9980


In [0]:
from collections import Counter
counter_english = Counter(corpus_english)
counter_hindi = Counter(corpus_hindi)

In [0]:
vocab_hindi = {'<sos>': 0,'<eos>': 1, '<unk>': 2, '<pad>': 3}
count = 4
for i,word in enumerate(corpus_hindi):
  if word not in vocab_hindi and counter_hindi[word] > 1:
    vocab_hindi[word] = count
    count += 1


In [0]:
vocab_english = {'<sos>': 0,'<eos>': 1, '<unk>': 2, '<pad>': 3}
count = 4
for i,word in enumerate(corpus_english):
  if word not in vocab_english and counter_english[word] > 1:
    vocab_english[word] = count
    count += 1


In [76]:
print(vocab_hindi['<sos>'])
vocab_english['<sos>']

0


0

In [0]:
reverse_vocab_hindi = dict((i,word) for word, i in vocab_hindi.items())
reverse_vocab_english = dict((i,word) for word, i in vocab_english.items())

In [78]:
reverse_vocab_english[1125]

'independent'

In [79]:
print(len(vocab_english), len(vocab_hindi))

13445 16917


In [80]:
training_data[5:10]

[{'src': ['गढ़मुक्तेश्वर', 'हिंदुओं', 'का', 'पावन', 'तीर्थ', 'है'],
  'trg': ['garhmukteshwar',
   'is',
   'a',
   'sacred',
   'pilgrimage',
   'of',
   'the',
   'hindus']},
 {'src': ['चलें', 'गढ़', 'मुक्तेश्वर', 'का', 'गंगा', 'मेला'],
  'trg': ['let',
   'us',
   'go',
   'to',
   'the',
   'ganges',
   'fair',
   'of',
   'garh',
   'mukteshwar']},
 {'src': ['गाजियाबाद',
   'जिले',
   'के',
   'गढ़',
   'मुक्तेश्वर',
   'में',
   'पतित',
   'पावनी',
   'गंगा',
   'के',
   'तट',
   'पर',
   'हर',
   'साल',
   'कार्तिक',
   'पूर्णिमा',
   'के',
   'अवसर',
   'पर',
   'लगने',
   'वाले',
   'उत्तर',
   'भारत',
   'के',
   'प्रसिद्ध',
   'और',
   'प्राचीन',
   'धार्मिक',
   'मेले',
   'का',
   'इतिहास',
   'लगभग',
   'पाँच',
   'हजार',
   'वर्ष',
   'पुराना',
   'है'],
  'trg': ['the',
   'history',
   'of',
   'north',
   'india',
   's',
   'famous',
   'and',
   'ancient',
   'religious',
   'fair',
   'held',
   'every',
   'year',
   'on',
   'the',
   'occasion',
   'of',
   'kar

In [0]:
def vectorization(data):
  for item in data:
    wordvec_english = []
    wordvec_hindi = []
    for word in item['src']:
      if word not in vocab_hindi:
        word = '<unk>'
      wordvec_hindi.append(vocab_hindi[word])
    item['src'] = wordvec_hindi[::-1]
    
    for word in item['trg']:
      if word not in vocab_english:
        word = '<unk>'
      wordvec_english.append(vocab_english[word])
    item['trg'] = wordvec_english
  return data


In [0]:
vectorized_data = vectorization(training_data)

In [83]:
vectorized_data[:2]

[{'src': [11, 10, 9, 8, 7, 6, 5, 4], 'trg': [4, 5, 6, 7, 8, 9, 10, 11, 12]},
 {'src': [23, 22, 21, 20, 19, 7, 18, 17, 16, 15, 14, 13, 12],
  'trg': [13, 14, 15, 16, 17, 18, 19, 10, 20]}]

In [0]:
valid_vectorized_data = vectorization(valid_data)
testing_vectorized_data = vectorization(testing_data)

In [0]:
batch_size = 64

In [0]:
def batch_iterator(data,batch_size):
  iterator = []
  data.sort(key = lambda x: len(x['src']))
  for i in range(0,len(data),batch_size):
    iterator.append(data[i:i+batch_size])
  return iterator

In [0]:
training_iterator = batch_iterator(vectorized_data, batch_size)

In [0]:
valid_iterator = batch_iterator(valid_vectorized_data, batch_size)
testing_iterator = batch_iterator(testing_vectorized_data, batch_size)

In [89]:
print(len(training_iterator[0]), len(valid_iterator), len(testing_iterator))

64 156 156


In [0]:
def padding(batch_data):
  max_length = 0
  for item in batch_data:
    max_length = max(max_length, len(item['src']), len(item['trg']))
  
  for item in batch_data:
    extra_src = max_length - len(item['src'])
    extra_trg = max_length - len(item['trg'])
    item['src'] = [0] + item['src'] + [3]*extra_src + [1]
    item['trg'] = [0] + item['trg'] + [3]*extra_trg + [1]
  return batch_data
      

In [0]:
batched_iterator = []
for batch in training_iterator:
  batched_iterator.append(padding(batch))

In [0]:
valid_padded_iterator = []
for batch in valid_iterator:
  valid_padded_iterator.append(padding(batch))

In [0]:
testing_padded_iterator = []
for batch in testing_iterator:
  testing_padded_iterator.append(padding(batch))

In [0]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.hid_dim = hid_dim #dimension of hidden and cell states
        self.n_layers = n_layers #layers in the RNN
        
        self.embedding = nn.Embedding(input_dim, emb_dim) #nn.Embedding is to convert the one-hot encodings into dense vector
        
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout) #Embeddings of sentence X is passed into RNN with dropout applied to them.
        
        self.dropout = nn.Dropout(dropout) #Dropout to be applied between the layers of a multi-layer RNN, i.e., btw hidden states output
        
    def forward(self, src):
        
        #src = [src len, batch size]
        embedded = self.dropout(self.embedding(src))
        
        #embedded = [src len, batch size, emb dim]
        
        outputs, (hidden, cell) = self.rnn(embedded)
        
        #outputs = [src len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #outputs are always from the top hidden layer
        
        return hidden, cell

In [0]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.output_dim = output_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(output_dim, emb_dim)
        
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
        
        self.fc_out = nn.Linear(hid_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, cell):
        

        #input = [batch size]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #n directions in the decoder will both always be 1, therefore:
        #hidden = [n layers, batch size, hid dim]
        #context = [n layers, batch size, hid dim]
        
        input = input.unsqueeze(0)
        
        #input = [1, batch size]
        #print("Decoder Input/",input)
        embedded = self.dropout(self.embedding(input))
        
        #embedded = [1, batch size, emb dim]
                
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        
        #output = [seq len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #seq len and n directions will always be 1 in the decoder, therefore:
        #output = [1, batch size, hid dim]
        #hidden = [n layers, batch size, hid dim]
        #cell = [n layers, batch size, hid dim]
        
        prediction = self.fc_out(output.squeeze(0))
        
        #prediction = [batch size, output dim]
        
        return prediction, hidden, cell

In [0]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
        assert encoder.hid_dim == decoder.hid_dim, \
             "Hidden dimensions of encoder and decoder must be equal!"
        assert encoder.n_layers == decoder.n_layers, \
             "Encoder and decoder must have equal number of layers!"
        
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        
        #src = [src len, batch size]
        #trg = [trg len, batch size]
        #teacher_forcing_ratio is probability to use teacher forcing
        #e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time
        
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]

        trg_vocab_size = self.decoder.output_dim
        # print(batch_size, trg_len, trg_vocab_size)
        
        #tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        
        #last hidden state of the encoder is used as the initial hidden state of the decoder
        hidden, cell = self.encoder(src)
        
        #first input to the decoder is the <sos> tokens
        input = trg[0,:]
        
        for t in range(1, trg_len):
            
            #insert input token embedding, previous hidden and previous cell states
            #receive output tensor (predictions) and new hidden and cell states
            output, hidden, cell = self.decoder(input, hidden, cell)
            #place predictions in a tensor holding predictions for each token
            outputs[t] = output
            
            #decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            
            #get the highest predicted token from our predictions
            top1 = output.argmax(1) 
            
            #if teacher forcing, use actual next token as next input
            #if not, use predicted token
            input = trg[t] if teacher_force else top1
        
        return outputs

In [0]:
INPUT_DIM = len(vocab_hindi)
OUTPUT_DIM = len(vocab_english)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 128
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

model = Seq2Seq(enc, dec, device).to(device)

In [98]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        
model.apply(init_weights)


Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(16917, 256)
    (rnn): LSTM(256, 128, num_layers=2, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(13445, 256)
    (rnn): LSTM(256, 128, num_layers=2, dropout=0.5)
    (fc_out): Linear(in_features=128, out_features=13445, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [99]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 10,166,533 trainable parameters


In [0]:
import torch.optim as optim
learning_rate=0.005
optimizer = optim.Adam(model.parameters(), weight_decay=0.01, lr=learning_rate)

In [0]:
criterion = nn.CrossEntropyLoss(ignore_index = 3).to(device)

In [0]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        #print(i)
        src = torch.t(torch.LongTensor([item['src'] for item in batch])).to(device)
        
        trg = torch.t(torch.LongTensor([item['trg'] for item in batch])).to(device)
        # print(src.shape, trg.shape)
        
        optimizer.zero_grad()
        
        output = model(src, trg)
        
        #trg = [trg len, batch size]
        #output = [trg len, batch size, output dim]
        
        output_dim = output.shape[-1]
        
        output = output[1:].contiguous().view(-1, output_dim)
        trg = trg[1:].contiguous().view(-1)
        
        # trg = [(trg_len - 1) * batch size]
        # output = [(trg len - 1) * batch size, output dim]
        
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [0]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = torch.LongTensor([item['src'] for item in batch]).to(device)
            trg = torch.LongTensor([item['trg'] for item in batch]).to(device)

            # src.to(device)
            # trg.to(device)
            output = model(src, trg, 0) #turn off teacher forcing

            #trg = [trg len, batch size]
            #output = [trg len, batch size, output dim]

            output_dim = output.shape[-1]
            
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)
            #trg = [(trg len - 1) * batch size]
            #output = [(trg len - 1) * batch size, output dim]

            loss = criterion(output, trg)
            
            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator) , output, trg

In [0]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [106]:
import time
N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, batched_iterator, optimizer, criterion, CLIP)
    valid_loss, _, _ = evaluate(model, valid_padded_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Epoch: 01 | Time: 0m 52s
	Train Loss: 6.585 | Train PPL: 724.460
	 Val. Loss: 7.143 |  Val. PPL: 1265.037
Epoch: 02 | Time: 0m 52s
	Train Loss: 6.603 | Train PPL: 737.632
	 Val. Loss: 7.190 |  Val. PPL: 1325.855
Epoch: 03 | Time: 0m 52s
	Train Loss: 6.608 | Train PPL: 741.163
	 Val. Loss: 7.175 |  Val. PPL: 1306.270
Epoch: 04 | Time: 0m 52s
	Train Loss: 6.608 | Train PPL: 741.179
	 Val. Loss: 7.149 |  Val. PPL: 1272.649
Epoch: 05 | Time: 0m 52s
	Train Loss: 6.606 | Train PPL: 739.807
	 Val. Loss: 7.190 |  Val. PPL: 1326.704
Epoch: 06 | Time: 0m 52s
	Train Loss: 6.611 | Train PPL: 743.588
	 Val. Loss: 7.078 |  Val. PPL: 1185.409
Epoch: 07 | Time: 0m 52s
	Train Loss: 6.617 | Train PPL: 747.377
	 Val. Loss: 7.095 |  Val. PPL: 1206.319
Epoch: 08 | Time: 0m 52s
	Train Loss: 6.613 | Train PPL: 744.931
	 Val. Loss: 7.195 |  Val. PPL: 1333.137
Epoch: 09 | Time: 0m 52s
	Train Loss: 6.609 | Train PPL: 741.945
	 Val. Loss: 7.162 |  Val. PPL: 1289.276
Epoch: 10 | Time: 0m 52s
	Train Loss: 6.612 | 

In [107]:
model.load_state_dict(torch.load('tut1-model.pt'))

test_loss, output, trg = evaluate(model, batched_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

| Test Loss: 7.195 | Test PPL: 1332.466 |


In [0]:
len(testing_padded_iterator[0])

64

In [0]:
output

tensor([[-5.1434, -0.6571, -5.1204,  ..., -3.9024, -4.5438, -2.8950],
        [-4.7056, -0.5464, -4.5724,  ..., -3.3813, -4.0453, -2.4985],
        [-4.6637, -0.5369, -4.5335,  ..., -3.3454, -4.0013, -2.4879],
        ...,
        [-6.4607,  0.3734, -6.4324,  ..., -5.1611, -5.7032, -4.3896],
        [-6.4597,  0.3733, -6.4311,  ..., -5.1595, -5.7012, -4.3888],
        [-6.4865,  0.3765, -6.4671,  ..., -5.2031, -5.7533, -4.4107]],
       device='cuda:0')

In [0]:
trg

tensor([  0, 173,  30,  ...,   3,   3,   1], device='cuda:0')

In [0]:
def translate_sentence(sentence, model, device):
    
    model.eval()
    word2vec = [0]    
    for word in sentence:
      if word not in vocab_hindi:
        word = '<unk>'
      word2vec.append(vocab_hindi[word])
    word2vec.append(1)
   
    print(word2vec)
    src_tensor = torch.LongTensor(word2vec).unsqueeze(1).to(device) 
    print(src_tensor)  

    with torch.no_grad():
        hidden, cell = model.encoder(src_tensor) 
    trg_indexes = [0] 
    print("src_tensor", src_tensor)
    print("HIdden", hidden.shape, cell.shape)
    for i in range(50):

      trg_tensor = torch.LongTensor([trg_indexes[-1]]).to(device)
      print(trg_tensor)
      with torch.no_grad():
          output, h, c = model.decoder(trg_tensor, hidden, cell)
      print("output:" ,output)

      pred_token = output.argmax(1).item()
      print(pred_token)

      trg_indexes.append(pred_token)
      if pred_token == 2:
          break
   
    output = [reverse_vocab_english[i] for i in trg_indexes]
    print(output)

In [120]:
s = ['छाती', 'में', 'तेज', 'दर्द', 'अथवा', 'सांस','लेने', 'में', 'तकलीफ', 'हो']
translate_sentence(s, model, device)

['participants', 'of', '<unk>', 'who', 'Allah', 'breaking', 'Bharatha', 'to', '<unk>', 'of', 'predominant', 'Bhagat']
