In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import unicodedata
import string
import re
import random

from __future__ import unicode_literals, print_function, division
from io import open


import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

DEBUG=False
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [0]:
def file_to_read(path1,path2):
    file_train_x=open(path1)
    file_train_y=open(path2)
    return file_train_x,file_train_y

In [0]:
file_train_x,file_train_y= file_to_read("gdrive/My Drive/dataset/train.hi","gdrive/My Drive/dataset/train.en")
file_dev_x,file_dev_y= file_to_read("gdrive/My Drive/dataset/dev.hi","gdrive/My Drive/dataset/dev.en")
file_test_x,file_test_y= file_to_read("gdrive/My Drive/dataset/test.hi","gdrive/My Drive/dataset/test.en")


In [0]:
hindi_sent=file_train_x.read().split('\n')[:7000]
eng_sent=file_train_y.read().split('\n')[:7000]
if DEBUG:
  print("-------------------")
  print(eng_sent[0])
  print(len(eng_sent))
  print(hindi_sent[0])
  print(len(hindi_sent))

In [0]:
file1=open("gdrive/My Drive/dataset/eng-hindi.txt","w")
for i in range(len(hindi_sent)):
  if DEBUG:
    print("--Writing into file-----")
  file1.write(hindi_sent[i])
  file1.write("\t")
  file1.write(eng_sent[i])
  file1.write("\n")

file1.flush()
file1.close()


In [0]:
SOS_token=0
EOS_token=1
UNK_token=2

In [0]:
class Language:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count ={}
       
        self.index2word = {0: "SOS", 1: "EOS" ,2:"UNK"}
        self.n_words = 3

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words = self.n_words+1
        else:
            self.word2count[word] =self.word2count[word]+1

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)


In [0]:
def readLangs(lang1, lang2, reverse=False):
    
    lines = open('gdrive/My Drive/dataset/%s-%s.txt' % (lang1, lang2), encoding='utf-8').read().strip().split('\n')
    
    pairs = [[s for s in l.split('\t')] for l in lines]
    if DEBUG:
      print("---------------------------------")
      print("Inside ReadLines functions: ")
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Language(lang2)
        output_lang = Language(lang1)
    else:
        input_lang = Language(lang1)
        output_lang = Language(lang2)

    if DEBUG:
      print("--------------------------")
      print("About to return: ")
    return input_lang, output_lang, pairs

In [0]:
def prepareData(lang1, lang2, reverse=False):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)

    if DEBUG:
      print("lenght of pair", len(pairs))
      
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    
    if DEBUG:
       print("name ",input_lang.name,"words", input_lang.n_words)
       print("name" ,output_lang.name,"words", output_lang.n_words)
    return input_lang, output_lang, pairs


In [0]:
input_lang, output_lang, pairs = prepareData('eng', 'hindi',True)
MAX_LENGTH=1000
if DEBUG:
  print(random.choice(pairs))


In [0]:
class EncoderRNN(nn.Module):

    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()

        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        if DEBUG:
          print("%------------EncodeRNN class ---------------")
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        if DEBUG:
          print("%------------EncodeRNN class ---------------")
        output, hidden = self.gru(embedded, hidden)
        return output,hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [0]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        if DEBUG:
          print("%------------DecoderRNN class ---------------")
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        if DEBUG:
          print("%------------DecoderRNN class ---------------")
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [0]:
def indexesFromSentence(lang, sentence):
  if DEBUG:
    print("About to return functions:1 ")
  return [lang.word2index[word] if word in lang.word2index else UNK_token for word in sentence.split(' ')]


def tensorFromSentence(lang, sentence):
  if DEBUG:
    print("About to return functions:2 ")
  indexes = indexesFromSentence(lang, sentence)
  indexes.append(EOS_token)
  return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
  if DEBUG:
    print("About to return functions:3 ")
  input_tensor = tensorFromSentence(input_lang, pair[0])
  target_tensor = tensorFromSentence(output_lang, pair[1])
  return (input_tensor, target_tensor)

In [0]:
teacher_forcing_ratio = 0.4

def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    if DEBUG:
      print(input_length)
    target_length = target_tensor.size(0)
    if DEBUG:
      print(input_length)
      print(target_length)
    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)
    if DEBUG:
      print(encoder_outputs)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = False
    if random.random() < teacher_forcing_ratio:
      use_teacher_forcing=True
    else:
      use_teacher_forcing=False

    if use_teacher_forcing:
        for di in range(target_length):
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            loss =loss+ criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di] 

    else:
        for di in range(target_length):
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()

            loss =loss+ criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [0]:
def trainIters(encoder1, decoder1, n_iters, printEvery, plot_every=100, learning_rate=0.02):
    start = time.time()
    plot_losses = []
    print_loss_total = 0 
    if DEBUG:
      print("Inside TrainIters")
      print(print_loss_total)
    plot_loss_total = 0  

    encoder_optimizer = optim.SGD(encoder1.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder1.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(pairs))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder1,decoder1, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total = print_loss_total+ loss
        plot_loss_total =plot_loss_total+ loss
        if DEBUG:
          print("Total loss",plot_loss_total)
          
        if iter % printEvery == 0:
            print_loss_avg = print_loss_total / printEvery
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),iter, iter / n_iters * 100, print_loss_avg))
            if DEBUG:
              print("Inside print")
              print(print_loss_total)

        if iter % printEvery == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0


In [0]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],encoder_hidden)
            encoder_outputs[ei] =encoder_outputs[ei]+ encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]

In [0]:
hidden_size = 256
iterations=7000
learningRate=0.01

In [0]:

encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
decoder1 = DecoderRNN(hidden_size, output_lang.n_words).to(device)

trainIters(encoder1, decoder1,iterations, print_every=5000)

2m 20s (- 30m 25s) (5000 7%) 5.0291
4m 40s (- 28m 4s) (10000 14%) 4.8924
7m 12s (- 26m 24s) (15000 21%) 4.8761
9m 43s (- 24m 19s) (20000 28%) 4.6614
12m 15s (- 22m 4s) (25000 35%) 4.4775
14m 44s (- 19m 39s) (30000 42%) 4.2838
17m 15s (- 17m 15s) (35000 50%) 4.0949
19m 49s (- 14m 51s) (40000 57%) 3.9609
22m 21s (- 12m 25s) (45000 64%) 3.8227
24m 56s (- 9m 58s) (50000 71%) 3.7260
27m 32s (- 7m 30s) (55000 78%) 3.6089
30m 2s (- 5m 0s) (60000 85%) 3.4792
32m 37s (- 2m 30s) (65000 92%) 3.4479
35m 13s (- 0m 0s) (70000 100%) 3.3908


SAve the model

In [0]:
torch.save(encoder1.state_dict(), "gdrive/My Drive/dataset/encoder1")
device = torch.device('cpu')
torch.save(decoder1.state_dict(), "gdrive/My Drive/dataset/decoder1")


Load the model

In [19]:

device = torch.device('cpu')
encoder_model = EncoderRNN(input_lang.n_words, hidden_size).to(device)
encoder_model.load_state_dict(torch.load("gdrive/My Drive/dataset/encoder1", map_location=device))


decoder_model = DecoderRNN(hidden_size, output_lang.n_words).to(device)
decoder_model.load_state_dict(torch.load("gdrive/My Drive/dataset/decoder1", map_location=device))

<All keys matched successfully>

In [0]:
Immediately contact the doctor on feeling any abnormality .

In [0]:
hindi_dev_sent=file_dev_x.read().split('\n')
eng_dev_sent=file_dev_y.read().split('\n')
hindi_test_sent=file_test_x.read().split('\n')
eng_test_sent=file_test_y.read().split('\n')
if DEBUG:
    print(eng_dev_sent[0])
    print(len(eng_dev_sent))
    print(hindi_dev_sent[0])
    print(len(hindi_dev_sent))
    print(eng_test_sent[0])
    print(len(eng_test_sent))
    print(hindi_test_sent[0])
    print(len(hindi_test_sent))

In [21]:
output_words= evaluate(encoder_model, decoder_model, "Put two spoons of sugar and a pinch of salt in a glass of water and boil it")
print(output_words[0])
if DEBUG:
  print(output_words)

['उसे', 'चाय', 'जल', 'में', 'गर्म', 'पानी', 'का', 'गर्म', 'लें', 'और', 'पानी', 'में', 'मिलाकर', 'लें', '।', '।', 'लें', '।', '<EOS>']


In [25]:
from nltk.translate.bleu_score import sentence_bleu

for i in eng_test_sent[15:20]:
  print(i)
  output_words= evaluate(encoder_model, decoder_model,i)
  print(output_words[0])
  score = sentence_bleu(i, output_words[0])
  print("bleu Score : ",score)



Saliva is formed by chewing the chewing gum .
['एक', 'विशेष', 'तक', 'जीवाणु', 'की', 'एक', '-', 'एक', 'विशेष', '।', '<EOS>']
bleu Score :  0.135
Chewing gum helps in keeping the teeth clean .
['दाँत', 'में', 'में', 'साफ', 'दाँत', 'साफ', '।', '।', '<EOS>']
bleu Score :  0.138
Sugared chewing gum is not supposed to be good for health .
['अच्छी', 'स्वास्थ्य', 'का', 'है', 'लिए', 'है', 'है', 'आवश्यक', 'है', 'है', '।', 'आवश्यक', 'है', '।', '<EOS>']
bleu Score :  0.177
That is why dentists do not suggest chewing sugared chewing gum .
['अब', 'चिकित्सा', 'की', 'सर्जरी', 'गए', 'ना', 'रह', 'जाती', 'है', '।', '<EOS>']
bleu Score :  0.195
Get the teeth checked-up regularly .
['दाँतों', 'के', 'साफ', 'सफाई', 'साफ', 'करें', '।', '<EOS>']
bleu Score :  0.108
