<a href="https://colab.research.google.com/github/karans17s/Practical_Implementation_Of_Deep_learning/blob/main/PHASE_8_ENGLISH_TO_GUJARATI_NMT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **STEP:1 - IMPORT LIBS..**

In [None]:
import torch
import numpy as np
import torch.nn as nn
import math

# **STEP:2 - DATASET**

In [None]:
english = '/content/train.en'

In [None]:
gujarati = '/content/train.gu'

# **STEP:3 DEFINE THE VOCAB**

In [None]:
START_TOKEN = '<s>'
PADDING_TOKEN = '</s>'
END_TOKEN = '<pad>'

gujarati_vocabulary = [START_TOKEN, ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/',
                       '૦', '૧', '૨', '૩', '૪', '૫', '૬', '૭', '૮', '૯', ':', '<', '=', '>', '?', '@',
                       'અ', 'આ', 'ઇ', 'ઈ', 'ઉ', 'ઊ', 'ઋ', 'ૠ', 'ઌ', 'ૡ', 'ઍ', 'એ', 'ઐ', 'ઑ', 'ઓ', 'ઔ',
                       'ક', 'ખ', 'ગ', 'ઘ', 'ઙ',
                       'ચ', 'છ', 'જ', 'ઝ', 'ઞ',
                       'ટ', 'ઠ', 'ડ', 'ઢ', 'ણ',
                       'ત', 'થ', 'દ', 'ધ', 'ન',
                       'પ', 'ફ', 'બ', 'ભ', 'મ',
                       'ય', 'ર', 'લ', 'વ', 'શ', 'ષ', 'સ', 'હ', '઼', 'ા', 'િ', 'ી', 'ુ', 'ૂ', 'ૃ', 'ૄ', 'ૅ', 'ે', 'ૈ', 'ૉ', 'ો', 'ૌ', '્', 'ૐ', 'ૠ', 'ૡ', 'ં', 'ઃ',
                       PADDING_TOKEN, END_TOKEN]


english_vocabulary = [START_TOKEN, ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/',
                        '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
                        ':', '<', '=', '>', '?', '@',
                        'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
                        'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
                        'Y', 'Z',
                        "[", "/", "]", "^", "_", "`",
                        'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
                        'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x',
                        'y', 'z',
                        '{', '|', '}', '~', PADDING_TOKEN, END_TOKEN]

In [None]:
index_to_gujarati = {k:v for k,v in enumerate(gujarati_vocabulary)}
gujarati_to_index = {v:k for k,v in enumerate(gujarati_vocabulary)}
index_to_english = {k:v for k,v in enumerate(english_vocabulary)}
english_to_index = {v:k for k,v in enumerate(english_vocabulary)}

In [None]:
gujarati_to_index

{'<s>': 0,
 ' ': 1,
 '!': 2,
 '"': 3,
 '#': 4,
 '$': 5,
 '%': 6,
 '&': 7,
 "'": 8,
 '(': 9,
 ')': 10,
 '*': 11,
 '+': 12,
 ',': 13,
 '-': 14,
 '.': 15,
 '/': 16,
 '૦': 17,
 '૧': 18,
 '૨': 19,
 '૩': 20,
 '૪': 21,
 '૫': 22,
 '૬': 23,
 '૭': 24,
 '૮': 25,
 '૯': 26,
 ':': 27,
 '<': 28,
 '=': 29,
 '>': 30,
 '?': 31,
 '@': 32,
 'અ': 33,
 'આ': 34,
 'ઇ': 35,
 'ઈ': 36,
 'ઉ': 37,
 'ઊ': 38,
 'ઋ': 39,
 'ૠ': 98,
 'ઌ': 41,
 'ૡ': 99,
 'ઍ': 43,
 'એ': 44,
 'ઐ': 45,
 'ઑ': 46,
 'ઓ': 47,
 'ઔ': 48,
 'ક': 49,
 'ખ': 50,
 'ગ': 51,
 'ઘ': 52,
 'ઙ': 53,
 'ચ': 54,
 'છ': 55,
 'જ': 56,
 'ઝ': 57,
 'ઞ': 58,
 'ટ': 59,
 'ઠ': 60,
 'ડ': 61,
 'ઢ': 62,
 'ણ': 63,
 'ત': 64,
 'થ': 65,
 'દ': 66,
 'ધ': 67,
 'ન': 68,
 'પ': 69,
 'ફ': 70,
 'બ': 71,
 'ભ': 72,
 'મ': 73,
 'ય': 74,
 'ર': 75,
 'લ': 76,
 'વ': 77,
 'શ': 78,
 'ષ': 79,
 'સ': 80,
 'હ': 81,
 '઼': 82,
 'ા': 83,
 'િ': 84,
 'ી': 85,
 'ુ': 86,
 'ૂ': 87,
 'ૃ': 88,
 'ૄ': 89,
 'ૅ': 90,
 'ે': 91,
 'ૈ': 92,
 'ૉ': 93,
 'ો': 94,
 'ૌ': 95,
 '્': 96,
 'ૐ': 97,
 'ં': 100,
 'ઃ': 101,
 '</s>

# **STEP:4 LOAD AND CLEANING THE DATA**

In [None]:
with open(english, 'r') as file:
    english_sentences = file.readlines()
with open(gujarati, 'r') as file:
    gujarati_sentences = file.readlines()

In [None]:
TOTAL_SENTENCES = 200000
english_sentences = english_sentences[:TOTAL_SENTENCES]
gujarati_sentences = gujarati_sentences[:TOTAL_SENTENCES]
english_sentences = [sentence.rstrip('\n') for sentence in english_sentences]
gujarati_sentences = [sentence.rstrip('\n') for sentence in gujarati_sentences]

In [None]:
english_sentences[:10]

['Are you doing online transactions?',
 'Kunwar explains:',
 'A passenger train is sitting at a station.',
 'heavy snow shower',
 'It was plain that their intensive study of the Scriptures over their five months of training had reached their heart and motivated them to share with others what they had learned.',
 'Jesus Christ is overseeing the greatest preaching campaign in history',
 'He had gained victory by a margin of 67,000 votes.',
 'The Moskals immediately included the reading of the Harp book in their regular Bible - reading sessions.',
 'Gas lasers.',
 'Effective December 2 midnight, petrol, diesel and gas outlets will be removed from the exempt category for receipt of old Rs 500 notes']

In [None]:
gujarati_sentences[:10]

['ઓનલાઈન ટ્રાન્ઝેક્શન કરી શકાય?',
 'કુરાન તે વર્ણવે છે:',
 'એક પેસેન્જર ટ્રેન સ્ટેશન પર બેઠેલું છે.',
 'ભારે બરફના ટૂકડાweather forecast',
 'પાંચ મહિનાના કોર્સમાં પોતે જે કંઈ શીખ્યો, એ એક વિદ્યાર્થીએ પોતાના નાના ભાઈને જણાવ્યું.',
 'આજે પૃથ્વી પર થઈ રહેલા મહાન પ્રચાર કાર્યની ઈસુ દેખરેખ રાખે છે',
 'આમ, તેઓ 67,000થી વધુ મતથી જીતી ગયા છે.',
 'મૉસ્કેલ કુટુંબે બાઇબલ સાથે સાથે એ પુસ્તક પણ વાંચવાનું શરૂ કરી દીધું.',
 'ગેસ લેસર્સ.',
 '10 ડિસેમ્બરથી 500 રુપિયની જૂની નોટ રેલવે, મેટ્રો અને બસમાં ચાલવાનું બંધ થઇ જશે']

# **STEP:5 SENTENCE LENGTH ANALYSIS**

In [None]:
max(len(x) for x in gujarati_sentences), max(len(x) for x in english_sentences),

(1182, 1004)

In [None]:
PERCENTILE = 99
print( f"{np.percentile([len(x) for x in gujarati_sentences], PERCENTILE)}" )
print( f"{np.percentile([len(x) for x in english_sentences], PERCENTILE)}" )

99th percentile length Kannada: 227.0
99th percentile length English: 245.0


# **STEP:6 FILTERING VALID SENTENCES**

sets the maximum sequence length to 300.

In [None]:
max_sequence_length = 300

In [None]:
def is_valid_tokens(sentence, vocab):
    for token in list(set(sentence)):
        if token not in vocab:
            return False
    return True

def is_valid_length(sentence, max_sequence_length):
    return len(list(sentence)) < (max_sequence_length - 1)

valid_sentence_indicies = []
for index in range(len(gujarati_sentences)):
    gujarati_sentence, english_sentence = gujarati_sentences[index], english_sentences[index]
    if is_valid_length(gujarati_sentence, max_sequence_length) \
      and is_valid_length(english_sentence, max_sequence_length) \
      and is_valid_tokens(gujarati_sentence, gujarati_vocabulary):
        valid_sentence_indicies.append(index)

print(f"Number of sentences: {len(gujarati_sentences)}")
print(f"Number of valid sentences: {len(valid_sentence_indicies)}")

Number of sentences: 200000
Number of valid sentences: 139624


In [None]:
gujarati_sentences = [gujarati_sentences[i] for i in valid_sentence_indicies]
english_sentences = [english_sentences[i] for i in valid_sentence_indicies]

# **STEP:7 DATASET CREATION**

In [None]:
from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):

    def __init__(self, english_sentences, gujarati_sentences):
        self.english_sentences = english_sentences
        self.gujarati_sentences = gujarati_sentences

    def __len__(self):
        return len(self.english_sentences)

    def __getitem__(self, idx):
        return self.english_sentences[idx], self.gujarati_sentences[idx]

In [None]:
dataset = TextDataset(english_sentences, gujarati_sentences)

In [None]:
dataset[3]

('It was plain that their intensive study of the Scriptures over their five months of training had reached their heart and motivated them to share with others what they had learned.',
 'પાંચ મહિનાના કોર્સમાં પોતે જે કંઈ શીખ્યો, એ એક વિદ્યાર્થીએ પોતાના નાના ભાઈને જણાવ્યું.')

# **STEP:8 DATALOADER AND BATCH PROCESSING**

In [None]:
batch_size = 10
train_loader = DataLoader(dataset, batch_size)
iterator = iter(train_loader)

for batch_num, batch in enumerate(iterator):
    print(batch)
    if batch_num > 3:
        break

[('Are you doing online transactions?', 'Kunwar explains:', 'A passenger train is sitting at a station.', 'It was plain that their intensive study of the Scriptures over their five months of training had reached their heart and motivated them to share with others what they had learned.', 'Jesus Christ is overseeing the greatest preaching campaign in history', 'The Moskals immediately included the reading of the Harp book in their regular Bible - reading sessions.', 'Gas lasers.', 'Then the job.', 'Australia announce ODI squad for India series', 'This was another topping.'), ('ઓનલાઈન ટ્રાન્ઝેક્શન કરી શકાય?', 'કુરાન તે વર્ણવે છે:', 'એક પેસેન્જર ટ્રેન સ્ટેશન પર બેઠેલું છે.', 'પાંચ મહિનાના કોર્સમાં પોતે જે કંઈ શીખ્યો, એ એક વિદ્યાર્થીએ પોતાના નાના ભાઈને જણાવ્યું.', 'આજે પૃથ્વી પર થઈ રહેલા મહાન પ્રચાર કાર્યની ઈસુ દેખરેખ રાખે છે', 'મૉસ્કેલ કુટુંબે બાઇબલ સાથે સાથે એ પુસ્તક પણ વાંચવાનું શરૂ કરી દીધું.', 'ગેસ લેસર્સ.', 'પછી તો કામ જ કામ છે.', 'ઓસ્ટ્રેલિયા સામેની વનડે શ્રેણી માટે ટીમ ઈન્ડિયાની જા

# **STEP:9 TOKENIZATION**

In [None]:
def tokenize(sentence, language_to_index, start_token=True, end_token=True):
    sentence_word_indicies = [language_to_index[token] for token in list(sentence)]
    if start_token:
        sentence_word_indicies.insert(0, language_to_index[START_TOKEN])
    if end_token:
        sentence_word_indicies.append(language_to_index[END_TOKEN])
    for _ in range(len(sentence_word_indicies), max_sequence_length):
        sentence_word_indicies.append(language_to_index[PADDING_TOKEN])
    return torch.tensor(sentence_word_indicies)

In [None]:
eng_tokenized, gu_tokenized = [], []
for sentence_num in range(batch_size):
    eng_sentence, gu_sentence = batch[0][sentence_num], batch[1][sentence_num]
    eng_tokenized.append( tokenize(eng_sentence, english_to_index, start_token=False, end_token=False) )
    gu_tokenized.append( tokenize(gu_sentence, gujarati_to_index, start_token=True, end_token=True) )
eng_tokenized = torch.stack(eng_tokenized)
gu_tokenized = torch.stack(gu_tokenized)

In [None]:
gu_tokenized

tensor([[  0,  74,  94,  ..., 102, 102, 102],
        [  0,  34,  80,  ..., 102, 102, 102],
        [  0,  64,  91,  ..., 102, 102, 102],
        ...,
        [  0,  69,  96,  ..., 102, 102, 102],
        [  0,  49,  91,  ..., 102, 102, 102],
        [  0,  44,  77,  ..., 102, 102, 102]])