## Import Libraries

In [2]:
import torch
import torch.functional as F
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import unicodedata
import re
import time

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'torch.__version__: {torch.__version__}')
print(f'torch device: {device}')

torch.__version__: 1.7.0+cu101
torch device: cuda


In [50]:
print(torch.__version__)

1.7.0+cu101


## Import Data from Google Drive

In [4]:
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [5]:
f = open('/gdrive/My Drive/Data Science/Capstone Project 2/fra.txt', encoding='UTF-8').read().strip().split('\n')

In [6]:
lines = f

In [7]:
# sample size (try with smaller sample size to reduce computation)
num_examples = 30000 

# creates lists containing each pair
original_word_pairs = [[w for w in l.split('\t')] for l in lines[:num_examples]]

In [8]:
data = pd.DataFrame(original_word_pairs, columns=["eng", "fr", "info"])
data = data.drop(columns="info", axis=1)

In [9]:
data.head(5)

Unnamed: 0,eng,fr
0,Go.,Va !
1,Hi.,Salut !
2,Hi.,Salut.
3,Run!,Cours !
4,Run!,Courez !


In [10]:
# Converts the unicode file to ascii
def unicode_to_ascii(s):
    """
    Normalizes latin chars with accent to their canonical decomposition
    """
    return ''.join(c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn')

def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())
    
    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ." 
    # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)
    
    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)
    
    w = w.rstrip().strip()
    
    # adding a start and an end token to the sentence
    # so that the model know when to start and stop predicting.
    w = '<start> ' + w + ' <end>'
    return w

## Data Exploration

In [11]:
# Now we do the preprocessing using pandas and lambdas
data["eng"] = data.eng.apply(lambda w: preprocess_sentence(w))
data["fr"] = data.fr.apply(lambda w: preprocess_sentence(w))
data.sample(10)

Unnamed: 0,eng,fr
29203,<start> he s got a headache . <end>,<start> il a mal a la tete . <end>
17395,<start> what s your name ? <end>,<start> comment tu t appelles ? <end>
21100,<start> the bath is ready . <end>,<start> le bain est pret . <end>
9899,<start> you re awesome . <end>,<start> tu es geniale . <end>
18581,<start> he can t help you . <end>,<start> il ne peut pas vous aider . <end>
25269,<start> is this seat empty ? <end>,<start> cette place est elle libre ? <end>
21848,<start> we have some time . <end>,<start> nous avons un peu de temps . <end>
25332,<start> it started to snow . <end>,<start> il commenca a neiger . <end>
16146,<start> please don t ask . <end>,"<start> ne demande pas , je te prie ! <end>"
22656,<start> you re contagious . <end>,<start> tu es contagieuse . <end>


## Building Vocabulary Index

In [12]:
# This class creates a word -> index mapping (e.g,. "dad" -> 5) and vice-versa 
# (e.g., 5 -> "dad") for each language,
class LanguageIndex():
    def __init__(self, lang):
        """ lang are the list of phrases from each language"""
        self.lang = lang
        self.word2idx = {}
        self.idx2word = {}
        self.vocab = set()
        
        self.create_index()
        
    def create_index(self):
        for phrase in self.lang:
            # update with individual tokens
            self.vocab.update(phrase.split(' '))
            
        # sort the vocab
        self.vocab = sorted(self.vocab)

        # add a padding token with index 0
        self.word2idx['<pad>'] = 0
        
        # word to index mapping
        for index, word in enumerate(self.vocab):
            self.word2idx[word] = index + 1 # +1 because of pad token
        
        # index to word mapping
        for word, index in self.word2idx.items():
            self.idx2word[index] = word        

In [13]:
# index language using the class above
inp_lang = LanguageIndex(data["fr"].values.tolist())
targ_lang = LanguageIndex(data["eng"].values.tolist())
# Vectorize the input and target languages
input_tensor = [[inp_lang.word2idx[s] for s in fr.split(' ')]  for fr in data["fr"].values.tolist()]
target_tensor = [[targ_lang.word2idx[s] for s in eng.split(' ')]  for eng in data["eng"].values.tolist()]
input_tensor[:10]

[[5, 7313, 1, 4],
 [5, 6391, 1, 4],
 [5, 6391, 3, 4],
 [5, 1674, 1, 4],
 [5, 1667, 1, 4],
 [5, 5768, 6, 4],
 [5, 986, 275, 1, 4],
 [5, 589, 3088, 1, 4],
 [5, 7, 4065, 176, 1, 4],
 [5, 6427, 3, 4]]

In [14]:
target_tensor[:10]

[[5, 1629, 3, 4],
 [5, 1821, 3, 4],
 [5, 1821, 3, 4],
 [5, 3221, 1, 4],
 [5, 3221, 1, 4],
 [5, 4259, 6, 4],
 [5, 4331, 1, 4],
 [5, 1419, 1, 4],
 [5, 1806, 1, 4],
 [5, 2099, 3, 4]]

In [15]:
list(inp_lang.idx2word.values())[:10]

['<pad>',
 '!',
 ',',
 '.',
 '<end>',
 '<start>',
 '?',
 'a',
 'abandonna',
 'abandonne']

In [16]:
list(inp_lang.word2idx.keys())[:10]

['<pad>',
 '!',
 ',',
 '.',
 '<end>',
 '<start>',
 '?',
 'a',
 'abandonna',
 'abandonne']

In [17]:
list(inp_lang.word2idx.keys())[1350:1450]

['cocufiee',
 'coffre',
 'cogna',
 'cogne',
 'cogner',
 'coiffe',
 'coiffee',
 'coin',
 'coince',
 'coincee',
 'coincees',
 'coinces',
 'coincidence',
 'colere',
 'collant',
 'collation',
 'colle',
 'collectif',
 'collectionne',
 'collegue',
 'coller',
 'colocataire',
 'coma',
 'combat',
 'combattre',
 'combattrons',
 'combien',
 'comedien',
 'comediens',
 'comedies',
 'commande',
 'commanderai',
 'commandes',
 'comme',
 'commenca',
 'commencames',
 'commence',
 'commencement',
 'commencer',
 'commencerai',
 'commencerons',
 'commences',
 'commencez',
 'commencons',
 'comment',
 'commentaire',
 'commere',
 'commettons',
 'commis',
 'commises',
 'compagnie',
 'compagnon',
 'compare',
 'compassion',
 'compatis',
 'competitif',
 'completement',
 'complexe',
 'compliment',
 'complique',
 'complot',
 'comporte',
 'comportez',
 'composer',
 'comprehensible',
 'comprenait',
 'comprend',
 'comprendra',
 'comprendre',
 'comprendront',
 'comprends',
 'comprenez',
 'comprenons',
 'compris',
 'com

In [18]:
def max_length(tensor):
    return max(len(t) for t in tensor)

In [19]:
# calculate the max_length of input and output tensor
max_length_inp, max_length_tar = max_length(input_tensor), max_length(target_tensor)

In [20]:
def pad_sequences(x, max_len):
    padded = np.zeros((max_len), dtype=np.int64)
    if len(x) > max_len: 
        padded[:] = x[:max_len]
    else: 
        padded[:len(x)] = x
    return padded

In [21]:
# inplace padding
input_tensor = [pad_sequences(x, max_length_inp) for x in input_tensor]
target_tensor = [pad_sequences(x, max_length_tar) for x in target_tensor]
len(target_tensor)

30000

In [22]:
# Creating training and validation sets using an 80-20 split
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)

# Show length
len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val)

(24000, 24000, 6000, 6000)

## Load Data into DataLoader for Batching

In [23]:
from torch.utils.data import Dataset, DataLoader

In [24]:
# convert the data to tensors and pass to the Dataloader 
# to create an batch iterator

class MyData(Dataset):
    def __init__(self, X, y):
        self.data = X
        self.target = y
        # TODO: convert this into torch code is possible
        self.length = [ np.sum(1 - np.equal(x, 0)) for x in X]
        
    def __getitem__(self, index):
        x = self.data[index]
        y = self.target[index]
        x_len = self.length[index]
        return x,y,x_len
    
    def __len__(self):
        return len(self.data)

## Parameters

Defining hyperparameters and other things we'll need to train NMT model

In [25]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 64
N_BATCH = BUFFER_SIZE//BATCH_SIZE
embedding_dim = 256
units = 1024
vocab_inp_size = len(inp_lang.word2idx)
vocab_tar_size = len(targ_lang.word2idx)

train_dataset = MyData(input_tensor_train, target_tensor_train)
val_dataset = MyData(input_tensor_val, target_tensor_val)

dataset = DataLoader(train_dataset, batch_size = BATCH_SIZE, 
                     drop_last=True,
                     shuffle=True)

In [26]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.embedding = nn.Embedding(self.vocab_size, self.embedding_dim)
        self.gru = nn.GRU(self.embedding_dim, self.enc_units)
        
    def forward(self, x, lens, device):
        # x: batch_size, max_length 
        
        # x: batch_size, max_length, embedding_dim
        x = self.embedding(x) 
                
        # x transformed = max_len X batch_size X embedding_dim
        # x = x.permute(1,0,2)
        x = pack_padded_sequence(x, lens) # unpad
    
        self.hidden = self.initialize_hidden_state(device)
        
        # output: max_length, batch_size, enc_units
        # self.hidden: 1, batch_size, enc_units
        output, self.hidden = self.gru(x, self.hidden) # gru returns hidden state of all timesteps as well as hidden state at last timestep
        
        # pad the sequence to the max length in the batch
        output, _ = pad_packed_sequence(output)
        
        return output, self.hidden

    def initialize_hidden_state(self, device):
        return torch.zeros((1, self.batch_sz, self.enc_units)).to(device)

In [27]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.embedding = nn.Embedding(self.vocab_size, self.embedding_dim)
        self.gru = nn.GRU(self.embedding_dim, self.enc_units)
        
    def forward(self, x, lens, device):
        # x: batch_size, max_length 
        
        # x: batch_size, max_length, embedding_dim
        x = self.embedding(x) 
                
        # x transformed = max_len X batch_size X embedding_dim
        # x = x.permute(1,0,2)
        x = pack_padded_sequence(x, lens) # unpad
    
        self.hidden = self.initialize_hidden_state(device)
        
        # output: max_length, batch_size, enc_units
        # self.hidden: 1, batch_size, enc_units
        output, self.hidden = self.gru(x, self.hidden) # gru returns hidden state of all timesteps as well as hidden state at last timestep
        
        # pad the sequence to the max length in the batch
        output, _ = pad_packed_sequence(output)
        
        return output, self.hidden

    def initialize_hidden_state(self, device):
        return torch.zeros((1, self.batch_sz, self.enc_units)).to(device)

In [28]:
### sort batch function to be able to use with pad_packed_sequence
def sort_batch(X, y, lengths):
    lengths, indx = lengths.sort(dim=0, descending=True)
    X = X[indx]
    y = y[indx]
    return X.transpose(0,1), y, lengths # transpose (batch x seq) to (seq x batch)

## Testing the Encoder

In [29]:
### Testing Encoder part
# TODO: put whether GPU is available or not
# Device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)

encoder.to(device)
# obtain one sample from the data iterator
it = iter(dataset)
x, y, x_len = next(it)

# sort the batch first to be able to use with pac_pack_sequence
xs, ys, lens = sort_batch(x, y, x_len)

enc_output, enc_hidden = encoder(xs.to(device), lens, device)

print(enc_output.size()) # max_length, batch_size, enc_units

torch.Size([10, 64, 1024])


In [30]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, dec_units, enc_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.enc_units = enc_units
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.embedding = nn.Embedding(self.vocab_size, self.embedding_dim)
        self.gru = nn.GRU(self.embedding_dim + self.enc_units, 
                          self.dec_units,
                          batch_first=True)
        self.fc = nn.Linear(self.enc_units, self.vocab_size)
        
        # used for attention
        self.W1 = nn.Linear(self.enc_units, self.dec_units)
        self.W2 = nn.Linear(self.enc_units, self.dec_units)
        self.V = nn.Linear(self.enc_units, 1)
    
    def forward(self, x, hidden, enc_output):
        # enc_output original: (max_length, batch_size, enc_units)
        # enc_output converted == (batch_size, max_length, hidden_size)
        enc_output = enc_output.permute(1,0,2)
        # hidden shape == (batch_size, hidden size)
        # hidden_with_time_axis shape == (batch_size, 1, hidden size)
        # we are doing this to perform addition to calculate the score
        
        # hidden shape == (batch_size, hidden size)
        # hidden_with_time_axis shape == (batch_size, 1, hidden size)
        hidden_with_time_axis = hidden.permute(1, 0, 2)
        
        # score: (batch_size, max_length, hidden_size) # Bahdanaus's
        # we get 1 at the last axis because we are applying tanh(FC(EO) + FC(H)) to self.V
        # It doesn't matter which FC we pick for each of the inputs
        score = torch.tanh(self.W1(enc_output) + self.W2(hidden_with_time_axis))
        
        #score = torch.tanh(self.W2(hidden_with_time_axis) + self.W1(enc_output))
          
        # attention_weights shape == (batch_size, max_length, 1)
        # we get 1 at the last axis because we are applying score to self.V
        attention_weights = torch.softmax(self.V(score), dim=1)
        
        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * enc_output
        context_vector = torch.sum(context_vector, dim=1)
        
        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        # takes case of the right portion of the model above (illustrated in red)
        x = self.embedding(x)
        
        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        #x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        # ? Looks like attention vector in diagram of source
        x = torch.cat((context_vector.unsqueeze(1), x), -1)
        
        # passing the concatenated vector to the GRU
        # output: (batch_size, 1, hidden_size)
        output, state = self.gru(x)
        
        
        # output shape == (batch_size * 1, hidden_size)
        output =  output.view(-1, output.size(2))
        
        # output shape == (batch_size * 1, vocab)
        x = self.fc(output)
        
        return x, state, attention_weights
    
    def initialize_hidden_state(self):
        return torch.zeros((1, self.batch_sz, self.dec_units))

## Testing the Decoder

In [31]:
# Device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)

encoder.to(device)
# obtain one sample from the data iterator
it = iter(dataset)
x, y, x_len = next(it)

print("Input: ", x.shape)
print("Output: ", y.shape)

# sort the batch first to be able to use with pac_pack_sequence
xs, ys, lens = sort_batch(x, y, x_len)

enc_output, enc_hidden = encoder(xs.to(device), lens, device)
print("Encoder Output: ", enc_output.shape) # batch_size X max_length X enc_units
print("Encoder Hidden: ", enc_hidden.shape) # batch_size X enc_units (corresponds to the last state)

decoder = Decoder(vocab_tar_size, embedding_dim, units, units, BATCH_SIZE)
decoder = decoder.to(device)

#print(enc_hidden.squeeze(0).shape)

dec_hidden = enc_hidden#.squeeze(0)
dec_input = torch.tensor([[targ_lang.word2idx['<start>']]] * BATCH_SIZE)
print("Decoder Input: ", dec_input.shape)
print("--------")

for t in range(1, y.size(1)):
    # enc_hidden: 1, batch_size, enc_units
    # output: max_length, batch_size, enc_units
    predictions, dec_hidden, _ = decoder(dec_input.to(device), 
                                         dec_hidden.to(device), 
                                         enc_output.to(device))
    
    print("Prediction: ", predictions.shape)
    print("Decoder Hidden: ", dec_hidden.shape)
    
    #loss += loss_function(y[:, t].to(device), predictions.to(device))
    
    dec_input = y[:, t].unsqueeze(1)
    print(dec_input.shape)
    break

Input:  torch.Size([64, 17])
Output:  torch.Size([64, 10])
Encoder Output:  torch.Size([11, 64, 1024])
Encoder Hidden:  torch.Size([1, 64, 1024])
Decoder Input:  torch.Size([64, 1])
--------
Prediction:  torch.Size([64, 4367])
Decoder Hidden:  torch.Size([1, 64, 1024])
torch.Size([64, 1])


In [32]:
lens.shape

torch.Size([64])

In [33]:
xs.shape

torch.Size([17, 64])

In [34]:
ys.shape

torch.Size([64, 10])

In [35]:
criterion = nn.CrossEntropyLoss()

def loss_function(real, pred):
    """ Only consider non-zero inputs in the loss; mask needed """
    #mask = 1 - np.equal(real, 0) # assign 0 to all above 0 and 1 to all 0s
    #print(mask)
    mask = real.ge(1).type(torch.cuda.FloatTensor)
    
    loss_ = criterion(pred, real) * mask 
    return torch.mean(loss_)

In [36]:
# Device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

## TODO: Combine the encoder and decoder into one class
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)
decoder = Decoder(vocab_tar_size, embedding_dim, units, units, BATCH_SIZE)

encoder.to(device)
decoder.to(device)

optimizer = optim.Adam(list(encoder.parameters()) + list(decoder.parameters()), 
                       lr=0.001)

## Training

In [37]:
EPOCHS = 2

for epoch in range(EPOCHS):
    start = time.time()
    
    encoder.train()
    decoder.train()
    
    total_loss = 0
    
    for (batch, (inp, targ, inp_len)) in enumerate(dataset):
        loss = 0
        
        xs, ys, lens = sort_batch(inp, targ, inp_len)
        enc_output, enc_hidden = encoder(xs.to(device), lens, device)
        dec_hidden = enc_hidden
        
        # use teacher forcing - feeding the target as the next input (via dec_input)
        dec_input = torch.tensor([[targ_lang.word2idx['<start>']]] * BATCH_SIZE)
        
        # run code below for every timestep in the ys batch
        for t in range(1, ys.size(1)):
            predictions, dec_hidden, _ = decoder(dec_input.to(device), 
                                         dec_hidden.to(device), 
                                         enc_output.to(device))
            loss += loss_function(ys[:, t].to(device), predictions.to(device))
            #loss += loss_
            dec_input = ys[:, t].unsqueeze(1)
            
        
        batch_loss = (loss / int(ys.size(1)))
        total_loss += batch_loss
        
        optimizer.zero_grad()
        
        loss.backward()

        ### UPDATE MODEL PARAMETERS
        optimizer.step()
        
        if batch % 100 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                         batch,
                                                         batch_loss.detach().item()))
        
        
    ### TODO: Save checkpoint for model
    print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                        total_loss / N_BATCH))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 4.8987
Epoch 1 Batch 100 Loss 1.5722
Epoch 1 Batch 200 Loss 1.1926
Epoch 1 Batch 300 Loss 1.0822
Epoch 1 Loss 1.4488
Time taken for 1 epoch 19.529693841934204 sec

Epoch 2 Batch 0 Loss 0.7467
Epoch 2 Batch 100 Loss 0.7385
Epoch 2 Batch 200 Loss 0.5799
Epoch 2 Batch 300 Loss 0.5989
Epoch 2 Loss 0.6919
Time taken for 1 epoch 19.766449213027954 sec



In [39]:
print(' '.join([targ_lang.idx2word[i] for i in target_tensor[10000]]))
print(' '.join([inp_lang.idx2word[i] for i in input_tensor[10000]]))

<start> you re welcome . <end> <pad> <pad> <pad> <pad>
<start> de rien ! <end> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>


In [40]:
for (inp, targ, inp_len) in dataset:
    break

In [41]:
print(inp)
print(targ)
print(inp_len)
#xs, ys, lens = sort_batch(inp, targ, inp_len)
#enc_output, enc_hidden = encoder(xs.to(device), lens, device=device)
#dec_hidden = enc_hidden

tensor([[   5, 3973, 4302,  ...,    0,    0,    0],
        [   5,  985, 2803,  ...,    0,    0,    0],
        [   5, 7264, 4237,  ...,    0,    0,    0],
        ...,
        [   5,  985, 2803,  ...,    0,    0,    0],
        [   5, 7084, 4237,  ...,    0,    0,    0],
        [   5, 7264,  494,  ...,    0,    0,    0]])
tensor([[   5, 1922, 1202,    7, 2281,    3,    4,    0,    0,    0],
        [   5, 3887, 2044, 1894,    3,    4,    0,    0,    0,    0],
        [   5, 4357,  130, 2353,    3,    4,    0,    0,    0,    0],
        [   5, 1922, 2458, 4357,    3,    4,    0,    0,    0,    0],
        [   5, 1922, 3042, 1909,    3,    4,    0,    0,    0,    0],
        [   5, 3940, 2044, 3236,    3,    4,    0,    0,    0,    0],
        [   5, 1922, 2518,    7, 2814,    3,    4,    0,    0,    0],
        [   5, 2190, 3865,  109,    3,    4,    0,    0,    0,    0],
        [   5, 1892,   14, 3942,    6,    4,    0,    0,    0,    0],
        [   5, 4210, 3273, 3545,    3,    4,

In [48]:
batch_size = 9
def french_to_english(french_sentence):
    french_tensor = [inp_lang.word2idx[w] for w in french_sentence.split()]
    
    lens = torch.tensor([len(french_tensor)] * batch_size)
    lens = lens.to(device)

    x = torch.tensor([list(french_tensor)] * batch_size)
    x = x.to(device)

    # xs, ys, lens = sort_batch(inp, targ, inp_len)
    enc_output, enc_hidden = encoder(x=x, lens=lens, device=device)
    
    # encode(french_sentence)
    return enc_output

In [49]:
french_to_english('<start> il ne peut pas vous aider . <end>')

RuntimeError: ignored