In [1]:
# IPython candies...
from IPython.display import Image
from IPython.core.display import HTML

In [2]:
# Imports we need.
import random
import pickle

import numpy as np
from tqdm import tqdm

from gensim.corpora import Dictionary

import torch
from torch import nn, optim, tensor, autograd
from torch.autograd import Variable
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader

from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

device = 'cuda' if torch.cuda.is_available else 'cpu'

In [3]:
try: # Use the default NLTK tokenizer.
    from nltk import word_tokenize, sent_tokenize 
    # Testing whether it works. 
    # Sometimes it doesn't work on some machines because of setup issues.
    word_tokenize(sent_tokenize("This is a foobar sentence. Yes it is.")[0])
except: # Use a naive sentence tokenizer and toktok.
    import re
    from nltk.tokenize import ToktokTokenizer
    # See https://stackoverflow.com/a/25736515/610569
    sent_tokenize = lambda x: re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', x)
    # Use the toktok tokenizer that requires no dependencies.
    toktok = ToktokTokenizer()
    word_tokenize = word_tokenize = toktok.tokenize

In [9]:
import pandas as pd
from gensim.corpora.dictionary import Dictionary
from nltk import word_tokenize

# Reads the tab-delimited data using Pandas.
kopitiam = pd.read_csv('kopitiam.csv')
kopitiam.head()

Unnamed: 0,Local Terms,Meaning,Source
0,Kopi O,Black Coffee with Sugar,https://daneshd.com/2010/02/28/a-rough-guide-t...
1,Kopi,Black Coffee with Condensed Milk,https://daneshd.com/2010/02/28/a-rough-guide-t...
2,Kopi C,Black Coffee with Evaporated Milk,https://daneshd.com/2010/02/28/a-rough-guide-t...
3,Kopi Kosong,Black Coffee without sugar or milk,https://daneshd.com/2010/02/28/a-rough-guide-t...
4,Kopi Gah Dai,Black Coffee with extra condensed milk,https://daneshd.com/2010/02/28/a-rough-guide-t...


In [10]:
# Use a unique string to indicate START and END of a sentence.
# Assign a unique index to them.
START, START_IDX = '<s>',  0
END, END_IDX = '</s>', 1

# We use this idiom to tokenize our sentences in the dataframe column:
# >>> DataFrame['column'].apply(str.lower).apply(word_tokenize)

# Also we added the START and the END symbol to the sentences. 
singlish_sents = [START] + kopitiam['Local Terms'].apply(str.lower).apply(word_tokenize) + [END]
english_sents = [START] + kopitiam['Meaning'].apply(str.lower).apply(word_tokenize) + [END]


In [11]:
# We're sort of getting into the data into the shape we want. 
# But now it's still too humanly readable and redundant.
## Cut-away: Computers like it to be simpler, more concise. -_-|||
print('First Singlish sentence:\t', singlish_sents[0])
print('First English sentence:\t\t', english_sents[0])

First Singlish sentence:	 ['<s>', 'kopi', 'o', '</s>']
First English sentence:		 ['<s>', 'black', 'coffee', 'with', 'sugar', '</s>']


In [12]:
class KopitiamDataset(Dataset):
    def __init__(self, src_sents, trg_sents, max_len=-1):
        assert len(src_sents) == len(trg_sents), "There should be the same no. of sentence for both source and target."
        self.src_sents = src_sents
        self.trg_sents = trg_sents

        # Create the vocabulary for both the source and target.
        self.src_vocab = Dictionary(src_sents)
        self.trg_vocab = Dictionary(trg_sents)
        
        # Patch the vocabularies and add the <pad> and <unk> symbols.
        special_tokens = {'<pad>': 0, '<unk>':1, '<s>':2, '</s>':3}
        self.src_vocab.patch_with_special_tokens(special_tokens)
        self.trg_vocab.patch_with_special_tokens(special_tokens)
        
        # Keep track of how many data points.
        self._len = len(src_sents)
        
        if max_len < 0:
            # If it's not set, find the longest text in the data.
            max_src_len = max(len(sent) for sent in src_sents)
            max_trg_len = max(len(sent) for sent in trg_sents)
            self.max_len = max(max_src_len, max_trg_len)
        else:
            self.max_len = max_len
        
    def pad_sequence(self, vectorized_sent, max_len):
        # To pad the sentence:
        # Pad left = 0; Pad right = max_len - len of sent.
        pad_dim = (0, max_len - len(vectorized_sent))
        return F.pad(vectorized_sent, pad_dim, 'constant')
        
    def __getitem__(self, index):
        vectorized_src = self.vectorize(self.src_vocab, self.src_sents[index])
        vectorized_trg = self.vectorize(self.trg_vocab, self.trg_sents[index])
        
        return {'x':self.pad_sequence(vectorized_src, self.max_len), 
                'y':self.pad_sequence(vectorized_trg, self.max_len), 
                'x_len':len(vectorized_src), 
                'y_len':len(vectorized_trg)}
    
    def __len__(self):
        return self._len
    
    def vectorize(self, vocab, tokens):
        """
        :param tokens: Tokens that should be vectorized. 
        :type tokens: list(str)
        """
        # See https://radimrehurek.com/gensim/corpora/dictionary.html#gensim.corpora.dictionary.Dictionary.doc2idx 
        # Lets just cast list of indices into torch tensors directly =)
        return torch.tensor(vocab.doc2idx(tokens))
    
    def unvectorize(self, vocab, indices):
        """
        :param indices: Converts the indices back to tokens.
        :type tokens: list(int)
        """
        return [vocab[i] for i in indices]

In [13]:

kopi_data = KopitiamDataset(singlish_sents, english_sents)

In [14]:
batch_size = 3
dataloader = DataLoader(dataset=kopi_data, 
                        batch_size=batch_size, 
                        shuffle=True)

def sort_batch_by_len(data_dict):
    # Sort indices of data in batch by lengths.
    sorted_indices = np.array(data_dict['x_len']).argsort()[::-1].tolist()
    data_batch = {name:_tensor[sorted_indices]
                  for name, _tensor in data_dict.items()}
    return data_batch

for data_dict in dataloader:
    data_batch = sort_batch_by_len(data_dict)
    print(data_batch)
    break

{'x': tensor([[ 2, 68, 67,  9,  8,  6,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0],
        [ 2, 49, 46,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0],
        [ 2, 68, 50,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0]]), 'y': tensor([[  2,  24, 117, 116,   5,  25, 116,  27,  23,  25,   4,   3,   0,   0,
           0,   0,   0,   0,   0],
        [  2,  56,   5,  55,  54,   3,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0],
        [  2,  82,  26,  83,  84,  61, 116,   5,  59,  19,  57,  81,  80,  76,
          74,  70,  67,   3,   0]]), 'x_len': tensor([7, 4, 4]), 'y_len': tensor([12,  6, 18])}


In [15]:
# For example, we have tensors with variable lengths.
t1 = torch.tensor([2, 13, 5, 8, 3]).float()
t2 = torch.tensor([2, 10, 3]).float()
t3 = torch.tensor([2, 10, 1, 3]).float()

In [17]:
# If the dimensions are different, you can't even stack them into a matrix.
torch.stack([t1, t2, t3])

RuntimeError: invalid argument 0: Sizes of tensors must match except in dimension 0. Got 5 and 3 in dimension 1 at c:\a\w\1\s\tmp_conda_3.6_091443\conda\conda-bld\pytorch_1544087948354\work\aten\src\th\generic/THTensorMoreMath.cpp:1333

In [18]:
# So we have to pad them, e.g. 
_max_len = max(len(t1), len(t2), len(t3))

t1 = F.pad(t1, (0, _max_len-len(t1)), 'constant')
t2 = F.pad(t2, (0, _max_len-len(t2)), 'constant')
t3 = F.pad(t3, (0, _max_len-len(t3)), 'constant')

torch.stack([t1, t2, t3])

tensor([[ 2., 13.,  5.,  8.,  3.],
        [ 2., 10.,  3.,  0.,  0.],
        [ 2., 10.,  1.,  3.,  0.]])

In [19]:
batch_size = 3
dataloader = DataLoader(dataset=kopi_data, batch_size=batch_size, 
                        shuffle=False)

# Hack to make dataloader give us the first batch.
data_batch = next(iter(dataloader)) 

data_batch['x']

tensor([[ 2, 68, 67,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0],
        [ 2, 68,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0],
        [ 2, 68,  4,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0]])

In [20]:
data_batch['x_len']

tensor([4, 3, 4])

In [21]:
packed_tensor = pack_padded_sequence(data_batch['x'],
                                     data_batch['x_len'], 
                                     batch_first=True)
packed_tensor

RuntimeError: 'lengths' array has to be sorted in decreasing order

In [22]:
batch_size = 3
dataloader = DataLoader(dataset=kopi_data, batch_size=batch_size, shuffle=True)

def sort_batch_by_len(data_batch):
    # Sort indices of data in batch by lengths.
    sorted_indices = np.array(data_dict['x_len']).argsort()[::-1].tolist()
    data_batch = {name:_tensor[sorted_indices]
                  for name, _tensor in data_dict.items()}
    return data_batch


# Hack to make dataloader give us the first batch.
data_batch = next(iter(dataloader)) 

# Apply the `pack_padded_sequence` to the batch.
data_batch = sort_batch_by_len(data_batch)
packed_tensor = pack_padded_sequence(data_batch['x'], data_batch['x_len'], batch_first=True)
packed_tensor

PackedSequence(data=tensor([ 2,  2,  2, 68, 49, 68, 67, 46, 50,  9,  3,  3,  8,  6,  3]), batch_sizes=tensor([3, 3, 3, 3, 1, 1, 1]))

In [23]:
_padded_x_tensors = tensor(
    [[ 2, 13,  6,  3,  0,  0,  0,  0,  0],
     [ 2, 68,  3,  0,  0,  0,  0,  0,  0],
     [ 2, 68, 67,  5,  3,  0,  0,  0,  0]
    ])

_tensor_lens = [4, 3, 5]

In [24]:
# First, to perform any operations within the tensors, 
# it's easiest to cast them into numpy arrays
np.array(_tensor_lens)

array([4, 3, 5])

In [25]:
# Using the `.argsort()` function in the numpy array
# would return an array of the indices sorted by their
# values in ascending order.
np.array(_tensor_lens).argsort()

array([1, 0, 2], dtype=int64)

In [26]:
# But for the `pack_padded_sequence`, we want our 
# tensor lengths to be sorted in a descending order, 
# so we do a reverse.

# Normally you can use the native python `reversed()` 
# function and the idiom looks as below, but that will
# lose the np.array object
list(reversed(np.array(_tensor_lens).argsort()))

[2, 0, 1]

In [27]:
# To keep the np.array object, 
# we can use the [::-1] slice notion to reverse the array.
# See https://stackoverflow.com/a/31633656/610569 
np.array(_tensor_lens).argsort()[::-1]

array([2, 0, 1], dtype=int64)

In [28]:
# Sort the indices by descending order.
sorted_indices = np.array(_tensor_lens).argsort()[::-1].tolist()
# Use the slice notation on the tensor to reorder the tensor.
print(_tensor_lens)
print(_padded_x_tensors)
print()
print(sorted(_tensor_lens, reverse=True))
print(_padded_x_tensors[sorted_indices])

[4, 3, 5]
tensor([[ 2, 13,  6,  3,  0,  0,  0,  0,  0],
        [ 2, 68,  3,  0,  0,  0,  0,  0,  0],
        [ 2, 68, 67,  5,  3,  0,  0,  0,  0]])

[5, 4, 3]
tensor([[ 2, 68, 67,  5,  3,  0,  0,  0,  0],
        [ 2, 13,  6,  3,  0,  0,  0,  0,  0],
        [ 2, 68,  3,  0,  0,  0,  0,  0,  0]])


In [29]:
# If you didn't specify the batch_first argument, you see that our tensors got transposed.
pad_packed_sequence(packed_tensor, batch_first=False)

(tensor([[ 2,  2,  2],
         [68, 49, 68],
         [67, 46, 50],
         [ 9,  3,  3],
         [ 8,  0,  0],
         [ 6,  0,  0],
         [ 3,  0,  0]]), tensor([7, 4, 4]))

In [30]:
# To get back the original tensor before packing, we set batch_first=True.
unpacked_tensor, unpacked_tensor_len = pad_packed_sequence(packed_tensor, batch_first=True)
unpacked_tensor

tensor([[ 2, 68, 67,  9,  8,  6,  3],
        [ 2, 49, 46,  3,  0,  0,  0],
        [ 2, 68, 50,  3,  0,  0,  0]])

In [31]:
unpacked_tensor_len

tensor([7, 4, 4])

In [32]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        # Set the no. of nodes for the hidden layer.
        self.hidden_size = hidden_size
        # Initialize the embedding layer with the 
        # - size of input (i.e. no. of words in input vocab)
        # - no. of hidden nodes in the embedding layer
        self.embedding = nn.Embedding(input_size, hidden_size,
                                      padding_idx=0)
        # Initialize the GRU with the 
        # - size of the hidden layer from the previous state
        # - size of the hidden layer from the current state
        self.gru = nn.GRU(hidden_size, hidden_size)

 
    def forward(self, inputs, inputs_lengths):
        # Feed the input into the embedding layer.
        embedded = self.embedding(inputs)
        # Create PackedSequence
        lengths = inputs_lengths.detach().cpu().numpy()
        embedded_packed = pack_padded_sequence(embedded, lengths, 
                                               batch_first=True)
        # Feed the embedded layer with the hidden layer to the GRU.
        # Update the output and hidden layer.
        output, hidden = self.gru(embedded_packed)
        return output, hidden

In [33]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        # Set the no. of nodes for the hidden layer.
        self.hidden_size = hidden_size
        # Initialize the embedding layer with the 
        # - size of output (i.e. no. of words in output vocab)
        # - no. of hidden nodes in the embedding layer
        self.embedding = nn.Embedding(output_size, hidden_size,
                                     padding_idx=0)
        # Initialize the GRU with the 
        # - size of the hidden layer from the previous state
        # - size of the hidden layer from the current state
        self.gru = nn.GRU(hidden_size, hidden_size)
        # Initialize the "classifier" linear layer.
        self.out = nn.Linear(hidden_size, output_size)
        # Set the output layer to output a specific symbol 
        # from the output vocabulary
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        # Get the embedding of the current input word (last output word)
        batch_size = input.size(0)
        embedded = self.embedding(input)
        embedded = embedded.permute(1, 0, 2) # S=1 x B x N
        # Transform the embedded output with a relu function. 
        output = F.relu(embedded)
        # Get current hidden state from input word and last hidden state
        output, hidden = self.gru(output, hidden)
        # Take the updated output and find the most appropriate
        # output symbol. 
        output = self.softmax(self.out(output).squeeze(0))
        return output, hidden

In [34]:
# Initialize the data.
batch_size = 3
kopi_data = KopitiamDataset(singlish_sents, english_sents)
dataloader = DataLoader(dataset=kopi_data, batch_size=batch_size, shuffle=True)

# Initialize the network for encoder and decoder.
hidden_size = 7
_encoder = EncoderRNN(len(kopi_data.src_vocab), hidden_size)
_decoder = DecoderRNN(hidden_size, len(kopi_data.trg_vocab))

In [35]:
_data_batch = sort_batch_by_len(next(iter(dataloader)))

In [36]:
_data_batch['x'], _data_batch['x_len']

(tensor([[ 2, 68, 67,  9,  8,  6,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
           0],
         [ 2, 49, 46,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
           0],
         [ 2, 68, 50,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
           0]]), tensor([7, 4, 4]))

In [37]:
# The input is of shape:
# batch_size * max_len
_data_batch['x'].shape

torch.Size([3, 19])

In [38]:
_encoder(_data_batch['x'], _data_batch['x_len'])

(PackedSequence(data=tensor([[ 0.2711,  0.0875, -0.0224,  0.0294, -0.2760, -0.2569, -0.3175],
        [ 0.2711,  0.0875, -0.0224,  0.0294, -0.2760, -0.2569, -0.3175],
        [ 0.2711,  0.0875, -0.0224,  0.0294, -0.2760, -0.2569, -0.3175],
        [ 0.4711,  0.1670, -0.0423, -0.2430, -0.1199, -0.3438, -0.2065],
        [ 0.2884, -0.1307, -0.3262,  0.1388,  0.0194, -0.5905, -0.3723],
        [ 0.4711,  0.1670, -0.0423, -0.2430, -0.1199, -0.3438, -0.2065],
        [ 0.6877,  0.2292,  0.0277, -0.3754, -0.5996,  0.0718, -0.4156],
        [ 0.2977, -0.3155, -0.2943, -0.1023,  0.1322, -0.1422, -0.4764],
        [ 0.5842,  0.1597, -0.1575, -0.3117, -0.3304, -0.2241, -0.4912],
        [ 0.4285,  0.0400, -0.0914, -0.3507, -0.2297, -0.2071, -0.4237],
        [ 0.4695, -0.2642, -0.2785, -0.2136,  0.0589, -0.2547, -0.4357],
        [ 0.5648,  0.0905, -0.2146, -0.3627, -0.2205, -0.3412, -0.4964],
        [ 0.3469,  0.1132,  0.1102, -0.0685, -0.0318, -0.4626, -0.3211],
        [ 0.4540,  0.0738,  0.

In [39]:

_encoder(_data_batch['x'], _data_batch['x_len'])[0]

PackedSequence(data=tensor([[ 0.2711,  0.0875, -0.0224,  0.0294, -0.2760, -0.2569, -0.3175],
        [ 0.2711,  0.0875, -0.0224,  0.0294, -0.2760, -0.2569, -0.3175],
        [ 0.2711,  0.0875, -0.0224,  0.0294, -0.2760, -0.2569, -0.3175],
        [ 0.4711,  0.1670, -0.0423, -0.2430, -0.1199, -0.3438, -0.2065],
        [ 0.2884, -0.1307, -0.3262,  0.1388,  0.0194, -0.5905, -0.3723],
        [ 0.4711,  0.1670, -0.0423, -0.2430, -0.1199, -0.3438, -0.2065],
        [ 0.6877,  0.2292,  0.0277, -0.3754, -0.5996,  0.0718, -0.4156],
        [ 0.2977, -0.3155, -0.2943, -0.1023,  0.1322, -0.1422, -0.4764],
        [ 0.5842,  0.1597, -0.1575, -0.3117, -0.3304, -0.2241, -0.4912],
        [ 0.4285,  0.0400, -0.0914, -0.3507, -0.2297, -0.2071, -0.4237],
        [ 0.4695, -0.2642, -0.2785, -0.2136,  0.0589, -0.2547, -0.4357],
        [ 0.5648,  0.0905, -0.2146, -0.3627, -0.2205, -0.3412, -0.4964],
        [ 0.3469,  0.1132,  0.1102, -0.0685, -0.0318, -0.4626, -0.3211],
        [ 0.4540,  0.0738,  0.2

In [47]:
t = Dictionary([['VIknesh is a'],['dads dada']])

In [49]:
special_tokens = {'<pad>': 0, '<unk>':1, '<s>':2, '</s>':3}
t.patch_with_special_tokens(special_tokens)

In [50]:
print (t)

Dictionary(6 unique tokens: ['VIknesh is a', 'dads dada', '<pad>', '<unk>', '<s>']...)


In [52]:
torch.tensor(t.doc2idx(['VIknesh is a']))

tensor([4])