# Assignment 2: Deep N-grams
Your task will be to predict the next set of characters using the previous characters. 
Your model will convert each character to its embedding, run the embeddings through a Gated Recurrent Unit GRU, and run it through a linear layer to predict the next set of characters.

In [29]:
!pip install trax
import os
import trax
import trax.fastmath.numpy as np
import pickle
import numpy
import random as rnd
from trax import fastmath
from trax import layers as tl



# Part 1: Importing the Data

## 1.1 Loading in the data

In [30]:
dirname = '/content/drive/My Drive/data'
lines = []
for filename in os.listdir(dirname):
    with open(os.path.join(dirname, filename)) as file:
        for line in file:
            pure_line = line.strip()
            if pure_line:
                lines.append(pure_line.lower())

In [31]:
eval_lines = lines[-1000:]
lines = lines[:-1000]

print(f"Number of lines for training: {len(lines)}")
print(f"Number of lines for validation: {len(eval_lines)}")

Number of lines for training: 124097
Number of lines for validation: 1000


In [32]:
print(lines[111])

through it do well, i do not relish well


## 1.2 Convert a line to tensor

In [33]:
def line_to_tensor(line, EOS_int=1):

    tensor = []

    for c in line:
        c_int = ord(c)
        tensor.append(c_int)

    tensor.append(EOS_int)
    
    return tensor

In [34]:
print(line_to_tensor(lines[111]))

[116, 104, 114, 111, 117, 103, 104, 32, 105, 116, 32, 100, 111, 32, 119, 101, 108, 108, 44, 32, 105, 32, 100, 111, 32, 110, 111, 116, 32, 114, 101, 108, 105, 115, 104, 32, 119, 101, 108, 108, 1]


## 1.3 Batch generator

In [35]:
def data_generator(batch_size, max_length, data_lines, line_to_tensor=line_to_tensor, shuffle=True):

    index = 0
    cur_batch = []
    num_lines = len(data_lines)
    lines_index = [*range(num_lines)]
    
    if shuffle:
        rnd.shuffle(lines_index)
    
    while True:
        if index >= num_lines:
            index = 0
            if shuffle:
                rnd.shuffle(lines_index)

        line = data_lines[lines_index[index]]
        
        if len(line) < max_length:
            cur_batch.append(line)
            
        index += 1
        
        if len(cur_batch) == batch_size:
            
            batch = []
            mask = []
            
            for li in cur_batch:
                tensor = line_to_tensor(li)
                pad = [0] * (max_length - len(tensor))
                tensor_pad = tensor + pad
                batch.append(tensor_pad)
                example_mask = [0  if i == 0 else 1 for i in tensor_pad]
                mask.append(example_mask)
               
            batch_np_arr = np.array(batch)
            mask_np_arr = np.array(mask)

            yield batch_np_arr, batch_np_arr, mask_np_arr
            
            cur_batch = []

In [36]:
tmp_lines = ['12345678901','123456789','234567890', '345678901']

tmp_data_gen = data_generator(batch_size=2, 
                              max_length=10, 
                              data_lines=tmp_lines,
                              shuffle=False)
next(tmp_data_gen)

(DeviceArray([[49, 50, 51, 52, 53, 54, 55, 56, 57,  1],
              [50, 51, 52, 53, 54, 55, 56, 57, 48,  1]], dtype=int32),
 DeviceArray([[49, 50, 51, 52, 53, 54, 55, 56, 57,  1],
              [50, 51, 52, 53, 54, 55, 56, 57, 48,  1]], dtype=int32),
 DeviceArray([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
              [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], dtype=int32))

# Part 2: Defining the GRU model

In [37]:
def GRULM(vocab_size=256, d_model=512, n_layers=2, mode='train'):

    model = tl.Serial(tl.ShiftRight(mode = mode), 
                      tl.Embedding(vocab_size,d_model),
                      [tl.GRU(d_model) for i in range(n_layers)],
                      tl.Dense(256),
                      tl.LogSoftmax())
    return model

In [38]:
model = GRULM()
print(model)

Serial[
  ShiftRight(1)
  Embedding_256_512
  GRU_512
  GRU_512
  Dense_256
  LogSoftmax
]


# Part 3: Training

In [39]:
batch_size = 32
max_length = 64

def n_used_lines(lines, max_length):

    n_lines = 0
    for l in lines:
        if len(l) <= max_length:
            n_lines += 1
    return n_lines

num_used_lines = n_used_lines(lines, 32)
print('Number of used lines from the dataset:', num_used_lines)
print('Batch size (a power of 2):', int(batch_size))
steps_per_epoch = int(num_used_lines/batch_size)
print('Number of steps to cover one epoch:', steps_per_epoch)

Number of used lines from the dataset: 25887
Batch size (a power of 2): 32
Number of steps to cover one epoch: 808


## 3.1 Training the model

In [44]:
from trax.supervised import training
import itertools

In [56]:
def train_model(model, data_generator, batch_size=32, max_length=64, lines=lines, eval_lines=eval_lines, n_steps=1, output_dir='model/'): 

    bare_train_generator = data_generator(batch_size, max_length, data_lines=lines)
    infinite_train_generator = itertools.cycle(bare_train_generator)
    
    bare_eval_generator = data_generator(batch_size, max_length, data_lines=eval_lines)
    infinite_eval_generator = itertools.cycle(bare_eval_generator)
   
    train_task = training.TrainTask(
        labeled_data=infinite_train_generator, 
        loss_layer=tl.CrossEntropyLoss(), 
        optimizer=trax.optimizers.Adam(0.0005)    
    )

    eval_task = training.EvalTask(
        labeled_data=infinite_eval_generator,    
        metrics=[tl.CrossEntropyLoss(), tl.Accuracy()],
        n_eval_batches=3    
    )
    
    training_loop = training.Loop(model,
                                  train_task,
                                  eval_task,
                                  output_dir=output_dir)

    training_loop.run(n_steps=n_steps)

    return training_loop

In [40]:
def test_model(preds, target):

    total_log_ppx = np.sum(tl.one_hot(target,preds.shape[-1]) * preds, axis= -1) 

    non_pad = 1.0 - np.equal(target, 0)   
    ppx = total_log_ppx * non_pad   

    log_ppx = np.sum(ppx) / np.sum(non_pad)
    
    return -log_ppx

In [None]:
model = GRULM()
model.init_from_file('model.pkl.gz')
batch = next(data_generator(batch_size, max_length, lines, shuffle=False))
preds = model(batch[0])
log_ppx = test_model(preds, batch[1])