# Train Character Language Model
Language Models are used for the following tasks:
* Check the probability of a sentence to be part of some specific language
* Generate Words/Characters given a sentence
On this notebook we will train a Neural Character Language model based on GRUs.
![Input Output](docs/imgs/char_lang_model.png "Title")

#### Character Generation
One way to generate data is to input a character, then sample the output of the RNN and feed this result back to the RNN
![Generation](docs/imgs/char_lang_model_gen.png "Title")

#### Evaluation Metrics
Character Language models are evaluated with Character Error Rate metric, that can be calculated by doing the exp of the cross entropy loss

#### Training
During training we use "masked cross entropy" to ignore pad values and the "plateau" learning rate scheduler that drop the learning rate if the loss functions doesnt drop for few epochs (patience).

![Loss](docs/imgs/loss_plateau_scheduler.png "Title")

#### Datasets
* https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/

#### References:
* https://en.wikipedia.org/wiki/Language_model
* https://github.com/furkanu/deeplearning.ai-pytorch
* https://towardsdatascience.com/writing-like-shakespeare-with-machine-learning-in-pytorch-d77f851d910c
* https://pytorch.org/tutorials/intermediate/char_rnn_generation_tutorial.html
* https://web.stanford.edu/class/cs124/lec/languagemodeling.pdf

In [1]:
import numpy as np
import os

import torch
import torch.nn as nn
from torch import optim
from torch.utils.tensorboard import SummaryWriter
import torch.utils.data as utils
from torch.optim import lr_scheduler

import utils_char_dataset
import utils_char_lm
import models
from char_LM_dataset import CharacterLanguageModelDataset

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#device = 'cpu'
print('Device:', device)
num_gpu = torch.cuda.device_count()
num_gpu = 1
print('Number of GPUs Available:', num_gpu)

pickle_filename_train = "data/shakespeare_corpus_data_train.pkl"
pickle_filename_test = "data/shakespeare_corpus_data_test.pkl"

# The codemap is a dictionary of words to class index
codemap = utils_char_dataset.load_pickle('./codemap_LM.pickle')
num_classes = len(codemap)
print('Num classes:', num_classes)

# Hyperparameters
# Pure sequence to sequence models can't deal with batches
batch_size = 200
clip = 50.0
lr = 0.001
hidden_size = 256
num_layers = 2
epochs = 50

Device: cuda:0
Number of GPUs Available: 1
Num classes: 69


#### Load Data

In [2]:
char_dataset_train = CharacterLanguageModelDataset(pickle_filename_train)
dataloader_train = utils.DataLoader(char_dataset_train, batch_size=batch_size, shuffle=True)
char_dataset_test = CharacterLanguageModelDataset(pickle_filename_test)
dataloader_test = utils.DataLoader(char_dataset_test, batch_size=batch_size, shuffle=False)
sample = char_dataset_train[0]
sample_str = ''.join([utils_char_dataset.char_from_class_id(char, codemap) for char in sample['X'][0:sample['len_x']]])
print(sample_str)

in


#### Start Tensorboard Interface

In [3]:
#writer = SummaryWriter('./logs')
# Default directory "runs"
writer = SummaryWriter()

#### Define Masked Loss
We need to filter out from the gradient the padding values.
##### Parameters
* input(Decoder output): [batch x sequence]
* target(Label): [batch]
* mask shape: [batch]

##### Example
```python
input: torch.Size([32, 83])
target: torch.Size([32])
mask: torch.Size([32])

input: torch.Size([70, 84, 69])
target: torch.Size([70, 84])
mask: torch.Size([70, 84])
```

In [4]:
def maskNLLLoss(predicted, target, mask):    
    # Get total number of valid elements
    nTotal = mask.sum()
    
    # flatten all the labels, mask and prediction
    target = target.view(-1)
    mask = mask.view(-1)
    
    predicted = predicted.view(-1, num_classes)        
    
    # pick the values for the label and zero out the rest with the mask
    #predicted_mask = predicted[range(predicted.shape[0]), target] * mask
    predicted_mask = -torch.log(predicted[range(predicted.shape[0]), target])
    loss = predicted_mask.masked_select(mask).mean()        
        
    loss = loss.to(device)    
    return loss, nTotal.item()

In [5]:
def evaluate_test(seq_model, writer = None):
    
    seq_model.eval()

    # Iterate on the test set
    print_every = 10
    metrics = []
    for iter, sample in enumerate(dataloader_test):
        input_tensor = sample['X'].type(torch.LongTensor).to(device)
        target_tensor = sample['Y'].type(torch.LongTensor).to(device)
        target_mask = sample['label_mask'].type(torch.ByteTensor).to(device)
        len_input = sample['len_x'].to(device)
        len_target = sample['len_y'].to(device)
        curr_batch_size = len_input.shape[0]
        with torch.no_grad():
            hidden_state = models.initHidden(curr_batch_size, False, hidden_size, num_layers, device)
            # Run words through seq_model (all batch at once)   
            seq_model_outputs, hidden_state = seq_model(input_tensor, hidden_state, len_input)        

            loss, nTotal = maskNLLLoss(seq_model_outputs, target_tensor, target_mask)
            
            # Add predicted/target text to tensorboard
            if writer:
                pass

            metric = loss.item()
            
            
            metrics.append(metric)

    distance_test = np.mean(metrics)
    return distance_test

In [6]:
def train_batch(input_tensor, target_tensor, len_input, len_target, target_mask, seq_model,  
          model_optimizer, iterations):
    # Start seq_model hidden state as zero
    curr_batch_size = len_input.shape[0]
    
    hidden_state = models.initHidden(curr_batch_size, False, hidden_size, num_layers, device)

    # Zero the gradient for doing backprop
    model_optimizer.zero_grad()  

    # Initialize Loss
    loss = 0
    print_losses = []
    n_totals = 0
        
    # Run words through seq_model (all batch at once)   
    seq_model_outputs, hidden_state = seq_model(input_tensor, hidden_state, len_input)        
    
    loss, nTotal = maskNLLLoss(seq_model_outputs, target_tensor, target_mask)    
    
    # Calculate the loss gradient wrt to the model weights
    loss.backward()
    
    # Clip gradients: gradients are modified in place
    _ = nn.utils.clip_grad_norm_(seq_model.parameters(), clip)    

    # Do the gradient descent step
    model_optimizer.step()    

    # Return normalized loss
    return loss.item()

In [7]:
def train_loop(seq_model, n_epochs=100, learning_rate=0.01):

    # Initialize SGD Optimizer to train the network
    seq_model_optimizer = optim.Adam(seq_model.parameters(), lr=learning_rate)    
    sc_plt_enc = torch.optim.lr_scheduler.ReduceLROnPlateau(seq_model_optimizer, patience=2, verbose=True)    
        
    iterations = 0
    best_metric = 1e12
    for epoch in range(n_epochs):
        running_loss = 0.0
        seq_model.train()        
        for iter, sample in enumerate(dataloader_train):
            # Select a sample and transpose padded arrays into (max_len_sequence x batch_size)            
            
            # Each 10 iterations send some input data to tensorboard
            if iterations % 10 == 0:
                pass
                # Select first element from the Batch and send to tensorboard
                #stroke = sample['sequence'][0].numpy()
                #lbl_str = sample['label_str'][0]         
                #fig_input = iam.line_plot(stroke, lbl_str, display=False)
                #writer.add_figure('train/stroke_target', fig_input, iterations)
        
            #input_tensor = sample['X'].type(torch.FloatTensor).to(device)
            input_tensor = sample['X'].type(torch.LongTensor).to(device)
            target_tensor = sample['Y'].type(torch.LongTensor).to(device)
            target_mask = sample['label_mask'].type(torch.ByteTensor).to(device)
            len_input = sample['len_x'].to(device)
            len_target = sample['len_y'].to(device)                        
                
            # Train on that particular input/output sequence
            loss = train_batch(input_tensor, target_tensor, len_input, len_target, target_mask, 
                         seq_model, seq_model_optimizer, iterations)

            # Accumulate Loss for display
            running_loss += loss

            # Send loss to Tensorboard
            #print(loss)
            writer.add_scalar('train/loss', loss, iterations)
            iterations +=1
        
        # Epoch ended        
        sc_plt_enc.step(running_loss)        
        writer.add_scalar('train/running_loss', running_loss, epoch)
        
        # Get current learning rate (To display on Tensorboard)
        for param_group in seq_model_optimizer.param_groups:
            curr_learning_rate = param_group['lr']
            writer.add_scalar('train/learning_rate', curr_learning_rate, epoch)
        
        # Evaluate Model
        metric = evaluate_test(seq_model, writer)
        writer.add_scalar('test/metric_loss', metric, epoch)
        
        test_metric = metric 
        #print('Epoch:', epoch, 'Running loss:', running_loss)
        
        # Save model on the best evaluation distance 
        if test_metric < best_metric:
                print('Smallest metric at epoch:', epoch, 'metric:', test_metric, 'running_loss:', running_loss)
                torch.save({'seq_model': seq_model.state_dict(),}, 
                           os.path.join('./', '{}_{}.pt'.format('model_lm', 'best')))
                best_metric = test_metric

In [8]:
# Add to tensorboard hyper-parameter values
writer.add_text('train/params', 
                'learning_rate:' + str(lr) + 
                ' hidden_size:' + str(hidden_size) + 
                ' num_layers:' + str(num_layers), 0)

# Instantiate Neural Character Language Model Networks
char_LM = models.CharLangModel(num_classes, hidden_size, num_classes, num_layers=num_layers).to(device)

# Train
train_loop(char_LM, n_epochs=epochs, learning_rate=lr)

Smallest metric at epoch: 0 metric: 1.1425018786326744 running_loss: 5438.66271853447
Smallest metric at epoch: 1 metric: 1.1075885187495838 running_loss: 4665.277022123337
Smallest metric at epoch: 2 metric: 1.095850261642769 running_loss: 4564.642035484314
Smallest metric at epoch: 3 metric: 1.0876045322206755 running_loss: 4519.386776983738
Smallest metric at epoch: 4 metric: 1.0823433448100037 running_loss: 4493.608509123325
Smallest metric at epoch: 6 metric: 1.077851315956158 running_loss: 4467.401553630829
Smallest metric at epoch: 8 metric: 1.0776235544496524 running_loss: 4459.083432257175
Smallest metric at epoch: 10 metric: 1.0774076803031887 running_loss: 4456.999837517738
Epoch    12: reducing learning rate of group 0 to 1.0000e-04.
Smallest metric at epoch: 13 metric: 1.0578111486794415 running_loss: 4368.681524336338
Smallest metric at epoch: 14 metric: 1.0542610906180152 running_loss: 4329.041776061058
Smallest metric at epoch: 15 metric: 1.0520535281915093 running_loss

#### Get the Probability of a Sentence

In [9]:
lst_words = ['Thi1', 'This', 'Love', 'Lov1', 'Hellz', 'Hello', 'HellO', 'HeLlo']
for word in lst_words:
    print('P(%s):%f' % (word, utils_char_lm.getProbabilitySentence(word, char_LM, device, codemap)))

P(Thi1):0.000000
P(This):0.051411
P(Love):0.021022
P(Lov1):0.000000
P(Hellz):0.000000
P(Hello):0.000000
P(HellO):0.000000
P(HeLlo):0.000000


#### Generate Characters Given some chars
##### References:
* https://pytorch.org/docs/stable/distributions.html

##### Greedly

In [10]:
lst_words = ['t', 'k', 'de', 'Liv', 'Pre', 'To be or']
for word in lst_words:
    pred = utils_char_lm.getNextChar(word,100, char_LM, device, codemap)
    res_str = ''.join([utils_char_dataset.char_from_class_id(class_id, codemap) for class_id in pred])
    print('('+word+')'+res_str)

(t)he<EOS>
(k)now<EOS>
(de)ath<EOS>
(Liv)e<EOS>
(Pre)sent<EOS>
(To be or)s<EOS>


##### Sampled

In [11]:
lst_words = ['t', 'k', 'de', 'Liv', 'Pre', 'To be or not to b']
for word in lst_words:
    pred = utils_char_lm.getNextChar(word,100, char_LM, device, codemap, greedly=False)
    res_str = ''.join([utils_char_dataset.char_from_class_id(class_id, codemap) for class_id in pred])
    print('('+word+')'+res_str)

(t)hen<EOS>
(k)now<EOS>
(de)ath<EOS>
(Liv)es<EOS>
(Pre)sence<EOS>
(To be or not to b)'s<EOS>
