# RNN/Transformer for Modeling Sentences

In this task, we will use an RNN or a transformer model to model sentences. The task is to predict the next character in a sentence. 

In [None]:
# As usual, a bit of setup
import time
import numpy as np
import torch
import matplotlib.pyplot as plt


%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2
%autosave 180


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Autosaving every 180 seconds


In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


## Load the data


In [None]:

import csv
import string
import numpy as np

def load_data(data_file):
    """Load the data into a list of strings"""
    
    with open(data_file) as csv_file:
        reader = csv.reader(csv_file, delimiter=',')
        rows = list(reader)

    if data_file == '/kaggle/input/dataset-assg4/train.csv':
        sentences, labels = zip(*rows[1:])
        sentences = list(sentences)
    elif data_file == '/kaggle/input/dataset-assg4/test.csv':
        sentences = [row[0] for row in rows[1:]]
    else:
        print("Can only load 'train.csv' or 'test.csv'")
    
    # replace non ascii chars to spaces
    count = 0
    for i, sen in enumerate(sentences):
        count = count + sum([0 if ord(i) < 128 else 1 for i in sen])
        
        # '\n' indicates the end of the sentence
        sentences[i] = ''.join([i if ord(i) < 128 else ' ' for i in sen]) + '\n'
        
    print('The total of ', count, 'non-ascii chars are removed \n')

    return sentences

def char_to_index(sentence, str_voc):
    """Convert a string to an array by using the index in the vocabulary"""
    
    sen_int = np.array([str_voc.index(c) for c in sentence])
    return sen_int

def convert_sen_to_data(sentences, str_voc):
    """ Convert a list of strings to a list of numpy arrays"""
    data = [None] * len(sentences)
    for i, sen in enumerate(sentences):
        data[i] = char_to_index(sen, str_voc)
        
        # sanity check
        #if i < 5:
        #    recover = "".join([str_voc[k] for k in data[i]])
        #    print(recover)
    return data


train_sentences = load_data('/kaggle/input/dataset-assg4/train.csv')

# NOTE: you need to use the same vocabulary to handle your test sentences
vocabulary = list(set("".join(train_sentences))) 
vocabulary.sort()
str_voc = "".join(vocabulary)

train_data = convert_sen_to_data(train_sentences, str_voc)


num_sen = len(train_data)
sen_lengths = [sen.shape[0] for sen in train_data]
max_len = max(sen_lengths)
min_len = min(sen_lengths)
num_chars = sum(sen_lengths)

print('Data statistics:')
print('Number of sentences: ', num_sen)
print('Maximum and minimum sentence lengths:', max_len, min_len)
print('Total number of characters:', num_chars)
print('Vocabulary size: ', len(vocabulary))

uniq, uniq_counts = np.unique(np.concatenate(train_data), return_counts=True)
freq = np.zeros_like(uniq_counts)
freq[uniq] = uniq_counts

print('Chars in vocabulary and their frequencies:')
print(list(zip(vocabulary, freq.tolist())))


# a sample sentence
print("Data exploration -- showing an example sentence:")

sample = ""
for i in train_data[5]:
    sample +=str_voc[i] 
print(sample)


The total of  4328 non-ascii chars are removed 

Data statistics:
Number of sentences:  160000
Maximum and minimum sentence lengths: 100 32
Total number of characters: 10954565
Vocabulary size:  95
Chars in vocabulary and their frequencies:
[('\n', 160000), (' ', 1762678), ('!', 12100), ('#', 496), ('$', 1212), ('%', 450), ('&', 1366), ("'", 88729), ('(', 8734), (')', 8890), ('*', 4310), ('+', 123), (',', 33680), ('-', 20064), ('.', 108694), ('/', 1586), ('0', 11139), ('1', 10960), ('2', 7690), ('3', 3517), ('4', 2882), ('5', 4272), ('6', 2673), ('7', 2496), ('8', 2071), ('9', 2801), (':', 22223), (';', 607), ('<', 12), ('=', 103), ('>', 9), ('?', 48816), ('@', 34), ('A', 8259), ('B', 4063), ('C', 5317), ('D', 6787), ('E', 2239), ('F', 3232), ('G', 2668), ('H', 11482), ('I', 15839), ('J', 2999), ('K', 2315), ('L', 2612), ('M', 7724), ('N', 3017), ('O', 2211), ('P', 3722), ('Q', 1036), ('R', 2942), ('S', 7281), ('T', 15062), ('U', 1014), ('V', 720), ('W', 37161), ('X', 17), ('Y', 2381),

## Implement an RNN or a Transformer with torch

**Q7 (10 points)** In this problem, you are supposed to train an RNN or a transformer to model sentences. Particuarly, your model will receive 10 starting characters and should predict the rest of sentence. The model will be evaluated by per-character cross-entropy loss. You will get 
* 5 points if your per-character cross-entropy loss is less than 2.5 (the loss by predicting with character frequencies is 3.13. Your model needs to be better than that). 
* 8 points if your per-character cross-entropy loss is less than 2
* 10 points if your per-character cross-entropy loss is less than 1.5

\*The performance from a [paper](https://arxiv.org/pdf/1808.04444.pdf) indicates that an LSTM can achieve performance of 1.43 * ln(2) = 0.991. 
\*The `zip` program for compressing files roughly can achieve a performances of 3.522 bits per character. It corresponds to a performance of  3.522 * ln(2) = 2.441

In [None]:
# Set up dataloader

# TODO: please read through the code in this cell so you know the data your model will see. 

from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence


class StrData(Dataset):
    def __init__(self, data):
        self.sentence = data
    def __len__(self):
        return len(self.sentence)
    def __getitem__(self, idx):
        return self.sentence[idx]

BEGIN_ID = freq.shape[0]
END_ID = BEGIN_ID + 1
PAD_ID = BEGIN_ID + 2

def add_begin_and_end(tokens):
    return torch.cat([torch.tensor([BEGIN_ID], dtype = torch.long),
                     torch.tensor(tokens, dtype = torch.long),
                     torch.tensor([END_ID], dtype = torch.long)])

def collate_fn(batch):
    batch_ret = []
    for sentence in batch:
        batch_ret.append(add_begin_and_end(sentence))
    batch_ret = pad_sequence(batch_ret, padding_value = PAD_ID).T # pad_sequence is not batch_first
    return batch_ret



### Set up a model

Suggestion: you may want to put your model in a `.py` file. Your code might look cleaner if you do so.

In [None]:
import torch.nn as nn

# A simple model
class SentenceModel(nn.Module):
    def __init__(self):
        super(SentenceModel, self).__init__()
        self.emb = nn.Embedding(num_embeddings=freq.shape[0]+3, embedding_dim=64)
        self.rnn = nn.RNN(input_size=64, hidden_size = 256, num_layers=1, batch_first = True)
        self.linear= nn.Linear(in_features=256, out_features=freq.shape[0]+3)
        
    def forward(self, x):
        
        h = self.emb(x)
        h, _ = self.rnn(h)
        out = self.linear(h)
        
        return out

model = SentenceModel()
model.to(device)

SentenceModel(
  (emb): Embedding(98, 32)
  (rnn): RNN(32, 256, batch_first=True)
  (linear): Linear(in_features=256, out_features=98, bias=True)
)

### Train the model

NOTE: this example only uses 20 sentences for fast showcase the code, but you should use the entire training set. You can also split out a subset as the validation set. You can make any changes as long as you don't touch the test set.   

In [None]:
epochs = 50

train_loader = DataLoader(StrData(train_data), shuffle=True, batch_size = 1000, collate_fn = collate_fn)

opt = torch.optim.Adam(model.parameters(), lr = 0.001)
loss_fn = torch.nn.CrossEntropyLoss(ignore_index = PAD_ID)
for ep in range(epochs):
    running_loss = 0
    for i, batch in enumerate(train_loader):
        
        m_input = batch[:, :-1] 
        m_output = batch[:, 1:]
        ## If GPU is available, move to cuda 
        if device.type == "cuda":
            m_input = m_input.to(device)
            m_output = m_output.to(device)
        
         # zero the parameter gradients
        opt.zero_grad()
        
        logits = model(m_input) # batch x no_sequences x logits
        # Question: is this teacher forcing? 
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), m_output.reshape(-1)) # * x vocab_size, * 
        loss.backward()
        
        opt.step()
        running_loss += loss.item()
        
        # TODO: Record loss values to some variable
        if device.type == "cuda":
            loss = loss.cpu()
    print(f"Epoch {ep+1}/{epochs}: Training Loss {running_loss / (i+1)}")
    if running_loss < 1.3:
        break

Epoch 1/50: Training Loss 2.559057368338108
Epoch 2/50: Training Loss 2.0814728036522867
Epoch 3/50: Training Loss 1.9154695354402065
Epoch 4/50: Training Loss 1.8177746690809726
Epoch 5/50: Training Loss 1.7528407923877238
Epoch 6/50: Training Loss 1.7056243613362312
Epoch 7/50: Training Loss 1.6695020861923695
Epoch 8/50: Training Loss 1.6412034898996353
Epoch 9/50: Training Loss 1.61897614300251
Epoch 10/50: Training Loss 1.6007990851998328
Epoch 11/50: Training Loss 1.5860715575516224
Epoch 12/50: Training Loss 1.5736069321632384
Epoch 13/50: Training Loss 1.5630849353969096
Epoch 14/50: Training Loss 1.5538190700113774
Epoch 15/50: Training Loss 1.5460318371653556
Epoch 16/50: Training Loss 1.5388614535331726
Epoch 17/50: Training Loss 1.5326403968036175
Epoch 18/50: Training Loss 1.5271154433488845
Epoch 19/50: Training Loss 1.521857362985611
Epoch 20/50: Training Loss 1.5172783233225346
Epoch 21/50: Training Loss 1.5131357699632644
Epoch 22/50: Training Loss 1.5089414209127425
E

### Save the model

In [None]:
torch.save(model, "rnn_lm.sav")

### Test the trained model

In [None]:

# load the test data. NOTE: need to use the same vocabulary as the training data
test_sentences = load_data('/kaggle/input/dataset-assg4/test.csv')
test_data = convert_sen_to_data(test_sentences, str_voc)


print('Number of test instances:', len(test_data))

# TODO: replace this stub model with your powerful model
model = torch.load("rnn_lm.sav")

test_dset = StrData(test_data)
test_loader = DataLoader(StrData(train_data), shuffle=True, batch_size = 50, collate_fn = collate_fn)
loss_fn = torch.nn.CrossEntropyLoss(ignore_index = PAD_ID)
print('Evaluating the model ...')
loss_sum = 0
char_count = 0
with torch.no_grad():
    for i, batch in enumerate(test_loader):
        m_input = batch[:, :-1]
        m_output = batch[:, 1:]
        if device.type == "cuda":
            m_input = m_input.to(device)
            m_output = m_output.to(device)
    
        logits = model(m_input)
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), m_output.reshape(-1))
        loss_sum += loss.item()
        char_count += torch.sum((batch != PAD_ID) & (batch != BEGIN_ID) & (batch != END_ID))
        
per_char_loss = loss_sum / (i+1)

print('The total number of chars in the test set is ', char_count)

print('The per-char-loss is %.3f' % per_char_loss)


The total of  1131 non-ascii chars are removed 

Number of test instances: 40000
Evaluating the model ...
The total number of chars in the test set is  tensor(10954565)
The per-char-loss is 1.459


### Use the model to generate sentences

Now we can use the trained model to generate text with a starting string. The naive model just predict frequent characters in the text, so there is no meaningful generation yet. See what you get from your models.

In [None]:
import torch.distributions as distributions

def generate_text(model, start_string, str_voc):
    """ Generate random text from a starting string. """

    # Number of characters to generate
    num_generate = 100 - len(start_string)

    # Converting our start string to numbers (vectorizing)
    input_int = [BEGIN_ID] + [str_voc.index(s) for s in start_string]
    input_tensor = torch.tensor(input_int, dtype = torch.long).view([1, -1])
    input_tensor = input_tensor.to(device)
    
    # Empty string to store our results
    text_generated = []

    # Low temperature results in more predictable text.
    # Higher temperature results in more surprising text.
    # Experiment to find the best setting.
    temperature = 0.5
    
    # Here batch size == 1
    other_voc = {BEGIN_ID: "<BEG>", END_ID: "<END>", PAD_ID: "<PAD>"}
    
    for i in range(num_generate):
        
        outputs = model(input_tensor)
        
        # remove the batch dimension
        prediction = torch.softmax(outputs[0, -1, :], dim=0)

        # using a categorical distribution to predict the character returned by the model
        prediction = prediction / temperature
        predicted_id = int(distributions.Categorical(probs = prediction).sample())

        
        # The calculation has a lot of repeatition because computation for the first part 
        # of the sequence is the same at every iteration. But it's fine for our example.
        input_int.append(predicted_id)
        input_tensor = torch.tensor(input_int, dtype = torch.long).view([1, -1])
        input_tensor = input_tensor.to(device)
        
        text_generated.append(str_voc[predicted_id] if (predicted_id < len(str_voc)) else other_voc[predicted_id])

    return (start_string + ''.join(text_generated))


start_string = 'I hav'
gen_sen = generate_text(model, start_string, str_voc)
gen_sen = gen_sen.split('\n')[0]

print('Starting from "' + start_string + '", the generated sentence is:')
print('"' + gen_sen + '"')

Starting from "I hav", the generated sentence is:
"I have to eat the man is dead incused if you hate to do in coming in american dark women."
