# Sentiment classification with LSTM
In this notebook we will use LSTMs to do sentiment classification on the [imdb dataset](http://ai.stanford.edu/~amaas/data/sentiment/). 

In [1]:
import numpy as np 
import pandas as pd 
import os
import spacy
import string
import re
import numpy as np
from spacy.symbols import ORTH
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence

## Data

To get the data: <br>
`wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz`

In [2]:
from pathlib import Path
PATH = Path("/data2/yinterian/aclImdb/")
list(PATH.iterdir())

[PosixPath('/data2/yinterian/aclImdb/README'),
 PosixPath('/data2/yinterian/aclImdb/model-gru-86.pth'),
 PosixPath('/data2/yinterian/aclImdb/model-81.pth'),
 PosixPath('/data2/yinterian/aclImdb/model-gru.pth'),
 PosixPath('/data2/yinterian/aclImdb/model-78.pth'),
 PosixPath('/data2/yinterian/aclImdb/model-gru-88.pth'),
 PosixPath('/data2/yinterian/aclImdb/test'),
 PosixPath('/data2/yinterian/aclImdb/model-gru-87.pth'),
 PosixPath('/data2/yinterian/aclImdb/imdbEr.txt'),
 PosixPath('/data2/yinterian/aclImdb/train'),
 PosixPath('/data2/yinterian/aclImdb/imdb.vocab')]

In [3]:
path = PATH/"train/pos/0_9.txt"
path.read_text()

'Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High\'s satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers\' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I\'m here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn\'t!'

## Tokenization

In [4]:
# first time run this
#!python3 -m spacy download en

In [5]:
re_br = re.compile(r'<\s*br\s*/?>', re.IGNORECASE)
def sub_br(x): return re_br.sub("\n", x.lower())

my_tok = spacy.load('en')
def spacy_tok(x): return [tok.text for tok in my_tok.tokenizer(sub_br(x))]

In [6]:
path = PATH/"train/pos/0_9.txt"
spacy_tok(path.read_text())[:10]

['bromwell', 'high', 'is', 'a', 'cartoon', 'comedy', '.', 'it', 'ran', 'at']

### Computing vocab2index

In [7]:
pos_files = list((PATH/"train"/"pos").iterdir())
neg_files = list((PATH/"train"/"neg").iterdir())
all_files = pos_files + neg_files
all_files[:5]

[PosixPath('/data2/yinterian/aclImdb/train/pos/8030_9.txt'),
 PosixPath('/data2/yinterian/aclImdb/train/pos/8819_10.txt'),
 PosixPath('/data2/yinterian/aclImdb/train/pos/6316_8.txt'),
 PosixPath('/data2/yinterian/aclImdb/train/pos/4781_8.txt'),
 PosixPath('/data2/yinterian/aclImdb/train/pos/10085_10.txt')]

In [8]:
counts = Counter()
for path in all_files:
    counts.update(spacy_tok(path.read_text()))

In [9]:
#counts

In [10]:
len(counts.keys())

87130

In [11]:
for word in list(counts):
    if counts[word] < 5:
        del counts[word]

In [12]:
len(counts.keys())

29370

In [13]:
vocab2index = {"<PAD>":0, "UNK":1}
words = ["<PAD>", "UNK"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)

In [14]:
#vocab2index

## Dataset version 1

In [16]:
# you could imprive this function by taking a random sample
# when sentences are longer than N=400 words 
def encode_sentence_v1(path, vocab2index, N=400):
    x = spacy_tok(path.read_text())
    enc = np.zeros(N, dtype=np.int32)
    enc1 = np.array([vocab2index.get(w, vocab2index["UNK"]) for w in x])
    l = min(N, len(enc1))
    enc[N-l:] = enc1[:l]
    return enc

In [17]:
path = PATH/"train/neg/211_4.txt"
encode_sentence(path, vocab2index, N=400)

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
         929,   732,    99,  2115,    99,   231,    23,  2888,    28,
         428,   788,   119,    52,   890,   127,  1885,   264,  1489,
         987,    93,   390,  4499,    93,  4500,    75,  2203,  1047,
          75, 18225,    47,   483,   141,  1574,    23,  2994,    34,
          26, 16331,

In [18]:
class ImdbDatasetv1(Dataset):
    def __init__(self, PATH, train="train", N=400):
        self.N = N
        self.path_to_images = PATH/train
        self.pos_files = list((self.path_to_images/"pos").iterdir())
        self.neg_files = list((self.path_to_images/"neg").iterdir())
        self.files = self.pos_files + self.neg_files
        # pos 1, neg 0
        self.y = np.concatenate((np.ones(len(self.pos_files), dtype=int),
                                np.zeros(len(self.neg_files), dtype=int)), axis=0)
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        path = self.files[idx]
        return encode_sentence(path, vocab2index, self.N), self.y[idx]

In [19]:
train_ds_v1 = ImdbDatasetv1(PATH)
test_ds_v1 = ImdbDatasetv1(PATH, "test")

In [21]:
batch_size = 1000
train_dl_v1 = DataLoader(train_ds_v1, batch_size=batch_size, shuffle=True)
test_dl_v1 = DataLoader(test_ds_v1, batch_size=batch_size)

## Understanding LSTMs 

In [31]:
# Input dim is the dimension of the embedding for each word (2 in the example)
# Output dim is the dimension of the hidden layer (4 in this example)
# batch_first – If True, then the input and output tensors are provided as (batch, seq, feature).
lstm = nn.LSTM(2, 4, batch_first=True)  

In [32]:
inputs = [torch.randn(1, 2) for _ in range(5)] # make a sequence of length 5
inputs = torch.cat(inputs).view( 1, len(inputs),  -1)
inputs

tensor([[[ 1.5123, -1.8904],
         [ 1.6441, -0.8196],
         [ 0.4953,  0.8664],
         [ 0.2091, -0.7583],
         [ 0.8346, -0.0706]]])

In [34]:
# RNNs assume this input shape
# input shape should be  bash_size x seq_len x embedding dimension (if batch_first=True)
inputs.shape

torch.Size([1, 5, 2])

In [35]:
out, hidden = lstm(inputs)

In [36]:
out

tensor([[[-0.0340,  0.1957,  0.1785, -0.2185],
         [-0.1098,  0.2564,  0.1409, -0.3286],
         [-0.3621,  0.0015, -0.0616, -0.2750],
         [-0.2830,  0.0663,  0.1183, -0.2650],
         [-0.3131,  0.0708,  0.0988, -0.2679]]], grad_fn=<TransposeBackward0>)

In [37]:
hidden

(tensor([[[-0.3131,  0.0708,  0.0988, -0.2679]]], grad_fn=<ViewBackward>),
 tensor([[[-0.6365,  0.1603,  0.1475, -0.5192]]], grad_fn=<ViewBackward>))

## Model v1

In [109]:
class LSTMModelv1(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super(LSTMModelv1,self).__init__()
        self.hidden_dim = hidden_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linearOut = nn.Linear(hidden_dim, 1)
        
    def forward(self, inputs) :
        x = self.embeddings(inputs)
        lstm_out, lstm_h = self.lstm(x)
        x = lstm_out[:,-1]
        x = self.linearOut(x)
        return x, lstm_h

### Debugging our model

In [110]:
batch_size = 7
train_dl = DataLoader(train_ds_v1, batch_size=batch_size, shuffle=True)

In [111]:
x, y = next(iter(train_dl))
x.shape

torch.Size([7, 400])

In [112]:
vocab_size = len(words)
embedding_dim = 10
embed = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)

In [113]:
x = embed(x.long())
x.shape

torch.Size([7, 400, 10])

In [114]:
hidden_dim = 9
lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
out, hidden = lstm(x)

In [115]:
out.shape

torch.Size([7, 400, 9])

In [116]:
# a vector of size 9 (hidden_dim) for every element on the batch
out[:,-1].shape

torch.Size([7, 9])

In [124]:
def train_epocs(model, train_dl, test_dl, epochs=10, lr=0.001):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr)
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, y in train_dl:
            x = x.long().cuda()
            y = y.float().cuda()
            y_pred, _ = model(x)
            optimizer.zero_grad()
            loss = F.binary_cross_entropy_with_logits(y_pred, y.unsqueeze(1))
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
        print("train loss %.3f" % (sum_loss/total))
        test_metrics(model, test_dl)

In [125]:
def test_metrics(model, test_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    for x, y in test_dl:
        x = x.long().cuda()
        y = y.float().cuda().unsqueeze(1)
        y_hat, _ = model(x)
        loss = F.binary_cross_entropy_with_logits(y_hat, y)
        y_pred = y_hat > 0
        correct += (y_pred.float() == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
    print("test loss %.3f and accuracy %.3f" % (sum_loss/total, correct/total))

In [126]:
vocab_size = len(words)
print(vocab_size)
model = LSTMModelv1(vocab_size, 50, 100).cuda()

29372


In [121]:
batch_size = 1000
train_dl_v1 = DataLoader(train_ds_v1, batch_size=batch_size, shuffle=True)
test_dl_v1 = DataLoader(test_ds_v1, batch_size=batch_size)

In [127]:
train_epocs(model, train_dl_v1, test_dl_v1, epochs=20, lr=0.01)

train loss 0.680
test loss 0.673 and accuracy 0.568
train loss 0.598
test loss 0.569 and accuracy 0.704
train loss 0.449
test loss 0.458 and accuracy 0.798
train loss 0.330
test loss 0.515 and accuracy 0.741
train loss 0.370
test loss 0.486 and accuracy 0.784
train loss 0.261
test loss 0.505 and accuracy 0.806
train loss 0.180
test loss 0.474 and accuracy 0.818
train loss 0.130
test loss 0.501 and accuracy 0.825
train loss 0.089
test loss 0.544 and accuracy 0.830
train loss 0.073
test loss 0.627 and accuracy 0.823
train loss 0.048
test loss 0.635 and accuracy 0.830
train loss 0.030
test loss 0.703 and accuracy 0.819
train loss 0.026
test loss 0.750 and accuracy 0.825
train loss 0.016
test loss 0.863 and accuracy 0.835
train loss 0.010
test loss 0.873 and accuracy 0.830
train loss 0.008
test loss 0.958 and accuracy 0.825
train loss 0.007
test loss 0.962 and accuracy 0.833
train loss 0.013
test loss 0.940 and accuracy 0.823
train loss 0.015
test loss 0.920 and accuracy 0.831
train loss 0

In [128]:
def save_model(m, p): torch.save(m.state_dict(), p)
    
def load_model(m, p): m.load_state_dict(torch.load(p))

In [132]:
p = PATH/"model-82.pth"
save_model(model, p)

## Model 2 GRU

In [129]:
class GRUModel(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(GRUModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.dropout = nn.Dropout(0.3)
        self.gru = nn.GRU(embedding_dim, hidden_dim, batch_first=True)
        self.linearOut = nn.Linear(hidden_dim, 1)
        
    def forward(self, inputs) :
        x = self.embeddings(inputs)
        x = self.dropout(x)
        lstm_out, lstm_h = self.gru(x)
        x = lstm_out[:, -1]
        x = self.linearOut(x)
        return x, lstm_h

In [130]:
vocab_size = len(words)
print(vocab_size)
model2 = GRUModel(vocab_size, 50, 50).cuda()

29372


In [131]:
train_epocs(model2, train_dl_v1, test_dl_v1, epochs=10, lr=0.01)

train loss 0.665
test loss 0.606 and accuracy 0.677
train loss 0.632
test loss 0.773 and accuracy 0.624
train loss 0.510
test loss 0.545 and accuracy 0.752
train loss 0.404
test loss 0.413 and accuracy 0.824
train loss 0.339
test loss 0.422 and accuracy 0.834
train loss 0.253
test loss 0.344 and accuracy 0.870
train loss 0.195
test loss 0.321 and accuracy 0.877
train loss 0.161
test loss 0.341 and accuracy 0.880
train loss 0.126
test loss 0.385 and accuracy 0.879
train loss 0.103
test loss 0.422 and accuracy 0.874


In [None]:
p = PATH/"model-82.pth"
save_model(model, p)

## Dataset version 2

In [21]:
# This function encodes sentences without padding 
def encode_sentence_v2(path, vocab2index):
    x = spacy_tok(path.read_text().lower())
    return np.array([vocab2index.get(w, vocab2index["UNK"]) for w in x])

In [22]:
path = PATH/"train/neg/211_4.txt"
encode_sentence(path, vocab2index)

array([  929,   732,    99,  2115,    99,   231,    23,  2888,    28,
         428,   788,   119,    52,   890,   127,  1885,   264,  1489,
         987,    93,   390,  4499,    93,  4500,    75,  2203,  1047,
          75, 18225,    47,   483,   141,  1574,    23,  2994,    34,
          26, 16331,   353,   705,   881,    73,   141,   197,  2624,
        3058,    52,     3,   184,  4303,    52, 10076,    30,   117,
          12,   264,    98,    26,  1498,  1132,     2,     8,    66,
        6401,  2205,    47,   353,    58,    23,  2115,   353,  7441,
       24959,    73,   666,   751,  1362,   141,   287,     2,  1885,
         967,    47,   696,    73,    52,  1074,   801,  5503,    47,
        1966,    73,    26,    62,    71,    52,    68,  4209,     3,
       16734,    73,    42,   490, 18225,    63, 13088,  7768,    47,
           2,    66,  2606,  4048,    58,     5,    73,  5561,   167,
         536,    42,   105,     1,   615,  4459,    71,    66,   161,
         178,   141,

In [34]:
class ImdbDatasetv2(Dataset):
    def __init__(self, PATH, train="train"):
        self.path_to_images = PATH/train
        self.pos_files = list((self.path_to_images/"pos").iterdir())
        self.neg_files = list((self.path_to_images/"neg").iterdir())
        self.files = self.pos_files + self.neg_files
        # pos 1, neg 0
        self.y = np.concatenate((np.ones(len(self.pos_files), dtype=int),
                                np.zeros(len(self.neg_files), dtype=int)), axis=0)
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        path = self.files[idx]
        return encode_sentence(path, vocab2index), self.y[idx]

In [35]:
train_ds_v2 = ImdbDatasetv2(PATH)
test_ds_v2 = ImdbDatasetv2(PATH, "test")

In [36]:
x, y = test_ds_v2[0]
len(x)

779

In [37]:
x, y = train_ds_v2[0]
len(x)

103

## collate_fn function

The `collate_fn` merges a list of samples to form a mini-batch. It is an optional parameter to our data loader. 

In [47]:
data = [([4, 545, 23, 1], 0), ([34, 84], 1), ([23, 6, 774], 0)]
sentences, labels = zip(*data)
list(sentences)

[[4, 545, 23, 1], [34, 84], [23, 6, 774]]

In [76]:
def collate_fn(data):
    """Creates mini-batch tensors from the list of tuples (sentences, labels).
    
    Need custom collate_fn because merging sequences (including padding) is not 
    supported in default. Sequences are padded to the maximum length of mini-batch 
    sequences (dynamic padding).
    
    Args:
        data: list of tuple (sentence, label). 
            - list of word indices of variable length
            - label, 0 or 1
    Returns:
        packed_batch: (PackedSequence), see torch.nn.utils.rnn.pack_padded_sequence
        sencences: torch tensor of shape (batch_size, max_len).
        labels: torch tensor of shape (batch_size, 1).
        lengths: list; valid length for each padded sentence. 
    """
    # Sort a data list by sentences length (descending order).
    data.sort(key=lambda x: len(x[0]), reverse=True)
    sentences, labels = zip(*data)
    
    # stack labels
    labels = torch.Tensor(labels).unsqueeze(1)
    
    # Merge sentences
    lengths = [len(s) for s in sentences]
   
    sents = torch.zeros(len(sentences), max(lengths)).long()
    for i, s in enumerate(sentences):
        end = lengths[i]
        sents[i, :end] = torch.Tensor(s[:end])        
    
    return sents, labels, lengths

In [77]:
batch_size = 5
train_dl_v2 = DataLoader(train_ds_v2, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
test_dl_v2 = DataLoader(test_ds_v2, batch_size=batch_size, collate_fn=collate_fn)

In [78]:
sents, labels, lengths = next(iter(train_dl))

In [85]:
sents.shape, labels.shape

(torch.Size([5, 606]), torch.Size([5, 1]))

In [83]:
lengths

[606, 209, 181, 169, 152]

## Model

In [153]:
class LSTMModel(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super(LSTMModel,self).__init__()
        self.hidden_dim = hidden_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linearOut = nn.Linear(hidden_dim, 1)
        
    def forward(self, inputs, lengths):
        x = self.embeddings(inputs)
        pack = pack_padded_sequence(x, lengths, batch_first=True)
        lstm_out, (ht, ct) = self.lstm(pack)
        x = ht[-1]
        x = self.linearOut(x)
        return x

In [164]:
batch_size = 500
train_dl_v2 = DataLoader(train_ds_v2, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
test_dl_v2 = DataLoader(test_ds_v2, batch_size=batch_size, collate_fn=collate_fn)

In [165]:
vocab_size = len(words)
print(vocab_size)
model = LSTMModel(vocab_size, 50, 100).cuda()

29372


In [166]:
model = train_epocs(model, train_dl_v2, test_dl_v2, epochs=5, lr=0.01)

train loss 0.651
test loss 0.558 and accuracy 0.718
train loss 0.548
test loss 0.529 and accuracy 0.749
train loss 0.392
test loss 0.396 and accuracy 0.831
train loss 0.275
test loss 0.505 and accuracy 0.768
train loss 0.255
test loss 0.397 and accuracy 0.857


In [184]:
def save_model(m, p): torch.save(m.state_dict(), p)
    
def load_model(m, p): m.load_state_dict(torch.load(p))

In [51]:
p = PATH/"model-85.pth"
save_model(model, p)

In [52]:
test_metrics(model)

test loss 0.939 and accuracy 0.817


In [49]:
load_model(model, p)

## GRU model with dropout

In [186]:
class GRUModel(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(GRUModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.dropout = nn.Dropout(0.5)
        self.gru = nn.GRU(embedding_dim, hidden_dim, batch_first=True)
        self.linearOut = nn.Linear(hidden_dim, 1)
        
    def forward(self, inputs, lengths):
        x = self.embeddings(inputs)
        pack = pack_padded_sequence(x, lengths, batch_first=True)
        out, ht = self.gru(pack)
        x = ht[-1]
        x = self.dropout(x)
        x = self.linearOut(x)
        return x

In [187]:
vocab_size = len(words)
print(vocab_size)
model2 = GRUModel(vocab_size, 50, 50).cuda()

29372


In [188]:
model2 = train_epocs(model2, epochs=5, lr=0.01)

train loss 0.648
test loss 0.623 and accuracy 0.663
train loss 0.512
test loss 0.377 and accuracy 0.841
train loss 0.261
test loss 0.270 and accuracy 0.889
train loss 0.144
test loss 0.306 and accuracy 0.887
train loss 0.074
test loss 0.377 and accuracy 0.882


In [189]:
p = PATH/"model-gru-88.pth"
save_model(model2, p)

## Exercise:
Start with pre-trained embeddings.

## References

The model in this notebook is adapted from this [pytorch tutorial](https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html). 

The data loader function `collate_fn` has been adapted from [here](https://github.com/yunjey/pytorch-tutorial/blob/master/tutorials/03-advanced/image_captioning/data_loader.py).