## 2 - Use LSTM model to conduct sentiment analysis ##

### 2.1 Prepare the data (10 Points) ###
Conduct sentiment analysis using IMDB data with reccurent neural network. Make prediction on sentiment (positive/negative) as a binary classification.
More details can be found here, https://keras.io/api/datasets/imdb/

**Tasks:**
1. Load the data from IMDB review dataset and **print out** the lengths of sequences. **(5 Points)**
2. Preprocess review data to meet the network input requirement by specifying **number of words=1000**, setting **the analysis length of the review = 100**, and **padding the input sequences**. **(5 Points)**

**Hints:**  
1. You may load the IMDB data with keras.datasets.imdb.load_data(num_words=max_features). Here, max_features is set to **1000**.
2. You may use keras.preprocessing.sequence.pad_sequences(x_train, maxlen) to pad the input sequences and set maxlen to **100**.

**Note:**\
We train the build LSTM-based model with ALL training data; the **validation set** (aka **development set**) is set with the **testing set** for model evaluation. This split is common in the application with limited sampled observation data, like NLP problems.

Source: https://github.com/bentrevett/pytorch-sentiment-analysis

In [97]:
# Import Packages 

import collections
import datasets
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torchtext
import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")



In [98]:
# Create seed for initializing training

seed = 6666
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

# Import and Process Dataset

In [99]:
#Import IMDB dataset  
train_data, test_data = datasets.load_dataset("imdb", split=["train", "test"])


In [100]:
# Create Token Setup
tokenizer = torchtext.data.utils.get_tokenizer("basic_english")

def tokenize_example(example, tokenizer, max_length):
    tokens = tokenizer(example["text"])[:max_length]
    length = len(tokens)
    return {"tokens": tokens, "length": length}

max_length = 256

train_data = train_data.map(
    tokenize_example, fn_kwargs={"tokenizer": tokenizer, "max_length": max_length}
)
test_data = test_data.map(
    tokenize_example, fn_kwargs={"tokenizer": tokenizer, "max_length": max_length}
)

In [101]:
# Split validation data

test_size = 0.25
train_valid_data = train_data.train_test_split(test_size=test_size)
train_data = train_valid_data["train"]
valid_data = train_valid_data["test"]

In [102]:
min_freq = 5
special_tokens = ["<unk>", "<pad>"]

vocab = torchtext.vocab.build_vocab_from_iterator(
    train_data["tokens"],
    min_freq=min_freq,
    specials=special_tokens,
)

In [103]:
unk_index = vocab["<unk>"]
pad_index = vocab["<pad>"]

In [104]:
vocab.set_default_index(unk_index)

In [105]:
def numericalize_example(example, vocab):
    ids = vocab.lookup_indices(example["tokens"])
    return {"ids": ids}


In [106]:
train_data = train_data.map(numericalize_example, fn_kwargs={"vocab": vocab})
valid_data = valid_data.map(numericalize_example, fn_kwargs={"vocab": vocab})
test_data = test_data.map(numericalize_example, fn_kwargs={"vocab": vocab})

In [107]:
# Convert to Tensor for training
train_data = train_data.with_format(type="torch", columns=["ids", "label", "length"])
valid_data = valid_data.with_format(type="torch", columns=["ids", "label", "length"])
test_data = test_data.with_format(type="torch", columns=["ids", "label", "length"])

In [108]:
def get_collate_fn(pad_index):
    def collate_fn(batch):
        batch_ids = [i["ids"] for i in batch]
        batch_ids = nn.utils.rnn.pad_sequence(
            batch_ids, padding_value=pad_index, batch_first=True
        )
        batch_length = [i["length"] for i in batch]
        batch_length = torch.stack(batch_length)
        batch_label = [i["label"] for i in batch]
        batch_label = torch.stack(batch_label)
        batch = {"ids": batch_ids, "length": batch_length, "label": batch_label}
        return batch

    return collate_fn

In [109]:
def get_data_loader(dataset, batch_size, pad_index, shuffle=False):
    collate_fn = get_collate_fn(pad_index)
    data_loader = torch.utils.data.DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        collate_fn=collate_fn,
        shuffle=shuffle,
    )
    return data_loader

In [110]:
batch_size = 512

train_data_loader = get_data_loader(train_data, batch_size, pad_index, shuffle=True)
valid_data_loader = get_data_loader(valid_data, batch_size, pad_index)
test_data_loader = get_data_loader(test_data, batch_size, pad_index)

# Build LSTM model

In [111]:


class LSTM(nn.Module):
    def __init__(
        self,
        vocab_size,
        embedding_dim,
        hidden_dim,
        output_dim,
        n_layers,
        bidirectional,
        dropout_rate,
        pad_index,
    ):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_index)
        self.lstm = nn.LSTM(
            embedding_dim,
            hidden_dim,
            n_layers,
            bidirectional=bidirectional,
            dropout=dropout_rate,
            batch_first=True,
        )
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, ids, length):
        # ids = [batch size, seq len]
        # length = [batch size]
        embedded = self.dropout(self.embedding(ids))
        # embedded = [batch size, seq len, embedding dim]
        packed_embedded = nn.utils.rnn.pack_padded_sequence(
            embedded, length, batch_first=True, enforce_sorted=False
        )
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        # hidden = [n layers * n directions, batch size, hidden dim]
        # cell = [n layers * n directions, batch size, hidden dim]
        output, output_length = nn.utils.rnn.pad_packed_sequence(packed_output)
        # output = [batch size, seq len, hidden dim * n directions]
        if self.lstm.bidirectional:
            hidden = self.dropout(torch.cat([hidden[-1], hidden[-2]], dim=-1))
            # hidden = [batch size, hidden dim * 2]
        else:
            hidden = self.dropout(hidden[-1])
            # hidden = [batch size, hidden dim]
        prediction = self.fc(hidden)
        # prediction = [batch size, output dim]
        return prediction

# Set Up Functions for Training

In [112]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [113]:
def initialize_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.xavier_normal_(m.weight)
        nn.init.zeros_(m.bias)
    elif isinstance(m, nn.LSTM):
        for name, param in m.named_parameters():
            if "bias" in name:
                nn.init.zeros_(param)
            elif "weight" in name:
                nn.init.orthogonal_(param)

In [114]:
def get_accuracy(prediction, label):
    batch_size, _ = prediction.shape
    predicted_classes = prediction.argmax(dim=-1)
    correct_predictions = predicted_classes.eq(label).sum()
    accuracy = correct_predictions / batch_size
    return accuracy

In [115]:
def train(dataloader, model, criterion, optimizer, device):
    model.train()
    epoch_losses = []
    epoch_accs = []
    for batch in tqdm.tqdm(dataloader, desc="training..."):
        ids = batch["ids"].to(device)
        length = batch["length"]
        label = batch["label"].to(device)
        prediction = model(ids, length)
        loss = criterion(prediction, label)
        accuracy = get_accuracy(prediction, label)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_losses.append(loss.item())
        epoch_accs.append(accuracy.item())
    return np.mean(epoch_losses), np.mean(epoch_accs)

In [116]:
def evaluate(dataloader, model, criterion, device):
    model.eval()
    epoch_losses = []
    epoch_accs = []
    with torch.no_grad():
        for batch in tqdm.tqdm(dataloader, desc="evaluating..."):
            ids = batch["ids"].to(device)
            length = batch["length"]
            label = batch["label"].to(device)
            prediction = model(ids, length)
            loss = criterion(prediction, label)
            accuracy = get_accuracy(prediction, label)
            epoch_losses.append(loss.item())
            epoch_accs.append(accuracy.item())
    return np.mean(epoch_losses), np.mean(epoch_accs)

# Set Up Training Parameters

In [117]:
# Hyperparameters
vocab_size = len(vocab)
embedding_dim = 300
hidden_dim = 300
output_dim = len(train_data.unique("label"))
n_layers = 2
bidirectional = True
dropout_rate = 0.5
learning_rate = 5e-4
n_epochs = 10

#Initialize Model
model = LSTM(
    vocab_size,
    embedding_dim,
    hidden_dim,
    output_dim,
    n_layers,
    bidirectional,
    dropout_rate,
    pad_index,
)


model.apply(initialize_weights)
vectors = torchtext.vocab.GloVe()

pretrained_embedding = vectors.get_vecs_by_tokens(vocab.get_itos())
model.embedding.weight.data = pretrained_embedding

optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()


model = model.to(device)
criterion = criterion.to(device)

# Train Model on Data

In [118]:
best_valid_loss = float("inf")

metrics = collections.defaultdict(list)

for epoch in range(n_epochs):
    train_loss, train_acc = train(
        train_data_loader, model, criterion, optimizer, device
    )
    valid_loss, valid_acc = evaluate(valid_data_loader, model, criterion, device)
    metrics["train_losses"].append(train_loss)
    metrics["train_accs"].append(train_acc)
    metrics["valid_losses"].append(valid_loss)
    metrics["valid_accs"].append(valid_acc)
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), "lstm.pt")
    print(f"epoch: {epoch}")
    print(f"train_loss: {train_loss:.3f}, train_acc: {train_acc:.3f}")
    print(f"valid_loss: {valid_loss:.3f}, valid_acc: {valid_acc:.3f}")

training...: 100%|██████████| 37/37 [00:05<00:00,  6.70it/s]
evaluating...: 100%|██████████| 13/13 [00:00<00:00, 20.62it/s]


epoch: 0
train_loss: 0.630, train_acc: 0.644
valid_loss: 0.553, valid_acc: 0.721


training...: 100%|██████████| 37/37 [00:05<00:00,  6.72it/s]
evaluating...: 100%|██████████| 13/13 [00:00<00:00, 20.55it/s]


epoch: 1
train_loss: 0.503, train_acc: 0.759
valid_loss: 0.426, valid_acc: 0.795


training...: 100%|██████████| 37/37 [00:05<00:00,  6.63it/s]
evaluating...: 100%|██████████| 13/13 [00:00<00:00, 20.16it/s]


epoch: 2
train_loss: 0.428, train_acc: 0.808
valid_loss: 0.493, valid_acc: 0.768


training...: 100%|██████████| 37/37 [00:05<00:00,  6.62it/s]
evaluating...: 100%|██████████| 13/13 [00:00<00:00, 20.40it/s]


epoch: 3
train_loss: 0.405, train_acc: 0.817
valid_loss: 0.370, valid_acc: 0.852


training...: 100%|██████████| 37/37 [00:05<00:00,  6.59it/s]
evaluating...: 100%|██████████| 13/13 [00:00<00:00, 20.26it/s]


epoch: 4
train_loss: 0.376, train_acc: 0.840
valid_loss: 0.480, valid_acc: 0.770


training...: 100%|██████████| 37/37 [00:05<00:00,  6.60it/s]
evaluating...: 100%|██████████| 13/13 [00:00<00:00, 20.12it/s]


epoch: 5
train_loss: 0.353, train_acc: 0.852
valid_loss: 0.338, valid_acc: 0.871


training...: 100%|██████████| 37/37 [00:05<00:00,  6.62it/s]
evaluating...: 100%|██████████| 13/13 [00:00<00:00, 20.51it/s]


epoch: 6
train_loss: 0.326, train_acc: 0.864
valid_loss: 0.314, valid_acc: 0.873


training...: 100%|██████████| 37/37 [00:05<00:00,  6.65it/s]
evaluating...: 100%|██████████| 13/13 [00:00<00:00, 20.19it/s]


epoch: 7
train_loss: 0.273, train_acc: 0.892
valid_loss: 0.288, valid_acc: 0.883


training...: 100%|██████████| 37/37 [00:05<00:00,  6.51it/s]
evaluating...: 100%|██████████| 13/13 [00:00<00:00, 20.38it/s]


epoch: 8
train_loss: 0.244, train_acc: 0.905
valid_loss: 0.289, valid_acc: 0.886


training...: 100%|██████████| 37/37 [00:05<00:00,  6.58it/s]
evaluating...: 100%|██████████| 13/13 [00:00<00:00, 20.29it/s]


epoch: 9
train_loss: 0.229, train_acc: 0.911
valid_loss: 0.282, valid_acc: 0.886


# Predict IMDB Ratings

In [124]:
# Load Model

model.load_state_dict(torch.load("lstm.pt"))

test_loss, test_acc = evaluate(test_data_loader, model, criterion, device)

evaluating...:   0%|          | 0/49 [00:00<?, ?it/s]

evaluating...: 100%|██████████| 49/49 [00:02<00:00, 19.91it/s]


In [125]:
def imdbRating(text, model, tokenizer, vocab, device):
    tokens = tokenizer(text)
    ids = vocab.lookup_indices(tokens)
    length = torch.LongTensor([len(ids)])
    tensor = torch.LongTensor(ids).unsqueeze(dim=0).to(device)
    prediction = model(tensor, length).squeeze(dim=0)
    probability = torch.softmax(prediction, dim=-1)
    predicted_class = prediction.argmax(dim=-1).item()
    predicted_probability = probability[predicted_class].item()
    return predicted_class, predicted_probability

In [126]:
text = "This film is lovely!"

imdbRating(text, model, tokenizer, vocab, device)

(1, 0.7892040014266968)

In [127]:
text = "This film is trash!"

imdbRating(text, model, tokenizer, vocab, device)

(0, 0.7881683111190796)

In [128]:
text = "This film is mid"

imdbRating(text, model, tokenizer, vocab, device)

(1, 0.5759172439575195)