In [None]:
import pandas as pd

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.dataset import random_split
from torch.nn.utils.rnn import pad_sequence

In [None]:
from torchtext.vocab import build_vocab_from_iterator



In [None]:
from torchtext.data.utils import get_tokenizer



In [None]:
# https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews/data
df = pd.read_csv('./imdb.csv')
df.shape

(50000, 2)

In [None]:
df.sample(3)

Unnamed: 0,review,sentiment
30888,What more could I say? The Americans totally h...,positive
1001,Why do movie makers always go against the auth...,negative
28864,Derek Jarman has shown us time and time again ...,negative


In [None]:
tokenizer = get_tokenizer('basic_english')
tokenizer

In [None]:
def yield_tokens(data_iter):
    for text in data_iter:
        yield tokenizer(text)


In [None]:
# Tokens addes via `specials` parameter, will get index of 0, 1 ...
vocab = build_vocab_from_iterator(yield_tokens(df['review']), specials= ['<unk>', '<pad>'])

# Any token not found in the vocabulary will be assigned the index of <xx>
vocab.set_default_index(vocab['<unk>'])

In [None]:
# all the unknown words will have the same idx as `<unk>`
vocab['<pad>'], vocab['<unk>'], vocab['<qqqqxx>']

(1, 0, 0)

In [None]:
set(df['sentiment'])

{'negative', 'positive'}

In [None]:
data_sentiments = torch.tensor((df['sentiment'] == 'positive') * 1.)
# To avoid: RuntimeError: expected scalar type Long but found Double
data_sentiments = data_sentiments.type(torch.LongTensor)
type(data_sentiments)

torch.Tensor

In [None]:
data_reviews = df['review'].apply(lambda review: vocab(tokenizer(review)))


In [None]:
class SentimentDataset(Dataset):
    def __init__(self, reviews, sentiments):
        self.reviews = reviews
        self.sentiments = sentiments

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, idx):
        # sentiment is already a tensor.
        return torch.tensor(self.reviews[idx]), self.sentiments[idx]


In [None]:
dataset = SentimentDataset(data_reviews, data_sentiments)


In [None]:
type(dataset)

In [None]:
def collate_fn(batch):
    """
    Make all the reviews in a given batch the same size, by padding to the end.
    All the reviews in the list `reviews_padded`
    will be the same size as the review with the most word.
    """
    reviews = [itm[0] for itm in batch]
    sentiments = [itm[1] for itm in batch]
    reviews_padded = pad_sequence(reviews, batch_first= True, padding_value= vocab['<pad>'])
    return reviews_padded, torch.tensor(sentiments)

In [None]:
class SentimentModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes):
        super().__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse= True)
        self.fc = nn.Linear(embed_dim, num_classes)

    def forward(self, txt):
        txt_embed = self.embedding(txt)
        return self.fc(txt_embed)


In [None]:
VOCAB_SIZE = len(vocab)
EMBED_SIZE = 128
NUM_CLASSES = 2
model = SentimentModel(VOCAB_SIZE, EMBED_SIZE, NUM_CLASSES)

In [None]:
data_trn, data_dev, data_tst = random_split(dataset, [.8, .1, .1])
len(data_trn)

40000

In [None]:
BATCH_SIZE = 2

loader_trn = DataLoader(data_trn, batch_size= BATCH_SIZE, shuffle= True, collate_fn= collate_fn)
loader_dev = DataLoader(data_dev, batch_size= BATCH_SIZE, shuffle= False, collate_fn= collate_fn)
loader_tst = DataLoader(data_tst, batch_size= BATCH_SIZE, shuffle= False, collate_fn= collate_fn)

len(loader_trn), len(loader_tst)

(20000, 2500)

In [None]:
criterion = nn.CrossEntropyLoss()

# RuntimeError: Adam does not support sparse gradients, please consider SparseAdam instead
#optimizer = torch.optim.Adam(params= model.parameters(), lr= .001)

optimizer= torch.optim.SGD(model.parameters(), lr= 0.01)

In [None]:
NUM_EPOCHS = 15
for e in range(NUM_EPOCHS):
    model.train()
    loss_total = 0
    for reviews, sentiments in loader_trn:
        optimizer.zero_grad()
        logits = model(reviews)
        loss = criterion(logits, sentiments)
        loss.backward()
        optimizer.step()
        loss_total += loss.item()
    print(f'Epoch {e+1}/{NUM_EPOCHS} --- Loss: {loss_total / len(loader_trn):.4f}')

Epoch 1/15 --- Loss: 0.6584
Epoch 2/15 --- Loss: 0.6194
Epoch 3/15 --- Loss: 0.5973
Epoch 4/15 --- Loss: 0.5803
Epoch 5/15 --- Loss: 0.5646
Epoch 6/15 --- Loss: 0.5497
Epoch 7/15 --- Loss: 0.5382
Epoch 8/15 --- Loss: 0.5256
Epoch 9/15 --- Loss: 0.5144
Epoch 10/15 --- Loss: 0.5036
Epoch 11/15 --- Loss: 0.4949
Epoch 12/15 --- Loss: 0.4877
Epoch 13/15 --- Loss: 0.4798
Epoch 14/15 --- Loss: 0.4721
Epoch 15/15 --- Loss: 0.4653


In [None]:
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for reviews, sentiments in loader_dev:
        logits = model(reviews)
        preds = torch.argmax(logits, dim=1)
        all_preds.extend(preds.tolist())
        all_labels.extend(sentiments.tolist())

accuracy = accuracy_score(all_labels, all_preds)
print(f'Validation Accuracy: {accuracy:.3f}')


Validation Accuracy: 0.778


In [None]:
# Evaluation loop
model.eval()
with torch.no_grad():
    correct, total = 0, 0
    for reviews, sentiments in loader_dev:
        logits = model(reviews)
        _, predicted = torch.max(logits, 1)
        total += sentiments.size(0)
        correct += (predicted == sentiments).sum().item()
    print(f'Accuracy on validation set: {100 * correct / total:.2f}%')

Accuracy on validation set: 77.84%
