In [1]:
import pandas as pd
import random
from collections import Counter
from torch.utils.data import Dataset as TorchDataset
from torch.utils.data import DataLoader, random_split
import torch
import torch.nn as nn
import torch.optim as optim
from datasets import Dataset
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoTokenizer
import torch.nn.functional as F

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Device:", device)

Device: cuda


# IMDB dataset

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/Ankit152/IMDB-sentiment-analysis/refs/heads/master/IMDB-Dataset.csv')
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [3]:
data = pd.concat([
    df[df['sentiment'] == 'positive'].sample(1500, random_state=101),
    df[df['sentiment'] == 'negative'].sample(1500, random_state=101)
]).sample(frac=1, random_state=42).reset_index(drop=True)

# RNN

In [4]:
def tokenize(text):
  text = text.replace("<br />", " ").replace("<br/>", " ")
  return text.lower().split()

counter = Counter(word for text in data['review'] for word in tokenize(text))
most_common = counter.most_common(5000)

vocab = {word: i+2 for i, (word, _) in enumerate(most_common)}
vocab['<pad>'] = 0
vocab['<unk>'] = 1

In [5]:
def encode(text):
  return torch.tensor([vocab.get(word, 1) for word in tokenize(text)])

In [6]:
class IMDBDataset(TorchDataset):
  def __init__(self, df):
    super().__init__()
    self.X = [encode(text) for text in df['review']]
    self.y = [torch.tensor([1 if s == 'positive' else 0]) for s in df['sentiment']]

  def __len__(self):
    return len(self.X)

  def __getitem__(self, idx):
    return self.X[idx], self.y[idx]


In [7]:
def collate_fn(batch):
  Xs, ys = zip(*batch)
  max_len = max(len(x) for x in Xs)
  padded_X = [torch.cat([x, torch.zeros(max_len - len(x), dtype=torch.long)]) for x in Xs]
  ys = torch.stack(ys).float()
  return torch.stack(padded_X), ys

In [8]:
generator = torch.Generator().manual_seed(123)
dataset = IMDBDataset(data)

train_set, test_set = random_split(dataset, [2500, 500], generator=generator)

train_loader = DataLoader(train_set, batch_size=32, shuffle=True, collate_fn=collate_fn)
test_loader  = DataLoader(test_set, batch_size=32, shuffle=False, collate_fn=collate_fn)

In [9]:
class SentimentRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim=128, hidden_dim=256):   # each token is a 128 element vector
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)        # initial embeddings(weight matrices) are created randomly for every token, without seeing any token yet
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)  # takes embedded x1 (timestep), computes h1 = W*h0 + U*x1 + bh, then shares h1 to next timestep, in parallel for every sample of batch
        self.fc = nn.Linear(hidden_dim, 1)                              # compresses final hidden state into single score(prediction) per sample: output = U * hn + b0
        self.sigmoid = nn.Sigmoid()                                     # converts the raw score into [0, 1] probability for binary sentiment.

    def forward(self, x):
        x = self.embedding(x)       # converts each token index into its embedding vector, conceptually looks up rows in the embedding table for each token
        _, h = self.rnn(x)          # h = hidden state at last timestep for each sample; '_' = hidden states at all timesteps (ignored here)
        x = self.fc(h.squeeze(0))   # h.squeeze(0) → removes the first dimension (RNN returns [num_layers, batch_size, hidden_dim]) → [batch_size, hidden_dim]
        return self.sigmoid(x)      # probability for class 1('pos')

In [10]:
model_rnn = SentimentRNN(len(vocab))
criterion = nn.BCELoss()
optimizer = optim.Adam(model_rnn.parameters(), lr=0.001)

In [11]:
epochs = 5
for epoch in range(epochs):
    epoch_loss = 0

    for X, y in train_loader:
        y_pred = model_rnn(X)
        loss = criterion(y_pred, y)

        optimizer.zero_grad()
        loss.backward()           # gradients for all parameters (RNN weights, Linear weights, embedding vectors, biases)
        optimizer.step()

        epoch_loss += loss.item()

    print(f'Epoch {epoch+1}, Loss: {epoch_loss:.4f}')

Epoch 1, Loss: 55.6684
Epoch 2, Loss: 55.5208
Epoch 3, Loss: 55.7555
Epoch 4, Loss: 55.2195
Epoch 5, Loss: 55.1862


In [12]:
model_rnn.eval()
test_loss = 0
correct = 0

with torch.no_grad():
    for X, y in test_loader:
        output = model_rnn(X)
        test_loss += criterion(output, y).item() * X.size(0)

        preds = (output > 0.5).float()
        correct += (preds == y).sum().item()

test_loss /= len(test_loader.dataset)
accuracy = correct / len(test_loader.dataset)

print(f"Accuracy: {accuracy:.4f}, Test Loss: {test_loss:.4f}")

Accuracy: 0.5060, Test Loss: 0.6959


# Transformer

In [13]:
train_df = data.iloc[train_set.indices].reset_index(drop=True)
test_df  = data.iloc[test_set.indices].reset_index(drop=True)

train_df['label'] = train_df['sentiment'].map({'positive': 1, 'negative': 0})
test_df['label']  = test_df['sentiment'].map({'positive': 1, 'negative': 0})

hf_train_set = Dataset.from_pandas(train_df)
hf_test_set  = Dataset.from_pandas(test_df)

In [14]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

def preprocess_function(examples):
    return tokenizer(examples['review'], truncation=True, padding='max_length', max_length=256)

tokenized_train_set = hf_train_set.map(preprocess_function, batched=True)
tokenized_test_set  = hf_test_set.map(preprocess_function, batched=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [15]:
model_trf = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_dir='./logs',
    logging_steps=100,
    report_to='none'
)

trainer = Trainer(
    model=model_trf,
    args=training_args,
    train_dataset=tokenized_train_set,
    eval_dataset=tokenized_test_set,
)

model_trf.to(device)
trainer.train()

Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

DistilBertForSequenceClassification LOAD REPORT from: distilbert-base-uncased
Key                     | Status     | 
------------------------+------------+-
vocab_layer_norm.weight | UNEXPECTED | 
vocab_transform.weight  | UNEXPECTED | 
vocab_layer_norm.bias   | UNEXPECTED | 
vocab_transform.bias    | UNEXPECTED | 
vocab_projector.bias    | UNEXPECTED | 
pre_classifier.bias     | MISSING    | 
classifier.bias         | MISSING    | 
classifier.weight       | MISSING    | 
pre_classifier.weight   | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.
`logging_dir` is deprecated and will be removed in v5.2. Please set `TENSORBOARD_LOGGING_DIR` instead.


Step,Training Loss
100,0.468866
200,0.254536
300,0.197086
400,0.091539


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

TrainOutput(global_step=471, training_loss=0.2259291337047666, metrics={'train_runtime': 175.5847, 'train_samples_per_second': 42.714, 'train_steps_per_second': 2.682, 'total_flos': 496752744960000.0, 'train_loss': 0.2259291337047666, 'epoch': 3.0})

In [16]:
trainer.evaluate()

{'eval_loss': 0.536452054977417,
 'eval_runtime': 3.4236,
 'eval_samples_per_second': 146.044,
 'eval_steps_per_second': 9.347,
 'epoch': 3.0}

# Deployment

In [28]:
model_trf.save_pretrained("my_model")
tokenizer.save_pretrained("my_model")

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

('my_model/tokenizer_config.json', 'my_model/tokenizer.json')

In [30]:
from huggingface_hub import login

login()

In [None]:
from huggingface_hub import HfApi

api = HfApi()
api.upload_folder(
    folder_path="my_model",
    repo_id="javokhirumar/sentiment-analysis",
    repo_type="model"
)

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...y_model/model.safetensors:   2%|2         | 6.03MB /  268MB            

CommitInfo(commit_url='https://huggingface.co/javokhirumar/sentiment-analysis/commit/39d31b01f80e731ea478980b6aab1fcfc27266b7', commit_message='Upload folder using huggingface_hub', commit_description='', oid='39d31b01f80e731ea478980b6aab1fcfc27266b7', pr_url=None, repo_url=RepoUrl('https://huggingface.co/javokhirumar/sentiment-analysis', endpoint='https://huggingface.co', repo_type='model', repo_id='javokhirumar/sentiment-analysis'), pr_revision=None, pr_num=None)