In [1]:
!pip install kaggle



In [2]:
!kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

Dataset URL: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
License(s): other
Downloading imdb-dataset-of-50k-movie-reviews.zip to /content
 66% 17.0M/25.7M [00:00<00:00, 46.5MB/s]
100% 25.7M/25.7M [00:00<00:00, 62.4MB/s]


In [3]:
!unzip imdb-dataset-of-50k-movie-reviews.zip

Archive:  imdb-dataset-of-50k-movie-reviews.zip
  inflating: IMDB Dataset.csv        


In [None]:
# import pandas as pd
# df = pd.read_csv('/content/IMDB Dataset.csv')
# def to_sentiment(rating):
#   rating = str(rating)
#   if rating == 'positive':
#     return 0
#   else:
#     return 1

# df['sentiment'] = df.sentiment.apply(to_sentiment)
# print(df.sample(5))
# df.to_csv('/content/IMDB Dataset_clean.csv', index=False)

In [6]:
%matplotlib inline
%load_ext autoreload

In [5]:
# !pip install pandas_profiling
# !pip install pydantic==1.8.2

In [62]:
import pandas as pd
import numpy as np
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import seaborn as sns

import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

from sklearn.model_selection import train_test_split

from collections import defaultdict

import matplotlib.pyplot as plt
from textwrap import wrap



import gc



RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

MAX_LEN = 400
BATCH_SIZE = 8

class_names = ['negative', 'positive']
EPOCHS = 2

In [3]:
# df = pd.read_csv('/content/IMDB Dataset_clean.csv',nrows=100)
df = pd.read_csv('/content/IMDB Dataset_clean.csv')
print(len(df))
print(df.sample(5))

50000
                                                  review  sentiment
33553  I really liked this Summerslam due to the look...          0
9427   Not many television shows appeal to quite as m...          0
199    The film quickly gets to a major chase scene w...          1
12447  Jane Austen would definitely approve of this o...          0
39489  Expectations were somewhat high for me when I ...          1


 Using cased version of BERT and tokenizer.

In [4]:
PRE_TRAINED_MODEL_NAME = 'bert-base-cased'

In [5]:
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [7]:
# Testing Tokenizer
sample_txt = 'I want to learn how to do sentiment analysis using BERT and tokenizer.'

encoding = tokenizer.encode_plus(
  sample_txt,
  max_length=32,
  add_special_tokens=True, # Add '[CLS]' and '[SEP]'
  return_token_type_ids=False,
  pad_to_max_length=True,
  return_attention_mask=True,
  return_tensors='pt',  # Return PyTorch tensors
  truncation = True
)

print(encoding.keys())


dict_keys(['input_ids', 'attention_mask'])




In [6]:
class MovieReviewDataset(Dataset):
  def __init__(self, reviews, targets, tokenizer, max_len):
    self.reviews = reviews
    self.targets = targets # Labels
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.reviews)

  def __getitem__(self, item):
    review = str(self.reviews[item])
    target = self.targets[item]


    # Tokenize with max_len and padding :D
    encoding = self.tokenizer.encode_plus(
      review,
      add_special_tokens=True,
      max_length=self.max_len,
      return_token_type_ids=False,
      pad_to_max_length=True,
      return_attention_mask=True,
      return_tensors='pt', # returned as tensor to ve used in pytorch
      truncation = True
    )

    return {
      'review_text': review,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'targets': torch.tensor(target, dtype=torch.long)
    }

In [7]:
# Split
df_train, df_test = train_test_split(df, test_size=0.3, random_state=RANDOM_SEED)
df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=RANDOM_SEED)

In [63]:
def create_data_loader(df, tokenizer, max_len, batch_size):
  ds = MovieReviewDataset(
    reviews=df.review.to_numpy(),
    targets=df.sentiment.to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
  )

  return DataLoader(
    ds,
    batch_size=batch_size,
    num_workers=4
  )

train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

In [11]:
data = next(iter(train_data_loader))
data.keys()

print(data['input_ids'].shape)
print(data['attention_mask'].shape)
print(data['targets'].shape)

torch.Size([16, 400])
torch.Size([16, 400])
torch.Size([16])



We are using the BERT model and build the sentiment classifier on top of it. We then try to use the model on our sample text.

In [12]:
bert_model = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

In [13]:
print(bert_model)
print(bert_model.config)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [27]:
# Test Model on smaple txt above
# last_hidden_state,pooler_output,_,_,_,_ = bert_model(
last_hidden_state,pooler_output = bert_model(
  input_ids=encoding['input_ids'],
  attention_mask=encoding['attention_mask']
)

print(last_hidden_state)
print(pooled_output)

last_hidden_state
pooler_output


In [11]:
class SentimentClassifier(nn.Module):
    def __init__(self, n_classes):
      super(SentimentClassifier, self).__init__()
      self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)

      self.drop = nn.Dropout(p=0.3)
      self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
      bert_out = self.bert(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      pooler_output=bert_out['pooler_output']
      output = self.drop(pooler_output)
      return self.out(output)


# import torch

model = SentimentClassifier(len(class_names))
model = model.to(device)

# torch.cuda.empty_cache()

In [38]:
# Test Model Code
input_ids = data['input_ids'].to(device)
attention_mask = data['attention_mask'].to(device)

F.softmax(model(input_ids, attention_mask), dim=1)


tensor([[0.6661, 0.3339],
        [0.5883, 0.4117],
        [0.4828, 0.5172],
        [0.5343, 0.4657],
        [0.6297, 0.3703],
        [0.6195, 0.3805],
        [0.5001, 0.4999],
        [0.4291, 0.5709],
        [0.5114, 0.4886],
        [0.5360, 0.4640],
        [0.5868, 0.4132],
        [0.6021, 0.3979],
        [0.4559, 0.5441],
        [0.6366, 0.3634],
        [0.5806, 0.4194],
        [0.5186, 0.4814]], device='cuda:0', grad_fn=<SoftmaxBackward0>)

In [12]:
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)

total_steps = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)

loss_fn = nn.CrossEntropyLoss().to(device)

In [47]:
def train_epoch(
      model,
      data_loader,
      loss_fn,
      optimizer,
      device,
      scheduler,
      n_examples):

  # Set Training
  model = model.train()

  losses = []
  correct_predictions = 0

  for batch_idx,batch in enumerate(data_loader):
    print(f'[Train]batch {batch_idx + 1}/{len(data_loader)}')
    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)
    targets = batch["targets"].to(device)

    outputs = model(
      input_ids=input_ids,
      attention_mask=attention_mask
    )

    _, preds = torch.max(outputs, dim=1)
    loss = loss_fn(outputs, targets)

    correct_predictions += torch.sum(preds == targets)
    losses.append(loss.item())

    loss.backward()

    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()

    del input_ids
    del attention_mask
    del targets
    del outputs

    torch.cuda.empty_cache()
    gc.collect()


  return correct_predictions.double() / n_examples, np.mean(losses)

In [44]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
  # Set to Eval Mode
  model = model.eval()

  losses = []
  correct_predictions = 0

  for batch_idx,batch in enumerate(data_loader):
    print(f'[Eval]batch {batch_idx + 1}/{len(data_loader)}')
    # continue
    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)
    targets = batch["targets"].to(device)

    outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
    _, preds = torch.max(outputs, dim=1)

    loss = loss_fn(outputs, targets)

    correct_predictions += torch.sum(preds == targets)
    losses.append(loss.item())


    del input_ids
    del attention_mask
    del targets
    del outputs

    torch.cuda.empty_cache()
    gc.collect()



  return correct_predictions.double() / n_examples, np.mean(losses)




In [None]:
%%time

history = defaultdict(list)
best_accuracy = 0

import torch
import gc


for epoch in range(EPOCHS):

  torch.cuda.empty_cache()
  gc.collect()


  print(f'Epoch {epoch + 1}/{EPOCHS}')
  print('-' * 50)


  train_acc, train_loss = train_epoch(
    model,
    train_data_loader,
    loss_fn,
    optimizer,
    device,
    scheduler,
    len(df_train)
  )
  print(f'Train loss {train_loss} accuracy {train_acc}')

  torch.cuda.empty_cache()
  gc.collect()

  val_acc, val_loss = eval_model(
    model,
    val_data_loader,
    loss_fn,
    device,
    len(df_val)
  )

  print(f'Val   loss {val_loss} accuracy {val_acc}')
  print()

  torch.cuda.empty_cache()
  gc.collect()


  history['train_acc'].append(train_acc)
  history['train_loss'].append(train_loss)
  history['val_acc'].append(val_acc)
  history['val_loss'].append(val_loss)

  if val_acc > best_accuracy:
    torch.save(model.state_dict(), 'best_model_state.bin')
    best_accuracy = val_acc

Epoch 1/2
--------------------------------------------------




[Train]batch 1/4375
[Train]batch 2/4375
[Train]batch 3/4375
[Train]batch 4/4375
[Train]batch 5/4375
[Train]batch 6/4375
[Train]batch 7/4375
[Train]batch 8/4375
[Train]batch 9/4375
[Train]batch 10/4375
[Train]batch 11/4375
[Train]batch 12/4375
[Train]batch 13/4375
[Train]batch 14/4375
[Train]batch 15/4375
[Train]batch 16/4375
[Train]batch 17/4375
[Train]batch 18/4375
[Train]batch 19/4375
[Train]batch 20/4375
[Train]batch 21/4375
[Train]batch 22/4375
[Train]batch 23/4375
[Train]batch 24/4375
[Train]batch 25/4375
[Train]batch 26/4375
[Train]batch 27/4375
[Train]batch 28/4375
[Train]batch 29/4375
[Train]batch 30/4375
[Train]batch 31/4375
[Train]batch 32/4375
[Train]batch 33/4375
[Train]batch 34/4375
[Train]batch 35/4375
[Train]batch 36/4375
[Train]batch 37/4375
[Train]batch 38/4375
[Train]batch 39/4375
[Train]batch 40/4375
[Train]batch 41/4375
[Train]batch 42/4375
[Train]batch 43/4375
[Train]batch 44/4375
[Train]batch 45/4375
[Train]batch 46/4375
[Train]batch 47/4375
[Train]batch 48/4375
[



[Eval]batch 1/938
[Eval]batch 2/938
[Eval]batch 3/938
[Eval]batch 4/938
[Eval]batch 5/938
[Eval]batch 6/938
[Eval]batch 7/938
[Eval]batch 8/938
[Eval]batch 9/938
[Eval]batch 10/938
[Eval]batch 11/938
[Eval]batch 12/938
[Eval]batch 13/938
[Eval]batch 14/938
[Eval]batch 15/938
[Eval]batch 16/938
[Eval]batch 17/938
[Eval]batch 18/938
[Eval]batch 19/938
[Eval]batch 20/938
[Eval]batch 21/938
[Eval]batch 22/938
[Eval]batch 23/938
[Eval]batch 24/938
[Eval]batch 25/938
[Eval]batch 26/938
[Eval]batch 27/938
[Eval]batch 28/938
[Eval]batch 29/938
[Eval]batch 30/938
[Eval]batch 31/938
[Eval]batch 32/938
[Eval]batch 33/938
[Eval]batch 34/938
[Eval]batch 35/938
[Eval]batch 36/938
[Eval]batch 37/938
[Eval]batch 38/938
[Eval]batch 39/938
[Eval]batch 40/938
[Eval]batch 41/938
[Eval]batch 42/938
[Eval]batch 43/938
[Eval]batch 44/938
[Eval]batch 45/938
[Eval]batch 46/938
[Eval]batch 47/938
[Eval]batch 48/938
[Eval]batch 49/938
[Eval]batch 50/938
[Eval]batch 51/938
[Eval]batch 52/938
[Eval]batch 53/938
[E



[Train]batch 1/4375
[Train]batch 2/4375
[Train]batch 3/4375
[Train]batch 4/4375
[Train]batch 5/4375
[Train]batch 6/4375
[Train]batch 7/4375
[Train]batch 8/4375
[Train]batch 9/4375
[Train]batch 10/4375
[Train]batch 11/4375
[Train]batch 12/4375
[Train]batch 13/4375
[Train]batch 14/4375
[Train]batch 15/4375
[Train]batch 16/4375
[Train]batch 17/4375
[Train]batch 18/4375
[Train]batch 19/4375
[Train]batch 20/4375
[Train]batch 21/4375
[Train]batch 22/4375
[Train]batch 23/4375
[Train]batch 24/4375
[Train]batch 25/4375
[Train]batch 26/4375
[Train]batch 27/4375
[Train]batch 28/4375
[Train]batch 29/4375
[Train]batch 30/4375
[Train]batch 31/4375
[Train]batch 32/4375
[Train]batch 33/4375
[Train]batch 34/4375
[Train]batch 35/4375
[Train]batch 36/4375
[Train]batch 37/4375
[Train]batch 38/4375
[Train]batch 39/4375
[Train]batch 40/4375
[Train]batch 41/4375
[Train]batch 42/4375
[Train]batch 43/4375
[Train]batch 44/4375
[Train]batch 45/4375
[Train]batch 46/4375
[Train]batch 47/4375
[Train]batch 48/4375
[

In [None]:
# #  history to CPU

print(history)

history_1={}
history_1['train_acc'] = [h.cpu() for h in history['train_acc']]
history_1['val_acc'] = [h.cpu() for h in history['val_acc']]
print(history_1)

plt.plot(history_1['train_acc'], label='train accuracy')
plt.plot(history_1['val_acc'], label='validation accuracy')

plt.title('Training history')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()
plt.xlim(left=0)
plt.ylim([0, 1]);
plt.show()

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
torch.cuda.empty_cache()
gc.collect()

test_acc, _ = eval_model(
  model,
  test_data_loader,
  loss_fn,
  device,
  len(df_test)
)

test_acc.item()

In [None]:
def get_predictions(model, data_loader):
  # Eval Mode
  model = model.eval()


  review_texts = []
  predictions = []
  prediction_probs = []
  real_values = []


  with torch.no_grad():
    for batch in data_loader:
        texts =batch['review_text']
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        targets = batch["targets"].to(device)


        outputs = model(
          input_ids=input_ids,
          attention_mask=attention_mask
        )
        _, preds = torch.max(outputs, dim=1)

        probs = F.softmax(outputs, dim=1)

        review_texts.extend(texts)
        predictions.extend(preds)
        prediction_probs.extend(probs)
        real_values.extend(targets)


  predictions = torch.stack(predictions).cpu()
  prediction_probs = torch.stack(prediction_probs).cpu()
  real_values = torch.stack(real_values).cpu()
  return review_texts, predictions, prediction_probs, real_values



y_review_texts, y_pred, y_pred_probs, y_test = get_predictions(
  model, test_data_loader)

In [None]:
print(classification_report(y_test, y_pred, target_names=class_names))

In [None]:
def show_confusion_matrix(confusion_matrix):
  hmap = sns.heatmap(confusion_matrix, annot=True, fmt="d", cmap="Blues")
  hmap.yaxis.set_ticklabels(hmap.yaxis.get_ticklabels(), rotation=0, ha='right')
  hmap.xaxis.set_ticklabels(hmap.xaxis.get_ticklabels(), rotation=30, ha='right')
  plt.ylabel('True sentiment')
  plt.xlabel('Predicted sentiment');

cm = confusion_matrix(y_test, y_pred)
df_cm = pd.DataFrame(cm, index=class_names, columns=class_names)
show_confusion_matrix(df_cm)

In [None]:
# Inference
idx = 5

review_text = y_review_texts[idx]
true_sentiment = y_test[idx]
pred_sentiment=y_pred[idx]
pred_df = pd.DataFrame({
  'class_names': class_names,
  'values': y_pred_probs[idx]
})

print("\n".join(wrap(review_text)))
print()
print(f'Predicted sentiment: {class_names[pred_sentiment]}')
print(f'True sentiment: {class_names[true_sentiment]}')

In [None]:
sns.barplot(x='values', y='class_names', data=pred_df, orient='h')
plt.ylabel('sentiment')
plt.xlabel('probability')
plt.xlim([0, 1]);

In [74]:
import torch
import gc

torch.cuda.empty_cache()
gc.collect()





0

In [75]:
!nvidia-smi

Tue May 28 23:43:51 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   74C    P0              38W /  70W |   1689MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    