In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel,AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix,classification_report

In [2]:
df = pd.read_csv("IMDB Dataset.csv", on_bad_lines='skip', encoding='utf-8')
df.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [3]:
df['label'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0 if x == 'negative' else 2)

In [4]:
df=df.sample(frac=1,random_state=42)
train_data,test_data=train_test_split(df,test_size=0.2,random_state=42)

print("Train set shape:",train_data.shape)
print("Test set shape:",test_data.shape)

Train set shape: (40000, 3)
Test set shape: (10000, 3)


In [5]:
class IMDB_Dataset(Dataset):
  def __init__(self,data,tokenizer):
    self.data=data
    self.tokenizer=tokenizer
    self.max_length=512

  def __len__(self):
    return len(self.data)

  def __getitem__(self, index):
     review=self.data.iloc[index]['review']
     labels = self.data.iloc[index][['label']].values.astype(int)
     encoding = self.tokenizer(review, return_tensors='pt', padding=True, truncation=True, max_length=self.max_length)
     input_ids = encoding['input_ids'][0]
     attention_mask = encoding['attention_mask'][0]
     # resize the tensors to the same size
     input_ids = nn.functional.pad(input_ids, (0, self.max_length - input_ids.shape[0]), value=0)
     attention_mask = nn.functional.pad(attention_mask, (0, self.max_length - attention_mask.shape[0]), value=0)
     return input_ids, attention_mask, torch.tensor(labels)

In [6]:
model_checkpoint = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_checkpoint)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_dataset = IMDB_Dataset(train_data, tokenizer)
test_dataset = IMDB_Dataset(test_data, tokenizer)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [7]:
batch_size = 8
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [8]:
class BertClassifier(nn.Module):
    def __init__(self, num_labels):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(model_checkpoint)
        self.classifier = nn.Sequential(
            nn.Linear(self.bert.config.hidden_size, 300),
            nn.ReLU(),
            nn.Linear(300, 100),
            nn.ReLU(),
            nn.Linear(100, 50),
            nn.ReLU(),
            nn.Linear(50, num_labels)

        )

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        x = outputs['last_hidden_state'][:, 0, :]
        x = self.classifier(x)
        return x

In [9]:
num_labels = 2
model = BertClassifier(num_labels).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 2e-5)

num_epochs = 1
n_total_steps = len(train_loader)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [10]:
for epoch in range(num_epochs):

  for i, batch in enumerate (train_loader):

    input_ids, attention_mask, labels = batch
    input_ids = input_ids.to(device)

    attention_mask = attention_mask.to(device)

    labels = labels.view(-1)
    labels = labels.to(device)

    optimizer.zero_grad()

    logits = model(input_ids, attention_mask)

    loss = criterion(logits, labels)
    loss.backward()
    optimizer.step()

    if (i+1) % 100 == 0:
      print(f'epoch {epoch + 1}/ {num_epochs}, batch {i+1}/{n_total_steps}, loss = {loss.item():.4f}')

epoch 1/ 1, batch 100/5000, loss = 0.4277
epoch 1/ 1, batch 200/5000, loss = 0.4005
epoch 1/ 1, batch 300/5000, loss = 0.7097
epoch 1/ 1, batch 400/5000, loss = 0.4278
epoch 1/ 1, batch 500/5000, loss = 0.3912
epoch 1/ 1, batch 600/5000, loss = 0.3163
epoch 1/ 1, batch 700/5000, loss = 0.3163
epoch 1/ 1, batch 800/5000, loss = 0.3846
epoch 1/ 1, batch 900/5000, loss = 0.4743
epoch 1/ 1, batch 1000/5000, loss = 0.2989
epoch 1/ 1, batch 1100/5000, loss = 0.2409
epoch 1/ 1, batch 1200/5000, loss = 0.0490
epoch 1/ 1, batch 1300/5000, loss = 0.1305
epoch 1/ 1, batch 1400/5000, loss = 0.1621
epoch 1/ 1, batch 1500/5000, loss = 0.8592
epoch 1/ 1, batch 1600/5000, loss = 0.3984
epoch 1/ 1, batch 1700/5000, loss = 0.0621
epoch 1/ 1, batch 1800/5000, loss = 0.1181
epoch 1/ 1, batch 1900/5000, loss = 0.2332
epoch 1/ 1, batch 2000/5000, loss = 0.0233
epoch 1/ 1, batch 2100/5000, loss = 0.0229
epoch 1/ 1, batch 2200/5000, loss = 0.4325
epoch 1/ 1, batch 2300/5000, loss = 0.3136
epoch 1/ 1, batch 24

In [11]:
all_labels = []
all_preds = []

with torch.no_grad():
  n_correct = 0
  n_samples = 0
  for i, batch in enumerate (test_loader):

    input_ids, attention_mask, labels = batch
    input_ids = input_ids.to(device)

    attention_mask = attention_mask.to(device)

    labels = labels.view(-1)
    labels = labels.to(device)

    outputs = model(input_ids, attention_mask)

    _, predictions = torch.max(outputs, 1)

    all_labels.append(labels.cpu().numpy())
    all_preds.append(predictions.cpu().numpy())

all_labels = np.concatenate(all_labels, axis=0)
all_preds = np.concatenate(all_preds, axis=0)

print(classification_report(all_labels, all_preds))
print(accuracy_score(all_labels, all_preds))

              precision    recall  f1-score   support

           0       0.93      0.95      0.94      4951
           1       0.95      0.93      0.94      5049

    accuracy                           0.94     10000
   macro avg       0.94      0.94      0.94     10000
weighted avg       0.94      0.94      0.94     10000

0.9394
