# Sentiment Analysis using Bert Transformer

The notebook demonstrates the fine-tunning of a Pretrained Bart Transormer to predict the sentiment of the IMBD movie ratings dataset

In [2]:
import torch
import random
import numpy as np
import torch.nn as nn
import torch.optim as optim
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.metrics import accuracy_score
from tqdm import tqdm

In [None]:
path = '../IMDB Dataset.csv' # path to the CSV file

## Text Preprocessing

In [12]:
df = pd.read_csv(path)
texts = df['review'].tolist()
labels = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0).tolist()
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

In [13]:
# Loading pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

In [14]:
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [15]:
MAX_LEN = 64  # Maximum sequence length
BATCH_SIZE = 256

train_dataset = SentimentDataset(train_texts, train_labels, tokenizer, MAX_LEN)
test_dataset = SentimentDataset(test_texts, test_labels, tokenizer, MAX_LEN)


In [16]:
train_dataset[0]

{'input_ids': tensor([  101,  2008,  1005,  1055,  2054,  1045,  2921,  4851,  2870,  2076,
          1996,  2116,  9590,  1010,  7491,  3503,  1010, 25082,  1998,  2236,
         26865,  2008,  2566,  4168,  3686,  1996,  6391,  2781,  1012,  1996,
         18539,  2036,  3233,  2039,  2043,  2017,  2228,  1997,  1996,  2028,
          1011,  8789,  3494,  1010,  2040,  2031,  2061,  2210,  5995,  2008,
          2009,  2003,  8990,  5263,  2000,  2729,  2054,  6433,  2000,  2068,
          1012,  2027,  2024,   102]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
 'label': tensor(0)}

In [None]:
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

## Training

For training we will fine-tune the bart model which is an encoder-only transformer meant for solving tasks like text classifcaition.
We will take the final embedding representation of the `[CLS]` token and fed it to linear layer for classification

In [None]:
class SentimentClassifier(torch.nn.Module):
    def __init__(self, bert_model_name='bert-base-uncased', num_classes=1):
        super(SentimentClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = torch.nn.Dropout(0.1)
        self.classifier = torch.nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output  # Use the [CLS] token's embedding
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return torch.sigmoid(logits)  # Apply sigmoid for binary classification

Since our model is quite large we will use colab's gpu to train the model

In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [9]:
model = SentimentClassifier().to(device)
optimizer = optim.Adam(model.parameters(), lr=2e-5)
criterion = torch.nn.BCELoss()

In [10]:

EPOCHS = 3

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0

    # Create a tqdm progress bar for the training loop
    progress_bar = tqdm(train_loader, desc=f'Epoch {epoch + 1}/{EPOCHS}', leave=False)

    for batch in progress_bar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device).float()

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs.squeeze(), labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        # Update the progress bar with the current loss
        progress_bar.set_postfix({'loss': loss.item()})

        # Flush print statements in Colab
        #sys.stdout.flush()

    # Calculate average loss for the epoch
    avg_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch + 1}/{EPOCHS}, Loss: {avg_loss:.4f}')




Epoch 1/3, Loss: 0.4413




Epoch 2/3, Loss: 0.3187


                                                                        

Epoch 3/3, Loss: 0.2466




In [11]:
model.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask)
        preds = (outputs.squeeze() > 0.5).long()
        predictions.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(true_labels, predictions)
print(f'Test Accuracy: {accuracy:.4f}')

Test Accuracy: 0.8486


As we can see our transformerbased model performs significantly better than RNN by achieving an accuracy of about 84%