Assignment 5.1: Sentiment Analysis with BERT on IMDb
Movie Reviews

DATA LOADING AND PREPROCESSING

In [2]:
import torch
import pandas as pd

In [3]:
#Mount drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
df=pd.read_csv('/content/drive/MyDrive/IMDB Dataset.csv', encoding='ISO-8859-1')



In [5]:
# Take a 10% random sample from the dataset
sample_df = df.sample(frac=0.1, random_state=42)


In [6]:
# Split the sample into train, validation, and test sets (80-10-10 split)
train_df = sample_df.sample(frac=0.8, random_state=42)
val_df = sample_df.drop(train_df.index).sample(frac=0.5, random_state=42)
test_df = sample_df.drop(train_df.index).drop(val_df.index)

train_df.shape, val_df.shape, test_df.shape


((4000, 2), (500, 2), (500, 2))

In [7]:
train_df.head()

Unnamed: 0,review,sentiment
34234,"When it comes to the erotic genre, I'm lucky t...",positive
28241,Two films are useful for scaring people to God...,negative
1226,I have just recently purchased collection one ...,positive
27004,This was an atrocious waste of my time. No plo...,negative
35839,Seeing Gary Busey in a G rated film was a firs...,positive


In [8]:
import re

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    return text

# Apply preprocessing to the reviews in train, validation, and test sets
train_df['review'] = train_df['review'].apply(preprocess_text)
val_df['review'] = val_df['review'].apply(preprocess_text)
test_df['review'] = test_df['review'].apply(preprocess_text)

# Display a few preprocessed reviews from the training set
train_df['review'].head()


34234    when it comes to the erotic genre im lucky to ...
28241    two films are useful for scaring people to god...
1226     i have just recently purchased collection one ...
27004    this was an atrocious waste of my time no plot...
35839    seeing gary busey in a g rated film was a firs...
Name: review, dtype: object

In [9]:
pip install torch transformers


Collecting transformers
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.17.3-py3-none-any.whl (295 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m32.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m50.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m51.8 MB/s[0m eta [36m0:00:00[0m
Insta

TEXT TOKENIZATION AND CONVERSION TO BERT INPUT FEATURES

Step 3: Tokenize the movie reviews using the BERT tokenizer python¶

In [10]:
from transformers import BertTokenizer

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

MODEL DEFINITION TRAINING AND EVALUATION

Step 4: Convert the tokenized reviews into input features suitable for BERT

In [11]:
# Convert the sentiments into numerical labels
train_df['label'] = train_df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)
val_df['label'] = val_df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)
test_df['label'] = test_df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

# Convert tokens to BERT input features
train_encodings = tokenizer(train_df['review'].tolist(), truncation=True, padding='max_length', max_length=256, return_tensors='pt', add_special_tokens=True)
val_encodings = tokenizer(val_df['review'].tolist(), truncation=True, padding='max_length', max_length=256, return_tensors='pt', add_special_tokens=True)
test_encodings = tokenizer(test_df['review'].tolist(), truncation=True, padding='max_length', max_length=256, return_tensors='pt', add_special_tokens=True)

Step 5: Load the pre-trained BERT model for sequence classification¶

In [12]:
from transformers import BertForSequenceClassification

# Load the pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')


Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step 6 & 7: Fine-tune the BERT model on the preprocessed IMDb dataset for sentiment analysis & Implement training loops and loss calculation

In [13]:
import torch
from transformers import AdamW
from torch.utils.data import DataLoader, TensorDataset

# Convert datasets into DataLoader
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], torch.tensor(train_df['label'].values))
val_dataset = TensorDataset(val_encodings['input_ids'], val_encodings['attention_mask'], torch.tensor(val_df['label'].values))
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], torch.tensor(test_df['label'].values))
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

optimizer = AdamW(model.parameters(), lr=1e-5)

# Define a device (use GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training loop
epochs = 3
for epoch in range(epochs):
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    # Optional: Validation loop to evaluate the model on the validation set
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, labels = batch
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            val_loss += outputs.loss.item()
    print(f"Epoch: {epoch + 1}, Validation Loss: {val_loss / len(val_loader)}")




Epoch: 1, Validation Loss: 0.25770165654830635
Epoch: 2, Validation Loss: 0.22344993462320417
Epoch: 3, Validation Loss: 0.2518835043301806


In [14]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Assuming you have test_encodings and test_df prepared similarly to train_encodings and train_df
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], torch.tensor(test_df['label'].values))
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

all_preds = []
all_true = []

model.eval()
with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)

        # Get the logits without the labels argument
        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)  # Get the predicted class labels
        all_preds.extend(preds.cpu().numpy())
        all_true.extend(labels.cpu().numpy())

# Compute the metrics
accuracy = accuracy_score(all_true, all_preds)
precision = precision_score(all_true, all_preds)
recall = recall_score(all_true, all_preds)
f1 = f1_score(all_true, all_preds)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")


Accuracy: 0.8740
Precision: 0.8880
Recall: 0.8560
F1-Score: 0.8717


SAMPLE MOVIE REVIEW PREDICTION AND EXPLANATIONS

In [15]:
sample_reviews = [
    "The movie was fantastic! The storyline was gripping and the actors did an excellent job.",
    "I didn't enjoy the film. It felt too long and the plot was hard to follow.",
    "It's an okay movie. Not the best I've seen, but not the worst either.",
    "Absolutely loved it! Would definitely recommend watching."
]


In [16]:
# Assuming you've already imported the tokenizer and have it initialized
sample_encodings = tokenizer(sample_reviews, truncation=True, padding=True, return_tensors='pt')


In [17]:
model.eval()
with torch.no_grad():
    sample_encodings = {key: val.to(device) for key, val in sample_encodings.items()}  # Move to the device
    outputs = model(**sample_encodings)
    preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()  # Get the predicted class labels


In [18]:
# Assuming two classes: 0 for negative and 1 for positive sentiment
sentiments = ["negative", "positive"]

for review, sentiment in zip(sample_reviews, preds):
    predicted_sentiment = sentiments[sentiment]
    print(f"Review: {review}\nPredicted Sentiment: {predicted_sentiment}\n{'-'*50}\n")


Review: The movie was fantastic! The storyline was gripping and the actors did an excellent job.
Predicted Sentiment: positive
--------------------------------------------------

Review: I didn't enjoy the film. It felt too long and the plot was hard to follow.
Predicted Sentiment: negative
--------------------------------------------------

Review: It's an okay movie. Not the best I've seen, but not the worst either.
Predicted Sentiment: positive
--------------------------------------------------

Review: Absolutely loved it! Would definitely recommend watching.
Predicted Sentiment: positive
--------------------------------------------------

