### **Sentiment Analysis on Movie Review Using BERT**

### Import Required Packages

In [1]:
import os
import requests
import zipfile
import warnings
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, classification_report,
    precision_recall_fscore_support
)
from transformers import (
    BertTokenizer, BertForSequenceClassification,
    get_linear_schedule_with_warmup
)
from tqdm.auto import tqdm

# Suppress warnings
warnings.filterwarnings("ignore")


In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') ## set device configuration
print(device)

cuda


### Data Ingestion: Download and extract data

In [3]:
data_url = 'https://github.com/pmensah28/data/raw/main/IMDB-Dataset.csv.zip'
raw_data_dir = 'data/raw'
raw_data_path = os.path.join(raw_data_dir, 'IMDB-Dataset.csv')
os.makedirs(raw_data_dir, exist_ok=True) # Creat data directory if it does not exist.

print("\n<<<<<<<< Downloading the dataset >>>>>>>>")
response = requests.get(data_url, stream=True)
zip_filename = os.path.basename(data_url)
zip_path = os.path.join(raw_data_dir, zip_filename)

with open(zip_path, 'wb') as f:
    for chunk in response.iter_content(chunk_size=8192):
        if chunk:
            f.write(chunk)
print("\nDownload completed!")

# Extract the dataset
print("\n<<<<<<<< Extracting the dataset >>>>>>>>")
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(raw_data_dir)
print("\nExtraction completed!")
os.remove(zip_path) # remove zip file after extracting

# Verify the extracted CSV file
if os.path.exists(raw_data_path):
    print(f"\nData ingestion successful! File saved to {raw_data_path}")
else:
    print(f"Expected file {raw_data_path} not found after extraction.")



<<<<<<<< Downloading the dataset >>>>>>>>

Download completed!

<<<<<<<< Extracting the dataset >>>>>>>>

Extraction completed!

Data ingestion successful! File saved to data/raw/IMDB-Dataset.csv


### Data Validation

In [4]:
# Load the dataset
print("\n<<<<<<<< Loading the dataset >>>>>>>>")
df = pd.read_csv(raw_data_path)

# Check for required columns
required_columns = ['review', 'sentiment']
print("\nChecking for required columns...")
missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
    print(f"Missing columns: {missing_columns}")
else:
    print("All required columns are present.")

# Check for missing values
print("\nChecking for missing values...")
missing_values = df.isnull().sum()
print(missing_values)

df = df.dropna() # In case there are missing values, we drop them.

# Check for duplicates
print("\nChecking for duplicate records...")
initial_count = len(df)
df.drop_duplicates(subset=['review'], inplace=True)
duplicates_removed = initial_count - len(df)
print(f"Done: Removed {duplicates_removed} duplicate records.")


print("\nData validation completed!")



<<<<<<<< Loading the dataset >>>>>>>>

Checking for required columns...
All required columns are present.

Checking for missing values...
review       0
sentiment    0
dtype: int64

Checking for duplicate records...
Done: Removed 418 duplicate records.

Data validation completed!


### Data Transformation

In [5]:
processed_data_dir = 'data/processed' # processed data path
os.makedirs(processed_data_dir, exist_ok=True)


df['review_length'] = df['review'].apply(lambda x: len(str(x)))

print("\nConverting reviews to lowercase...")
df['review'] = df['review'].str.lower()

# Map sentiment labels to numeric values
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

# Split the data into training, validation, and test sets
print("\nSplitting data into training, validation, and test sets...")
train_df, temp_df = train_test_split(
    df, test_size=0.3, random_state=42, stratify=df['sentiment']
)
validation_df, test_df = train_test_split(
    temp_df, test_size=0.5, random_state=42, stratify=temp_df['sentiment']
)

# Save the datasets
train_path = os.path.join(processed_data_dir, 'train.csv')
validation_path = os.path.join(processed_data_dir, 'validation.csv')
test_path = os.path.join(processed_data_dir, 'test.csv')

train_df.to_csv(train_path, index=False)
validation_df.to_csv(validation_path, index=False)
test_df.to_csv(test_path, index=False)

print('Saving processed data to', processed_data_dir)

print("\nData transformation completed!")



Converting reviews to lowercase...

Splitting data into training, validation, and test sets...
Saving processed data to data/processed

Data transformation completed!


In [6]:
# Define the dataset class
class IMDBDataset(Dataset):
    def __init__(self, reviews, sentiments, tokenizer, max_length):
        self.reviews = reviews.reset_index(drop=True)
        self.sentiments = sentiments.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, idx):
        review = str(self.reviews[idx])
        sentiment = self.sentiments[idx]

        encoding = self.tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'review_text': review,
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(sentiment, dtype=torch.long)
        }

In [7]:
# Load the training, validation and test datasets
train_df = pd.read_csv('data/processed/train.csv')
validation_df = pd.read_csv('data/processed/validation.csv')
test_df = pd.read_csv('data/processed/test.csv')

In [8]:
# Initialize the tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=2,
    hidden_dropout_prob=0.1
)

max_length = 128
batch_size = 16
epochs = 5

# Create datasets
train_dataset = IMDBDataset(
    reviews=train_df['review'],
    sentiments=train_df['sentiment'],
    tokenizer=tokenizer,
    max_length=max_length
)

validation_dataset = IMDBDataset(
    reviews=validation_df['review'],
    sentiments=validation_df['sentiment'],
    tokenizer=tokenizer,
    max_length=max_length
)

test_dataset = IMDBDataset(
    reviews=test_df['review'],
    sentiments=test_df['sentiment'],
    tokenizer=tokenizer,
    max_length=max_length
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
## creating data loaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
validation_loader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

### Model Training

In [10]:
# Set up optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)
model = model.to(device)

# Training loop
for epoch in range(epochs):
    print(f"\nEpoch {epoch + 1}/{epochs}")
    print("-" * 20)

    # Training phase
    model.train()
    total_train_loss = 0
    train_predictions = []
    train_true_labels = []
    progress_bar = tqdm(train_loader, desc='Training')

    for batch in progress_bar:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        logits = outputs.logits
        total_train_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()

        preds = torch.argmax(logits, dim=1).flatten()
        train_predictions.extend(preds.cpu().numpy())
        train_true_labels.extend(labels.cpu().numpy())

        progress_bar.set_postfix({'loss': loss.item()})

    avg_train_loss = total_train_loss / len(train_loader)
    train_accuracy = accuracy_score(train_true_labels, train_predictions)
    # print(f"Average Training Loss: {avg_train_loss:.4f}")
    # print(f"Training Accuracy: {train_accuracy:.4f}")

    # Validation phase
    model.eval()
    total_eval_loss = 0
    predictions = []
    true_labels = []

    progress_bar = tqdm(validation_loader, desc='Validation')
    with torch.no_grad():
        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss = outputs.loss
            logits = outputs.logits
            total_eval_loss += loss.item()

            preds = torch.argmax(logits, dim=1).flatten()

            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

            progress_bar.set_postfix({'val_loss': loss.item()})

    avg_val_loss = total_eval_loss / len(validation_loader)
    val_accuracy = accuracy_score(true_labels, predictions)
    # print(f"Average Validation Loss: {avg_val_loss:.4f}")
    # print(f"Validation Accuracy: {val_accuracy:.4f}")

    # Print classification report
    # print("\nValidation Classification Report:")
    # print(classification_report(true_labels, predictions, digits=4))

    # Print results
    print(f"Epoch {epoch + 1}: Training loss: {avg_train_loss:.4f}\t Training accuracy: {train_accuracy:.4f}\t Validation loss: {avg_val_loss:.4f}\t Validation accuracy: {val_accuracy:.4f}")

# Save the trained model
model_output_dir = 'models'
os.makedirs(model_output_dir, exist_ok=True)
model_save_path = os.path.join(model_output_dir, 'bert_classifier.pth')
torch.save(model.state_dict(), model_save_path)

print("\nTraining complete!")
print(f"\nModel saved to {model_save_path}")


Epoch 1/5
--------------------


Training:   0%|          | 0/2170 [00:00<?, ?it/s]

Validation:   0%|          | 0/465 [00:00<?, ?it/s]

Epoch 1: Training loss: 0.3236	 Training accuracy: 0.8635	 Validation loss: 0.2933	 Validation accuracy: 0.8881

Epoch 2/5
--------------------


Training:   0%|          | 0/2170 [00:00<?, ?it/s]

Validation:   0%|          | 0/465 [00:00<?, ?it/s]

Epoch 2: Training loss: 0.1993	 Training accuracy: 0.9293	 Validation loss: 0.3460	 Validation accuracy: 0.8860

Epoch 3/5
--------------------


Training:   0%|          | 0/2170 [00:00<?, ?it/s]

Validation:   0%|          | 0/465 [00:00<?, ?it/s]

Epoch 3: Training loss: 0.1141	 Training accuracy: 0.9682	 Validation loss: 0.4705	 Validation accuracy: 0.8931

Epoch 4/5
--------------------


Training:   0%|          | 0/2170 [00:00<?, ?it/s]

Validation:   0%|          | 0/465 [00:00<?, ?it/s]

Epoch 4: Training loss: 0.0605	 Training accuracy: 0.9861	 Validation loss: 0.5916	 Validation accuracy: 0.8928

Epoch 5/5
--------------------


Training:   0%|          | 0/2170 [00:00<?, ?it/s]

Validation:   0%|          | 0/465 [00:00<?, ?it/s]

Epoch 5: Training loss: 0.0306	 Training accuracy: 0.9936	 Validation loss: 0.6104	 Validation accuracy: 0.8962

Training complete!

Model saved to models/bert_classifier.pth


### Model Evaluation

In [11]:
# Load the trained model weights
model.load_state_dict(torch.load(model_save_path, map_location=torch.device('cpu')))

# Set device
model = model.to(device)
model.eval()

# Evaluate the model on the test set
print("\n<<<<<<< Evaluating the model on the test set >>>>>>>")
all_preds = []
all_labels = []

progress_bar = tqdm(test_loader, desc='Testing')
with torch.no_grad():
    for batch in progress_bar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        logits = outputs.logits
        preds = torch.argmax(logits, dim=1).flatten()

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Calculate evaluation metrics
accuracy = accuracy_score(all_labels, all_preds)
print(f"\nTest Accuracy: {accuracy:.4f}")

print("\nTest Classification Report:")
print(classification_report(all_labels, all_preds, digits=4))



<<<<<<< Evaluating the model on the test set >>>>>>>


Testing:   0%|          | 0/465 [00:00<?, ?it/s]


Test Accuracy: 0.8992

Test Classification Report:
              precision    recall  f1-score   support

           0     0.9003    0.8969    0.8986      3705
           1     0.8981    0.9014    0.8997      3733

    accuracy                         0.8992      7438
   macro avg     0.8992    0.8992    0.8992      7438
weighted avg     0.8992    0.8992    0.8992      7438



### Making Predictions

In [None]:
# Initialize the tokenizer and load the trained model
model.load_state_dict(torch.load(model_save_path, map_location=torch.device('cpu')))

# Set device
model = model.to(device)
model.eval()

# Function to make predictions
def predict_sentiment(text):
    # Preprocess the text
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    # Make prediction
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=1).item()

    # Map predicted class to sentiment
    sentiment = 'positive' if predicted_class == 1 else 'negative'
    return sentiment



In [14]:
new_reviews = [
    "I absolutely loved this movie! The plot was exciting and the characters were well developed.",
    "This was the worst movie I've ever seen. It was a complete waste of time.",
    "The movie was okay, but I wouldn't watch it again.",
    "An outstanding performance by the lead actor. Highly recommend watching it!",
    "The storyline was boring and predictable."
]

for review in new_reviews:
    sentiment = predict_sentiment(review)
    print(f"Review: {review}\nPredicted Sentiment: {sentiment}\n")


Review: I absolutely loved this movie! The plot was exciting and the characters were well developed.
Predicted Sentiment: positive

Review: This was the worst movie I've ever seen. It was a complete waste of time.
Predicted Sentiment: negative

Review: The movie was okay, but I wouldn't watch it again.
Predicted Sentiment: negative

Review: An outstanding performance by the lead actor. Highly recommend watching it!
Predicted Sentiment: positive

Review: The storyline was boring and predictable.
Predicted Sentiment: negative

