Step 1: Load and Preprocess the Dataset

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load data
data = pd.read_csv("../data/IMDB_Dataset.csv")

# Convert sentiment to binary labels (0 for negative, 1 for positive)
data["sentiment"] = data["sentiment"].apply(lambda x: 1 if x == "positive" else 0)

# Split the dataset into training and validation sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)


Step 2: Tokenize the Data

In [None]:
from transformers import BertTokenizer

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize the text data
def tokenize_data(df, max_length=128):
    return tokenizer(
        df["review"].tolist(),
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors="pt",
    )

train_encodings = tokenize_data(train_data)
test_encodings = tokenize_data(test_data)

 Step 4: Create a PyTorch Dataset


In [None]:
import torch
from torch.utils.data import Dataset

class MovieReviewDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Create datasets
train_dataset = MovieReviewDataset(train_encodings, train_data["sentiment"].tolist())
test_dataset = MovieReviewDataset(test_encodings, test_data["sentiment"].tolist())

Step 5: Load the Pre-trained BERT Model

In [None]:
from transformers import DistilBertForSequenceClassification

# Load the BERT model for sequence classification
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

Step 6: Set Up Training

In [None]:
from torch.utils.data import DataLoader
from transformers import AdamW
from tqdm import tqdm

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Set up optimizer and device
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop
gradient_accumulation_steps = 4
epochs = 3

for epoch in range(epochs):
    model.train()
    total_loss = 0
    optimizer.zero_grad()
    
    for i, batch in enumerate(tqdm(train_loader)):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss = loss / gradient_accumulation_steps  # Normalize loss
        loss.backward()

        if (i + 1) % gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}, Training Loss: {avg_train_loss}")

    # Validation loop
    model.eval()
    val_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            val_loss += outputs.loss.item()

            predictions = torch.argmax(outputs.logits, dim=1)
            correct += (predictions == labels).sum().item()
            total += labels.size(0)

    avg_val_loss = val_loss / len(val_loader)
    accuracy = correct / total
    print(f"Validation Loss: {avg_val_loss}, Accuracy: {accuracy}")

Step 7: Test the model

In [20]:
# Load the saved model and tokenizer
model = DistilBertForSequenceClassification.from_pretrained("bert-sentiment-model")
tokenizer = BertTokenizer.from_pretrained("bert-sentiment-tokenizer")
model.to(device)

def predict_sentiment(text):
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    
    # Remove 'token_type_ids' if the model doesn't support it
    if "token_type_ids" in inputs:
        inputs.pop("token_type_ids")
    
    # Move inputs to the correct device
    inputs = {key: val.to(device) for key, val in inputs.items()}
    
    # Get model outputs
    outputs = model(**inputs)
    probs = torch.softmax(outputs.logits, dim=1)
    return "positive" if torch.argmax(probs).item() == 1 else "negative"

# Test the function
print(predict_sentiment("movie good"))  # Should output "positive"

positive


Step 7: Save the Model


In [None]:
model.save_pretrained("bert-sentiment-model")
tokenizer.save_pretrained("bert-sentiment-tokenizer")