In [None]:
"""
Fine-tuning Sentiment Analysis Model for Movie Reviews

This script demonstrates how to fine-tune a DistilBERT model for sentiment analysis
specifically on movie reviews using the IMDB dataset.
"""

In [None]:
import torch
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
from datasets import load_dataset
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm.auto import tqdm
import pandas as pd
import logging

In [None]:
# Set up logging
logging.basicConfig(level=logging.INFO)

In [None]:
def load_and_prepare_data():
    """Load IMDB dataset and prepare it for training."""
    logging.info("Loading IMDB dataset...")
    dataset = load_dataset("imdb")
    
    print(f"Dataset format: {dataset}")
    print(f"Training samples: {len(dataset['train'])}")
    print(f"Testing samples: {len(dataset['test'])}")
    
    return dataset

In [None]:
def initialize_model():
    """Initialize the model and tokenizer."""
    model_checkpoint = "distilbert-base-uncased"
    logging.info(f"Loading model and tokenizer from {model_checkpoint}")
    
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_checkpoint,
        num_labels=2  # Binary classification (positive/negative)
    )
    
    return model, tokenizer

In [None]:
def preprocess_data(dataset, tokenizer):
    """Preprocess and tokenize the dataset."""
    def preprocess_function(examples):
        return tokenizer(
            examples["text"],
            truncation=True,
            padding=True,
            max_length=512
        )
    
    logging.info("Tokenizing dataset...")
    tokenized_dataset = dataset.map(
        preprocess_function,
        batched=True,
        remove_columns=dataset["train"].column_names
    )
    
    return tokenized_dataset

In [None]:
def compute_metrics(pred):
    """Calculate accuracy, precision, recall, and F1 score."""
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels,
        preds,
        average='binary'
    )
    acc = accuracy_score(labels, preds)
    
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
def setup_training(model, tokenizer, tokenized_dataset):
    """Set up training arguments and initialize trainer."""
    training_args = TrainingArguments(
        output_dir="./results",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        push_to_hub=False,
        logging_dir='./logs',
        logging_steps=100,
        seed=42
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["test"],
        tokenizer=tokenizer,
        data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
        compute_metrics=compute_metrics
    )
    
    return trainer

In [None]:
def train_model(trainer):
    """Train the model and evaluate performance."""
    logging.info("Starting model training...")
    train_results = trainer.train()
    
    print("\nTraining completed!")
    print("Training metrics:")
    print(train_results.metrics)
    
    logging.info("Evaluating model performance...")
    eval_results = trainer.evaluate()
    
    print("\nEvaluation Results:")
    for key, value in eval_results.items():
        print(f"{key}: {value:.4f}")
    
    return train_results, eval_results

In [None]:
def save_model(model, tokenizer, output_dir="../models/fine_tuned_sentiment"):
    """Save the fine-tuned model and tokenizer."""
    logging.info(f"Saving model to {output_dir}")
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    print(f"Model and tokenizer saved to {output_dir}")

In [None]:
def test_model(model, tokenizer):
    """Test the model with sample reviews."""
    def predict_sentiment(text):
        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
        
        with torch.no_grad():
            outputs = model(**inputs)
        
        probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
        prediction = torch.argmax(probabilities, dim=-1).item()
        confidence = probabilities[0][prediction].item()
        
        return {
            "sentiment": "POSITIVE" if prediction == 1 else "NEGATIVE",
            "confidence": confidence
        }
    
    sample_reviews = [
        "This movie was absolutely fantastic! The acting was superb and the story was engaging throughout.",
        "I was really disappointed with this film. The plot was confusing and the pacing was too slow.",
        "An average movie with some good moments but nothing spectacular. The acting was decent.",
        "The special effects were amazing, but the story lacked depth and character development."
    ]
    
    print("Testing model with sample reviews:\n")
    for review in sample_reviews:
        result = predict_sentiment(review)
        print(f"Review: {review}")
        print(f"Sentiment: {result['sentiment']}")
        print(f"Confidence: {result['confidence']:.4f}\n")

In [None]:
def main():
    # Load and prepare dataset
    dataset = load_and_prepare_data()
    
    # Initialize model and tokenizer
    model, tokenizer = initialize_model()
    
    # Preprocess data
    tokenized_dataset = preprocess_data(dataset, tokenizer)
    
    # Setup training
    trainer = setup_training(model, tokenizer, tokenized_dataset)
    
    # Train and evaluate model
    train_results, eval_results = train_model(trainer)
    
    # Save model
    save_model(model, tokenizer)
    
    # Test model
    test_model(model, tokenizer)

In [None]:
if __name__ == "__main__":
    main() 