In [None]:
!pip install -U datasets

In [None]:
# Import necessary libraries:
import torch
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    Trainer, TrainingArguments
)
from datasets import load_dataset
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
import re
import warnings
warnings.filterwarnings('ignore')

torch.manual_seed(42)
np.random.seed(42)

print(f"Using device: {'cuda' if torch.cuda.is_available() else 'cpu'}")


In [None]:
# Load IMDB dataset:
print("Loading IMDB dataset...")
dataset = load_dataset("imdb")
dataset["train"] = dataset["train"].select(range(5000))
dataset["test"] = dataset["test"].select(range(500))

# Display dataset info:
print("Dataset structure:")
print(dataset)
print(f"\nTraining examples: {len(dataset['train'])}")
print(f"Test examples: {len(dataset['test'])}")

# Show sample data:
print("\nSample training example:")
sample = dataset['train'][0]
print(f"Text: {sample['text'][:200]}...")
print(f"Label: {sample['label']} ({'positive' if sample['label'] == 1 else 'negative'})")

# Label distribution:
train_labels = dataset['train']['label']
print(f"\nLabel distribution in training set:")
print(f"Negative (0): {train_labels.count(0)}")
print(f"Positive (1): {train_labels.count(1)}")


In [None]:
# Text preprocessing function:
def preprocess_text(text):
    """Clean and preprocess text data"""
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags.
    text = text.lower()  # Convert to lowercase.
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespaces.
    text = text.strip()  # Remove leading/trailing whitespaces.
    return text

# Apply preprocessing to dataset
print("Preprocessing text data...")
dataset = dataset.map(
    lambda x: {'text': preprocess_text(x['text']), 'label': x['label']},
    batched=False
)

# Show preprocessed example:
print("Preprocessing complete!")


In [None]:
# Model configuration:
MODEL_NAME = 'distilbert-base-uncased'
MAX_LENGTH = 512

# Initialize tokenizer and model:
print(f"Loading {MODEL_NAME} tokenizer and model...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

print("Model loaded successfully!")

# Tokenize datasets:
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding=True, max_length=MAX_LENGTH)

print("Tokenizing datasets...")
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Set format for PyTorch:
tokenized_datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

print("Tokenization complete!")


In [None]:
# Prepare datasets:
train_dataset = tokenized_datasets['train']
test_dataset = tokenized_datasets['test']

print(f"Training examples: {len(train_dataset)}")
print(f"Test examples: {len(test_dataset)}")


In [None]:
# Define compute metrics function for evaluation:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')
    precision = precision_score(labels, predictions, average='weighted')
    recall = recall_score(labels, predictions, average='weighted')

    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Define training arguments:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=500,
    seed=42,
    report_to="none"
)

print("Training arguments set up successfully!")


In [None]:
# Initialize trainer:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

print("Trainer initialized successfully!")


In [None]:
# Train the model:
print("Starting training...")
print("=" * 50)

trainer.train()

print("Training completed!")


In [None]:
# Evaluate the model:
print("=" * 50)
print("EVALUATING MODEL")
print("=" * 50)

# Evaluate on test set:
eval_results = trainer.evaluate()

print("Final Test Results:")
print(f"Accuracy: {eval_results['eval_accuracy']:.4f}")
print(f"F1 Score: {eval_results['eval_f1']:.4f}")
print(f"Precision: {eval_results['eval_precision']:.4f}")
print(f"Recall: {eval_results['eval_recall']:.4f}")
print(f"Loss: {eval_results['eval_loss']:.4f}")

# Save the model:
print("\n" + "=" * 50)
print("SAVING MODEL")
print("=" * 50)

model_save_path = "./fine_tuned_sentiment_model"
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)

print(f"Model saved to {model_save_path}")
print("Model saved successfully!")


In [None]:
# Test with sample predictions:
print("=" * 50)
print("SAMPLE PREDICTIONS")
print("=" * 50)

def predict_sentiment(text):
    """Predict sentiment for a given text"""
    inputs = tokenizer(text, truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='pt').to("cuda")

    with torch.no_grad():
        outputs = model(**inputs)
        probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
        prediction = torch.argmax(probabilities, dim=-1)

    sentiment = "Positive" if prediction.item() == 1 else "Negative"
    confidence = probabilities[0][prediction].item()

    return sentiment, confidence

# Test with sample reviews:
sample_reviews = [
    "This movie was absolutely fantastic! The acting was superb and the plot was engaging.",
    "Terrible movie. I wasted my time watching this boring and poorly written film.",
    "One of the best movies I've ever seen! Brilliant cinematography and outstanding performances."
]

for i, review in enumerate(sample_reviews, 1):
    sentiment, confidence = predict_sentiment(review)
    print(f"Review {i}: {review[:50]}...")
    print(f"Sentiment: {sentiment} (Confidence: {confidence:.4f})")
    print()

print("=" * 50)
print("SENTIMENT CLASSIFICATION COMPLETE!")
print("=" * 50)
