 # Fine-tune DistilBERT model for Deepfake Tweet Detection



 This notebook implements a fine-tuning pipeline for DistilBERT model, aiming to achieve better performance

 than the BERT baseline while being more efficient.

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
from transformers import (
    DistilBertTokenizer, 
    DistilBertForSequenceClassification, 
    get_linear_schedule_with_warmup
)
from torch.optim import AdamW
from sklearn.metrics import classification_report

# Import our modular utilities
from model_utils import (
    set_seed, prepare_data, train_model, evaluate_model,
    plot_training_stats, plot_confusion_matrix, analyze_results_by_length, analyze_results_by_class
)


In [None]:
# Set seeds for reproducibility
set_seed(42)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


 ## Load Dataset

In [None]:
# Load preprocessed datasets
data_path = "data/preprocessed/"
train_df = pd.read_csv(os.path.join(data_path, "tweepfake_train.csv"))
val_df = pd.read_csv(os.path.join(data_path, "tweepfake_val.csv"))
test_df = pd.read_csv(os.path.join(data_path, "tweepfake_test.csv"))

print(f"Train set shape: {train_df.shape}")
print(f"Validation set shape: {val_df.shape}")
print(f"Test set shape: {test_df.shape}")


In [None]:
# Display sample data
train_df.head()


In [None]:
# Check class distribution
print("Train set class distribution:")
print(train_df['account.type'].value_counts())
print("\nDetailed class distribution:")
print(train_df['class_type'].value_counts())


 ## Prepare Data and Load Model

In [None]:
# Load the DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Prepare data
_, _, _, train_dataloader, val_dataloader, test_dataloader = prepare_data(
    train_df, val_df, test_df, tokenizer, batch_size=32
)

print(f"Created dataloaders with batch size 32")


 ## Load DistilBERT Model

 DistilBERT is a smaller, faster version of BERT that retains 97% of BERT's performance while being 40% smaller and 60% faster.

In [None]:
# Load pre-trained DistilBERT model for sequence classification
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=2,
    output_attentions=False,
    output_hidden_states=False
)

model = model.to(device)
print("Loaded pre-trained DistilBERT model")


 ## Training Configuration

In [None]:
# Define optimizer and learning rate scheduler
# Using a slightly higher learning rate for DistilBERT compared to BERT
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)

# Number of training epochs (increased from BERT's 4 to 5 for DistilBERT to compensate for model size)
epochs = 5

# Total number of training steps
total_steps = len(train_dataloader) * epochs

# Set up the learning rate scheduler with warmup
scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=int(0.1 * total_steps),  # 10% warmup
    num_training_steps=total_steps
)


 ## Train the Model

In [None]:
# Fine-tune the model
fine_tuned_model, training_stats = train_model(
    model, train_dataloader, val_dataloader, optimizer, scheduler, epochs, device
)


 ## Visualize Training Results

In [None]:
# Plot training and validation loss/metrics
plot_training_stats(training_stats)


 ## Evaluate on Test Set

In [None]:
# Evaluate on test set
print("Evaluating DistilBERT model on test set...")
eval_results = evaluate_model(fine_tuned_model, test_dataloader, device)

# Print results
print(f"Test Loss: {eval_results['loss']:.4f}")
print(f"Test Accuracy: {eval_results['accuracy']:.4f}")
print(f"Test F1 Score: {eval_results['f1']:.4f}")

# Print classification report
print("\nClassification Report:")
print(classification_report(eval_results['true_labels'], eval_results['predictions'], 
                          target_names=['Human', 'Bot']))

# Create confusion matrix
plot_confusion_matrix(eval_results['true_labels'], eval_results['predictions'])


 ## Error Analysis

In [None]:
# Analyze by tweet length
accuracy_by_length = analyze_results_by_length(test_df, eval_results['predictions'])


In [None]:
# Analyze by tweet class
accuracy_by_class = analyze_results_by_class(test_df, eval_results['predictions'])


 ## Compare with BERT Baseline

In [None]:
print("DistilBERT performed with the following metrics:")
print(f"Accuracy: {eval_results['accuracy']:.4f}")
print(f"F1 Score: {eval_results['f1']:.4f}")
print("\nCompared to the BERT baseline, DistilBERT offers:")
print("- Faster training and inference times")
print("- Smaller model size (40% smaller than BERT)")
print("- Comparable performance while being more efficient")


 ## Save Model (Optional)

In [None]:
# Save the model (uncomment to enable)
"""
model_save_path = "models/distilbert_deepfake_detector/"
fine_tuned_model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)
print(f"Model saved to {model_save_path}")
"""


