1. SETUP & INSTALLATION

In [None]:
# Install required libraries
!pip install -q transformers torch pandas scikit-learn nltk seaborn matplotlib
!pip install -q accelerate

# For MentalRoBERTa authentication
!pip install -q huggingface_hub

# Import libraries
import os
import sys
import json
import pickle
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset, RandomSampler
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
def set_seed(seed=42):
    torch.manual_seed(seed)
    np.random.seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
set_seed(42)

# Check GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
print(f"GPU available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

2. DATA LOADING & PREPROCESSING FUNCTIONS

In [None]:
# Download NLTK resources
import nltk
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger_eng')

from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet, stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
import re

# Configuration class
class Config:
    def __init__(self):
        # Data paths
        self.data_path = '/content/Dataset.csv'

        # Preprocessing options
        self.remove_stopwords = True
        self.use_stemming = True
        self.expand_contractions = True
        self.expand_abbreviations = True

        # Model parameters
        self.model_name = 'roberta-base'  # Will be updated per experiment
        self.num_labels = 6
        self.max_length = 128

        # Training parameters
        self.batch_size = 8
        self.eval_batch_size = 32
        self.learning_rate = 1e-5
        self.fine_tune_epochs = 2
        self.classifier_epochs = 1

        # Directories
        self.results_dir = '/content/results'

    def update_for_experiment(self, exp_name, model_name, preprocessing_type='basic'):
        """Update config for specific experiment"""
        self.exp_name = exp_name
        self.model_name = model_name

        if preprocessing_type == 'advanced':
            self.remove_stopwords = True
            self.use_stemming = True
        else:
            self.remove_stopwords = False
            self.use_stemming = False

        # Create experiment directory
        self.exp_dir = os.path.join(self.results_dir, exp_name)
        os.makedirs(self.exp_dir, exist_ok=True)

        return self

# Preprocessing functions
def get_wordnet_pos(token):
    """Get WordNet POS tag from NLTK POS tag"""
    if not token:
        return wordnet.NOUN
    try:
        tag = nltk.pos_tag([token])[0][1][0].upper()
    except:
        return wordnet.NOUN  # Default to noun if tagging fails
    tag_dict = {"N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV, "J": wordnet.ADJ}
    return tag_dict.get(tag, wordnet.NOUN)

def preprocess_tokens(tokens, config):
    """Apply stopword removal and stemming based on config"""
    if config.remove_stopwords:
        stop_words = set(stopwords.words('english'))
        tokens = [word for word in tokens if word.lower() not in stop_words]

    if config.use_stemming:
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(word) for word in tokens]

    return tokens

def preprocess_text(text, config):
    """Main preprocessing function"""
    if not isinstance(text, str):
        return ''

    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)

    # Tokenize and lemmatize
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token, get_wordnet_pos(token)) for token in tokens]

    # Apply stopword removal and stemming
    tokens = preprocess_tokens(tokens, config)

    # Expand contractions
    if config.expand_contractions:
        contraction_mapping = {
            "n't": " not", "'s": " is", "'m": " am", "'re": " are", "'ve": " have", "'ll": " will"
        }
        tokens = [contraction_mapping.get(token, token) for token in tokens]

    # Expand abbreviations
    if config.expand_abbreviations:
        abbreviation_mapping = {
            "BRB": "Be Right Back",
            "BTW": "By the Way",
            "OMG": "Oh My God/Goodness",
            "IDK": "I Don’t Know",
            "TTYL": "Talk to You Later",
            "OMW": "On My Way",
            "SMH": "Shaking My Head",
            "LOL": "Laugh Out Loud",
            "TBD": "To be Determined",
            "IMHO/IMO": "In My Humble Opinion/In My Opinion",
            "HMU": "Hit Me Up",
            "LMK": "Let Me Know",
            "OG": "Original Gangsters (used for old friends)",
            "FTW": "For The Win",
            "NVM": "Nevermind",
            "OOTD": "Outfit of the Day",
            "FWIW": "For What It’s Worth",
            "NGL": "Not Gonna Lie",
            "RQ": "Real Quick",
            "IYKYK": "If You Know, You Know",
            "ONG": "On God (I Swear)",
            "BRT": "Be Right There",
            "SM": "So Much",
            "IG": "I Guess",
            "WYA": "Where You At",
            "ISTG": "I Swear to God",
            "HBU": "How About You",
            "ATM": "At the Moment",
            "NP": "No Problem",
            "FOMO": "Fear of Missing Out",
            "OBV": "Obviously",
            "RN": "Right Now"
        }
        tokens = [abbreviation_mapping.get(token, token) for token in tokens]

    return ' '.join(tokens)

def load_and_prepare_data(config):
    """Load data and apply preprocessing"""
    print(f"Loading data from: {config.data_path}")
    data = pd.read_csv(config.data_path)

    print(f"Dataset shape: {data.shape}")
    print(f"Target distribution:\n{data['Target'].value_counts()}")

    # Apply preprocessing
    print("Applying text preprocessing...")
    processed_texts = [preprocess_text(text, config) for text in tqdm(data['Title'])]

    # Split data
    train_data, test_data, train_labels, test_labels = train_test_split(
        processed_texts, data['Target'], test_size=0.2, random_state=42, stratify=data['Target']
    )

    return train_data, test_data, train_labels, test_labels

3. MODEL & TRAINING CLASSES

In [3]:
from torch.optim import AdamW
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Simple Classifier
class SimpleClassifier(nn.Module):
    def __init__(self, input_size, output_size):
        super(SimpleClassifier, self).__init__()
        self.fc = nn.Linear(input_size, output_size)
    def forward(self, x):
        return self.fc(x)

# Experiment Runner
class ExperimentRunner:
    def __init__(self, config):
        self.config = config
        self.device = device
        # Load tokenizer based on model
        if 'mental' in config.model_name:
            self.tokenizer = AutoTokenizer.from_pretrained(config.model_name)
        else:
            print(f"Loading tokenizer for RoBERTa...")
            self.tokenizer = RobertaTokenizer.from_pretrained(config.model_name)

    def prepare_dataloaders(self, train_data, test_data, train_labels, test_labels):
        """Tokenize data and create dataloaders"""
        print("Tokenizing training data...")
        train_inputs = self.tokenizer(
            train_data, padding=True, truncation=True,
            return_tensors='pt', max_length=self.config.max_length
        )
        print("Tokenizing test data...")
        test_inputs = self.tokenizer(
            test_data, padding=True, truncation=True,
            return_tensors='pt', max_length=self.config.max_length
        )
        # Encode labels
        label_encoder = LabelEncoder()
        train_labels_encoded = label_encoder.fit_transform(train_labels)
        test_labels_encoded = label_encoder.transform(test_labels)
        # Create datasets
        train_dataset = TensorDataset(
            train_inputs.input_ids,
            train_inputs.attention_mask,
            torch.tensor(train_labels_encoded)
        )
        test_dataset = TensorDataset(
            test_inputs.input_ids,
            test_inputs.attention_mask,
            torch.tensor(test_labels_encoded)
        )
        # Create dataloaders
        train_dataloader = DataLoader(
            train_dataset,
            batch_size=self.config.batch_size,
            sampler=RandomSampler(train_dataset)
        )
        test_dataloader = DataLoader(
            test_dataset,
            batch_size=self.config.eval_batch_size,
            sampler=RandomSampler(test_dataset)
        )
        return train_dataloader, test_dataloader, label_encoder

    def load_model(self):
        """Load the appropriate model"""
        if 'mental' in self.config.model_name:
            print("Loading MentalRoBERTa model...")
            model = AutoModelForSequenceClassification.from_pretrained(
                self.config.model_name,
                num_labels=self.config.num_labels,
                ignore_mismatched_sizes=True
            )
        else:
            print("Loading RoBERTa model...")
            model = RobertaForSequenceClassification.from_pretrained(
                'roberta-base',
                num_labels=self.config.num_labels
            )
        return model.to(self.device)

    def train_fine_tuning(self, model, train_dataloader):
        """Fine-tune the complete model"""
        print("\n" + "="*50)
        print("FINE-TUNING PHASE")
        print("="*50)
        optimizer = AdamW(model.parameters(), lr=self.config.learning_rate)
        fine_tune_history = []
        for epoch in range(self.config.fine_tune_epochs):
            model.train()
            total_loss = 0
            total_correct = 0
            total_samples = 0
            progress_bar = tqdm(train_dataloader, desc=f'Epoch {epoch+1}/{self.config.fine_tune_epochs}')
            for batch in progress_bar:
                input_ids = batch[0].to(self.device)
                attention_mask = batch[1].to(self.device)
                labels = batch[2].to(self.device)
                optimizer.zero_grad()
                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                loss.backward()
                optimizer.step()
                total_loss += loss.item()
                # Calculate accuracy
                logits = outputs.logits
                predictions = torch.argmax(logits, dim=1)
                batch_correct = (predictions == labels).sum().item()
                total_correct += batch_correct
                total_samples += labels.size(0)
                progress_bar.set_postfix({
                    'loss': f'{loss.item():.4f}',
                    'acc': f'{(batch_correct/labels.size(0)*100):.1f}%'
                })
            epoch_loss = total_loss / len(train_dataloader)
            epoch_acc = total_correct / total_samples * 100
            fine_tune_history.append({
                'epoch': epoch + 1,
                'loss': epoch_loss,
                'accuracy': epoch_acc
            })
            print(f'Epoch {epoch+1}: Loss = {epoch_loss:.4f}, Accuracy = {epoch_acc:.2f}%')
        return fine_tune_history

    # REMOVED the entire `train_classifier` method.

    def evaluate(self, model, test_dataloader):
        """Evaluate the model on test set"""
        print("\n" + "="*50)
        print("EVALUATION")
        print("="*50)
        model.eval()
        all_predictions = []
        all_labels = []
        total_loss = 0
        criterion = nn.CrossEntropyLoss()
        with torch.no_grad():
            for batch in tqdm(test_dataloader, desc='Evaluating'):
                input_ids = batch[0].to(self.device)
                attention_mask = batch[1].to(self.device)
                labels = batch[2].to(self.device)
                # Forward pass through the complete model
                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                logits = outputs.logits
                loss = criterion(logits, labels)
                total_loss += loss.item()
                predictions = torch.argmax(logits, dim=1)
                all_predictions.extend(predictions.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())
        # Calculate metrics
        test_loss = total_loss / len(test_dataloader)
        accuracy = accuracy_score(all_labels, all_predictions)
        precision = precision_score(all_labels, all_predictions, average='weighted')
        recall = recall_score(all_labels, all_predictions, average='weighted')
        f1 = f1_score(all_labels, all_predictions, average='weighted')
        metrics = {
            'test_loss': test_loss,
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1
        }
        print(f"Test Loss: {test_loss:.4f}")
        print(f"Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1-Score: {f1:.4f}")
        return metrics, all_predictions, all_labels

    def plot_confusion_matrix(self, predictions, labels, label_encoder, exp_name):
        """Plot and save confusion matrix"""
        cm = confusion_matrix(labels, predictions)
        plt.figure(figsize=(10, 8))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                   xticklabels=label_encoder.classes_,
                   yticklabels=label_encoder.classes_)
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plt.title(f'Confusion Matrix - {exp_name}')
        # Save plot
        cm_path = os.path.join(self.config.exp_dir, 'confusion_matrix.png')
        plt.savefig(cm_path, bbox_inches='tight', dpi=300)
        plt.show()
        return cm

    def save_results(self, fine_tune_history, metrics, label_encoder):
        """Save all experiment results"""
        results = {
            'config': {
                'model_name': self.config.model_name,
                'preprocessing': {
                    'remove_stopwords': self.config.remove_stopwords,
                    'use_stemming': self.config.use_stemming
                },
                'training_params': {
                    'learning_rate': self.config.learning_rate,
                    'fine_tune_epochs': self.config.fine_tune_epochs,
                }
            },
            'fine_tune_history': fine_tune_history,
            'metrics': metrics
        }
        # Save results as JSON
        results_path = os.path.join(self.config.exp_dir, 'results.json')
        with open(results_path, 'w') as f:
            json.dump(results, f, indent=2)
        # Save label encoder
        le_path = os.path.join(self.config.exp_dir, 'label_encoder.pkl')
        with open(le_path, 'wb') as f:
            pickle.dump(label_encoder, f)
        print(f"Results saved to: {self.config.exp_dir}")

4. EXPERIMENT 1: RoBERTa (Basic Preprocessing)

In [None]:
print("\n" + "="*60)
print("EXPERIMENT 1: RoBERTa Baseline")
print("="*60)

# Setup configuration
config = Config().update_for_experiment(
    exp_name="roberta_baseline",
    model_name="roberta-base",
    preprocessing_type="basic"
)

# Initialize runner
runner = ExperimentRunner(config)

# Load and prepare data
train_data, test_data, train_labels, test_labels = load_and_prepare_data(config)
train_dataloader, test_dataloader, label_encoder = runner.prepare_dataloaders(
    train_data, test_data, train_labels, test_labels
)

# Load model
model = runner.load_model()

# Train model
fine_tune_history = runner.train_fine_tuning(model, train_dataloader)

# Evaluate
metrics, predictions, true_labels = runner.evaluate(model, test_dataloader)

# Plot confusion matrix
cm = runner.plot_confusion_matrix(predictions, true_labels, label_encoder, config.exp_name)

# Save results
runner.save_results(fine_tune_history, metrics, label_encoder)

5. EXPERIMENT 2: RoBERTa Advanced Preprocessing

In [None]:
print("\n" + "="*60)
print("EXPERIMENT 2: RoBERTa with Advanced Preprocessing")
print("="*60)

# Setup configuration
config = Config().update_for_experiment(
    exp_name="roberta_advanced",
    model_name="roberta-base",
    preprocessing_type="advanced"
)

# Initialize runner
runner = ExperimentRunner(config)

# Load and prepare data
train_data, test_data, train_labels, test_labels = load_and_prepare_data(config)
train_dataloader, test_dataloader, label_encoder = runner.prepare_dataloaders(
    train_data, test_data, train_labels, test_labels
)

# Load model
model = runner.load_model()

# Train and evaluate
fine_tune_history = runner.train_fine_tuning(model, train_dataloader)
metrics, predictions, true_labels = runner.evaluate(model, test_dataloader)
cm = runner.plot_confusion_matrix(predictions, true_labels, label_encoder, config.exp_name)
runner.save_results(fine_tune_history, metrics, label_encoder)

6. EXPERIMENT 3: MentalRoBERTa (Basic Preprocessing)

In [None]:
print("\n" + "="*60)
print("EXPERIMENT 3: MentalRoBERTa Baseline")
print("="*60)

# Authenticate for MentalRoBERTa (need to run this only once per session)
from huggingface_hub import notebook_login
notebook_login()

# Setup configuration
config = Config().update_for_experiment(
    exp_name="mentalroberta_baseline",
    model_name="mental/mental-roberta-base",
    preprocessing_type="basic"
)

# Initialize runner
runner = ExperimentRunner(config)

# Load and prepare data
train_data, test_data, train_labels, test_labels = load_and_prepare_data(config)
train_dataloader, test_dataloader, label_encoder = runner.prepare_dataloaders(
    train_data, test_data, train_labels, test_labels
)

# Load model
model = runner.load_model()

# Train and evaluate
fine_tune_history = runner.train_fine_tuning(model, train_dataloader)
metrics, predictions, true_labels = runner.evaluate(model, test_dataloader)
cm = runner.plot_confusion_matrix(predictions, true_labels, label_encoder, config.exp_name)
runner.save_results(fine_tune_history, metrics, label_encoder)

7. EXPERIMENT 4: MentalRoBERTa Advanced Preprocessing

In [None]:
print("\n" + "="*60)
print("EXPERIMENT 4: MentalRoBERTa with Advanced Preprocessing")
print("="*60)

# Setup configuration
config = Config().update_for_experiment(
    exp_name="mentalroberta_advanced",
    model_name="mental/mental-roberta-base",
    preprocessing_type="advanced"
)

# Initialize runner
runner = ExperimentRunner(config)

# Load and prepare data
train_data, test_data, train_labels, test_labels = load_and_prepare_data(config)
train_dataloader, test_dataloader, label_encoder = runner.prepare_dataloaders(
    train_data, test_data, train_labels, test_labels
)

# Load model
model = runner.load_model()

# Train and evaluate
fine_tune_history = runner.train_fine_tuning(model, train_dataloader)
metrics, predictions, true_labels = runner.evaluate(model, test_dataloader)
cm = runner.plot_confusion_matrix(predictions, true_labels, label_encoder, config.exp_name)
runner.save_results(fine_tune_history, metrics, label_encoder)

8. RESULTS COMPARISON & VISUALIZATION

In [None]:
import glob

def load_and_compare_results(results_dir='/content/results'):
    """Load and compare results from all experiments"""
    experiment_dirs = glob.glob(os.path.join(results_dir, '*'))

    comparison_data = []

    for exp_dir in experiment_dirs:
        exp_name = os.path.basename(exp_dir)
        results_path = os.path.join(exp_dir, 'results.json')

        if os.path.exists(results_path):
            with open(results_path, 'r') as f:
                results = json.load(f)

            comparison_data.append({
                'Experiment': exp_name,
                'Model': results['config']['model_name'].split('/')[-1],
                'Preprocessing': 'Advanced' if results['config']['preprocessing']['remove_stopwords'] else 'Basic',
                'Accuracy': results['metrics']['accuracy'],
                'Precision': results['metrics']['precision'],
                'Recall': results['metrics']['recall'],
                'F1-Score': results['metrics']['f1_score'],
                'Test Loss': results['metrics']['test_loss']
            })

    comparison_df = pd.DataFrame(comparison_data)
    return comparison_df

def plot_comparison(comparison_df, results_dir):
    """Create visual comparison of all experiments"""

    # Bar plot for accuracy comparison
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))

    # Accuracy comparison
    axes[0, 0].barh(range(len(comparison_df)), comparison_df['Accuracy'])
    axes[0, 0].set_yticks(range(len(comparison_df)))
    axes[0, 0].set_yticklabels(comparison_df['Experiment'])
    axes[0, 0].set_xlabel('Accuracy')
    axes[0, 0].set_title('Accuracy Comparison')
    axes[0, 0].grid(True, alpha=0.3)

    # F1-Score comparison
    axes[0, 1].barh(range(len(comparison_df)), comparison_df['F1-Score'])
    axes[0, 1].set_yticks(range(len(comparison_df)))
    axes[0, 1].set_yticklabels(comparison_df['Experiment'])
    axes[0, 1].set_xlabel('F1-Score')
    axes[0, 1].set_title('F1-Score Comparison')
    axes[0, 1].grid(True, alpha=0.3)

    # Grouped bar chart
    metrics_to_plot = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
    x = np.arange(len(comparison_df))
    width = 0.2

    for i, metric in enumerate(metrics_to_plot):
        axes[1, 0].bar(x + i*width, comparison_df[metric], width, label=metric)

    axes[1, 0].set_xticks(x + width*1.5)
    axes[1, 0].set_xticklabels(comparison_df['Experiment'], rotation=45)
    axes[1, 0].set_ylabel('Score')
    axes[1, 0].set_title('All Metrics Comparison')
    axes[1, 0].legend()
    axes[1, 0].grid(True, alpha=0.3)

    # Heatmap of metrics
    metrics_df = comparison_df.set_index('Experiment')[metrics_to_plot]
    sns.heatmap(metrics_df, annot=True, fmt='.3f', cmap='YlOrRd', ax=axes[1, 1])
    axes[1, 1].set_title('Metrics Heatmap')

    plt.tight_layout()
    plt.savefig(os.path.join(results_dir, 'comparison_summary.png'), dpi=300, bbox_inches='tight')
    plt.show()

    return comparison_df

# Load and compare results
print("\n" + "="*60)
print("RESULTS COMPARISON")
print("="*60)

# Define the results directory (same as in Config class)
RESULTS_DIR = '/content/results'

# Load the comparison data
comparison_df = load_and_compare_results(results_dir=RESULTS_DIR)

print("\nComparison DataFrame:")
print(comparison_df.to_string())

# Create comparison table
print("\n FINAL RESULTS SUMMARY:")
print("-" * 80)
print(f"{'Experiment':<30} {'Accuracy':<10} {'F1-Score':<10} {'Model':<20} {'Preprocessing':<15}")
print("-" * 80)
for idx, row in comparison_df.iterrows():
    print(f"{row['Experiment']:<30} {row['Accuracy']:<10.4f} {row['F1-Score']:<10.4f} {row['Model']:<20} {row['Preprocessing']:<15}")
print("-" * 80)

# Find best model
if len(comparison_df) > 0:
    best_idx = comparison_df['F1-Score'].idxmax()
    best_model = comparison_df.loc[best_idx]
    print(f"\n BEST MODEL: {best_model['Experiment']}")
    print(f"   Accuracy: {best_model['Accuracy']:.4f}")
    print(f"   F1-Score: {best_model['F1-Score']:.4f}")
    print(f"   Model: {best_model['Model']}")
    print(f"   Preprocessing: {best_model['Preprocessing']}")

    # Visual comparison
    plot_comparison(comparison_df, RESULTS_DIR)
else:
    print("\n No results found. Please run the experiments first.")