# Model Training

## Environment Setup

In [None]:
import os
import sys
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AdamW,
    get_linear_schedule_with_warmup,
    set_seed
)

from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight

IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    !pip install transformers accelerate -q
    DATA_PATH = '/content/drive/MyDrive/multi-class-phishing/scripts/'
else:
    DATA_PATH = './'

SEED = 42
set_seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")

## Configuration

In [None]:
MODEL_CONFIGS = {
    'distilbert': 'distilbert-base-uncased',
    'roberta': 'roberta-base',
    'albert': 'albert-base-v2',
}

TRAINING_CONFIG = {
    'max_length': 512,
    'batch_size': 16,
    'epochs': 4,
    'learning_rate': 2e-5,
    'warmup_ratio': 0.1,
    'weight_decay': 0.01,
    'max_grad_norm': 1.0,
}

CLASS_LABELS = {
    0: 'Legitimate',
    1: 'Credential Theft',
    2: 'Financial Fraud',
    3: 'Malware Delivery',
    4: 'Personal Information Theft',
    5: 'Other/Generic Phishing',
}

## Data Loading

In [None]:
malicious_df = pd.read_csv(os.path.join(DATA_PATH, 'malicious_phishing_corpus.csv'))
benign_df = pd.read_csv(os.path.join(DATA_PATH, 'benign_corpus.csv'))

print(f"Malicious: {len(malicious_df):,} records")
print(f"Benign: {len(benign_df):,} records")

In [None]:
def prepare_dataset(malicious_df, benign_df):
    benign_data = benign_df[['text_cleaned']].copy()
    benign_data['label'] = 0
    benign_data = benign_data.rename(columns={'text_cleaned': 'text'})
    
    malicious_data = malicious_df[['text_cleaned', 'annotation_label']].copy()
    malicious_data = malicious_data.rename(columns={'text_cleaned': 'text', 'annotation_label': 'label'})
    malicious_data['label'] = pd.to_numeric(malicious_data['label'], errors='coerce')
    malicious_data = malicious_data.dropna(subset=['label'])
    malicious_data['label'] = malicious_data['label'].astype(int)
    malicious_data = malicious_data[malicious_data['label'].isin([1, 2, 3, 4, 5])]
    
    if len(malicious_data) == 0:
        print("No annotations found. Using binary classification.")
        malicious_data = malicious_df[['text_cleaned']].copy()
        malicious_data = malicious_data.rename(columns={'text_cleaned': 'text'})
        malicious_data['label'] = 1
    
    combined_df = pd.concat([benign_data, malicious_data], ignore_index=True)
    combined_df = combined_df[combined_df['text'].str.strip() != '']
    combined_df = combined_df.dropna(subset=['text'])
    combined_df = combined_df.sample(frac=1, random_state=SEED).reset_index(drop=True)
    
    print(f"Total samples: {len(combined_df):,}")
    print(f"Class distribution: {combined_df['label'].value_counts().sort_index().to_dict()}")
    
    return combined_df

df = prepare_dataset(malicious_df, benign_df)

## Tokenization

In [None]:
class PhishingDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        encoding = self.tokenizer(
            str(self.texts[idx]),
            add_special_tokens=True,
            max_length=self.max_length,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(self.labels[idx], dtype=torch.long)
        }

## Data Splitting

In [None]:
def create_data_splits(df, test_size=0.15, val_size=0.15):
    texts = df['text'].values
    labels = df['label'].values
    
    train_val_texts, test_texts, train_val_labels, test_labels = train_test_split(
        texts, labels, test_size=test_size, stratify=labels, random_state=SEED
    )
    
    val_proportion = val_size / (1 - test_size)
    train_texts, val_texts, train_labels, val_labels = train_test_split(
        train_val_texts, train_val_labels, test_size=val_proportion, 
        stratify=train_val_labels, random_state=SEED
    )
    
    train_df = pd.DataFrame({'text': train_texts, 'label': train_labels})
    val_df = pd.DataFrame({'text': val_texts, 'label': val_labels})
    test_df = pd.DataFrame({'text': test_texts, 'label': test_labels})
    
    print(f"Train: {len(train_df):,} | Val: {len(val_df):,} | Test: {len(test_df):,}")
    
    return train_df, val_df, test_df

train_df, val_df, test_df = create_data_splits(df)

## Class Weights

In [None]:
num_classes = len(df['label'].unique())
unique_classes = np.unique(train_df['label'].values)

weights = compute_class_weight(
    class_weight='balanced',
    classes=unique_classes,
    y=train_df['label'].values
)

class_weights = np.ones(num_classes)
for cls, weight in zip(unique_classes, weights):
    class_weights[int(cls)] = weight

class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)
print(f"Class weights: {class_weights}")

## Training Functions

In [None]:
def train_epoch(model, dataloader, optimizer, scheduler, loss_fn, device):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    
    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs.logits, labels)
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), TRAINING_CONFIG['max_grad_norm'])
        optimizer.step()
        scheduler.step()
        
        total_loss += loss.item()
        predictions = torch.argmax(outputs.logits, dim=1)
        correct += (predictions == labels).sum().item()
        total += labels.size(0)
    
    return total_loss / len(dataloader), correct / total


def evaluate(model, dataloader, loss_fn, device):
    model.eval()
    total_loss = 0
    all_predictions = []
    all_labels = []
    
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = loss_fn(outputs.logits, labels)
            
            total_loss += loss.item()
            predictions = torch.argmax(outputs.logits, dim=1)
            all_predictions.extend(predictions.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    accuracy = np.mean(np.array(all_predictions) == np.array(all_labels))
    return total_loss / len(dataloader), accuracy, all_predictions, all_labels

In [None]:
def train_model(model_key, train_df, val_df, num_classes, class_weights, config, device):
    print(f"\n{'='*60}")
    print(f"Training: {model_key}")
    print(f"{'='*60}")
    
    model_name = MODEL_CONFIGS[model_key]
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, num_labels=num_classes
    ).to(device)
    
    train_dataset = PhishingDataset(
        train_df['text'].values, train_df['label'].values,
        tokenizer, config['max_length']
    )
    val_dataset = PhishingDataset(
        val_df['text'].values, val_df['label'].values,
        tokenizer, config['max_length']
    )
    
    train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=config['batch_size'], shuffle=False)
    
    total_steps = len(train_loader) * config['epochs']
    warmup_steps = int(total_steps * config['warmup_ratio'])
    
    optimizer = AdamW(model.parameters(), lr=config['learning_rate'], weight_decay=config['weight_decay'])
    scheduler = get_linear_schedule_with_warmup(optimizer, warmup_steps, total_steps)
    loss_fn = nn.CrossEntropyLoss(weight=class_weights)
    
    history = {'train_loss': [], 'train_acc': [], 'val_loss': [], 'val_acc': []}
    best_val_acc = 0
    best_model_state = None
    
    for epoch in range(config['epochs']):
        train_loss, train_acc = train_epoch(model, train_loader, optimizer, scheduler, loss_fn, device)
        val_loss, val_acc, _, _ = evaluate(model, val_loader, loss_fn, device)
        
        history['train_loss'].append(train_loss)
        history['train_acc'].append(train_acc)
        history['val_loss'].append(val_loss)
        history['val_acc'].append(val_acc)
        
        print(f"Epoch {epoch+1}/{config['epochs']} - "
              f"Train Loss: {train_loss:.4f}, Acc: {train_acc:.4f} | "
              f"Val Loss: {val_loss:.4f}, Acc: {val_acc:.4f}")
        
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_model_state = model.state_dict().copy()
    
    if best_model_state:
        model.load_state_dict(best_model_state)
    
    return model, tokenizer, history

## Training Execution

In [None]:
# trained_models = {}
# for model_key in ['distilbert', 'roberta', 'albert']:
#     model, tokenizer, history = train_model(
#         model_key, train_df, val_df,
#         num_classes, class_weights,
#         TRAINING_CONFIG, device
#     )
#     trained_models[model_key] = (model, tokenizer, history)

## Evaluation Metrics

In [None]:
from sklearn.metrics import (
    classification_report, confusion_matrix, accuracy_score,
    precision_recall_fscore_support, cohen_kappa_score
)
import matplotlib.pyplot as plt
import seaborn as sns

def compute_metrics(y_true, y_pred, class_labels):
    accuracy = accuracy_score(y_true, y_pred)
    precision, recall, f1, support = precision_recall_fscore_support(y_true, y_pred, average=None)
    macro_f1 = precision_recall_fscore_support(y_true, y_pred, average='macro')[2]
    weighted_f1 = precision_recall_fscore_support(y_true, y_pred, average='weighted')[2]
    kappa = cohen_kappa_score(y_true, y_pred)
    
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Macro F1: {macro_f1:.4f}")
    print(f"Weighted F1: {weighted_f1:.4f}")
    print(f"Cohen's Kappa: {kappa:.4f}")
    print(f"\nClassification Report:")
    print(classification_report(y_true, y_pred, target_names=[class_labels[i] for i in sorted(class_labels.keys())]))
    
    return {
        'accuracy': accuracy,
        'macro_f1': macro_f1,
        'weighted_f1': weighted_f1,
        'kappa': kappa,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

def plot_confusion_matrix(y_true, y_pred, class_labels, normalize=False):
    cm = confusion_matrix(y_true, y_pred)
    labels = [class_labels[i] for i in sorted(class_labels.keys())]
    
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        fmt = '.2f'
        title = 'Confusion Matrix (Normalized)'
    else:
        fmt = 'd'
        title = 'Confusion Matrix'
    
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt=fmt, cmap='Blues', xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(title)
    plt.tight_layout()
    plt.savefig('confusion_matrix.png', dpi=300)
    plt.show()

## K-Fold Cross-Validation

In [None]:
from sklearn.model_selection import StratifiedKFold

def cross_validate_model(model_key, df, num_classes, config, device, n_folds=5):
    print(f"\n{'='*60}")
    print(f"Cross-Validation: {model_key} ({n_folds} folds)")
    print(f"{'='*60}")
    
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=SEED)
    fold_results = []
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(df['text'], df['label'])):
        print(f"\nFold {fold + 1}/{n_folds}")
        
        train_df = df.iloc[train_idx].reset_index(drop=True)
        val_df = df.iloc[val_idx].reset_index(drop=True)
        
        # Compute class weights for this fold
        unique_classes = np.unique(train_df['label'].values)
        weights = compute_class_weight('balanced', classes=unique_classes, y=train_df['label'].values)
        fold_weights = np.ones(num_classes)
        for cls, weight in zip(unique_classes, weights):
            fold_weights[int(cls)] = weight
        fold_weights = torch.tensor(fold_weights, dtype=torch.float).to(device)
        
        model, tokenizer, history = train_model(
            model_key, train_df, val_df, num_classes, fold_weights, config, device
        )
        
        fold_results.append({
            'fold': fold + 1,
            'best_val_acc': max(history['val_acc']),
            'final_train_loss': history['train_loss'][-1],
            'final_val_loss': history['val_loss'][-1]
        })
        
        del model, tokenizer
        torch.cuda.empty_cache() if torch.cuda.is_available() else None
    
    avg_acc = np.mean([r['best_val_acc'] for r in fold_results])
    std_acc = np.std([r['best_val_acc'] for r in fold_results])
    print(f"\nCross-Validation Results: {avg_acc:.4f} (+/- {std_acc:.4f})")
    
    return fold_results

## Model Comparison

In [None]:
def evaluate_on_test(model, tokenizer, test_df, class_weights, config, device, class_labels):
    test_dataset = PhishingDataset(
        test_df['text'].values, test_df['label'].values,
        tokenizer, config['max_length']
    )
    test_loader = DataLoader(test_dataset, batch_size=config['batch_size'], shuffle=False)
    loss_fn = nn.CrossEntropyLoss(weight=class_weights)
    
    _, _, y_pred, y_true = evaluate(model, test_loader, loss_fn, device)
    
    metrics = compute_metrics(y_true, y_pred, class_labels)
    plot_confusion_matrix(y_true, y_pred, class_labels)
    plot_confusion_matrix(y_true, y_pred, class_labels, normalize=True)
    
    return metrics

def compare_models(trained_models, test_df, class_weights, config, device, class_labels):
    results = {}
    
    for model_key, (model, tokenizer, history) in trained_models.items():
        print(f"\n{'='*60}")
        print(f"Evaluating: {model_key}")
        print(f"{'='*60}")
        
        metrics = evaluate_on_test(model, tokenizer, test_df, class_weights, config, device, class_labels)
        results[model_key] = metrics
    
    # Summary table
    print(f"\n{'='*60}")
    print("Model Comparison Summary")
    print(f"{'='*60}")
    print(f"{'Model':<15} {'Accuracy':<10} {'Macro F1':<10} {'Weighted F1':<12} {'Kappa':<10}")
    print("-" * 57)
    for model_key, metrics in results.items():
        print(f"{model_key:<15} {metrics['accuracy']:<10.4f} {metrics['macro_f1']:<10.4f} "
              f"{metrics['weighted_f1']:<12.4f} {metrics['kappa']:<10.4f}")
    
    best_model = max(results.keys(), key=lambda k: results[k]['macro_f1'])
    print(f"\nBest model (by Macro F1): {best_model}")
    
    return results

## Run Evaluation

In [None]:
# Evaluate trained models on test set
# results = compare_models(trained_models, test_df, class_weights, TRAINING_CONFIG, device, CLASS_LABELS)

# Cross-validation (optional)
# cv_results = {}
# for model_key in ['distilbert', 'roberta', 'albert']:
#     cv_results[model_key] = cross_validate_model(
#         model_key, df, num_classes, TRAINING_CONFIG, device, n_folds=5
#     )