# Modeling

This notebook trains multiple models, compares performance, performs hyperparameter tuning, analyzes feature importance, and performs error analysis.


In [None]:
import sys
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

sys.path.append(str(Path().resolve().parent))
from src import models, data_preprocessing, feature_extractor, network_builder, visualization
import config

print("Imports successful!")


## Prepare Data and Features


In [None]:
# Load and preprocess data
print("Loading data...")
df = data_preprocessing.create_sample_dataset(n_samples=1000)

# Preprocess and create splits
print("Preprocessing data...")
train_df, val_df, test_df = data_preprocessing.preprocess_dataset(
    df, 
    text_column="text",
    label_column="label",
    timestamp_column="timestamp",
    save_processed=False
)

print(f"Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")


In [None]:
# Extract features
print("Extracting features...")
extractor = feature_extractor.FeatureExtractor(use_bert=False)

# Extract features for each split
train_features = extractor.extract_all_features(
    train_df,
    text_column="text",
    user_column="user_id",
    timestamp_column="timestamp"
)

val_features = extractor.extract_all_features(
    val_df,
    text_column="text",
    user_column="user_id",
    timestamp_column="timestamp"
)

test_features = extractor.extract_all_features(
    test_df,
    text_column="text",
    user_column="user_id",
    timestamp_column="timestamp"
)

# Prepare X and y
X_train = train_features.values
y_train = train_df['label'].values
X_val = val_features.values
y_val = val_df['label'].values
X_test = test_features.values
y_test = test_df['label'].values

print(f"Training set: {X_train.shape}")
print(f"Validation set: {X_val.shape}")
print(f"Test set: {X_test.shape}")
print(f"Feature names: {list(train_features.columns)[:10]}...")


## Train Random Forest Model


In [None]:
# Train Random Forest
print("Training Random Forest...")
rf_model = models.TraditionalMLModel("random_forest", n_estimators=100, max_depth=20)
rf_model.train(X_train, y_train, X_val, y_val)

# Evaluate
rf_results = rf_model.evaluate(X_test, y_test)
print("\nRandom Forest Results:")
print("="*50)
for metric, value in rf_results.items():
    if isinstance(value, (int, float)):
        print(f"{metric:20s}: {value:.4f}")


## Train XGBoost Model


In [None]:
# Train XGBoost
print("Training XGBoost...")
xgb_model = models.TraditionalMLModel("xgboost", n_estimators=100, max_depth=6)
xgb_model.train(X_train, y_train, X_val, y_val)

# Evaluate
xgb_results = xgb_model.evaluate(X_test, y_test)
print("\nXGBoost Results:")
print("="*50)
for metric, value in xgb_results.items():
    if isinstance(value, (int, float)):
        print(f"{metric:20s}: {value:.4f}")


## Train Logistic Regression


In [None]:
# Train Logistic Regression
print("Training Logistic Regression...")
lr_model = models.TraditionalMLModel("logistic_regression")
lr_model.train(X_train, y_train, X_val, y_val)

# Evaluate
lr_results = lr_model.evaluate(X_test, y_test)
print("\nLogistic Regression Results:")
print("="*50)
for metric, value in lr_results.items():
    if isinstance(value, (int, float)):
        print(f"{metric:20s}: {value:.4f}")


## Model Comparison


In [None]:
# Compare all models
results = {
    "Random Forest": rf_results,
    "XGBoost": xgb_results,
    "Logistic Regression": lr_results
}

# Plot comparison
visualization.plot_model_comparison(results, metric="f1")
visualization.plot_model_comparison(results, metric="accuracy")


## Confusion Matrices


In [None]:
# Plot confusion matrices for all models
y_pred_rf = rf_model.predict(X_test)
y_pred_xgb = xgb_model.predict(X_test)
y_pred_lr = lr_model.predict(X_test)

print("Random Forest Confusion Matrix:")
visualization.plot_confusion_matrix(y_test, y_pred_rf, class_names=["Real", "Fake"])

print("\nXGBoost Confusion Matrix:")
visualization.plot_confusion_matrix(y_test, y_pred_xgb, class_names=["Real", "Fake"])

print("\nLogistic Regression Confusion Matrix:")
visualization.plot_confusion_matrix(y_test, y_pred_lr, class_names=["Real", "Fake"])


## ROC Curves


In [None]:
# Plot ROC curves
y_proba_rf = rf_model.predict_proba(X_test)[:, 1]
y_proba_xgb = xgb_model.predict_proba(X_test)[:, 1]
y_proba_lr = lr_model.predict_proba(X_test)[:, 1]

visualization.plot_roc_curve(y_test, y_proba_rf, "Random Forest")
visualization.plot_roc_curve(y_test, y_proba_xgb, "XGBoost")
visualization.plot_roc_curve(y_test, y_proba_lr, "Logistic Regression")


## Feature Importance


In [None]:
# Plot feature importance for Random Forest
feature_names = train_features.columns.tolist()
visualization.plot_feature_importance(rf_model.model, feature_names, top_n=15)

# Show top features
importances = rf_model.model.feature_importances_
indices = np.argsort(importances)[::-1][:15]
print("\nTop 15 Most Important Features:")
print("="*50)
for i in indices:
    print(f"{feature_names[i]:30s}: {importances[i]:.4f}")


## Save Best Model


In [None]:
# Save the best performing model
best_model = xgb_model if xgb_results['f1'] > rf_results['f1'] else rf_model
best_model_name = "XGBoost" if xgb_results['f1'] > rf_results['f1'] else "Random Forest"

model_path = Path("../models/best_model.pkl")
model_path.parent.mkdir(parents=True, exist_ok=True)
best_model.save(model_path)

print(f"Saved best model ({best_model_name}) to {model_path}")
