In [None]:
# Baseline Machine Learning Models for ADR Prediction
# This notebook trains and evaluates classical ML models as baselines

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import (
    roc_auc_score, average_precision_score, f1_score,
    classification_report, confusion_matrix, roc_curve,
    precision_recall_curve
)
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

# =============================================================================
# 1. LOAD DATA
# =============================================================================

BASE_DIR = "/Users/hithaishireddy/Desktop/ADR-project/Prediction-Model-for-Adverse-Drug-Reactions-Using-Deep-Learning-Methods"
DATA_DIR = f"{BASE_DIR}/processed_data"

print("Loading preprocessed data...")
X_train = pd.read_csv(f"{DATA_DIR}/X_train.csv")
y_train = pd.read_csv(f"{DATA_DIR}/y_train.csv").values.ravel()
X_val = pd.read_csv(f"{DATA_DIR}/X_val.csv")
y_val = pd.read_csv(f"{DATA_DIR}/y_val.csv").values.ravel()
X_test = pd.read_csv(f"{DATA_DIR}/X_test.csv")
y_test = pd.read_csv(f"{DATA_DIR}/y_test.csv").values.ravel()

print(f"Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")
print(f"ADR rate - Train: {y_train.mean():.3f}, Val: {y_val.mean():.3f}, Test: {y_test.mean():.3f}")

# =============================================================================
# 2. TRAIN BASELINE MODELS
# =============================================================================

# Calculate class weight for imbalanced data
class_weight = {0: 1.0, 1: (y_train == 0).sum() / (y_train == 1).sum()}
print(f"\nClass weight: {class_weight}")

# Define models
models = {
    'Logistic Regression': LogisticRegression(
        max_iter=1000,
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    ),
    'Random Forest': RandomForestClassifier(
        n_estimators=100,
        max_depth=20,
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    ),
    'Gradient Boosting': GradientBoostingClassifier(
        n_estimators=100,
        max_depth=5,
        learning_rate=0.1,
        random_state=42
    ),
    'XGBoost': XGBClassifier(
        n_estimators=100,
        max_depth=7,
        learning_rate=0.1,
        scale_pos_weight=class_weight[1],
        random_state=42,
        n_jobs=-1,
        eval_metric='logloss'
    )
}

# Train and evaluate
results = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Train
    model.fit(X_train, y_train)
    
    # Predict probabilities
    y_val_proba = model.predict_proba(X_val)[:, 1]
    y_test_proba = model.predict_proba(X_test)[:, 1]
    
    # Calculate metrics
    val_auroc = roc_auc_score(y_val, y_val_proba)
    val_auprc = average_precision_score(y_val, y_val_proba)
    test_auroc = roc_auc_score(y_test, y_test_proba)
    test_auprc = average_precision_score(y_test, y_test_proba)
    
    # Store results
    results[name] = {
        'model': model,
        'val_auroc': val_auroc,
        'val_auprc': val_auprc,
        'test_auroc': test_auroc,
        'test_auprc': test_auprc,
        'y_test_proba': y_test_proba
    }
    
    print(f"Val AUROC: {val_auroc:.4f}, Val AUPRC: {val_auprc:.4f}")
    print(f"Test AUROC: {test_auroc:.4f}, Test AUPRC: {test_auprc:.4f}")

# =============================================================================
# 3. RESULTS SUMMARY
# =============================================================================

print("\n" + "="*80)
print("BASELINE MODELS SUMMARY")
print("="*80)

results_df = pd.DataFrame({
    'Model': list(results.keys()),
    'Val AUROC': [v['val_auroc'] for v in results.values()],
    'Val AUPRC': [v['val_auprc'] for v in results.values()],
    'Test AUROC': [v['test_auroc'] for v in results.values()],
    'Test AUPRC': [v['test_auprc'] for v in results.values()]
})

print("\n", results_df.to_string(index=False))

# =============================================================================
# 4. VISUALIZATIONS
# =============================================================================

# ROC Curves
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
for name, metrics in results.items():
    fpr, tpr, _ = roc_curve(y_test, metrics['y_test_proba'])
    plt.plot(fpr, tpr, label=f"{name} (AUC={metrics['test_auroc']:.3f})", linewidth=2)

plt.plot([0, 1], [0, 1], 'k--', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves - Baseline Models')
plt.legend()
plt.grid(alpha=0.3)

# PR Curves
plt.subplot(1, 2, 2)
for name, metrics in results.items():
    precision, recall, _ = precision_recall_curve(y_test, metrics['y_test_proba'])
    plt.plot(recall, precision, label=f"{name} (AUPRC={metrics['test_auprc']:.3f})", linewidth=2)

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curves - Baseline Models')
plt.legend()
plt.grid(alpha=0.3)

plt.tight_layout()
plt.savefig(f"{BASE_DIR}/results/baseline_models_comparison.png", dpi=300, bbox_inches='tight')
plt.show()

# =============================================================================
# 5. BEST MODEL ANALYSIS
# =============================================================================

best_model_name = max(results.items(), key=lambda x: x[1]['test_auroc'])[0]
best_model_results = results[best_model_name]

print(f"\n{'='*80}")
print(f"BEST MODEL: {best_model_name}")
print(f"{'='*80}")

# Predictions at threshold 0.5
y_test_pred = (best_model_results['y_test_proba'] >= 0.5).astype(int)

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_test_pred, target_names=['No ADR', 'ADR']))

# Confusion matrix
cm = confusion_matrix(y_test, y_test_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['No ADR', 'ADR'],
            yticklabels=['No ADR', 'ADR'])
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.title(f'Confusion Matrix - {best_model_name}')
plt.savefig(f"{BASE_DIR}/results/best_baseline_confusion_matrix.png", dpi=300, bbox_inches='tight')
plt.show()

# Feature importance (if available)
if hasattr(best_model_results['model'], 'feature_importances_'):
    feature_names = X_train.columns
    importances = best_model_results['model'].feature_importances_
    indices = np.argsort(importances)[::-1][:10]
    
    plt.figure(figsize=(10, 6))
    plt.bar(range(10), importances[indices])
    plt.xticks(range(10), [feature_names[i] for i in indices], rotation=45, ha='right')
    plt.xlabel('Features')
    plt.ylabel('Importance')
    plt.title(f'Top 10 Feature Importances - {best_model_name}')
    plt.tight_layout()
    plt.savefig(f"{BASE_DIR}/results/feature_importance.png", dpi=300, bbox_inches='tight')
    plt.show()

print("\n" + "="*80)
print("Baseline model evaluation complete!")
print("="*80)

Patients: (299712, 6)
Prescriptions (sample): (200000, 21)
Diagnoses (sample): (200000, 5)


  prescriptions = pd.read_csv(
