# Linear Models Category - Kaggle Playground Series S5E8

**Category**: Linear Models  
**Sub-models**: LogisticRegression (lbfgs, saga + L1/L2/ElasticNet), RidgeClassifier, SGDClassifier, Perceptron, LinearSVC  
**Split Strategy**: 70/30 stratified split  
**Cross-Validation**: 5-fold StratifiedKFold  
**Random Seed**: 42  
**Artifact Paths**: outputs/linear_models/  

This notebook compares multiple linear model variants using the same data preprocessing and evaluation protocol.

In [None]:
# Bootstrap installation and imports
%pip install numpy pandas scikit-learn matplotlib shap xgboost lightgbm catboost tensorflow --quiet

import os, json, random, pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier, Perceptron
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import (
    roc_auc_score, average_precision_score, f1_score, accuracy_score,
    precision_score, recall_score, log_loss, roc_curve, precision_recall_curve,
    confusion_matrix, calibration_curve
)
import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
os.makedirs('outputs', exist_ok=True)
np.random.seed(42)
random.seed(42)

print("Linear Models Category - Setup Complete")
print(f"Working directory: {os.getcwd()}")

In [None]:
# Load and prepare data
print("Loading data...")
train_df = pd.read_csv('../playground-series-s5e8/train.csv')
test_df = pd.read_csv('../playground-series-s5e8/test.csv')

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print(f"Columns: {train_df.columns.tolist()}")
print(f"Target distribution: {train_df['target'].value_counts().to_dict()}")

# Features and target
feature_cols = [col for col in train_df.columns if col not in ['id', 'target']]
X = train_df[feature_cols]
y = train_df['target']

print(f"Features: {len(feature_cols)}")
print(f"Feature columns: {feature_cols[:10]}...")  # Show first 10 features

In [None]:
# Data split: 70% train_pool, 30% test_holdout
X_train_pool, X_test_holdout, y_train_pool, y_test_holdout = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

print(f"Train pool: {X_train_pool.shape}")
print(f"Test holdout: {X_test_holdout.shape}")
print(f"Train pool target distribution: {y_train_pool.value_counts().to_dict()}")
print(f"Test holdout target distribution: {y_test_holdout.value_counts().to_dict()}")

# Set up cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
print("\nCross-validation setup: 5-fold StratifiedKFold")

In [None]:
# Define linear models and their hyperparameter grids
models_config = {
    'LogisticRegression_lbfgs': {
        'estimator': LogisticRegression(solver='lbfgs', max_iter=2000, random_state=42),
        'param_grid': {
            'classifier__C': [0.1, 1, 10],
            'classifier__penalty': ['l2', 'none']
        }
    },
    'LogisticRegression_saga_l1': {
        'estimator': LogisticRegression(solver='saga', max_iter=2000, random_state=42),
        'param_grid': {
            'classifier__C': [0.1, 1, 10],
            'classifier__penalty': ['l1']
        }
    },
    'LogisticRegression_saga_elasticnet': {
        'estimator': LogisticRegression(solver='saga', max_iter=2000, random_state=42),
        'param_grid': {
            'classifier__C': [0.1, 1, 10],
            'classifier__penalty': ['elasticnet'],
            'classifier__l1_ratio': [0.5]
        }
    },
    'RidgeClassifier': {
        'estimator': RidgeClassifier(random_state=42),
        'param_grid': {
            'classifier__alpha': [0.1, 1.0, 10.0]
        }
    },
    'SGDClassifier': {
        'estimator': SGDClassifier(loss='log_loss', random_state=42),
        'param_grid': {
            'classifier__alpha': [0.0001, 0.001, 0.01],
            'classifier__l1_ratio': [0.0, 0.5, 1.0]
        }
    },
    'Perceptron': {
        'estimator': Perceptron(random_state=42),
        'param_grid': {
            'classifier__alpha': [0.0001, 0.001, 0.01]
        }
    },
    'LinearSVC': {
        'estimator': CalibratedClassifierCV(
            LinearSVC(random_state=42, max_iter=2000), 
            method='sigmoid', cv=3
        ),
        'param_grid': {
            'classifier__base_estimator__C': [0.1, 1, 10]
        }
    }
}

print(f"Configured {len(models_config)} linear model variants:")
for name in models_config.keys():
    print(f"  - {name}")

In [None]:
# Helper functions
def create_pipeline(estimator):
    """Create a preprocessing pipeline with the given estimator"""
    return Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler()),
        ('classifier', estimator)
    ])

def get_probabilities(estimator, X):
    """Get probabilities from estimator, handling different probability methods"""
    if hasattr(estimator, 'predict_proba'):
        return estimator.predict_proba(X)[:, 1]
    elif hasattr(estimator, 'decision_function'):
        # Convert decision function to probabilities using sigmoid
        decision = estimator.decision_function(X)
        return 1 / (1 + np.exp(-decision))
    else:
        raise ValueError("Estimator doesn't support probability prediction")

def compute_metrics(y_true, y_prob, threshold=0.5):
    """Compute all evaluation metrics"""
    y_pred = (y_prob >= threshold).astype(int)
    
    return {
        'roc_auc': roc_auc_score(y_true, y_prob),
        'average_precision': average_precision_score(y_true, y_prob),
        'f1': f1_score(y_true, y_pred),
        'accuracy': accuracy_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred),
        'recall': recall_score(y_true, y_pred),
        'logloss': log_loss(y_true, y_prob)
    }

def find_best_threshold(y_true, y_prob):
    """Find best threshold using Youden's J statistic"""
    fpr, tpr, thresholds = roc_curve(y_true, y_prob)
    j_scores = tpr - fpr
    best_idx = np.argmax(j_scores)
    return thresholds[best_idx]

print("Helper functions defined")

In [None]:
# Main evaluation loop
results = {}
all_cv_results = []

for model_name, config in models_config.items():
    print(f"\n{'='*60}")
    print(f"Evaluating: {model_name}")
    print(f"{'='*60}")
    
    # Create directories for this model
    model_dir = f"../outputs/linear_models/{model_name}"
    os.makedirs(f"{model_dir}/logs", exist_ok=True)
    os.makedirs(f"{model_dir}/models", exist_ok=True)
    os.makedirs(f"{model_dir}/figures", exist_ok=True)
    
    # Create pipeline
    pipeline = create_pipeline(config['estimator'])
    
    # Cross-validation with hyperparameter tuning
    grid_search = GridSearchCV(
        pipeline, config['param_grid'], cv=cv, 
        scoring='roc_auc', n_jobs=-1, verbose=1
    )
    
    # Fit on train pool
    grid_search.fit(X_train_pool, y_train_pool)
    best_pipeline = grid_search.best_estimator_
    
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Best CV AUC: {grid_search.best_score_:.4f}")
    
    # Collect CV results for detailed analysis
    cv_metrics = []
    cv_roc_curves = []
    cv_pr_curves = []
    cv_thresholds = []
    
    for fold_idx, (train_idx, val_idx) in enumerate(cv.split(X_train_pool, y_train_pool)):
        X_fold_train = X_train_pool.iloc[train_idx]
        X_fold_val = X_train_pool.iloc[val_idx]
        y_fold_train = y_train_pool.iloc[train_idx]
        y_fold_val = y_train_pool.iloc[val_idx]
        
        # Fit best pipeline on this fold
        fold_pipeline = create_pipeline(config['estimator'])
        fold_pipeline.set_params(**grid_search.best_params_)
        fold_pipeline.fit(X_fold_train, y_fold_train)
        
        # Predict on validation fold
        y_val_prob = get_probabilities(fold_pipeline, X_fold_val)
        
        # Find best threshold for this fold
        best_threshold = find_best_threshold(y_fold_val, y_val_prob)
        cv_thresholds.append(best_threshold)
        
        # Compute metrics
        fold_metrics = compute_metrics(y_fold_val, y_val_prob, best_threshold)
        fold_metrics['fold'] = fold_idx + 1
        fold_metrics['threshold'] = best_threshold
        cv_metrics.append(fold_metrics)
        
        # Store curves for plotting
        fpr, tpr, _ = roc_curve(y_fold_val, y_val_prob)
        precision, recall, _ = precision_recall_curve(y_fold_val, y_val_prob)
        cv_roc_curves.append((fpr, tpr))
        cv_pr_curves.append((precision, recall))
        
        print(f"  Fold {fold_idx + 1}: AUC={fold_metrics['roc_auc']:.4f}, "
              f"AP={fold_metrics['average_precision']:.4f}, "
              f"F1={fold_metrics['f1']:.4f}")
    
    # Calculate mean threshold from CV
    mean_threshold = np.mean(cv_thresholds)
    
    # Evaluate on test holdout using mean threshold
    y_test_prob = get_probabilities(best_pipeline, X_test_holdout)
    test_metrics = compute_metrics(y_test_holdout, y_test_prob, mean_threshold)
    test_metrics['chosen_threshold'] = mean_threshold
    test_metrics['confusion_matrix'] = confusion_matrix(
        y_test_holdout, (y_test_prob >= mean_threshold).astype(int)
    ).tolist()
    
    print(f"\nTest Results (threshold={mean_threshold:.4f}):")
    print(f"  AUC: {test_metrics['roc_auc']:.4f}")
    print(f"  AP: {test_metrics['average_precision']:.4f}")
    print(f"  F1: {test_metrics['f1']:.4f}")
    
    # Store results
    results[model_name] = {
        'cv_metrics': cv_metrics,
        'test_metrics': test_metrics,
        'cv_roc_curves': cv_roc_curves,
        'cv_pr_curves': cv_pr_curves,
        'best_params': grid_search.best_params_,
        'model_dir': model_dir
    }
    
    # Add to overall results
    all_cv_results.extend([{
        'model': model_name,
        'category': 'linear_models',
        **metrics
    } for metrics in cv_metrics])

print(f"\n{'='*60}")
print("All models evaluated!")

In [None]:
# Save artifacts for each model
for model_name, model_results in results.items():
    model_dir = model_results['model_dir']
    
    # Save CV metrics
    cv_df = pd.DataFrame(model_results['cv_metrics'])
    
    # Add summary statistics
    summary_stats = []
    for metric in ['roc_auc', 'average_precision', 'f1', 'accuracy', 'precision', 'recall', 'logloss']:
        summary_stats.append({
            'fold': 'mean',
            metric: cv_df[metric].mean(),
            'threshold': cv_df['threshold'].mean()
        })
        summary_stats.append({
            'fold': 'std',
            metric: cv_df[metric].std(),
            'threshold': cv_df['threshold'].std()
        })
    
    cv_summary_df = pd.concat([cv_df, pd.DataFrame(summary_stats)], ignore_index=True)
    cv_summary_df.to_csv(f"{model_dir}/logs/cv_metrics.csv", index=False)
    
    # Save test metrics
    with open(f"{model_dir}/logs/test_metrics.json", 'w') as f:
        json.dump(model_results['test_metrics'], f, indent=2)
    
    # Save model
    # Refit best pipeline on full train pool
    final_pipeline = create_pipeline(models_config[model_name]['estimator'])
    grid_search_final = GridSearchCV(
        final_pipeline, models_config[model_name]['param_grid'], 
        cv=cv, scoring='roc_auc', n_jobs=-1
    )
    grid_search_final.fit(X_train_pool, y_train_pool)
    
    with open(f"{model_dir}/models/final_model.pkl", 'wb') as f:
        pickle.dump(grid_search_final.best_estimator_, f)
    
    print(f"Artifacts saved for {model_name}")

print("\nAll artifacts saved!")

In [None]:
# Generate figures for each model
for model_name, model_results in results.items():
    model_dir = model_results['model_dir']
    
    print(f"Generating figures for {model_name}...")
    
    # 1. ROC Curve with CV mean and std
    plt.figure(figsize=(8, 6))
    
    # Plot individual fold curves in light color
    for i, (fpr, tpr) in enumerate(model_results['cv_roc_curves']):
        plt.plot(fpr, tpr, alpha=0.3, color='gray')
    
    # Calculate mean ROC curve
    mean_fpr = np.linspace(0, 1, 100)
    tprs = []
    for fpr, tpr in model_results['cv_roc_curves']:
        tprs.append(np.interp(mean_fpr, fpr, tpr))
    
    mean_tpr = np.mean(tprs, axis=0)
    std_tpr = np.std(tprs, axis=0)
    
    plt.plot(mean_fpr, mean_tpr, 'b-', label=f'Mean ROC (AUC = {np.mean([cv["roc_auc"] for cv in model_results["cv_metrics"]]):.3f} ± {np.std([cv["roc_auc"] for cv in model_results["cv_metrics"]]):.3f})')
    plt.fill_between(mean_fpr, mean_tpr - std_tpr, mean_tpr + std_tpr, alpha=0.2, color='blue')
    plt.plot([0, 1], [0, 1], 'k--', label='Random')
    
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve - {model_name}')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig(f"{model_dir}/figures/roc_cv.png", dpi=200, bbox_inches='tight')
    plt.close()
    
    # 2. Precision-Recall Curve
    plt.figure(figsize=(8, 6))
    
    # Plot individual fold curves
    for i, (precision, recall) in enumerate(model_results['cv_pr_curves']):
        plt.plot(recall, precision, alpha=0.3, color='gray')
    
    # Calculate mean PR curve (simplified)
    mean_ap = np.mean([cv['average_precision'] for cv in model_results['cv_metrics']])
    std_ap = np.std([cv['average_precision'] for cv in model_results['cv_metrics']])
    
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title(f'Precision-Recall Curve - {model_name}\nAP = {mean_ap:.3f} ± {std_ap:.3f}')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig(f"{model_dir}/figures/pr_cv.png", dpi=200, bbox_inches='tight')
    plt.close()
    
    # 3. Load final model for remaining plots
    with open(f"{model_dir}/models/final_model.pkl", 'rb') as f:
        final_model = pickle.load(f)
    
    y_test_prob = get_probabilities(final_model, X_test_holdout)
    
    # 4. Calibration curve
    plt.figure(figsize=(8, 6))
    
    try:
        prob_true, prob_pred = calibration_curve(y_test_holdout, y_test_prob, n_bins=10)
        plt.plot(prob_pred, prob_true, 'o-', label='Calibration curve')
        plt.plot([0, 1], [0, 1], 'k--', label='Perfect calibration')
        plt.xlabel('Mean Predicted Probability')
        plt.ylabel('Fraction of Positives')
        plt.title(f'Calibration Curve - {model_name}')
        plt.legend()
        plt.grid(True, alpha=0.3)
    except:
        plt.text(0.5, 0.5, 'Calibration curve not available', 
                ha='center', va='center', transform=plt.gca().transAxes)
        plt.title(f'Calibration Curve - {model_name}')
    
    plt.tight_layout()
    plt.savefig(f"{model_dir}/figures/calibration_curve.png", dpi=200, bbox_inches='tight')
    plt.close()
    
    # 5. Confusion Matrix
    plt.figure(figsize=(6, 5))
    
    cm = model_results['test_metrics']['confusion_matrix']
    plt.imshow(cm, interpolation='nearest', cmap='Blues')
    plt.title(f'Confusion Matrix - {model_name}')
    plt.colorbar()
    
    tick_marks = np.arange(2)
    plt.xticks(tick_marks, ['Negative', 'Positive'])
    plt.yticks(tick_marks, ['Negative', 'Positive'])
    
    # Add text annotations
    for i in range(2):
        for j in range(2):
            plt.text(j, i, str(cm[i][j]), ha='center', va='center', fontsize=14)
    
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.tight_layout()
    plt.savefig(f"{model_dir}/figures/confusion_matrix.png", dpi=200, bbox_inches='tight')
    plt.close()
    
    # 6. Feature importance (for linear models - coefficient magnitudes)
    plt.figure(figsize=(10, 6))
    
    try:
        if hasattr(final_model.named_steps['classifier'], 'coef_'):
            coef = final_model.named_steps['classifier'].coef_[0]
            abs_coef = np.abs(coef)
            
            # Get top 20 features
            top_indices = np.argsort(abs_coef)[-20:]
            top_coef = coef[top_indices]
            top_features = [feature_cols[i] for i in top_indices]
            
            colors = ['red' if c < 0 else 'blue' for c in top_coef]
            plt.barh(range(len(top_coef)), top_coef, color=colors, alpha=0.7)
            plt.yticks(range(len(top_features)), top_features)
            plt.xlabel('Coefficient Value')
            plt.title(f'Top 20 Feature Coefficients - {model_name}')
            plt.grid(True, alpha=0.3)
        else:
            plt.text(0.5, 0.5, 'Feature importance not available', 
                    ha='center', va='center', transform=plt.gca().transAxes)
            plt.title(f'Feature Importance - {model_name}')
    except Exception as e:
        plt.text(0.5, 0.5, f'Feature importance error: {str(e)}', 
                ha='center', va='center', transform=plt.gca().transAxes)
        plt.title(f'Feature Importance - {model_name}')
    
    plt.tight_layout()
    plt.savefig(f"{model_dir}/figures/feature_importance.png", dpi=200, bbox_inches='tight')
    plt.close()
    
    # 7. Error analysis
    plt.figure(figsize=(8, 6))
    
    errors = np.abs(y_test_holdout - y_test_prob)
    plt.scatter(y_test_prob, errors, alpha=0.6)
    plt.xlabel('Predicted Probability')
    plt.ylabel('Prediction Error')
    plt.title(f'Error Analysis - {model_name}')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig(f"{model_dir}/figures/error_analysis.png", dpi=200, bbox_inches='tight')
    plt.close()

print("All figures generated!")

In [None]:
# Create summary table and final results
summary_data = []

for model_name, model_results in results.items():
    test_metrics = model_results['test_metrics']
    cv_metrics = model_results['cv_metrics']
    
    summary_data.append({
        'model': model_name,
        'test_auc': test_metrics['roc_auc'],
        'test_ap': test_metrics['average_precision'],
        'test_f1': test_metrics['f1'],
        'test_accuracy': test_metrics['accuracy'],
        'cv_auc_mean': np.mean([cv['roc_auc'] for cv in cv_metrics]),
        'cv_auc_std': np.std([cv['roc_auc'] for cv in cv_metrics]),
        'best_params': str(model_results['best_params']),
        'artifacts_path': model_results['model_dir']
    })

# Sort by test AUC
summary_df = pd.DataFrame(summary_data).sort_values('test_auc', ascending=False)

# Save summary
os.makedirs('../outputs/linear_models', exist_ok=True)
summary_df.to_csv('../outputs/linear_models/summary.csv', index=False)

# Display results
print("\n" + "="*80)
print("LINEAR MODELS CATEGORY - FINAL RESULTS")
print("="*80)
print("\nRanked by Test AUC:")
print("-" * 60)

for idx, row in summary_df.iterrows():
    print(f"{row['model']:30s} | AUC: {row['test_auc']:.4f} | AP: {row['test_ap']:.4f} | F1: {row['test_f1']:.4f}")
    print(f"{'':30s} | CV AUC: {row['cv_auc_mean']:.4f}±{row['cv_auc_std']:.4f}")
    print(f"{'':30s} | Artifacts: {row['artifacts_path']}")
    print("-" * 60)

print(f"\nBest Model: {summary_df.iloc[0]['model']}")
print(f"Best Test AUC: {summary_df.iloc[0]['test_auc']:.4f}")
print(f"\nAll results saved to: ../outputs/linear_models/")
print(f"Summary saved to: ../outputs/linear_models/summary.csv")