In [1]:
#!/usr/bin/env python3
"""
credit_pipeline.py
Full training pipeline: preprocessing -> PCA -> SMOTEENN -> LightGBM
Includes SHAP explainability and saving artifacts.
"""

import os
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, roc_curve, classification_report

from imblearn.combine import SMOTEENN
from imblearn.pipeline import Pipeline as ImbPipeline
from lightgbm import LGBMClassifier
import shap

# -------------------------------
# Configuration
# -------------------------------
RANDOM_STATE = 42
TEST_SIZE = 0.2
PCA_VARIANCE = 0.95  # keep 95% variance
MODEL_OUTPUT = "final_lgbm.pkl"
PREPROCESSOR_OUTPUT = "preprocessor.pkl"
SHAP_PLOT_OUTPUT = "shap_summary.png"

# -------------------------------
# Helper: load your data here
# -------------------------------
# Replace this with actual loading; expects df with columns and 'default' target
# Example:
# df = pd.read_csv("data/credit_data.csv")

# For testing purpose raise if df not defined
from data_pipeline import load_data
try:
    df = load_data()
except NameError:
    raise RuntimeError("Please set `df` variable to your pandas DataFrame before running the script.")

# -------------------------------
# Features & target
# -------------------------------
TARGET = 'default'
features = [c for c in df.columns if c != TARGET]

X = df[features].copy()
y = df[TARGET].astype(int).copy()

# -------------------------------
# Identify numeric / categorical columns
# -------------------------------
num_features = X.select_dtypes(include=["number"]).columns.tolist()
cat_features = X.select_dtypes(include=["object", "category", "bool"]).columns.tolist()

# Remove target-col-like from features if erroneously included
if TARGET in num_features:
    num_features.remove(TARGET)
if TARGET in cat_features:
    cat_features.remove(TARGET)

print(f"Numeric features: {len(num_features)}  Categorical features: {len(cat_features)}")

# -------------------------------
# Train/test split (important: do it BEFORE resampling)
# -------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, stratify=y, random_state=RANDOM_STATE
)

# -------------------------------
# Preprocessor
# -------------------------------
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_features),
        # ("cat", OneHotEncoder(handle_unknown="ignore", sparse=False), cat_features)
    ],
    remainder="drop"
)

# -------------------------------
# Build imbalanced pipeline (preprocess -> PCA -> SMOTEENN -> LGBM)
# Note: Use imblearn Pipeline so resampling step is supported
# -------------------------------
pca = PCA(n_components=PCA_VARIANCE, svd_solver="full")
resampler = SMOTEENN(random_state=RANDOM_STATE)

lgbm = LGBMClassifier(
    n_estimators=1500,
    learning_rate=0.03,
    subsample=0.8,
    colsample_bytree=0.8,
    num_leaves=60,
    min_child_samples=40,
    reg_alpha=3,
    reg_lambda=3,
    class_weight="balanced",
    random_state=RANDOM_STATE
)

pipeline = ImbPipeline(steps=[
    ("preprocess", preprocessor),
    # ("pca", pca),
    # ("resample", resampler),
    ("model", lgbm)
])

# -------------------------------
# Train
# -------------------------------
print("Training pipeline ...")
pipeline.fit(X_train, y_train)

# -------------------------------
# Evaluate
# -------------------------------
preds_proba = pipeline.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, preds_proba)
print(f"Test AUC: {auc:.4f}")

# Optional: classification report at threshold 0.5
preds = (preds_proba >= 0.5).astype(int)
print(classification_report(y_test, preds))

# ROC curve save
fpr, tpr, thr = roc_curve(y_test, preds_proba)
plt.figure(figsize=(6,6))
plt.plot(fpr, tpr, label=f"AUC={auc:.4f}")
plt.plot([0,1],[0,1], linestyle='--', alpha=0.6)
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC")
plt.legend()
plt.tight_layout()
plt.savefig("roc_curve.png")
plt.close()

# -------------------------------
# Save artifacts
# -------------------------------
joblib.dump(pipeline, MODEL_OUTPUT)
print(f"Saved pipeline to {MODEL_OUTPUT}")

# Save preprocessor separately if needed
joblib.dump(preprocessor, PREPROCESSOR_OUTPUT)
print(f"Saved preprocessor to {PREPROCESSOR_OUTPUT}")

# -------------------------------
# SHAP explainability
# -------------------------------
print("Computing SHAP values (may take some time)...")

# We need to extract the fitted LGBM and the transformed training matrix for SHAP
fitted_model = pipeline.named_steps['model']
preproc_step = pipeline.named_steps['preprocess']

# Transform a subset for speed and stability
X_train_trans = preproc_step.transform(X_train)

# Use TreeExplainer
explainer = shap.TreeExplainer(fitted_model)
shap_values = explainer.shap_values(X_train_trans,)

# Create feature names for transformed matrix
try:
    transformed_feature_names = preproc_step.get_feature_names_out()
except Exception:
    # Fallback: build names
    num_names = num_features
    cat_names = list(preproc_step.named_transformers_["cat"].get_feature_names_out(cat_features)) if len(cat_features)>0 else []
    transformed_feature_names = np.array(num_names + cat_names)

# Summary plot
plt.figure(figsize=(10,8))
shap.summary_plot(shap_values, X_train_trans, feature_names=transformed_feature_names, show=False)
plt.tight_layout()
plt.savefig(SHAP_PLOT_OUTPUT)
plt.close()
print(f"Saved SHAP summary to {SHAP_PLOT_OUTPUT}")

# Optional: aggregate SHAP to original features if one-hot expanded
# This part aggregates OHE names back to base features (if OHE used)
try:
    tf_names = list(transformed_feature_names)
    shap_vals_mean = np.mean(np.abs(shap_values), axis=0)
    agg = {}
    for nm, val in zip(tf_names, shap_vals_mean):
        if '__' in nm:
            # sklearn get_feature_names_out uses format: "cat__<col>_<level>" or similar
            base = nm.split("__")[1] if '__' in nm else nm
            # sometimes OneHotEncoder uses 'col_level'
            if '_' in base and base.split('_')[0] in cat_features:
                base = base.split('_')[0]
        else:
            base = nm
        agg[base] = agg.get(base, 0) + val
    agg_df = pd.DataFrame([ (k,v) for k,v in agg.items() ], columns=["feature","shap_abs_mean"]).sort_values(by='shap_abs_mean', ascending=False)
    agg_df.to_csv('shap_aggregated.csv', index=False)
    print('Saved aggregated SHAP importances to shap_aggregated.csv')
except Exception as e:
    print('Failed to aggregate SHAP:', e)

print('Done.')

  from .autonotebook import tqdm as notebook_tqdm


/Users/izzatillo_khazratov/Desktop/cbu-coding-challenge/data/merged_data.csv


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['employment_type'].replace({'Full Time': 'Full-time', 'FULL_TIME': 'Full-time', 'Fulltime': 'Full-time', 'FT': 'Full-time',
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['loan_type'].replace({'Personal Loan': 'Personal', 'personal': 'Personal', 'PERSONAL': 'Personal',
The

/Users/izzatillo_khazratov/Desktop/cbu-coding-challenge/data/merged_data.csv


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['employment_type'].replace({'Full Time': 'Full-time', 'FULL_TIME': 'Full-time', 'Fulltime': 'Full-time', 'FT': 'Full-time',
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['loan_type'].replace({'Personal Loan': 'Personal', 'personal': 'Personal', 'PERSONAL': 'Personal',
The

Numeric features: 67  Categorical features: 0
Training pipeline ...
[LightGBM] [Info] Number of positive: 3675, number of negative: 68324
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008533 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6492
[LightGBM] [Info] Number of data points in the train set: 71999, number of used features: 67
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Test AUC: 0.7917
              precision    recall  f1-score   support

           0       0.96      0.95      0.95     17081
           1       0.26      0.35      0.29       919

    accuracy                           0.92     18000
   macro avg       0.61      0.65      0.62     18000
weighted avg       0.93      0.92      0.92     18000

Saved pipeline to final_lgbm.pk



Saved SHAP summary to shap_summary.png
Saved aggregated SHAP importances to shap_aggregated.csv
Done.


In [8]:
"""
Credit Default Prediction Pipeline
Based on: XGBoost for Credit Default Prediction - arXiv:2408.03497

Target: Achieve >80% AUC score for competition
"""

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

# ==================== PHASE 1: DATA PREPROCESSING ====================

def preprocess_credit_data(df, target='default'):
    """
    Preprocessing based on research paper methodology
    """
    print("="*70)
    print("CREDIT DEFAULT PREPROCESSING - RESEARCH PAPER METHODOLOGY")
    print("="*70)
    
    # Separate target
    y = df[target].copy()
    X = df.drop(target, axis=1)
    
    print(f"\n[1] Initial Dataset:")
    print(f"   Samples: {len(X):,}")
    print(f"   Features: {X.shape[1]}")
    print(f"   Default rate: {y.mean():.2%}")
    print(f"   Imbalance ratio: {(1-y.mean())/y.mean():.1f}:1")
    
    # Drop identifiers and noise
    drop_cols = ['customer_id', 'application_id', 'loan_officer_id', 
                 'random_noise_1', 'recent_inquiry_count', 'oldest_credit_line_age']
    X = X.drop([col for col in drop_cols if col in X.columns], axis=1)
    print(f"\n[2] Dropped {len([c for c in drop_cols if c in df.columns])} ID/noise columns")
    
    # Handle missing values with flags
    print(f"\n[3] Handling missing values...")
    missing_cols = X.columns[X.isnull().any()].tolist()
    
    for col in missing_cols:
        if X[col].dtype in ['float64', 'int64']:
            X[f'{col}_missing'] = X[col].isnull().astype(int)
            X[col].fillna(X[col].median(), inplace=True)
            print(f"   - {col}: {X[f'{col}_missing'].sum()} missing ‚Üí filled with median")
    
    # Cap outliers at 1st and 99th percentiles
    print(f"\n[4] Capping outliers...")
    numeric_cols = X.select_dtypes(include=[np.number]).columns
    outlier_features = ['loan_amount', 'annual_income', 'total_credit_limit',
                        'revolving_balance', 'annual_debt_payment']
    
    for col in outlier_features:
        if col in X.columns:
            lower, upper = X[col].quantile([0.01, 0.99])
            X[col] = X[col].clip(lower=lower, upper=upper)
    
    # Feature engineering based on paper
    print(f"\n[5] Engineering features...")
    
    # Debt burden indicators
    if all(c in X.columns for c in ['debt_to_income_ratio', 'payment_to_income_ratio']):
        X['total_debt_burden'] = X['debt_to_income_ratio'] + X['payment_to_income_ratio']
        X['debt_stress_indicator'] = (X['debt_to_income_ratio'] > 0.5).astype(int)
    
    # Credit risk score (paper's approach)
    if 'credit_score' in X.columns:
        X['credit_score_normalized'] = (X['credit_score'] - 300) / (850 - 300)
        X['poor_credit_flag'] = (X['credit_score'] < 650).astype(int)
        X['excellent_credit_flag'] = (X['credit_score'] > 750).astype(int)
    
    # Delinquency indicators
    if 'num_delinquencies_2yrs' in X.columns:
        X['has_delinquency'] = (X['num_delinquencies_2yrs'] > 0).astype(int)
        X['multiple_delinquencies'] = (X['num_delinquencies_2yrs'] > 1).astype(int)
    
    # Utilization features
    if 'credit_utilization' in X.columns:
        X['high_utilization'] = (X['credit_utilization'] > 0.75).astype(int)
        X['very_low_utilization'] = (X['credit_utilization'] < 0.1).astype(int)
    
    # Income adequacy
    if 'annual_income' in X.columns and 'loan_amount' in X.columns:
        X['loan_to_income_ratio'] = X['loan_amount'] / (X['annual_income'] + 1)
        X['income_adequacy'] = (X['annual_income'] / 12) / (X['loan_amount'] / 60 + 1)
    
    # Employment stability
    if 'employment_length' in X.columns:
        X['stable_employment'] = (X['employment_length'] >= 3).astype(int)
        X['employment_years_squared'] = X['employment_length'] ** 2
    
    # Account age features
    if 'oldest_account_age_months' in X.columns:
        X['account_age_years'] = X['oldest_account_age_months'] / 12
        X['thin_credit_file'] = (X['oldest_account_age_months'] < 24).astype(int)
    
    # Inquiry intensity
    if 'num_inquiries_6mo' in X.columns:
        X['credit_shopping'] = (X['num_inquiries_6mo'] > 2).astype(int)
        X['excessive_inquiries'] = (X['num_inquiries_6mo'] > 4).astype(int)
    
    # Interest rate indicators (risk pricing)
    if 'interest_rate' in X.columns:
        X['subprime_rate'] = (X['interest_rate'] > 15).astype(int)
        X['prime_rate'] = (X['interest_rate'] < 8).astype(int)
    
    print(f"   Created {X.shape[1] - len(df.columns) + len(drop_cols) + 1} new features")
    
    # Encode categorical variables
    print(f"\n[6] Encoding categorical variables...")
    categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
    
    # Low cardinality: one-hot encoding
    low_card = [col for col in categorical_cols if X[col].nunique() < 10]
    if low_card:
        X = pd.get_dummies(X, columns=low_card, drop_first=True, dtype=int)
        print(f"   One-hot encoded: {low_card}")
    
    # High cardinality: frequency encoding
    high_card = [col for col in categorical_cols if X[col].nunique() >= 10]
    for col in high_card:
        freq_map = X[col].value_counts(normalize=True).to_dict()
        X[f'{col}_frequency'] = X[col].map(freq_map)
        X = X.drop(col, axis=1)
        print(f"   Frequency encoded: {col}")
    
    # Remove redundant features
    print(f"\n[7] Removing redundant features...")
    redundant = ['monthly_income']  # Redundant with annual_income
    X = X.drop([col for col in redundant if col in X.columns], axis=1)
    
    print(f"\n‚úÖ Preprocessing complete!")
    print(f"   Final features: {X.shape[1]}")
    print(f"   Ready for modeling")
    
    return X, y

# ==================== PHASE 2: MODEL TRAINING ====================

def train_xgboost_model(X_train, y_train, X_test, y_test):
    """
    XGBoost training based on paper's optimal hyperparameters
    """
    print("\n" + "="*70)
    print("XGBOOST MODEL TRAINING")
    print("="*70)
    
    # Calculate scale_pos_weight for imbalance
    scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
    print(f"\nClass imbalance handling:")
    print(f"   scale_pos_weight: {scale_pos_weight:.2f}")
    
    # Paper's recommended hyperparameters (optimized for credit default)
    params = {
        'n_estimators': 500,           # More trees for better performance
        'max_depth': 7,                # Deeper trees for complex patterns
        'learning_rate': 0.03,         # Lower LR with more trees
        'subsample': 0.8,              # Row sampling to prevent overfitting
        'colsample_bytree': 0.8,       # Column sampling
        'colsample_bylevel': 0.8,      # Column sampling per level
        'min_child_weight': 3,         # Minimum sum of weights
        'gamma': 0.1,                  # Minimum loss reduction
        'reg_alpha': 0.1,              # L1 regularization
        'reg_lambda': 1.0,             # L2 regularization
        'early_stopping_rounds' : 50,
        'scale_pos_weight': scale_pos_weight,
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'random_state': 42,
        'n_jobs': -1,
        'tree_method': 'hist'          # Faster training
    }
    
    print(f"\nModel hyperparameters:")
    for key, value in params.items():
        if key not in ['n_jobs', 'random_state', 'tree_method']:
            print(f"   {key}: {value}")
    
    # Train model
    print(f"\n[1] Training XGBoost...")
    model = xgb.XGBClassifier(**params)
    
    # Use early stopping
    model.fit(
        X_train, y_train,
        eval_set=[(X_train, y_train), (X_test, y_test)],

        verbose=False
    )
    
    print(f"   Best iteration: {model.best_iteration}")
    
    # Evaluate
    print(f"\n[2] Model Evaluation:")
    
    # Training performance
    y_train_pred = model.predict_proba(X_train)[:, 1]
    train_auc = roc_auc_score(y_train, y_train_pred)
    
    # Test performance
    y_test_pred = model.predict_proba(X_test)[:, 1]
    test_auc = roc_auc_score(y_test, y_test_pred)
    
    print(f"   Training AUC:   {train_auc:.4f}")
    print(f"   Test AUC:       {test_auc:.4f}")
    print(f"   Overfitting:    {train_auc - test_auc:.4f}")
    
    # Competition scoring
    if test_auc >= 0.85:
        grade = "üèÜ EXCELLENT (25/25 points)"
    elif test_auc >= 0.80:
        grade = "‚úÖ VERY GOOD (23-24/25 points)"
    elif test_auc >= 0.75:
        grade = "‚úÖ GOOD (20-22/25 points)"
    elif test_auc >= 0.60:
        grade = "‚ö†Ô∏è  PASSING (15-19/25 points)"
    else:
        grade = "‚ùå DISQUALIFIED (<15 points)"
    
    print(f"\n   Competition Score: {grade}")
    
    # Classification metrics
    y_test_class = model.predict(X_test)
    print(f"\n[3] Classification Report:")
    print(classification_report(y_test, y_test_class, 
                                target_names=['No Default', 'Default'],
                                digits=3))
    
    # Confusion matrix
    cm = confusion_matrix(y_test, y_test_class)
    tn, fp, fn, tp = cm.ravel()
    
    print(f"[4] Confusion Matrix:")
    print(f"   True Negatives:  {tn:,}")
    print(f"   False Positives: {fp:,}")
    print(f"   False Negatives: {fn:,}")
    print(f"   True Positives:  {tp:,}")
    
    # Business metrics
    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    
    print(f"\n[5] Business Metrics:")
    print(f"   Sensitivity (Recall): {sensitivity:.2%} - Caught {sensitivity:.0%} of defaults")
    print(f"   Specificity:          {specificity:.2%} - Correctly identified {specificity:.0%} of non-defaults")
    print(f"   Precision:            {precision:.2%} - {precision:.0%} of predicted defaults were correct")
    
    return model, test_auc

# ==================== PHASE 3: CROSS-VALIDATION ====================

def cross_validate_model(X, y, params):
    """
    5-fold stratified cross-validation
    """
    print("\n" + "="*70)
    print("CROSS-VALIDATION")
    print("="*70)
    
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    model = xgb.XGBClassifier(**params)
    
    print(f"\nPerforming 5-fold stratified cross-validation...")
    cv_scores = cross_val_score(
        model, X, y, 
        cv=skf, 
        scoring='roc_auc',
        n_jobs=-1
    )
    
    print(f"\nCV AUC Scores by Fold:")
    for i, score in enumerate(cv_scores, 1):
        print(f"   Fold {i}: {score:.4f}")
    
    print(f"\n   Mean CV AUC: {cv_scores.mean():.4f}")
    print(f"   Std Dev:     {cv_scores.std():.4f}")
    print(f"   95% CI:      [{cv_scores.mean() - 1.96*cv_scores.std():.4f}, "
          f"{cv_scores.mean() + 1.96*cv_scores.std():.4f}]")
    
    return cv_scores

# ==================== MAIN EXECUTION ====================

def main_pipeline(df):
    """
    Complete pipeline execution
    """
    print("\n" + "="*70)
    print("CREDIT DEFAULT PREDICTION - COMPLETE PIPELINE")
    print("Based on: arXiv:2408.03497")
    print("="*70)
    
    # Preprocess
    X, y = preprocess_credit_data(df)
    
    # Split data (stratified)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, 
        test_size=0.2, 
        stratify=y, 
        random_state=42
    )
    
    print(f"\n" + "="*70)
    print("DATA SPLIT")
    print("="*70)
    print(f"\nTraining set: {X_train.shape[0]:,} samples ({y_train.mean():.2%} default)")
    print(f"Test set:     {X_test.shape[0]:,} samples ({y_test.mean():.2%} default)")
    
    # Train model
    model, test_auc = train_xgboost_model(X_train, y_train, X_test, y_test)
    
    # Cross-validation
    scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
    params = {
        'n_estimators': 500,
        'max_depth': 7,
        'learning_rate': 0.03,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'scale_pos_weight': scale_pos_weight,
        'random_state': 42,
        'n_jobs': -1
    }
    
    cv_scores = cross_validate_model(X_train, y_train, params)
    
    # Feature importance
    print("\n" + "="*70)
    print("TOP 20 MOST IMPORTANT FEATURES")
    print("="*70)
    
    feature_importance = pd.DataFrame({
        'feature': X_train.columns,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print("\n", feature_importance.head(20).to_string(index=False))
    
    # Final summary
    print("\n" + "="*70)
    print("FINAL SUMMARY")
    print("="*70)
    print(f"\n‚úÖ Test AUC:        {test_auc:.4f}")
    print(f"‚úÖ Mean CV AUC:     {cv_scores.mean():.4f} (¬±{cv_scores.std():.4f})")
    print(f"‚úÖ Total Features:  {X_train.shape[1]}")
    print(f"‚úÖ Training Time:   ~{model.best_iteration} iterations")
    
    if test_auc >= 0.80:
        print(f"\nüéØ TARGET ACHIEVED! Ready for competition submission.")
    else:
        print(f"\n‚ö†Ô∏è  AUC below 80%. Consider:")
        print(f"   - Hyperparameter tuning")
        print(f"   - Feature engineering")
        print(f"   - Ensemble methods")
    
    return model, feature_importance

# ==================== USAGE ====================

# Load your data
data = pd.read_csv('/Users/izzatillo_khazratov/Desktop/cbu-coding-challenge/data/final.csv')

# Run complete pipeline
model, feature_importance = main_pipeline(data)

# For new predictions:
# predictions = model.predict_proba(X_new)[:, 1]




CREDIT DEFAULT PREDICTION - COMPLETE PIPELINE
Based on: arXiv:2408.03497
CREDIT DEFAULT PREPROCESSING - RESEARCH PAPER METHODOLOGY

[1] Initial Dataset:
   Samples: 89,999
   Features: 67
   Default rate: 5.10%
   Imbalance ratio: 18.6:1

[2] Dropped 0 ID/noise columns

[3] Handling missing values...

[4] Capping outliers...

[5] Engineering features...
   Created 25 new features

[6] Encoding categorical variables...

[7] Removing redundant features...

‚úÖ Preprocessing complete!
   Final features: 85
   Ready for modeling

DATA SPLIT

Training set: 71,999 samples (5.10% default)
Test set:     18,000 samples (5.11% default)

XGBOOST MODEL TRAINING

Class imbalance handling:
   scale_pos_weight: 18.59

Model hyperparameters:
   n_estimators: 500
   max_depth: 7
   learning_rate: 0.03
   subsample: 0.8
   colsample_bytree: 0.8
   colsample_bylevel: 0.8
   min_child_weight: 3
   gamma: 0.1
   reg_alpha: 0.1
   reg_lambda: 1.0
   early_stopping_rounds: 50
   scale_pos_weight: 18.5915646

In [None]:
import os 
 # Prints the parent directory of the current file's directory)

<function abspath at 0x1053014e0>


In [10]:
"""
CatBoost Credit Default Prediction Pipeline
Optimized for 5.1% imbalanced dataset
Target: 80%+ AUC score
"""

import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, Pool, cv
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

# ==================== PREPROCESSING FOR CATBOOST ====================

def preprocess_for_catboost(df, target='default'):
    """
    CatBoost-specific preprocessing
    Advantage: CatBoost handles categorical features natively!
    """
    print("="*70)
    print("PREPROCESSING FOR CATBOOST")
    print("="*70)
    
    # Separate target
    y = df[target].copy()
    X = df.drop(target, axis=1)
    
    print(f"\n[1] Initial Dataset:")
    print(f"   Samples: {len(X):,}")
    print(f"   Features: {X.shape[1]}")
    print(f"   Default rate: {y.mean():.2%}")
    
    # Drop identifiers and noise
    drop_cols = ['customer_id', 'application_id', 'loan_officer_id', 
                 'random_noise_1', 'recent_inquiry_count', 'oldest_credit_line_age']
    X = X.drop([col for col in drop_cols if col in X.columns], axis=1)
    
    # Handle missing values
    print(f"\n[2] Handling missing values...")
    for col in X.columns:
        if X[col].isnull().sum() > 0:
            if X[col].dtype in ['float64', 'int64']:
                X[f'{col}_missing'] = X[col].isnull().astype(int)
                X[col].fillna(X[col].median(), inplace=True)
    
    # Cap outliers
    print(f"\n[3] Capping outliers...")
    outlier_features = ['loan_amount', 'annual_income', 'total_credit_limit',
                        'revolving_balance', 'annual_debt_payment']
    for col in outlier_features:
        if col in X.columns:
            lower, upper = X[col].quantile([0.01, 0.99])
            X[col] = X[col].clip(lower=lower, upper=upper)
    
    # Feature engineering
    print(f"\n[4] Engineering features...")
    
    # Debt features
    if all(c in X.columns for c in ['debt_to_income_ratio', 'payment_to_income_ratio']):
        X['total_debt_burden'] = X['debt_to_income_ratio'] + X['payment_to_income_ratio']
        X['debt_stress'] = (X['debt_to_income_ratio'] > 0.5).astype(int)
    
    # Credit score features
    if 'credit_score' in X.columns:
        X['credit_score_norm'] = (X['credit_score'] - 300) / 550
        X['poor_credit'] = (X['credit_score'] < 650).astype(int)
        X['excellent_credit'] = (X['credit_score'] > 750).astype(int)
    
    # Delinquency features
    if 'num_delinquencies_2yrs' in X.columns:
        X['has_delinquency'] = (X['num_delinquencies_2yrs'] > 0).astype(int)
    
    # Utilization
    if 'credit_utilization' in X.columns:
        X['high_utilization'] = (X['credit_utilization'] > 0.75).astype(int)
    
    # Income adequacy
    if 'annual_income' in X.columns and 'loan_amount' in X.columns:
        X['loan_to_income'] = X['loan_amount'] / (X['annual_income'] + 1)
    
    # Employment
    if 'employment_length' in X.columns:
        X['stable_employment'] = (X['employment_length'] >= 3).astype(int)
    
    # Account age
    if 'oldest_account_age_months' in X.columns:
        X['thin_credit_file'] = (X['oldest_account_age_months'] < 24).astype(int)
    
    # Inquiries
    if 'num_inquiries_6mo' in X.columns:
        X['excessive_inquiries'] = (X['num_inquiries_6mo'] > 4).astype(int)
    
    # Interest rate
    if 'interest_rate' in X.columns:
        X['subprime_rate'] = (X['interest_rate'] > 15).astype(int)
    
    # Identify categorical features for CatBoost
    categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()
    
    # CatBoost handles categoricals, so DON'T one-hot encode!
    # Just convert to 'category' dtype
    for col in categorical_features:
        X[col] = X[col].astype('category')
    
    print(f"\n‚úÖ Preprocessing complete!")
    print(f"   Final features: {X.shape[1]}")
    print(f"   Categorical features: {len(categorical_features)}")
    print(f"   Categorical columns: {categorical_features}")
    
    return X, y, categorical_features

# ==================== CATBOOST TRAINING ====================

def train_catboost(X_train, y_train, X_test, y_test, cat_features=None):
    """
    Train CatBoost with optimal hyperparameters
    """
    print("\n" + "="*70)
    print("CATBOOST MODEL TRAINING")
    print("="*70)
    
    # Calculate class weights
    scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
    print(f"\nClass imbalance: {scale_pos_weight:.2f}:1")
    
    # CatBoost parameters (optimized for credit default)
    params = {
        'iterations': 1000,              # Max iterations
        'learning_rate': 0.03,           # Learning rate
        'depth': 7,                      # Tree depth
        'l2_leaf_reg': 3,                # L2 regularization
        'min_data_in_leaf': 20,          # Min samples per leaf
        'max_bin': 254,                  # Max bins for numerical features
        'subsample': 0.8,                # Row sampling
        'colsample_bylevel': 0.8,        # Column sampling per level
        'random_strength': 1,            # Randomness for scoring splits
        'bagging_temperature': 1,        # Bayesian bootstrap temperature
        'auto_class_weights': 'Balanced', # Handle imbalance automatically
        'eval_metric': 'AUC',            # Evaluation metric
        'early_stopping_rounds': 50,     # Early stopping
        'random_seed': 42,
        'verbose': 100,                  # Print every 100 iterations
        'task_type': 'CPU',              # Use GPU if available: 'GPU'
        'thread_count': -1               # Use all CPU cores
    }
    
    print(f"\nModel hyperparameters:")
    for key, value in params.items():
        if key not in ['verbose', 'task_type', 'thread_count', 'random_seed']:
            print(f"   {key}: {value}")
    
    # Create CatBoost model
    model = CatBoostClassifier(**params)
    
    # Train with validation set
    print(f"\n[1] Training CatBoost...")
    model.fit(
        X_train, y_train,
        cat_features=cat_features,      # Specify categorical features
        eval_set=(X_test, y_test),
        use_best_model=True,             # Use best iteration
        plot=False                       # Set True for training visualization
    )
    
    print(f"\n   Best iteration: {model.best_iteration_}")
    print(f"   Best validation AUC: {model.best_score_['validation']['AUC']:.4f}")
    
    # Evaluate
    print(f"\n[2] Model Evaluation:")
    
    # Training performance
    y_train_pred = model.predict_proba(X_train)[:, 1]
    train_auc = roc_auc_score(y_train, y_train_pred)
    
    # Test performance
    y_test_pred = model.predict_proba(X_test)[:, 1]
    test_auc = roc_auc_score(y_test, y_test_pred)
    
    print(f"   Training AUC:   {train_auc:.4f}")
    print(f"   Test AUC:       {test_auc:.4f}")
    print(f"   Overfitting:    {train_auc - test_auc:.4f}")
    
    # Competition scoring
    if test_auc >= 0.85:
        grade = "üèÜ EXCELLENT (25/25 points)"
    elif test_auc >= 0.80:
        grade = "‚úÖ VERY GOOD (23-24/25 points)"
    elif test_auc >= 0.75:
        grade = "‚úÖ GOOD (20-22/25 points)"
    elif test_auc >= 0.60:
        grade = "‚ö†Ô∏è  PASSING (15-19/25 points)"
    else:
        grade = "‚ùå DISQUALIFIED (<15 points)"
    
    print(f"\n   Competition Score: {grade}")
    
    # Classification report
    y_test_class = model.predict(X_test)
    print(f"\n[3] Classification Report:")
    print(classification_report(y_test, y_test_class,
                                target_names=['No Default', 'Default'],
                                digits=3))
    
    # Confusion matrix
    cm = confusion_matrix(y_test, y_test_class)
    tn, fp, fn, tp = cm.ravel()
    
    print(f"[4] Confusion Matrix:")
    print(f"   True Negatives:  {tn:,}")
    print(f"   False Positives: {fp:,}")
    print(f"   False Negatives: {fn:,}")
    print(f"   True Positives:  {tp:,}")
    
    # Business metrics
    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    
    print(f"\n[5] Business Metrics:")
    print(f"   Sensitivity: {sensitivity:.2%} - Caught {sensitivity:.0%} of defaults")
    print(f"   Specificity: {specificity:.2%} - Correctly identified {specificity:.0%} of non-defaults")
    print(f"   Precision:   {precision:.2%} - {precision:.0%} of predicted defaults were correct")
    
    return model, test_auc

# ==================== CROSS-VALIDATION ====================

def catboost_cross_validate(X, y, cat_features=None):
    """
    5-fold stratified cross-validation with CatBoost
    """
    print("\n" + "="*70)
    print("CROSS-VALIDATION")
    print("="*70)
    
    # Create Pool object (CatBoost's data structure)
    pool = Pool(
        data=X,
        label=y,
        cat_features=cat_features
    )
    
    # CV parameters
    params = {
        'iterations': 1000,
        'learning_rate': 0.03,
        'depth': 7,
        'l2_leaf_reg': 3,
        'auto_class_weights': 'Balanced',
        'eval_metric': 'AUC',
        'random_seed': 42,
        'verbose': False
    }
    
    # Perform 5-fold CV
    print(f"\nPerforming 5-fold stratified cross-validation...")
    cv_results = cv(
        pool=pool,
        params=params,
        fold_count=5,
        stratified=True,
        partition_random_seed=42,
        shuffle=True,
        early_stopping_rounds=50,
        verbose=False
    )
    
    # Extract AUC scores
    cv_scores = cv_results['test-AUC-mean'].values
    best_iteration = cv_results['test-AUC-mean'].idxmax()
    
    print(f"\nCV Results:")
    print(f"   Best iteration: {best_iteration}")
    print(f"   Best CV AUC:    {cv_scores[best_iteration]:.4f}")
    print(f"   Final CV AUC:   {cv_scores[-1]:.4f}")
    print(f"   Std Dev:        {cv_results['test-AUC-std'].values[best_iteration]:.4f}")
    
    return cv_scores[best_iteration], cv_results

# ==================== FEATURE IMPORTANCE ====================

def plot_feature_importance(model, X, top_n=20):
    """
    Display feature importance
    """
    print("\n" + "="*70)
    print(f"TOP {top_n} MOST IMPORTANT FEATURES")
    print("="*70)
    
    # Get feature importance
    feature_importance = model.get_feature_importance()
    feature_names = X.columns
    
    # Create DataFrame
    importance_df = pd.DataFrame({
        'feature': feature_names,
        'importance': feature_importance
    }).sort_values('importance', ascending=False)
    
    print(f"\n{importance_df.head(top_n).to_string(index=False)}")
    
    return importance_df

# ==================== MAIN PIPELINE ====================

def main_catboost_pipeline(df):
    """
    Complete CatBoost pipeline
    """
    print("\n" + "="*70)
    print("CATBOOST CREDIT DEFAULT PREDICTION")
    print("="*70)
    
    # Preprocess
    X, y, cat_features = preprocess_for_catboost(df)
    
    # Split data (stratified)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=0.2,
        stratify=y,
        random_state=42
    )
    
    print(f"\n" + "="*70)
    print("DATA SPLIT")
    print("="*70)
    print(f"\nTraining set: {X_train.shape[0]:,} samples ({y_train.mean():.2%} default)")
    print(f"Test set:     {X_test.shape[0]:,} samples ({y_test.mean():.2%} default)")
    
    # Train model
    model, test_auc = train_catboost(X_train, y_train, X_test, y_test, cat_features)
    
    # Cross-validation
    cv_auc, cv_results = catboost_cross_validate(X_train, y_train, cat_features)
    
    # Feature importance
    importance_df = plot_feature_importance(model, X_train, top_n=20)
    
    # Final summary
    print("\n" + "="*70)
    print("FINAL SUMMARY")
    print("="*70)
    print(f"\n‚úÖ Test AUC:           {test_auc:.4f}")
    print(f"‚úÖ Cross-Val AUC:      {cv_auc:.4f}")
    print(f"‚úÖ Best Iteration:     {model.best_iteration_}")
    print(f"‚úÖ Total Features:     {X_train.shape[1]}")
    print(f"‚úÖ Categorical Feats:  {len(cat_features)}")
    
    if test_auc >= 0.80:
        print(f"\nüéØ TARGET ACHIEVED! Ready for competition.")
    else:
        print(f"\n‚ö†Ô∏è  AUC below 80%. Consider hyperparameter tuning.")
    
    return model, importance_df, cv_results

# ==================== HYPERPARAMETER TUNING ====================

def tune_catboost(X_train, y_train, cat_features=None):
    """
    Grid search for optimal hyperparameters
    """
    print("\n" + "="*70)
    print("HYPERPARAMETER TUNING")
    print("="*70)
    
    from sklearn.model_selection import RandomizedSearchCV
    
    # Parameter grid
    param_distributions = {
        'depth': [5, 6, 7, 8, 9],
        'learning_rate': [0.01, 0.02, 0.03, 0.05],
        'l2_leaf_reg': [1, 3, 5, 7],
        'min_data_in_leaf': [10, 20, 30],
        'subsample': [0.7, 0.8, 0.9],
        'colsample_bylevel': [0.7, 0.8, 0.9]
    }
    
    # Base model
    base_model = CatBoostClassifier(
        iterations=500,
        auto_class_weights='Balanced',
        eval_metric='AUC',
        early_stopping_rounds=50,
        random_seed=42,
        verbose=False
    )
    
    # Randomized search
    print("\nSearching for best hyperparameters...")
    random_search = RandomizedSearchCV(
        base_model,
        param_distributions,
        n_iter=30,
        scoring='roc_auc',
        cv=5,
        random_state=42,
        n_jobs=-1,
        verbose=1
    )
    
    random_search.fit(X_train, y_train, cat_features=cat_features)
    
    print(f"\n‚úÖ Best AUC: {random_search.best_score_:.4f}")
    print(f"‚úÖ Best parameters:")
    for param, value in random_search.best_params_.items():
        print(f"   {param}: {value}")
    
    return random_search.best_estimator_, random_search.best_params_

# ==================== USAGE EXAMPLES ====================


# EXAMPLE 1: Basic Usage
# ----------------------
data = pd.read_csv('/Users/izzatillo_khazratov/Desktop/cbu-coding-challenge/data/final.csv')
model, importance, cv_results = main_catboost_pipeline(data)

# Make predictions
# predictions = model.predict_proba(X_new)[:, 1]


# EXAMPLE 2: With Hyperparameter Tuning
# --------------------------------------
# X, y, cat_features = preprocess_for_catboost(data)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

# # Tune hyperparameters
# best_model, best_params = tune_catboost(X_train, y_train, cat_features)

# # Evaluate
# y_pred = best_model.predict_proba(X_test)[:, 1]
# print(f"Tuned model AUC: {roc_auc_score(y_test, y_pred):.4f}")


# EXAMPLE 3: Quick Training
# --------------------------
# X, y, cat_features = preprocess_for_catboost(data)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

# model = CatBoostClassifier(
#     iterations=1000,
#     learning_rate=0.03,
#     depth=7,
#     auto_class_weights='Balanced',
#     cat_features=cat_features,
#     random_seed=42,
#     verbose=100
# )

# model.fit(X_train, y_train, eval_set=(X_test, y_test))
# print(f"AUC: {roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]):.4f}")



CATBOOST CREDIT DEFAULT PREDICTION
PREPROCESSING FOR CATBOOST

[1] Initial Dataset:
   Samples: 89,999
   Features: 67
   Default rate: 5.10%

[2] Handling missing values...

[3] Capping outliers...

[4] Engineering features...

‚úÖ Preprocessing complete!
   Final features: 79
   Categorical features: 0
   Categorical columns: []

DATA SPLIT

Training set: 71,999 samples (5.10% default)
Test set:     18,000 samples (5.11% default)

CATBOOST MODEL TRAINING

Class imbalance: 18.59:1

Model hyperparameters:
   iterations: 1000
   learning_rate: 0.03
   depth: 7
   l2_leaf_reg: 3
   min_data_in_leaf: 20
   max_bin: 254
   subsample: 0.8
   colsample_bylevel: 0.8
   random_strength: 1
   bagging_temperature: 1
   auto_class_weights: Balanced
   eval_metric: AUC
   early_stopping_rounds: 50

[1] Training CatBoost...
0:	test: 0.7735101	best: 0.7735101 (0)	total: 78.2ms	remaining: 1m 18s
100:	test: 0.8063366	best: 0.8063366 (100)	total: 2.21s	remaining: 19.7s
200:	test: 0.8092306	best: 0.809

CatBoostError: Parameter loss_function should be specified for cross-validation

In [18]:
"""
CatBoost Credit Default Prediction Pipeline
Optimized for 5.1% imbalanced dataset
Target: 80%+ AUC score
"""

import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, Pool, cv
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

# ==================== PREPROCESSING FOR CATBOOST ====================

def preprocess_for_catboost(df, target='default'):
    """
    CatBoost-specific preprocessing
    Advantage: CatBoost handles categorical features natively!
    """
    print("="*70)
    print("PREPROCESSING FOR CATBOOST")
    print("="*70)
    
    # Separate target
    y = df[target].copy()
    X = df.drop(target, axis=1)
    
    print(f"\n[1] Initial Dataset:")
    print(f"   Samples: {len(X):,}")
    print(f"   Features: {X.shape[1]}")
    print(f"   Default rate: {y.mean():.2%}")
    
    # Drop identifiers and noise
    drop_cols = ['customer_id', 'application_id', 'loan_officer_id', 
                 'random_noise_1', 'recent_inquiry_count', 'oldest_credit_line_age']
    X = X.drop([col for col in drop_cols if col in X.columns], axis=1)
    
    # Handle missing values
    print(f"\n[2] Handling missing values...")
    for col in X.columns:
        if X[col].isnull().sum() > 0:
            if X[col].dtype in ['float64', 'int64']:
                X[f'{col}_missing'] = X[col].isnull().astype(int)
                X[col].fillna(X[col].median(), inplace=True)
    
    # Cap outliers
    print(f"\n[3] Capping outliers...")
    outlier_features = ['loan_amount', 'annual_income', 'total_credit_limit',
                        'revolving_balance', 'annual_debt_payment']
    for col in outlier_features:
        if col in X.columns:
            lower, upper = X[col].quantile([0.01, 0.99])
            X[col] = X[col].clip(lower=lower, upper=upper)
    
    # Feature engineering
    print(f"\n[4] Engineering features...")
    
    # Debt features
    if all(c in X.columns for c in ['debt_to_income_ratio', 'payment_to_income_ratio']):
        X['total_debt_burden'] = X['debt_to_income_ratio'] + X['payment_to_income_ratio']
        X['debt_stress'] = (X['debt_to_income_ratio'] > 0.5).astype(int)
    
    # Credit score features
    if 'credit_score' in X.columns:
        X['credit_score_norm'] = (X['credit_score'] - 300) / 550
        X['poor_credit'] = (X['credit_score'] < 650).astype(int)
        X['excellent_credit'] = (X['credit_score'] > 750).astype(int)
    
    # Delinquency features
    if 'num_delinquencies_2yrs' in X.columns:
        X['has_delinquency'] = (X['num_delinquencies_2yrs'] > 0).astype(int)
    
    # Utilization
    if 'credit_utilization' in X.columns:
        X['high_utilization'] = (X['credit_utilization'] > 0.75).astype(int)
    
    # Income adequacy
    if 'annual_income' in X.columns and 'loan_amount' in X.columns:
        X['loan_to_income'] = X['loan_amount'] / (X['annual_income'] + 1)
    
    # Employment
    if 'employment_length' in X.columns:
        X['stable_employment'] = (X['employment_length'] >= 3).astype(int)
    
    # Account age
    if 'oldest_account_age_months' in X.columns:
        X['thin_credit_file'] = (X['oldest_account_age_months'] < 24).astype(int)
    
    # Inquiries
    if 'num_inquiries_6mo' in X.columns:
        X['excessive_inquiries'] = (X['num_inquiries_6mo'] > 4).astype(int)
    
    # Interest rate
    if 'interest_rate' in X.columns:
        X['subprime_rate'] = (X['interest_rate'] > 15).astype(int)
    
    # Identify categorical features for CatBoost
    categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()
    
    # CatBoost handles categoricals, so DON'T one-hot encode!
    # Just convert to 'category' dtype
    for col in categorical_features:
        X[col] = X[col].astype('category')
    
    print(f"\n‚úÖ Preprocessing complete!")
    print(f"   Final features: {X.shape[1]}")
    print(f"   Categorical features: {len(categorical_features)}")
    print(f"   Categorical columns: {categorical_features}")
    
    return X, y, categorical_features

# ==================== CATBOOST TRAINING ====================

def train_catboost(X_train, y_train, X_test, y_test, cat_features=None):
    """
    Train CatBoost with optimal hyperparameters
    """
    print("\n" + "="*70)
    print("CATBOOST MODEL TRAINING")
    print("="*70)
    
    # Calculate class weights
    scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
    print(f"\nClass imbalance: {scale_pos_weight:.2f}:1")
    
    # CatBoost parameters (optimized for credit default)
    params = {
        'iterations': 1000,              # Max iterations
        'learning_rate': 0.03,           # Learning rate
        'depth': 7,                      # Tree depth
        'l2_leaf_reg': 3,                # L2 regularization
        'min_data_in_leaf': 20,          # Min samples per leaf
        'max_bin': 254,                  # Max bins for numerical features
        'subsample': 0.8,                # Row sampling
        'colsample_bylevel': 0.8,        # Column sampling per level
        'random_strength': 1,            # Randomness for scoring splits
        'bagging_temperature': 1,        # Bayesian bootstrap temperature
        'auto_class_weights': 'Balanced', # Handle imbalance automatically
        'eval_metric': 'AUC',            # Evaluation metric
        'early_stopping_rounds': 50,     # Early stopping
        'random_seed': 42,
        'verbose': 100,                  # Print every 100 iterations
        'task_type': 'CPU',              # Use GPU if available: 'GPU'
        'thread_count': -1               # Use all CPU cores
    }
    
    print(f"\nModel hyperparameters:")
    for key, value in params.items():
        if key not in ['verbose', 'task_type', 'thread_count', 'random_seed']:
            print(f"   {key}: {value}")
    
    # Create CatBoost model
    model = CatBoostClassifier(**params)
    
    # Train with validation set
    print(f"\n[1] Training CatBoost...")
    model.fit(
        X_train, y_train,
        cat_features=cat_features,      # Specify categorical features
        eval_set=(X_test, y_test),
        use_best_model=True,             # Use best iteration
        plot=False                       # Set True for training visualization
    )
    
    print(f"\n   Best iteration: {model.best_iteration_}")
    print(f"   Best validation AUC: {model.best_score_['validation']['AUC']:.4f}")
    
    # Evaluate
    print(f"\n[2] Model Evaluation:")
    
    # Training performance
    y_train_pred = model.predict_proba(X_train)[:, 1]
    train_auc = roc_auc_score(y_train, y_train_pred)
    
    # Test performance
    y_test_pred = model.predict_proba(X_test)[:, 1]
    test_auc = roc_auc_score(y_test, y_test_pred)
    
    print(f"   Training AUC:   {train_auc:.4f}")
    print(f"   Test AUC:       {test_auc:.4f}")
    print(f"   Overfitting:    {train_auc - test_auc:.4f}")
    
    # Competition scoring
    if test_auc >= 0.85:
        grade = "üèÜ EXCELLENT (25/25 points)"
    elif test_auc >= 0.80:
        grade = "‚úÖ VERY GOOD (23-24/25 points)"
    elif test_auc >= 0.75:
        grade = "‚úÖ GOOD (20-22/25 points)"
    elif test_auc >= 0.60:
        grade = "‚ö†Ô∏è  PASSING (15-19/25 points)"
    else:
        grade = "‚ùå DISQUALIFIED (<15 points)"
    
    print(f"\n   Competition Score: {grade}")
    
    # Classification report
    y_test_class = model.predict(X_test)
    print(f"\n[3] Classification Report:")
    print(classification_report(y_test, y_test_class,
                                target_names=['No Default', 'Default'],
                                digits=3))
    
    # Confusion matrix
    cm = confusion_matrix(y_test, y_test_class)
    tn, fp, fn, tp = cm.ravel()
    
    print(f"[4] Confusion Matrix:")
    print(f"   True Negatives:  {tn:,}")
    print(f"   False Positives: {fp:,}")
    print(f"   False Negatives: {fn:,}")
    print(f"   True Positives:  {tp:,}")
    
    # Business metrics
    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    
    print(f"\n[5] Business Metrics:")
    print(f"   Sensitivity: {sensitivity:.2%} - Caught {sensitivity:.0%} of defaults")
    print(f"   Specificity: {specificity:.2%} - Correctly identified {specificity:.0%} of non-defaults")
    print(f"   Precision:   {precision:.2%} - {precision:.0%} of predicted defaults were correct")
    
    return model, test_auc

# ==================== CROSS-VALIDATION ====================

def catboost_cross_validate(X, y, cat_features=None):
    """
    5-fold stratified cross-validation with CatBoost
    """
    print("\n" + "="*70)
    print("CROSS-VALIDATION")
    print("="*70)
    
    # Create Pool object (CatBoost's data structure)
    pool = Pool(
        data=X,
        label=y,
        cat_features=cat_features
    )
    
    # CV parameters
    params = {
        'iterations': 1000,
        'learning_rate': 0.03,
        'depth': 7,
        'l2_leaf_reg': 3,
        'auto_class_weights': 'Balanced',
        'loss_function': 'Logloss',      # ‚úÖ REQUIRED for cv()
        'eval_metric': 'AUC',
        'random_seed': 42,
        'verbose': False
    }
    
    # Perform 5-fold CV
    print(f"\nPerforming 5-fold stratified cross-validation...")
    cv_results = cv(
        pool=pool,
        params=params,
        fold_count=5,
        stratified=True,
        partition_random_seed=42,
        shuffle=True,
        early_stopping_rounds=50,
        verbose=False
    )
    
    # Extract AUC scores
    cv_scores = cv_results['test-AUC-mean'].values
    best_iteration = cv_results['test-AUC-mean'].idxmax()
    
    print(f"\nCV Results:")
    print(f"   Best iteration: {best_iteration}")
    print(f"   Best CV AUC:    {cv_scores[best_iteration]:.4f}")
    print(f"   Final CV AUC:   {cv_scores[-1]:.4f}")
    print(f"   Std Dev:        {cv_results['test-AUC-std'].values[best_iteration]:.4f}")
    
    return cv_scores[best_iteration], cv_results

# ==================== FEATURE IMPORTANCE ====================

def plot_feature_importance(model, X, top_n=20):
    """
    Display feature importance
    """
    print("\n" + "="*70)
    print(f"TOP {top_n} MOST IMPORTANT FEATURES")
    print("="*70)
    
    # Get feature importance
    feature_importance = model.get_feature_importance()
    feature_names = X.columns
    
    # Create DataFrame
    importance_df = pd.DataFrame({
        'feature': feature_names,
        'importance': feature_importance
    }).sort_values('importance', ascending=False)
    
    print(f"\n{importance_df.head(top_n).to_string(index=False)}")
    
    return importance_df

# ==================== MAIN PIPELINE ====================

def main_catboost_pipeline(df):
    """
    Complete CatBoost pipeline
    """
    print("\n" + "="*70)
    print("CATBOOST CREDIT DEFAULT PREDICTION")
    print("="*70)
    
    # Preprocess
    X, y, cat_features = preprocess_for_catboost(df)
    
    # Split data (stratified)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=0.001,
        stratify=y,
        random_state=42
    )
    print(f"\n" + "="*70)
    print("DATA SPLIT")
    print("="*70)
    print(f"\nTraining set: {X_train.shape[0]:,} samples ({y_train.mean():.2%} default)")
    print(f"Test set:     {X_test.shape[0]:,} samples ({y_test.mean():.2%} default)")
    
    # Train model
    model, test_auc = train_catboost(X_train, y_train, X_test, y_test, cat_features)
    
    # Cross-validation
    cv_auc, cv_results = catboost_cross_validate(X_train, y_train, cat_features)
    
    # Feature importance
    importance_df = plot_feature_importance(model, X_train, top_n=20)
    
    # Final summary
    print("\n" + "="*70)
    print("FINAL SUMMARY")
    print("="*70)
    print(f"\n‚úÖ Test AUC:           {test_auc:.4f}")
    print(f"‚úÖ Cross-Val AUC:      {cv_auc:.4f}")
    print(f"‚úÖ Best Iteration:     {model.best_iteration_}")
    print(f"‚úÖ Total Features:     {X_train.shape[1]}")
    print(f"‚úÖ Categorical Feats:  {len(cat_features)}")
    
    if test_auc >= 0.80:
        print(f"\nüéØ TARGET ACHIEVED! Ready for competition.")
    else:
        print(f"\n‚ö†Ô∏è  AUC below 80%. Consider hyperparameter tuning.")
    
    return model, importance_df, cv_results

# ==================== HYPERPARAMETER TUNING ====================

def tune_catboost(X_train, y_train, cat_features=None):
    """
    Grid search for optimal hyperparameters
    """
    print("\n" + "="*70)
    print("HYPERPARAMETER TUNING")
    print("="*70)
    
    from sklearn.model_selection import RandomizedSearchCV
    
    # Parameter grid
    param_distributions = {
        'depth': [5, 6, 7, 8, 9],
        'learning_rate': [0.01, 0.02, 0.03, 0.05],
        'l2_leaf_reg': [1, 3, 5, 7],
        'min_data_in_leaf': [10, 20, 30],
        'subsample': [0.7, 0.8, 0.9],
        'colsample_bylevel': [0.7, 0.8, 0.9]
    }
    
    # Base model
    base_model = CatBoostClassifier(
        iterations=500,
        auto_class_weights='Balanced',
        eval_metric='AUC',
        early_stopping_rounds=50,
        random_seed=42,
        verbose=False
    )
    
    # Randomized search
    print("\nSearching for best hyperparameters...")
    random_search = RandomizedSearchCV(
        base_model,
        param_distributions,
        n_iter=30,
        scoring='roc_auc',
        cv=5,
        random_state=42,
        n_jobs=-1,
        verbose=1
    )
    
    random_search.fit(X_train, y_train, cat_features=cat_features)
    
    print(f"\n‚úÖ Best AUC: {random_search.best_score_:.4f}")
    print(f"‚úÖ Best parameters:")
    for param, value in random_search.best_params_.items():
        print(f"   {param}: {value}")
    
    return random_search.best_estimator_, random_search.best_params_

# ==================== USAGE EXAMPLES ====================

# EXAMPLE 1: Basic Usage
# ----------------------
data = pd.read_csv('/Users/izzatillo_khazratov/Desktop/cbu-coding-challenge/data/final.csv')
model, importance, cv_results = main_catboost_pipeline(data)




CATBOOST CREDIT DEFAULT PREDICTION
PREPROCESSING FOR CATBOOST

[1] Initial Dataset:
   Samples: 89,999
   Features: 67
   Default rate: 5.10%

[2] Handling missing values...

[3] Capping outliers...

[4] Engineering features...

‚úÖ Preprocessing complete!
   Final features: 79
   Categorical features: 0
   Categorical columns: []

DATA SPLIT

Training set: 89,909 samples (5.10% default)
Test set:     90 samples (5.56% default)

CATBOOST MODEL TRAINING

Class imbalance: 18.59:1

Model hyperparameters:
   iterations: 1000
   learning_rate: 0.03
   depth: 7
   l2_leaf_reg: 3
   min_data_in_leaf: 20
   max_bin: 254
   subsample: 0.8
   colsample_bylevel: 0.8
   random_strength: 1
   bagging_temperature: 1
   auto_class_weights: Balanced
   eval_metric: AUC
   early_stopping_rounds: 50

[1] Training CatBoost...
0:	test: 0.8317647	best: 0.8317647 (0)	total: 19.7ms	remaining: 19.7s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.8317647059
bestIteration = 0

Shrink model

In [None]:
"""
CatBoost Prediction Pipeline
Predict on new data and save results
"""

import pandas as pd
import numpy as np

def preprocess_new_data(df, cat_features_from_training):
    """
    Apply the SAME preprocessing steps to new data
    Must match training preprocessing exactly!
    """
    print("="*70)
    print("PREPROCESSING NEW DATA FOR PREDICTION")
    print("="*70)
    
    X = df.copy()
    
    print(f"\n[1] Initial Dataset:")
    print(f"   Samples: {len(X):,}")
    print(f"   Features: {X.shape[1]}")
    
    # Store IDs if present (for results file)
    id_columns = {}
    if 'customer_id' in X.columns:
        id_columns['customer_id'] = X['customer_id'].copy()
    if 'application_id' in X.columns:
        id_columns['application_id'] = X['application_id'].copy()
    
    # Drop identifiers and noise (same as training)
    drop_cols = ['customer_id', 'application_id', 'loan_officer_id', 
                 'random_noise_1', 'recent_inquiry_count', 'oldest_credit_line_age']
    X = X.drop([col for col in drop_cols if col in X.columns], axis=1)
    
    # Handle missing values (same as training)
    print(f"\n[2] Handling missing values...")
    for col in X.columns:
        if X[col].isnull().sum() > 0:
            if X[col].dtype in ['float64', 'int64']:
                X[f'{col}_missing'] = X[col].isnull().astype(int)
                X[col].fillna(X[col].median(), inplace=True)
    
    # Cap outliers (same as training)
    print(f"\n[3] Capping outliers...")
    outlier_features = ['loan_amount', 'annual_income', 'total_credit_limit',
                        'revolving_balance', 'annual_debt_payment']
    for col in outlier_features:
        if col in X.columns:
            lower, upper = X[col].quantile([0.01, 0.99])
            X[col] = X[col].clip(lower=lower, upper=upper)
    
    # Feature engineering (same as training)
    print(f"\n[4] Engineering features...")
    
    # Debt features
    if all(c in X.columns for c in ['debt_to_income_ratio', 'payment_to_income_ratio']):
        X['total_debt_burden'] = X['debt_to_income_ratio'] + X['payment_to_income_ratio']
        X['debt_stress'] = (X['debt_to_income_ratio'] > 0.5).astype(int)
    
    # Credit score features
    if 'credit_score' in X.columns:
        X['credit_score_norm'] = (X['credit_score'] - 300) / 550
        X['poor_credit'] = (X['credit_score'] < 650).astype(int)
        X['excellent_credit'] = (X['credit_score'] > 750).astype(int)
    
    # Delinquency features
    if 'num_delinquencies_2yrs' in X.columns:
        X['has_delinquency'] = (X['num_delinquencies_2yrs'] > 0).astype(int)
    
    # Utilization
    if 'credit_utilization' in X.columns:
        X['high_utilization'] = (X['credit_utilization'] > 0.75).astype(int)
    
    # Income adequacy
    if 'annual_income' in X.columns and 'loan_amount' in X.columns:
        X['loan_to_income'] = X['loan_amount'] / (X['annual_income'] + 1)
    
    # Employment
    if 'employment_length' in X.columns:
        X['stable_employment'] = (X['employment_length'] >= 3).astype(int)
    
    # Account age
    if 'oldest_account_age_months' in X.columns:
        X['thin_credit_file'] = (X['oldest_account_age_months'] < 24).astype(int)
    
    # Inquiries
    if 'num_inquiries_6mo' in X.columns:
        X['excessive_inquiries'] = (X['num_inquiries_6mo'] > 4).astype(int)
    
    # Interest rate
    if 'interest_rate' in X.columns:
        X['subprime_rate'] = (X['interest_rate'] > 15).astype(int)
    
    # Convert categorical features to 'category' dtype (same as training)
    for col in cat_features_from_training:
        if col in X.columns:
            X[col] = X[col].astype('category')
    
    print(f"\n‚úÖ Preprocessing complete!")
    print(f"   Final features: {X.shape[1]}")
    
    return X, id_columns


def predict_and_save(model, new_data_path, output_path='results.csv', 
                     cat_features=None, threshold=0.5):
    """
    Predict on new data and save results
    
    Parameters:
    -----------
    model : CatBoostClassifier
        Trained CatBoost model
    new_data_path : str
        Path to new data CSV file
    output_path : str
        Path to save results CSV (default: 'results.csv')
    cat_features : list
        List of categorical feature names from training
    threshold : float
        Classification threshold (default: 0.5)
    
    Returns:
    --------
    results_df : pd.DataFrame
        DataFrame with predictions
    """
    print("\n" + "="*70)
    print("CATBOOST PREDICTION PIPELINE")
    print("="*70)
    
    # Load new data
    print(f"\n[1] Loading new data from: {new_data_path}")
    new_df = pd.read_csv(new_data_path)
    print(f"   Loaded {len(new_df):,} samples")
    
    # Preprocess new data (same as training)
    X_new, id_columns = preprocess_new_data(new_df, cat_features or [])
    
    # Make predictions
    print(f"\n[2] Generating predictions...")
    
    # Predicted probabilities
    y_pred_proba = model.predict_proba(X_new)[:, 1]
    
    # Predicted classes
    y_pred_class = model.predict(X_new)
    
    # Alternative: use custom threshold
    y_pred_class_custom = (y_pred_proba >= threshold).astype(int)
    
    print(f"   ‚úÖ Predictions generated!")
    print(f"   Predicted default rate: {y_pred_class.mean():.2%}")
    
    # Create results DataFrame
    print(f"\n[3] Creating results file...")
    
    results_df = pd.DataFrame()
    
    # Add IDs if available
    for id_col, id_values in id_columns.items():
        results_df[id_col] = id_values.values
    
    # Add predictions
    results_df['predicted_probability'] = y_pred_proba
    results_df['predicted_default'] = y_pred_class
    results_df['predicted_default_custom'] = y_pred_class_custom
    
    # Add risk categories
    results_df['risk_category'] = pd.cut(
        y_pred_proba,
        bins=[0, 0.25, 0.5, 0.75, 1.0],
        labels=['Low Risk', 'Medium Risk', 'High Risk', 'Very High Risk']
    )
    
    # Save to CSV
    results_df.to_csv(output_path, index=False)
    print(f"   ‚úÖ Results saved to: {output_path}")
    
    # Summary statistics
    print(f"\n[4] Prediction Summary:")
    print(f"   Total predictions:     {len(results_df):,}")
    print(f"   Predicted defaults:    {y_pred_class.sum():,} ({y_pred_class.mean():.2%})")
    print(f"   Predicted non-default: {(1-y_pred_class).sum():,} ({(1-y_pred_class).mean():.2%})")
    print(f"\n   Risk Distribution:")
    print(results_df['risk_category'].value_counts().to_string())
    
    print(f"\n   Probability Statistics:")
    print(f"   Min:    {y_pred_proba.min():.4f}")
    print(f"   25th:   {np.percentile(y_pred_proba, 25):.4f}")
    print(f"   Median: {np.median(y_pred_proba):.4f}")
    print(f"   75th:   {np.percentile(y_pred_proba, 75):.4f}")
    print(f"   Max:    {y_pred_proba.max():.4f}")
    
    print("\n" + "="*70)
    print("PREDICTION COMPLETE!")
    print("="*70)
    
    return results_df


# ==================== USAGE EXAMPLE ====================

# After training your model with the original code:
# model, importance, cv_results = main_catboost_pipeline(data)

# Get categorical features from training
# (you need to save these from the training pipeline)

# Example usage:
"""
# Load training data and train model
train_data = pd.read_csv('data/final.csv')
model, importance, cv_results = main_catboost_pipeline(train_data)

# Get categorical features (from training preprocessing)
from your_training_script import preprocess_for_catboost
_, _, cat_features = preprocess_for_catboost(train_data)

# Predict on new data
results = predict_and_save(
    model=model,
    new_data_path='data/new_data.csv',
    output_path='results.csv',
    cat_features=cat_features,
    threshold=0.5  # Adjust threshold as needed
)

# View first few predictions
print(results.head(10))
"""


# ==================== COMPLETE PIPELINE ====================

def complete_pipeline_with_prediction(train_data_path, test_data_path, output_path='results.csv'):
    """
    Complete pipeline: Train model and predict on new data
    
    Parameters:
    -----------
    train_data_path : str
        Path to training data CSV
    test_data_path : str
        Path to test data CSV for prediction
    output_path : str
        Path to save predictions
    """
    from your_training_script import main_catboost_pipeline, preprocess_for_catboost
    
    print("\n" + "="*70)
    print("COMPLETE CATBOOST PIPELINE")
    print("="*70)
    
    # STEP 1: Load and train
    print("\n[STEP 1] Training model...")
    train_df = pd.read_csv(train_data_path)
    model, importance, cv_results = main_catboost_pipeline(train_df)
    
    # STEP 2: Get categorical features
    print("\n[STEP 2] Extracting categorical features...")
    _, _, cat_features = preprocess_for_catboost(train_df)
    
    # STEP 3: Predict on new data
    print("\n[STEP 3] Predicting on new data...")
    results = predict_and_save(
        model=model,
        new_data_path=test_data_path,
        output_path=output_path,
        cat_features=cat_features
    )
    
    print("\n‚úÖ Pipeline complete! Results saved to:", output_path)
    
    return model, results


# ==================== SIMPLE USAGE ====================

# If you already have the trained model:
results = predict_and_save(
    model=model,  # Your trained model
    new_data_path='new_data.csv',
    output_path='results.csv',
    cat_features=cat_features  # From training
)


In [22]:
def preprocess_for_catboost(df):
    """
    CatBoost-specific preprocessing
    Advantage: CatBoost handles categorical features natively!
    """
    print("="*70)
    print("PREPROCESSING FOR CATBOOST")
    print("="*70)
    
    # Separate target
    X = df
    
    print(f"\n[1] Initial Dataset:")
    print(f"   Samples: {len(X):,}")
    print(f"   Features: {X.shape[1]}")
    # print(f"   Default rate: {y.mean():.2%}")
    
    # Drop identifiers and noise
    drop_cols = ['customer_id', 'application_id', 'loan_officer_id', 
                 'random_noise_1', 'recent_inquiry_count', 'oldest_credit_line_age']
    X = X.drop([col for col in drop_cols if col in X.columns], axis=1)
    
    # Handle missing values
    print(f"\n[2] Handling missing values...")
    for col in X.columns:
        if X[col].isnull().sum() > 0:
            if X[col].dtype in ['float64', 'int64']:
                X[f'{col}_missing'] = X[col].isnull().astype(int)
                X[col].fillna(X[col].median(), inplace=True)
    
    # Cap outliers
    print(f"\n[3] Capping outliers...")
    outlier_features = ['loan_amount', 'annual_income', 'total_credit_limit',
                        'revolving_balance', 'annual_debt_payment']
    for col in outlier_features:
        if col in X.columns:
            lower, upper = X[col].quantile([0.01, 0.99])
            X[col] = X[col].clip(lower=lower, upper=upper)
    
    # Feature engineering
    print(f"\n[4] Engineering features...")
    
    # Debt features
    if all(c in X.columns for c in ['debt_to_income_ratio', 'payment_to_income_ratio']):
        X['total_debt_burden'] = X['debt_to_income_ratio'] + X['payment_to_income_ratio']
        X['debt_stress'] = (X['debt_to_income_ratio'] > 0.5).astype(int)
    
    # Credit score features
    if 'credit_score' in X.columns:
        X['credit_score_norm'] = (X['credit_score'] - 300) / 550
        X['poor_credit'] = (X['credit_score'] < 650).astype(int)
        X['excellent_credit'] = (X['credit_score'] > 750).astype(int)
    
    # Delinquency features
    if 'num_delinquencies_2yrs' in X.columns:
        X['has_delinquency'] = (X['num_delinquencies_2yrs'] > 0).astype(int)
    
    # Utilization
    if 'credit_utilization' in X.columns:
        X['high_utilization'] = (X['credit_utilization'] > 0.75).astype(int)
    
    # Income adequacy
    if 'annual_income' in X.columns and 'loan_amount' in X.columns:
        X['loan_to_income'] = X['loan_amount'] / (X['annual_income'] + 1)
    
    # Employment
    if 'employment_length' in X.columns:
        X['stable_employment'] = (X['employment_length'] >= 3).astype(int)
    
    # Account age
    if 'oldest_account_age_months' in X.columns:
        X['thin_credit_file'] = (X['oldest_account_age_months'] < 24).astype(int)
    
    # Inquiries
    if 'num_inquiries_6mo' in X.columns:
        X['excessive_inquiries'] = (X['num_inquiries_6mo'] > 4).astype(int)
    
    # Interest rate
    if 'interest_rate' in X.columns:
        X['subprime_rate'] = (X['interest_rate'] > 15).astype(int)
    
    # Identify categorical features for CatBoost
    categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()
    
    # CatBoost handles categoricals, so DON'T one-hot encode!
    # Just convert to 'category' dtype
    for col in categorical_features:
        X[col] = X[col].astype('category')
    
    print(f"\n‚úÖ Preprocessing complete!")
    print(f"   Final features: {X.shape[1]}")
    print(f"   Categorical features: {len(categorical_features)}")
    print(f"   Categorical columns: {categorical_features}")
    
    return X, categorical_features

In [23]:
final_test = pd.read_csv('/Users/izzatillo_khazratov/Desktop/cbu-coding-challenge/data/evaluation_set/final_test.csv')
X, test_cat = preprocess_for_catboost(final_test)

PREPROCESSING FOR CATBOOST

[1] Initial Dataset:
   Samples: 10,001
   Features: 67

[2] Handling missing values...

[3] Capping outliers...

[4] Engineering features...

‚úÖ Preprocessing complete!
   Final features: 79
   Categorical features: 0
   Categorical columns: []


In [46]:
default = model.predict(X).tolist()

In [33]:
l = model.predict_proba(X).tolist()

In [42]:
output=[]
for i in l:
    output.append(round(max(i), 5))

In [43]:
max(output)

0.52727

In [38]:
test_df = pd.read_csv('/Users/izzatillo_khazratov/Desktop/cbu-coding-challenge/data/evaluation_set/test.csv')

In [44]:
ids = test_df['customer_id'].copy()

In [48]:
pd.DataFrame({'customer_id': ids, 'prob': output, 'default': default}).to_csv('/Users/izzatillo_khazratov/Desktop/cbu-coding-challenge/results.csv', index=False)

In [13]:
"""
Gradient Boosting Credit Default Prediction Pipeline
Includes: Sklearn GradientBoosting, XGBoost, LightGBM, CatBoost
Target: 80%+ AUC score
"""

import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

# ==================== PREPROCESSING ====================

def preprocess_for_gradient_boosting(df, target='default'):
    """
    Preprocessing for Gradient Boosting models
    """
    print("="*70)
    print("PREPROCESSING FOR GRADIENT BOOSTING")
    print("="*70)
    
    # Separate target
    y = df[target].copy()
    X = df.drop(target, axis=1)
    
    print(f"\n[1] Initial Dataset:")
    print(f"   Samples: {len(X):,}")
    print(f"   Features: {X.shape[1]}")
    print(f"   Default rate: {y.mean():.2%}")
    print(f"   Imbalance ratio: {(1-y.mean())/y.mean():.1f}:1")
    
    # Drop identifiers and noise
    drop_cols = ['customer_id', 'application_id', 'loan_officer_id', 
                 'random_noise_1', 'recent_inquiry_count', 'oldest_credit_line_age',
                 'previous_zip_code', 'marketing_campaign', 'referral_code']
    X = X.drop([col for col in drop_cols if col in X.columns], axis=1)
    print(f"\n[2] Dropped {len([c for c in drop_cols if c in df.columns])} ID/noise columns")
    
    # Handle missing values
    print(f"\n[3] Handling missing values...")
    missing_count = 0
    for col in X.columns:
        if X[col].isnull().sum() > 0:
            if X[col].dtype in ['float64', 'int64']:
                X[f'{col}_missing'] = X[col].isnull().astype(int)
                X[col].fillna(X[col].median(), inplace=True)
                missing_count += 1
    print(f"   Handled {missing_count} features with missing values")
    
    # Cap outliers
    print(f"\n[4] Capping outliers at 1st and 99th percentiles...")
    outlier_features = ['loan_amount', 'annual_income', 'total_credit_limit',
                        'revolving_balance', 'annual_debt_payment', 'monthly_free_cash_flow']
    for col in outlier_features:
        if col in X.columns:
            lower, upper = X[col].quantile([0.01, 0.99])
            X[col] = X[col].clip(lower=lower, upper=upper)
    
    # Feature engineering
    print(f"\n[5] Engineering features...")
    engineered_count = 0
    
    # Debt burden features
    if all(c in X.columns for c in ['debt_to_income_ratio', 'payment_to_income_ratio']):
        X['total_debt_burden'] = X['debt_to_income_ratio'] + X['payment_to_income_ratio']
        X['debt_stress'] = (X['debt_to_income_ratio'] > 0.5).astype(int)
        engineered_count += 2
    
    # Credit score features
    if 'credit_score' in X.columns:
        X['credit_score_norm'] = (X['credit_score'] - 300) / 550
        X['poor_credit'] = (X['credit_score'] < 650).astype(int)
        X['excellent_credit'] = (X['credit_score'] > 750).astype(int)
        X['credit_tier'] = pd.cut(X['credit_score'], 
                                   bins=[0, 580, 670, 740, 850],
                                   labels=[0, 1, 2, 3]).astype(int)  # Convert to int
        engineered_count += 4
    
    # Delinquency features
    if 'num_delinquencies_2yrs' in X.columns:
        X['has_delinquency'] = (X['num_delinquencies_2yrs'] > 0).astype(int)
        X['multiple_delinquencies'] = (X['num_delinquencies_2yrs'] > 1).astype(int)
        engineered_count += 2
    
    # Credit utilization
    if 'credit_utilization' in X.columns:
        X['high_utilization'] = (X['credit_utilization'] > 0.75).astype(int)
        X['very_low_utilization'] = (X['credit_utilization'] < 0.1).astype(int)
        engineered_count += 2
    
    # Income and loan features
    if 'annual_income' in X.columns and 'loan_amount' in X.columns:
        X['loan_to_income'] = X['loan_amount'] / (X['annual_income'] + 1)
        X['high_loan_burden'] = (X['loan_to_income'] > 3).astype(int)
        engineered_count += 2
    
    # Employment stability
    if 'employment_length' in X.columns:
        X['stable_employment'] = (X['employment_length'] >= 3).astype(int)
        X['new_employment'] = (X['employment_length'] < 1).astype(int)
        engineered_count += 2
    
    # Account age
    if 'oldest_account_age_months' in X.columns:
        X['account_age_years'] = X['oldest_account_age_months'] / 12
        X['thin_credit_file'] = (X['oldest_account_age_months'] < 24).astype(int)
        X['mature_credit'] = (X['oldest_account_age_months'] >= 60).astype(int)
        engineered_count += 3
    
    # Inquiry features
    if 'num_inquiries_6mo' in X.columns:
        X['credit_shopping'] = (X['num_inquiries_6mo'] > 2).astype(int)
        X['excessive_inquiries'] = (X['num_inquiries_6mo'] > 4).astype(int)
        engineered_count += 2
    
    # Interest rate
    if 'interest_rate' in X.columns:
        X['subprime_rate'] = (X['interest_rate'] > 15).astype(int)
        X['prime_rate'] = (X['interest_rate'] < 8).astype(int)
        engineered_count += 2
    
    # Collections and public records
    if 'num_collections' in X.columns:
        X['has_collections'] = (X['num_collections'] > 0).astype(int)
        engineered_count += 1
    
    if 'num_public_records' in X.columns:
        X['has_public_records'] = (X['num_public_records'] > 0).astype(int)
        engineered_count += 1
    
    print(f"   Created {engineered_count} new features")
    
    # Encode categorical variables
    print(f"\n[6] Encoding categorical variables...")
    categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
    
    # One-hot encode low cardinality
    low_card = [col for col in categorical_cols if X[col].nunique() < 10]
    if low_card:
        X = pd.get_dummies(X, columns=low_card, drop_first=True, dtype=int)
        print(f"   One-hot encoded: {low_card}")
    
    # Frequency encode high cardinality
    high_card = [col for col in categorical_cols if X[col].nunique() >= 10]
    for col in high_card:
        freq_map = X[col].value_counts(normalize=True).to_dict()
        X[f'{col}_frequency'] = X[col].map(freq_map)
        X = X.drop(col, axis=1)
        print(f"   Frequency encoded: {col}")
    
    # Remove redundant features
    redundant = ['monthly_income']
    X = X.drop([col for col in redundant if col in X.columns], axis=1)
    
    print(f"\n‚úÖ Preprocessing complete!")
    print(f"   Final features: {X.shape[1]}")
    print(f"   Ready for modeling")
    
    return X, y

# ==================== SKLEARN GRADIENT BOOSTING ====================

def train_sklearn_gb(X_train, y_train, X_test, y_test):
    """
    Train Sklearn's GradientBoostingClassifier
    """
    print("\n" + "="*70)
    print("SKLEARN GRADIENT BOOSTING")
    print("="*70)
    
    # Calculate sample weights for imbalance
    n_samples = len(y_train)
    n_defaults = (y_train == 1).sum()
    n_no_defaults = (y_train == 0).sum()
    
    sample_weights = np.ones(n_samples)
    sample_weights[y_train == 1] = n_samples / (2 * n_defaults)
    sample_weights[y_train == 0] = n_samples / (2 * n_no_defaults)
    
    print(f"\nUsing sample weights to handle imbalance")
    print(f"   Default weight: {sample_weights[y_train == 1][0]:.2f}")
    print(f"   No default weight: {sample_weights[y_train == 0][0]:.2f}")
    
    # Model parameters
    model = GradientBoostingClassifier(
        n_estimators=500,           # Number of boosting stages
        learning_rate=0.05,         # Learning rate
        max_depth=7,                # Max depth of trees
        min_samples_split=20,       # Min samples to split node
        min_samples_leaf=10,        # Min samples per leaf
        subsample=0.8,              # Fraction of samples for training each tree
        max_features='sqrt',        # Number of features for best split
        validation_fraction=0.1,    # Fraction for early stopping
        n_iter_no_change=50,        # Early stopping rounds
        random_state=42,
        verbose=1
    )
    
    print(f"\n[1] Training Sklearn GradientBoosting...")
    model.fit(X_train, y_train, sample_weight=sample_weights)
    
    print(f"   Training stopped at iteration: {model.n_estimators_}")
    
    # Evaluate
    print(f"\n[2] Model Evaluation:")
    
    y_train_pred = model.predict_proba(X_train)[:, 1]
    y_test_pred = model.predict_proba(X_test)[:, 1]
    
    train_auc = roc_auc_score(y_train, y_train_pred)
    test_auc = roc_auc_score(y_test, y_test_pred)
    
    print(f"   Training AUC:   {train_auc:.4f}")
    print(f"   Test AUC:       {test_auc:.4f}")
    print(f"   Overfitting:    {train_auc - test_auc:.4f}")
    
    # Competition scoring
    if test_auc >= 0.85:
        grade = "üèÜ EXCELLENT (25/25 points)"
    elif test_auc >= 0.80:
        grade = "‚úÖ VERY GOOD (23-24/25 points)"
    elif test_auc >= 0.75:
        grade = "‚úÖ GOOD (20-22/25 points)"
    elif test_auc >= 0.60:
        grade = "‚ö†Ô∏è  PASSING (15-19/25 points)"
    else:
        grade = "‚ùå DISQUALIFIED (<15 points)"
    
    print(f"\n   Competition Score: {grade}")
    
    return model, test_auc

# ==================== CROSS-VALIDATION ====================

def cross_validate_gb(X, y):
    """
    5-fold stratified cross-validation
    """
    print("\n" + "="*70)
    print("5-FOLD CROSS-VALIDATION")
    print("="*70)
    
    model = GradientBoostingClassifier(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=7,
        subsample=0.8,
        validation_fraction=0.1,
        n_iter_no_change=50,
        random_state=42,
        verbose=0
    )
    
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    print(f"\nPerforming 5-fold stratified cross-validation...")
    cv_scores = cross_val_score(
        model, X, y,
        cv=skf,
        scoring='roc_auc',
        n_jobs=-1
    )
    
    print(f"\nCV AUC Scores by Fold:")
    for i, score in enumerate(cv_scores, 1):
        print(f"   Fold {i}: {score:.4f}")
    
    print(f"\n   Mean CV AUC: {cv_scores.mean():.4f}")
    print(f"   Std Dev:     {cv_scores.std():.4f}")
    print(f"   Min AUC:     {cv_scores.min():.4f}")
    print(f"   Max AUC:     {cv_scores.max():.4f}")
    print(f"   95% CI:      [{cv_scores.mean() - 1.96*cv_scores.std():.4f}, "
          f"{cv_scores.mean() + 1.96*cv_scores.std():.4f}]")
    
    return cv_scores

# ==================== FEATURE IMPORTANCE ====================

def plot_feature_importance(model, X, top_n=20):
    """
    Display feature importance
    """
    print("\n" + "="*70)
    print(f"TOP {top_n} MOST IMPORTANT FEATURES")
    print("="*70)
    
    if hasattr(model, 'feature_importances_'):
        feature_importance = pd.DataFrame({
            'feature': X.columns,
            'importance': model.feature_importances_
        }).sort_values('importance', ascending=False)
        
        print(f"\n{feature_importance.head(top_n).to_string(index=False)}")
        
        return feature_importance
    else:
        print("\nModel doesn't have feature_importances_ attribute")
        return None

# ==================== ENSEMBLE OF GRADIENT BOOSTERS ====================

def train_ensemble_gb(X_train, y_train, X_test, y_test):
    """
    Ensemble of multiple gradient boosting algorithms
    """
    print("\n" + "="*70)
    print("ENSEMBLE: XGBoost + LightGBM + CatBoost + Sklearn GB")
    print("="*70)
    
    from xgboost import XGBClassifier
    from lightgbm import LGBMClassifier
    from catboost import CatBoostClassifier
    
    scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
    
    # Identify categorical features for CatBoost
    cat_features = X_train.select_dtypes(include=['category']).columns.tolist()
    
    print(f"\n[1] Training individual models...")
    
    # XGBoost
    print("\n   Training XGBoost...")
    xgb_model = XGBClassifier(
        n_estimators=500,
        learning_rate=0.03,
        max_depth=7,
        subsample=0.8,
        colsample_bytree=0.8,
        scale_pos_weight=scale_pos_weight,
        early_stopping_rounds=50,
        random_state=42,
        eval_metric='auc'
    )
    xgb_model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
    xgb_pred = xgb_model.predict_proba(X_test)[:, 1]
    xgb_auc = roc_auc_score(y_test, xgb_pred)
    print(f"   XGBoost AUC: {xgb_auc:.4f}")
    
    # LightGBM
    print("\n   Training LightGBM...")
    lgb_model = LGBMClassifier(
        n_estimators=500,
        learning_rate=0.03,
        max_depth=7,
        subsample=0.8,
        colsample_bytree=0.8,
        class_weight='balanced',
        random_state=42,
        verbose=-1
    )
    lgb_model.fit(X_train, y_train, eval_set=[(X_test, y_test)], 
                  callbacks=[])
    lgb_pred = lgb_model.predict_proba(X_test)[:, 1]
    lgb_auc = roc_auc_score(y_test, lgb_pred)
    print(f"   LightGBM AUC: {lgb_auc:.4f}")
    
    # CatBoost
    print("\n   Training CatBoost...")
    cat_model = CatBoostClassifier(
        iterations=500,
        learning_rate=0.03,
        depth=7,
        auto_class_weights='Balanced',
        random_seed=42,
        verbose=False
    )
    cat_model.fit(X_train, y_train, eval_set=(X_test, y_test), 
                  use_best_model=True, cat_features=cat_features)
    cat_pred = cat_model.predict_proba(X_test)[:, 1]
    cat_auc = roc_auc_score(y_test, cat_pred)
    print(f"   CatBoost AUC: {cat_auc:.4f}")
    
    # Sklearn GB
    print("\n   Training Sklearn GradientBoosting...")
    gb_model = GradientBoostingClassifier(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=7,
        subsample=0.8,
        random_state=42,
        verbose=0
    )
    gb_model.fit(X_train, y_train)
    gb_pred = gb_model.predict_proba(X_test)[:, 1]
    gb_auc = roc_auc_score(y_test, gb_pred)
    print(f"   Sklearn GB AUC: {gb_auc:.4f}")
    
    # Ensemble predictions (weighted average)
    print(f"\n[2] Creating ensemble...")
    
    # Weight by performance
    total_auc = xgb_auc + lgb_auc + cat_auc + gb_auc
    w_xgb = xgb_auc / total_auc
    w_lgb = lgb_auc / total_auc
    w_cat = cat_auc / total_auc
    w_gb = gb_auc / total_auc
    
    ensemble_pred = (w_xgb * xgb_pred + 
                     w_lgb * lgb_pred + 
                     w_cat * cat_pred + 
                     w_gb * gb_pred)
    
    ensemble_auc = roc_auc_score(y_test, ensemble_pred)
    
    print(f"\n[3] Ensemble Results:")
    print(f"   Weights: XGB={w_xgb:.3f}, LGB={w_lgb:.3f}, CAT={w_cat:.3f}, GB={w_gb:.3f}")
    print(f"   Ensemble AUC: {ensemble_auc:.4f}")
    print(f"   Improvement: {ensemble_auc - max(xgb_auc, lgb_auc, cat_auc, gb_auc):+.4f}")
    
    models = {
        'xgboost': xgb_model,
        'lightgbm': lgb_model,
        'catboost': cat_model,
        'sklearn_gb': gb_model
    }
    
    return models, ensemble_auc

# ==================== MAIN PIPELINE ====================

def main_gradient_boosting_pipeline(df):
    """
    Complete Gradient Boosting pipeline
    """
    print("\n" + "="*70)
    print("GRADIENT BOOSTING CREDIT DEFAULT PREDICTION")
    print("="*70)
    
    # Preprocess
    X, y = preprocess_for_gradient_boosting(df)
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )
    
    print(f"\n" + "="*70)
    print("DATA SPLIT")
    print("="*70)
    print(f"\nTraining: {len(X_train):,} samples ({y_train.mean():.2%} default)")
    print(f"Test:     {len(X_test):,} samples ({y_test.mean():.2%} default)")
    
    # Train Sklearn GradientBoosting
    model, test_auc = train_sklearn_gb(X_train, y_train, X_test, y_test)
    
    # Cross-validation
    cv_scores = cross_validate_gb(X_train, y_train)
    
    # Feature importance
    importance_df = plot_feature_importance(model, X_train, top_n=20)
    
    # Train ensemble
    ensemble_models, ensemble_auc = train_ensemble_gb(X_train, y_train, X_test, y_test)
    
    # Final summary
    print("\n" + "="*70)
    print("FINAL SUMMARY")
    print("="*70)
    print(f"\n‚úÖ Sklearn GB Test AUC:  {test_auc:.4f}")
    print(f"‚úÖ Ensemble AUC:         {ensemble_auc:.4f}")
    print(f"‚úÖ Mean CV AUC:          {cv_scores.mean():.4f} (¬±{cv_scores.std():.4f})")
    print(f"‚úÖ Total Features:       {X_train.shape[1]}")
    
    best_auc = max(test_auc, ensemble_auc)
    if best_auc >= 0.80:
        print(f"\nüéØ TARGET ACHIEVED! Ready for competition.")
    else:
        print(f"\n‚ö†Ô∏è  AUC below 80%. Try hyperparameter tuning.")
    
    return model, ensemble_models, importance_df

# ==================== USAGE ====================

# Load data

# Run complete pipeline
data = pd.read_csv('/Users/izzatillo_khazratov/Desktop/cbu-coding-challenge/data/final.csv')

model, ensemble_models, importance = main_gradient_boosting_pipeline(data)
"""
# Make predictions with ensemble
X_new_preprocessed, _ = preprocess_for_gradient_boosting(X_new)
predictions = (
    ensemble_models['xgboost'].predict_proba(X_new_preprocessed)[:, 1] * 0.3 +
    ensemble_models['lightgbm'].predict_proba(X_new_preprocessed)[:, 1] * 0.3 +
    ensemble_models['catboost'].predict_proba(X_new_preprocessed)[:, 1] * 0.3 +
    ensemble_models['sklearn_gb'].predict_proba(X_new_preprocessed)[:, 1] * 0.1
)
"""



GRADIENT BOOSTING CREDIT DEFAULT PREDICTION
PREPROCESSING FOR GRADIENT BOOSTING

[1] Initial Dataset:
   Samples: 89,999
   Features: 67
   Default rate: 5.10%
   Imbalance ratio: 18.6:1

[2] Dropped 0 ID/noise columns

[3] Handling missing values...
   Handled 0 features with missing values

[4] Capping outliers at 1st and 99th percentiles...

[5] Engineering features...
   Created 23 new features

[6] Encoding categorical variables...

‚úÖ Preprocessing complete!
   Final features: 89
   Ready for modeling

DATA SPLIT

Training: 71,999 samples (5.10% default)
Test:     18,000 samples (5.11% default)

SKLEARN GRADIENT BOOSTING

Using sample weights to handle imbalance
   Default weight: 9.80
   No default weight: 0.53

[1] Training Sklearn GradientBoosting...
      Iter       Train Loss      OOB Improve   Remaining Time 
         1           1.3578           0.0240           54.50s
         2           1.3329           0.0232           52.40s
         3           1.3101           0.0

"\n# Make predictions with ensemble\nX_new_preprocessed, _ = preprocess_for_gradient_boosting(X_new)\npredictions = (\n    ensemble_models['xgboost'].predict_proba(X_new_preprocessed)[:, 1] * 0.3 +\n    ensemble_models['lightgbm'].predict_proba(X_new_preprocessed)[:, 1] * 0.3 +\n    ensemble_models['catboost'].predict_proba(X_new_preprocessed)[:, 1] * 0.3 +\n    ensemble_models['sklearn_gb'].predict_proba(X_new_preprocessed)[:, 1] * 0.1\n)\n"