In [1]:
# ------------------------------------------------------------------------------
# 1. Environment Setup and Imports
# ------------------------------------------------------------------------------

from dotenv import load_dotenv
import os
import sys
import logging
import warnings

# Load environment variables from the .env file
load_dotenv(dotenv_path='.env')

# System & OS utilities
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Add the project source directory if needed
sys.path.append(os.getenv('SRC_DIR'))

# Scikit-learn configuration: Force transformers to return DataFrames
from sklearn import set_config
set_config(transform_output="pandas")

# Scikit-learn: Preprocessing, Pipelines, Model Selection, and Metrics
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import (
    train_test_split, 
    GridSearchCV, 
    ParameterGrid
)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.base import BaseEstimator, TransformerMixin

# For parallel processing and progress monitoring
from tqdm import tqdm
from joblib import Parallel, delayed

# For model persistence
import joblib

# Optionally, set warnings to show by default
warnings.filterwarnings("default")

# ------------------------------------------------------------------------------
# 2. Logging Environment Information
# ------------------------------------------------------------------------------

log_dir = os.getenv('LOG_DIR')
db_url = os.getenv('DB_URL')
tickers_file = os.getenv('TICKERS')

print(f"Log Directory: {log_dir}")
print(f"Tickers File: {tickers_file}")
print(f"Database URL: {db_url}")
print("Current working directory:", os.getcwd())
print("DEBUG: DB_URL =", os.getenv("DB_URL"))

# ------------------------------------------------------------------------------
# 3. Data Loading and Preprocessing
# ------------------------------------------------------------------------------

# Load the GiveMeSomeCredit dataset
ft_path = os.getenv("CREDIT_DATA")
df_raw = pd.read_csv(ft_path)

# Rename columns and create additional features
df = df_raw.drop(columns=["Unnamed: 0"]).rename(
    columns={
        'SeriousDlqin2yrs': 'delinquency',
        'RevolvingUtilizationOfUnsecuredLines': 'revolving_unsecured_line_utilization',
        'age': 'age',
        'NumberOfTime30-59DaysPastDueNotWorse': 'num_30_59_days_late',
        'DebtRatio': 'debt_ratio',
        'MonthlyIncome': 'monthly_income',
        'NumberOfOpenCreditLinesAndLoans': 'num_open_credit_loans',
        'NumberOfTimes90DaysLate': 'num_90_days_late',
        'NumberRealEstateLoansOrLines': 'num_real_estate_loans',
        'NumberOfTime60-89DaysPastDueNotWorse': 'num_60_89_days_late',
        'NumberOfDependents': 'num_dependents'
    }
).assign(
    high_debt_ratio=lambda x: (x['debt_ratio'] > 1) * 1,
    missing_monthly_income=lambda x: x['monthly_income'].isna() * 1,
    missing_num_dependents=lambda x: x['num_dependents'].isna() * 1
)

# Define feature matrix X and target variable Y
X = df.drop(columns='delinquency')
Y = df['delinquency']

# Split the dataset into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42
)
# Ensure they remain DataFrames
X_train = pd.DataFrame(X_train, columns=X.columns)
X_test = pd.DataFrame(X_test, columns=X.columns)

# Define numerical columns for transformation
num_cols = [
    'revolving_unsecured_line_utilization', 'age',
    'num_30_59_days_late', 'debt_ratio', 'monthly_income',
    'num_open_credit_loans', 'num_90_days_late', 'num_real_estate_loans',
    'num_60_89_days_late', 'num_dependents'
]

# Build a simple numeric preprocessing pipeline
pipe_num_simple = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('standardizer', StandardScaler())
])

# Create a ColumnTransformer that applies the numeric pipeline to num_cols
ctransform_simple = ColumnTransformer(
    transformers=[('numeric_simple', pipe_num_simple, num_cols)],
    remainder='passthrough',
    force_int_remainder_cols=False
)

# ------------------------------------------------------------------------------
# 4. Pipeline Construction
# ------------------------------------------------------------------------------

# Create a pipeline for Logistic Regression
pipe_lr = Pipeline([
    ("scaler", StandardScaler()),
    ("preprocess", ctransform_simple),
    ("clf", LogisticRegression(max_iter=500, tol=1e-4))
])

print("Pipeline:")
print(pipe_lr)

# ------------------------------------------------------------------------------
# 5. Construct a Refined Hyperparameter Grid
# ------------------------------------------------------------------------------

# Define separate grids to avoid l1_ratio warnings and improve convergence.
# Grid 1: For lbfgs, newton-cg, sag (which support only l2 penalty)
grid1 = {
    "clf__solver": ["lbfgs", "newton-cg", "sag"],
    "clf__penalty": ["l2"],
    "clf__C": [0.001, 0.01, 0.1, 0.5, 1.0],
    "clf__max_iter": [500]
}

# Grid 2: For liblinear (supports l1 and l2 without l1_ratio)
grid2 = {
    "clf__solver": ["liblinear"],
    "clf__penalty": ["l1", "l2"],
    "clf__C": [0.001, 0.01, 0.1, 0.5, 1.0],
    "clf__max_iter": [500]
}

# Grid 3: For saga with l1 and l2 penalties (without l1_ratio)
grid3 = {
    "clf__solver": ["saga"],
    "clf__penalty": ["l1", "l2"],
    "clf__C": [0.001, 0.01, 0.1, 0.5, 1.0],
    "clf__max_iter": [500]
}

# Grid 4: For saga with elasticnet penalty (requires l1_ratio)
grid4 = {
    "clf__solver": ["saga"],
    "clf__penalty": ["elasticnet"],
    "clf__C": [0.001, 0.01, 0.1, 0.5, 1.0],
    "clf__l1_ratio": [0.1, 0.5, 0.9],
    "clf__max_iter": [500]
}

# Combine grids into one list
param_grid = [grid1, grid2, grid3, grid4]

# Define scoring metrics (multiple metrics are computed)
scoring = ['neg_log_loss', 'roc_auc', 'f1', 'accuracy', 'precision', 'recall']

# ------------------------------------------------------------------------------
# 6. Grid Search with Parallel Processing and Verbose Output
# ------------------------------------------------------------------------------

grid_cv = GridSearchCV(
    estimator=pipe_lr,
    param_grid=param_grid,
    scoring=scoring,
    cv=5,
    refit="neg_log_loss",
    n_jobs=-1,
    verbose=3  # Built-in progress updates
)

print("\nStarting Grid Search...")
grid_cv.fit(X_train, Y_train)

# ------------------------------------------------------------------------------
# 7. Results and Evaluation
# ------------------------------------------------------------------------------

# Convert CV results to DataFrame
results_df = pd.DataFrame(grid_cv.cv_results_)
print("\nAvailable Columns in Results:")
print(results_df.columns)

# Optionally, filter out failed fits (if any nan scores)
results_df_filtered = results_df.dropna(subset=["mean_test_neg_log_loss"])
print("\nTop 5 Best Results (by rank_test_neg_log_loss):")
print(results_df_filtered.nsmallest(5, "rank_test_neg_log_loss"))

print("\nBest Parameters:")
print(grid_cv.best_params_)

# Evaluate the best model on the test set
Y_pred = grid_cv.best_estimator_.predict(X_test)
print("\nClassification Report on Test Set:")
print(classification_report(Y_test, Y_pred))
print("ROC AUC Score:", roc_auc_score(Y_test, grid_cv.best_estimator_.predict_proba(X_test)[:, 1]))

# ------------------------------------------------------------------------------
# 8. Save the Best Model and Results
# ------------------------------------------------------------------------------

joblib.dump(grid_cv.best_estimator_, os.path.join(log_dir, "best_model.pkl"))
results_df.to_csv(os.path.join(log_dir, "grid_search_results.csv"), index=False)

print("\nModel and results saved.")


Log Directory: ../../07_logs/
Tickers File: ../../05_src/data/tickers/sp500_wiki.csv
Database URL: postgresql://postgres:HumanAfterAll@localhost:5432/model_db
Current working directory: c:\Users\AC\Documents\GitHub\Programs\UofT-DSI\production\01_materials\labs
DEBUG: DB_URL = postgresql://postgres:HumanAfterAll@localhost:5432/model_db
Pipeline:
Pipeline(steps=[('scaler', StandardScaler()),
                ('preprocess',
                 ColumnTransformer(force_int_remainder_cols=False,
                                   remainder='passthrough',
                                   transformers=[('numeric_simple',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('standardizer',
                                                                   StandardScaler())]),
                             