In [1]:
import pandas as pd
import numpy as np
import joblib
import gc
import kagglehub

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    roc_auc_score, average_precision_score,
    classification_report, confusion_matrix
)
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
path = kagglehub.dataset_download("ethon0426/lending-club-20072020q1")
print(path)

file_path = path + "/Loan_status_2007-2020Q3.gzip"

ROW_LIMIT = 10000   # Change this parameter to control the input number of rows

read_kwargs = dict(low_memory=False, compression="infer")
if ROW_LIMIT is not None:
    read_kwargs["nrows"] = ROW_LIMIT

df = pd.read_csv(file_path, **read_kwargs)
# print(df.shape)
# print(df.head())

/Users/abao/.cache/kagglehub/datasets/ethon0426/lending-club-20072020q1/versions/3


In [None]:
TARGET_COL = "loan_status"

df = df[df[TARGET_COL].isin(["Fully Paid", "Charged Off"])].copy()
df["target"] = (df[TARGET_COL] == "Charged Off").astype(int)
df.drop(columns=[TARGET_COL], inplace=True)

In [4]:
numeric_features = [
    "loan_amnt", "int_rate", "installment", "annual_inc", "dti",
    "inq_last_6mths", "open_acc", "pub_rec", "revol_bal", "revol_util",
    "total_acc"
]
categorical_features = [
    "term", "grade", "sub_grade", "emp_length", "home_ownership",
    "verification_status", "purpose", "application_type"
]

numeric_features = [c for c in numeric_features if c in df.columns]
categorical_features = [c for c in categorical_features if c in df.columns]

for col in ["int_rate", "revol_util"]:
    if col in df.columns and df[col].dtype == object:
        df[col] = pd.to_numeric(df[col].astype(str).str.rstrip("%"), errors="coerce")


In [None]:
numeric_features = [
    "loan_amnt", "int_rate", "installment", "annual_inc", "dti",
    "inq_last_6mths", "open_acc", "pub_rec", "revol_bal", "revol_util",
    "total_acc"
]
categorical_features = [
    "term", "grade", "sub_grade", "emp_length", "home_ownership",
    "verification_status", "purpose", "application_type"
]

numeric_features = [c for c in numeric_features if c in df.columns]
categorical_features = [c for c in categorical_features if c in df.columns]

for col in ["int_rate", "revol_util"]:
    if col in df.columns and df[col].dtype == object:
        df[col] = pd.to_numeric(df[col].astype(str).str.rstrip("%"), errors="coerce")

X = df[numeric_features + categorical_features].copy()
y = df["target"].values
del df
gc.collect()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ],
    remainder="drop"
)

log_reg = LogisticRegression(
    solver="lbfgs",
    max_iter=500,
    class_weight="balanced" 
)

model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("clf", log_reg)
])

model.fit(X_train, y_train)

proba = model.predict_proba(X_test)[:, 1]
pred  = (proba >= 0.5).astype(int)

print("\n=== Metrics ===")
print("ROC-AUC: ", roc_auc_score(y_test, proba))
print("PR-AUC:  ", average_precision_score(y_test, proba))
print("Confusion Matrix:\n", confusion_matrix(y_test, pred))
print("\nClassification Report:\n", classification_report(y_test, pred, digits=4))


# os.makedirs("artifacts", exist_ok=True)
# joblib.dump(model, "artifacts/logreg_lendingclub.joblib")
# print("\nModel saved at: artifacts/logreg_lendingclub.joblib")


=== Metrics ===
ROC-AUC:  0.7195771093531593
PR-AUC:   0.31650533173247436
Confusion Matrix:
 [[1109  577]
 [ 110  204]]

Classification Report:
               precision    recall  f1-score   support

           0     0.9098    0.6578    0.7635      1686
           1     0.2612    0.6497    0.3726       314

    accuracy                         0.6565      2000
   macro avg     0.5855    0.6537    0.5681      2000
weighted avg     0.8079    0.6565    0.7021      2000

