In [1]:
!pip install scikit-learn

Collecting scikit-learn
  Using cached scikit_learn-1.6.1-cp39-cp39-macosx_12_0_arm64.whl.metadata (31 kB)
Collecting numpy>=1.19.5 (from scikit-learn)
  Using cached numpy-2.0.2-cp39-cp39-macosx_14_0_arm64.whl.metadata (60 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Using cached scipy-1.13.1-cp39-cp39-macosx_12_0_arm64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.5.2-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Using cached scikit_learn-1.6.1-cp39-cp39-macosx_12_0_arm64.whl (11.1 MB)
Using cached joblib-1.5.2-py3-none-any.whl (308 kB)
Using cached numpy-2.0.2-cp39-cp39-macosx_14_0_arm64.whl (5.3 MB)
Using cached scipy-1.13.1-cp39-cp39-macosx_12_0_arm64.whl (30.3 MB)
Using cached threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, numpy, joblib, scipy, scikit-learn
[2K   [90m━━━━━━━━━━━

In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv("../datasets/cc_approvals.data", header=None, na_values=["?"])

# 16 columns, no header row.
df.columns

df.columns = [f"A{i}" for i in range(1, 17)]

print(df.info())
print(df.isna().sum())
print(df.head())

X = df.drop(columns=['A16'])
y_raw = df['A16']

y = y_raw.map({
    '+': 1,
    '-': 0
})

y.value_counts()

numeric_like = ["A2","A3","A8","A11","A14","A15"]

for c in numeric_like:
    X[c] = pd.to_numeric(X[c], errors="coerce")

cat_cols = X.select_dtypes(include=["object"]).columns.tolist()
num_cols = [c for c in X.columns if c not in cat_cols]

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

numeric_preprocess = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

cat_preprocess = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_preprocess, num_cols),
        ("cat", cat_preprocess, cat_cols),
    ],
    remainder="drop",
)

modelLR = Pipeline(steps=[
    ("preprocess", preprocess),
    ("clf", LogisticRegression(max_iter=200, n_jobs=None))
])

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

modelLR.fit(X_train, y_train)

print("\n=== Train/Test accuracy (quick sanity check) ===")
print("Train:", modelLR.score(X_train, y_train))
print("Test :", modelLR.score(X_test, y_test))

from sklearn.metrics import classification_report, roc_auc_score

y_pred = modelLR.predict(X_test)
y_proba = modelLR.predict_proba(X_test)[:,1]

print(classification_report(y_test, y_pred, digits=3))
print("ROC AUC:", roc_auc_score(y_test, y_proba))


ModuleNotFoundError: No module named 'pandas'

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score

rf_model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("clf", RandomForestClassifier(
        n_estimators=300,      # number of trees
        max_depth=None,        # let trees grow until pure
        min_samples_split=5,
        class_weight="balanced", # handle imbalance
        random_state=42,
        n_jobs=-1
    ))
])

rf_model.fit(X_train, y_train)

print("\n=== Random Forest ===")
print("Train:", rf_model.score(X_train, y_train))
print("Test :", rf_model.score(X_test, y_test))

y_pred_rf = rf_model.predict(X_test)
y_proba_rf = rf_model.predict_proba(X_test)[:,1]

print(classification_report(y_test, y_pred_rf, digits=3))
print("ROC AUC:", roc_auc_score(y_test, y_proba_rf))



=== Random Forest ===
Train: 0.9655797101449275
Test : 0.9130434782608695
              precision    recall  f1-score   support

           0      0.911     0.935     0.923        77
           1      0.915     0.885     0.900        61

    accuracy                          0.913       138
   macro avg      0.913     0.910     0.912       138
weighted avg      0.913     0.913     0.913       138

ROC AUC: 0.9661486054928679


In [None]:
from sklearn.ensemble import GradientBoostingClassifier, HistGradientBoostingClassifier

gb_model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("clf", GradientBoostingClassifier(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=3,
        random_state=42
    ))
])

gb_model.fit(X_train, y_train)

print("\n=== XGBoost ===")
print("Train:", gb_model.score(X_train, y_train))
print("Test :", gb_model.score(X_test, y_test))

y_pred_xgb = gb_model.predict(X_test)
y_proba_xgb = gb_model.predict_proba(X_test)[:,1]

print(classification_report(y_test, y_pred_xgb, digits=3))
print("ROC AUC:", roc_auc_score(y_test, y_proba_xgb))



=== XGBoost ===
Train: 0.9909420289855072
Test : 0.8913043478260869
              precision    recall  f1-score   support

           0      0.908     0.896     0.902        77
           1      0.871     0.885     0.878        61

    accuracy                          0.891       138
   macro avg      0.889     0.891     0.890       138
weighted avg      0.892     0.891     0.891       138

ROC AUC: 0.9510325739833936
