In [3]:
!pip install scikit-learn



In [4]:
import pandas as pd
import numpy as np

df = pd.read_csv("../datasets/cc_approvals.data", header=None, na_values=["?"])

# 16 columns, no header row.
df.columns

df.columns = [f"A{i}" for i in range(1, 17)]

print(df.info())
print(df.isna().sum())
print(df.head())

X = df.drop(columns=['A16'])
y_raw = df['A16']

y = y_raw.map({
    '+': 1,
    '-': 0
})

y.value_counts()

numeric_like = ["A2","A3","A8","A11","A14","A15"]

for c in numeric_like:
    X[c] = pd.to_numeric(X[c], errors="coerce")

cat_cols = X.select_dtypes(include=["object"]).columns.tolist()
num_cols = [c for c in X.columns if c not in cat_cols]

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

numeric_preprocess = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

cat_preprocess = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_preprocess, num_cols),
        ("cat", cat_preprocess, cat_cols),
    ],
    remainder="drop",
)

modelLR = Pipeline(steps=[
    ("preprocess", preprocess),
    ("clf", LogisticRegression(max_iter=200, n_jobs=None))
])

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

modelLR.fit(X_train, y_train)

print("\n=== Train/Test accuracy (quick sanity check) ===")
print("Train:", modelLR.score(X_train, y_train))
print("Test :", modelLR.score(X_test, y_test))

from sklearn.metrics import classification_report, roc_auc_score

y_pred = modelLR.predict(X_test)
y_proba = modelLR.predict_proba(X_test)[:,1]

print(classification_report(y_test, y_pred, digits=3))
print("ROC AUC:", roc_auc_score(y_test, y_proba))


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A1      678 non-null    object 
 1   A2      678 non-null    float64
 2   A3      690 non-null    float64
 3   A4      684 non-null    object 
 4   A5      684 non-null    object 
 5   A6      681 non-null    object 
 6   A7      681 non-null    object 
 7   A8      690 non-null    float64
 8   A9      690 non-null    object 
 9   A10     690 non-null    object 
 10  A11     690 non-null    int64  
 11  A12     690 non-null    object 
 12  A13     690 non-null    object 
 13  A14     677 non-null    float64
 14  A15     690 non-null    int64  
 15  A16     690 non-null    object 
dtypes: float64(4), int64(2), object(10)
memory usage: 86.4+ KB
None
A1     12
A2     12
A3      0
A4      6
A5      6
A6      9
A7      9
A8      0
A9      0
A10     0
A11     0
A12     0
A13     0
A14    13
A15     0
A16     0


In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score

rf_model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("clf", RandomForestClassifier(
        n_estimators=300,      # number of trees
        max_depth=None,        # let trees grow until pure
        min_samples_split=5,
        class_weight="balanced", # handle imbalance
        random_state=42,
        n_jobs=-1
    ))
])

rf_model.fit(X_train, y_train)

print("\n=== Random Forest ===")
print("Train:", rf_model.score(X_train, y_train))
print("Test :", rf_model.score(X_test, y_test))

y_pred_rf = rf_model.predict(X_test)
y_proba_rf = rf_model.predict_proba(X_test)[:,1]

print(classification_report(y_test, y_pred_rf, digits=3))
print("ROC AUC:", roc_auc_score(y_test, y_proba_rf))



=== Random Forest ===
Train: 0.9655797101449275
Test : 0.9130434782608695
              precision    recall  f1-score   support

           0      0.911     0.935     0.923        77
           1      0.915     0.885     0.900        61

    accuracy                          0.913       138
   macro avg      0.913     0.910     0.912       138
weighted avg      0.913     0.913     0.913       138

ROC AUC: 0.9661486054928679


In [6]:
from sklearn.ensemble import GradientBoostingClassifier, HistGradientBoostingClassifier

gb_model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("clf", GradientBoostingClassifier(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=3,
        random_state=42
    ))
])

gb_model.fit(X_train, y_train)

print("\n=== XGBoost ===")
print("Train:", gb_model.score(X_train, y_train))
print("Test :", gb_model.score(X_test, y_test))

y_pred_xgb = gb_model.predict(X_test)
y_proba_xgb = gb_model.predict_proba(X_test)[:,1]

print(classification_report(y_test, y_pred_xgb, digits=3))
print("ROC AUC:", roc_auc_score(y_test, y_proba_xgb))



=== XGBoost ===
Train: 0.9909420289855072
Test : 0.8913043478260869
              precision    recall  f1-score   support

           0      0.908     0.896     0.902        77
           1      0.871     0.885     0.878        61

    accuracy                          0.891       138
   macro avg      0.889     0.891     0.890       138
weighted avg      0.892     0.891     0.891       138

ROC AUC: 0.9510325739833936
