In [63]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, FunctionTransformer

pd.options.display.max_columns = 500

df = pd.read_csv("../data/credit.csv")
df_val = pd.read_csv("../data/credit2.csv")

X, y = df.loc[:, ~df.columns.isin(['bad_loan'])], df['bad_loan']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=2345, stratify=y
)

# save model training / test
pd.concat([X_train, y_train]).to_csv("../models/artifacts/training_data.csv", index=False)
pd.concat([X_test, y_test]).to_csv("../models/artifacts/test_data.csv", index=False)

# clean up categorical fields
ordinal_features = ["account_status", "savings", "employment"]
oh_features = ["credit_history","purpose", "personal_status_and_sex", "other_debtors", "property", "other_installments", "housing", "job", "telephone", "foreign_worker"]

# Pipeline
# processor = ColumnTransformer([
    # ("ordinal_transform", OrdinalEncoder(), ordinal_features),
    # ("oh_transformer", OneHotEncoder(), oh_features),
    # ("credit_amount_transform", FunctionTransformer(lambda x: np.log1p(x)), ["credit_amount"])],
    # remainder="passthrough"
# )

# pipeline = Pipeline([
    # ("process", processor),
# ])

# pipeline.fit_transform(X_train)

X_train.shape

(480, 20)

In [64]:
# Training set up --------------------------------------------------------------
ord_enc = OrdinalEncoder()
oh_enc = OneHotEncoder(drop="first")
log_trans = FunctionTransformer(np.log1p)

# Transformer ordinal features
X_train[ordinal_features] = ord_enc.fit_transform(X_train[ordinal_features])
# Log transform credit amount
X_train["credit_amount"] = log_trans.fit_transform(X_train["credit_amount"])

# fit one hot encoded features
oh_enc.fit(X_train[oh_features])
oh_enc_features = pd.DataFrame(oh_enc.transform(X_train[oh_features]).toarray(), columns=oh_enc.get_feature_names_out(), index=X_train.index)
X_train = pd.concat([X_train, oh_enc_features], axis=1)
X_train.drop(columns=oh_features, inplace=True)


# Test set up ------------------------------------------------------
# Transformer ordinal features
X_test[ordinal_features] = ord_enc.transform(X_test[ordinal_features])
# Log transform credit amount
X_test["credit_amount"] = log_trans.transform(X_test["credit_amount"])

# fit one hot encoded features
oh_enc_features = pd.DataFrame(oh_enc.transform(X_test[oh_features]).toarray(), columns=oh_enc.get_feature_names_out(), index=X_test.index)
X_test = pd.concat([X_test, oh_enc_features], axis=1)
X_test.drop(columns=oh_features, inplace=True)

In [53]:
X_train.shape

(480, 40)

In [54]:
X_test.shape

(120, 40)

In [70]:
this

0    336
1    144
Name: bad_loan, dtype: int64

In [62]:
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics


# check models
models = {
    "xgb": XGBClassifier(),
    "logisitc": LogisticRegression(max_iter=10_000),
    "rf": RandomForestClassifier(),
    "svm": SVC(),
    "knn": KNeighborsClassifier(n_neighbors=9)
}

results = {}

for name, model in models.items():
    results[name] = {}
    try:
        model.fit(X_train, y_train)
        results[name]["model"] = model
        # get predictions
        y_pred = model.predict(X_test)
        results[name]["accuracy"] = metrics.accuracy_score(y_test, y_pred)
        results[name]["recall"] = metrics.recall_score(y_test, y_pred)
        results[name]["precision"] = metrics.precision_score(y_test, y_pred)
        results[name]["auc"] = metrics.roc_auc_score(y_test, y_pred)
    except:
        print(name)
        raise

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  _warn_prf(average, modifier, msg_start, len(result))


In [61]:
results

{'xgb': {'accuracy': 0.725,
  'recall': 0.4444444444444444,
  'precision': 0.5517241379310345,
  'auc': 0.6448412698412699},
 'logisitc': {'accuracy': 0.725,
  'recall': 0.3888888888888889,
  'precision': 0.56,
  'auc': 0.628968253968254},
 'rf': {'accuracy': 0.7083333333333334,
  'recall': 0.2777777777777778,
  'precision': 0.5263157894736842,
  'auc': 0.5853174603174602},
 'svm': {'accuracy': 0.7, 'recall': 0.0, 'precision': 0.0, 'auc': 0.5},
 'knn': {'accuracy': 0.7083333333333334,
  'recall': 0.25,
  'precision': 0.5294117647058824,
  'auc': 0.5773809523809523}}