In [127]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import optuna
from lightgbm import LGBMClassifier
from isotree import IsolationForest
from sklearn.metrics import balanced_accuracy_score
import sys
sys.path.append("../")

from cfmining.utils import OutlierWrap

import os
import joblib

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [128]:
VAL_RATIO = 1/10
TEST_RATIO = 0.5
SEED = 0

In [129]:
hyperparam_spaces = {
    "LGBMClassifier": {
        "n_estimators": {"low": 5, "high": 250, "type": "int"},
        "learning_rate": {"low": 0.05, "high": 1.0, "type": "float"},
        "max_depth": {"low": 2, "high": 12, "type": "int"},
        "colsample_bytree": {"low": 0.1, "high": 1.0, "type": "float"},
        "reg_alpha": {"low": 1e-3, "high": 1e3, "log": True, "type": "float"},
        "verbose": {"choices": [-1], "type": "categorical"},
        "random_state" : {"choices": [SEED], "type": "categorical"},
    }
}

In [130]:
optuna.logging.set_verbosity(optuna.logging.WARNING)

def objective(
    trial,
    hyperparams,
    X_train,
    Y_train,
    X_val,
    Y_val
    ):
    params = {}
    for k, v in hyperparams.items():
        if "choices" in v:
            params[k] = trial.suggest_categorical(k, v["choices"])
        elif v["type"] == "int":
            params[k] = trial.suggest_int(k, v["low"], v["high"])
        elif v["type"] == "float":
            params[k] = trial.suggest_float(k, v["low"], v["high"], log=v.get("log", False))

    model = LGBMClassifier(**params)
    model.fit(X_train, Y_train)
    Y_val_pred = model.predict(X_val)
    score = balanced_accuracy_score(Y_val, Y_val_pred)
    return score

## German

In [131]:
os.makedirs("../models/german", exist_ok=True)

In [132]:
df = pd.read_csv("../data/german.csv")
X = df.drop("GoodCustomer", axis=1)
Y = df["GoodCustomer"]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=TEST_RATIO, random_state=SEED, shuffle=True)
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=VAL_RATIO, random_state=SEED, shuffle=True)

In [133]:
study = optuna.create_study(
    direction="maximize",
)

study.optimize(
    lambda trial: objective(trial, hyperparam_spaces["LGBMClassifier"], X_train, Y_train, X_val, Y_val),
    n_trials=50,
    n_jobs=1,
    show_progress_bar=True,
)

params = study.best_params
model = LGBMClassifier(**params)
model.fit(X_train, Y_train)
Y_train_pred = model.predict(X_train)
Y_test_pred = model.predict(X_test)
print(f"Score  training: {balanced_accuracy_score(Y_train, Y_train_pred):.3f} test: {balanced_accuracy_score(Y_test, Y_test_pred):.3f}")

joblib.dump(model, "../models/german/LGBMClassifier.pkl")

  0%|          | 0/50 [00:00<?, ?it/s]

Score  training: 1.000 test: 0.585


['../models/german/LGBMClassifier.pkl']

In [134]:
outlier_detection = IsolationForest(ndim=1, sample_size=256, max_depth=8, ntrees=100, missing_action="divide")
outlier_detection.fit(X_train);
outlier_detection = OutlierWrap(outlier_detection, 0.6)
joblib.dump(outlier_detection, "../models/german/IsolationForest.pkl")

outlier_detection = IsolationForest(ndim=1, sample_size=256, max_depth=8, ntrees=100, missing_action="divide")
outlier_detection.fit(X_test);
outlier_detection = OutlierWrap(outlier_detection, 0.6)
joblib.dump(outlier_detection, "../models/german/IsolationForest_test.pkl")

['../models/german/IsolationForest_test.pkl']

## Taiwan

In [135]:
os.makedirs("../models/taiwan", exist_ok=True)

In [136]:
df = pd.read_csv("../data/taiwan.csv")
X = df.drop("NoDefaultNextMonth", axis=1)
Y = df["NoDefaultNextMonth"]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=TEST_RATIO, random_state=SEED, shuffle=True)
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=VAL_RATIO, random_state=SEED, shuffle=True)

In [138]:
study = optuna.create_study(
    direction="maximize",
)

study.optimize(
    lambda trial: objective(trial, hyperparam_spaces["LGBMClassifier"], X_train, Y_train, X_val, Y_val),
    n_trials=50,
    n_jobs=1,
    show_progress_bar=True,
)

params = study.best_params
model = LGBMClassifier(**params)
model.fit(X_train, Y_train)
Y_train_pred = model.predict(X_train)
Y_test_pred = model.predict(X_test)
print(f"Score  training: {balanced_accuracy_score(Y_train, Y_train_pred):.3f} test: {balanced_accuracy_score(Y_test, Y_test_pred):.3f}")

joblib.dump(model, "../models/taiwan/LGBMClassifier.pkl")

  0%|          | 0/50 [00:00<?, ?it/s]

Score  training: 0.702 test: 0.637


['../models/taiwan/LGBMClassifier.pkl']

In [139]:
outlier_detection = IsolationForest(ndim=1, sample_size=256, max_depth=8, ntrees=100, missing_action="divide")
outlier_detection.fit(X_train);
outlier_detection = OutlierWrap(outlier_detection, 0.6)
joblib.dump(outlier_detection, "../models/taiwan/IsolationForest.pkl")

outlier_detection = IsolationForest(ndim=1, sample_size=256, max_depth=8, ntrees=100, missing_action="divide")
outlier_detection.fit(X_test);
outlier_detection = OutlierWrap(outlier_detection, 0.6)
joblib.dump(outlier_detection, "../models/taiwan/IsolationForest_test.pkl")

['../models/taiwan/IsolationForest_test.pkl']