In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import optuna
from lightgbm import LGBMClassifier
from isotree import IsolationForest
import sys
sys.path.append("../cfmining")

from utils import OutlierWrap

import os
import joblib

%load_ext autoreload
%autoreload 2

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
VAL_RATIO = 1/7
TEST_RATIO = 0.3
SEED = 0

In [3]:
hyperparam_spaces = {
    "LGBMClassifier": {
        "n_estimators": {"low": 5, "high": 250, "type": "int"},
        "learning_rate": {"low": 0.05, "high": 1.0, "type": "float"},
        "max_depth": {"low": 2, "high": 12, "type": "int"},
        "colsample_bytree": {"low": 0.1, "high": 1.0, "type": "float"},
        "reg_alpha": {"low": 1e-3, "high": 1e3, "log": True, "type": "float"},
        "verbose": {"choices": [-1], "type": "categorical"},
        "random_state" : {"choices": [SEED], "type": "categorical"},
    }
}

In [4]:
optuna.logging.set_verbosity(optuna.logging.WARNING)

def objective(
    trial,
    hyperparams,
    X_train,
    Y_train,
    X_val,
    Y_val
    ):
    params = {}
    for k, v in hyperparams.items():
        if "choices" in v:
            params[k] = trial.suggest_categorical(k, v["choices"])
        elif v["type"] == "int":
            params[k] = trial.suggest_int(k, v["low"], v["high"])
        elif v["type"] == "float":
            params[k] = trial.suggest_float(k, v["low"], v["high"], log=v.get("log", False))

    model = LGBMClassifier(**params)
    model.fit(X_train, Y_train)
    score = model.score(X_val, Y_val)
    return score

## German

In [5]:
os.makedirs("../models/german", exist_ok=True)

In [6]:
df = pd.read_csv("../data/german.csv")
X = df.drop("GoodCustomer", axis=1)
Y = df["GoodCustomer"]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=TEST_RATIO, random_state=SEED, shuffle=True)
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=VAL_RATIO, random_state=SEED, shuffle=True)

In [7]:
study = optuna.create_study(
    direction="maximize",
)

study.optimize(
    lambda trial: objective(trial, hyperparam_spaces["LGBMClassifier"], X_train, Y_train, X_val, Y_val),
    n_trials=100,
    n_jobs=1,
    show_progress_bar=True,
)

  0%|          | 0/100 [00:00<?, ?it/s]

Best trial: 5. Best value: 0.75: 100%|██████████| 100/100 [00:05<00:00, 19.42it/s]


In [8]:
params = study.best_params
model = LGBMClassifier(**params)
model.fit(X_train, Y_train)
print("Score on training set:")
print(model.score(X_train, Y_train))
print("Score on test set:")
print(model.score(X_test, Y_test))

Score on training set:
1.0
Score on test set:
0.6833333333333333


In [9]:
joblib.dump(model, "../models/german/LGBMClassifier.pkl")

['../models/german/LGBMClassifier.pkl']

In [10]:
outlier_detection = IsolationForest(ndim=1, sample_size=256, max_depth=8, ntrees=100, missing_action="divide")
outlier_detection.fit(X_train);
outlier_detection = OutlierWrap(outlier_detection, 0.6)

In [15]:
np.unique(outlier_detection.predict(X_train), return_counts=True)

(array([-1,  1]), array([  9, 591]))

In [16]:
joblib.dump(outlier_detection, "../models/german/IsolationForest.pkl")

['../models/german/IsolationForest.pkl']