# Training Classifier Models and Outlier Detection

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import optuna
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.metrics import balanced_accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import sys
sys.path.append("../")

import cfmining.models as models
from cfmining.outlier_detector import *
from cfmining.datasets import *
import matplotlib.pyplot as plt

import os
import joblib


## General Configs

In [5]:
VAL_RATIO = 1/10
TEST_RATIO = 0.5
SEED = 0
n_jobs = 8
n_trials = 50

In [3]:
hyperparam_spaces = {
    "LogisticRegression" : {
        "C": {"low": 1e-5, "high": 1, "log": True, "type": "float"},
        "class_weight": {"choices": ["balanced"], "type": "categorical"},
        "random_state" : {"choices": [SEED], "type": "categorical"},
        "max_iter" : {"low" : 1000, "high" : 1000, "type" : "int"},
    },
    "LGBMClassifier": {
        "n_estimators": {"low": 5, "high": 250, "type": "int"},
        "learning_rate": {"low": 0.05, "high": 1.0, "type": "float"},
        "max_depth": {"low": 2, "high": 12, "type": "int"},
        "colsample_bytree": {"low": 0.1, "high": 1.0, "type": "float"},
        "reg_lambda": {"low": 1e-2, "high": 1e4, "log": True, "type": "float"},
        "verbose": {"choices": [-1], "type": "categorical"},
        "random_state" : {"choices": [SEED], "type": "categorical"},
    },
    "MLPClassifier": {
        "hidden_layer_sizes": {
            "choices": [
                [30],
                [30, 30],
                [30, 30, 30],
                [64],
                [30, 64],
            ],
            "type": "categorical",
        },
        "learning_rate_init": {"low": 1e-5, "high": 1e-3, "type": "float", "log": True},
        "weight_decay": {"low": 1e-5, "high": 1e-3, "type": "float", "log": True},
        "epochs": {"low": 10, "high": 100, "type": "int", "step": 10},
        "class_weight": {"choices": ["balanced"], "type": "categorical"},
        "batch_size" : {"low" : 128, "high" : 128, "type" : "int"},
        "random_state" : {"choices": [SEED], "type": "categorical"},
    },
}

In [4]:
optuna.logging.set_verbosity(optuna.logging.ERROR)

def get_model(X, model_name, categorical_features, params):
    num_features = [col for col in X.columns if col not in categorical_features]
    model = [
        ("preprocess", ColumnTransformer([
            ("num", StandardScaler(), num_features),
            ("cat", OneHotEncoder(sparse_output=False), categorical_features)
        ])),
    ]   
    if model_name == "LGBMClassifier":
        model.append(("classifier", LGBMClassifier(**params)))
    elif model_name == "MLPClassifier":
        model.append(("classifier", models.MLPClassifier(**params)))
    elif model_name == "LogisticRegression":
        model.append(("classifier", LogisticRegression(**params)))    
    model = Pipeline(model)
    return model


def objective(
    trial,
    hyperparams,
    X_train,
    Y_train,
    X_val,
    Y_val,
    model_name = "LGBMClassifier",
    categorical_features = [],
    ):
    params = {}
    for k, v in hyperparams.items():
        if v["type"] == "categorical":
            params[k] = trial.suggest_categorical(k, v["choices"])
        elif v["type"] == "int":
            params[k] = trial.suggest_int(k, v["low"], v["high"])
        elif v["type"] == "float":
            params[k] = trial.suggest_float(k, v["low"], v["high"], log=v.get("log", False))

    model = get_model(X_train, model_name, categorical_features, params)
    model.fit(X_train, Y_train)

    Y_val_pred = model.predict(X_val)
    score = balanced_accuracy_score(Y_val, Y_val_pred)

    return score

In [8]:
model_list = [
    "LogisticRegression",
    "LGBMClassifier",
    "MLPClassifier",
]

for dataset_name in ["german", "german_cat", "taiwan", "taiwan_cat", "adult", "adult_cat"]:

    #### LOAD DATA
    os.makedirs(f"results/{dataset_name}", exist_ok=True)
    cat = "_cat" in dataset_name

    dataset = DATASETS_[dataset_name.replace("_cat", "")](use_categorical=cat)
    X, Y = dataset.load_data()

    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=TEST_RATIO, random_state=SEED, shuffle=True)
    X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=VAL_RATIO, random_state=SEED, shuffle=True)

    # reset index
    X_train = X_train.reset_index(drop=True)
    X_val = X_val.reset_index(drop=True)
    X_test = X_test.reset_index(drop=True)
    Y_train = Y_train.reset_index(drop=True)
    Y_val = Y_val.reset_index(drop=True)
    Y_test = Y_test.reset_index(drop=True)


    #### HYPERPARAMETER OPTIMIZATION OF EACH MODEL
    for model_name in model_list:
        study = optuna.create_study(
        direction="maximize",
        )
        study.optimize(
            lambda trial: objective(trial, hyperparam_spaces[model_name], X_train, Y_train, X_val, Y_val, model_name, dataset.categoric_features),
            n_trials=n_trials,
            n_jobs=n_jobs if model_name != "MLPClassifier" else 1,
            show_progress_bar=True,
        )

        params = study.best_params
        model = get_model(X_train, model_name, dataset.categoric_features, params)
        
        model.fit(X_train, Y_train)
        Y_train_pred = model.predict(X_train)
        Y_test_pred = model.predict(X_test)

        print(f"Number of denied samples from test: {(1 - Y_test_pred).sum():.0f}")
        print(f"Score  training: {balanced_accuracy_score(Y_train, Y_train_pred):.3f}")
        print(f"Score validation: {study.best_value:.3f}")
        print(f"Score test: {balanced_accuracy_score(Y_test, Y_test_pred):.3f}")

        joblib.dump(model, f"../models/{dataset_name}/{model_name}.pkl")



    #### SAVE ISOLATION FOREST
    outlier_detection = IsolationForest(contamination = dataset.outlier_contamination, ndim=1, sample_size=0.25, max_depth="auto", ntrees=100, missing_action="divide")
    outlier_detection.fit(X_train);
    joblib.dump(outlier_detection, f"../models/{dataset_name}/IsolationForest.pkl")

    outlier_detection = IsolationForest(dataset.outlier_contamination, ndim=2, sample_size=0.25, max_depth="auto", ntrees=100)
    outlier_detection.fit(X_test);
    joblib.dump(outlier_detection, f"../models/{dataset_name}/IsolationForest_test.pkl")


    #### SAVE AE OUTLIER DETECTOR
    outlier_detection = AE_OutlierDetector(
        categoric_features=dataset.categoric_features,
        contamination=dataset.outlier_contamination,
        hidden_layer_sizes = [128, 64, 64, 64],
    )
    outlier_detection.fit(X_train);
    joblib.dump(outlier_detection, f"../models/{dataset_name}/AE_OutlierDetection.pkl")

    outlier_detection = AE_OutlierDetector(
        categoric_features=dataset.categoric_features,
        contamination=dataset.outlier_contamination,
        hidden_layer_sizes = [128, 64, 64, 64],
    )
    outlier_detection.fit(X_test);
    joblib.dump(outlier_detection, f"../models/{dataset_name}/AE_OutlierDetection_test.pkl")