In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import optuna
from lightgbm import LGBMClassifier
from isotree import IsolationForest
from sklearn.metrics import balanced_accuracy_score
import sys
sys.path.append("../")

from cfmining.utils import OutlierWrap

import os
import joblib

%load_ext autoreload
%autoreload 2

In [119]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.base import BaseEstimator, ClassifierMixin

class MLPClassifier(BaseEstimator, ClassifierMixin):
    """MLPClassifier in the Sklearn API using PyTorch.
    It mimics the MLPClassifier from Sklearn, but it uses PyTorch to train the model.
    The extra functionalities are the possibility to use class weights and sample weights.

    Parameters
    ----------
    hidden_layer_sizes : tuple, optional
            List of hidden layer sizes as a tuple with has n_layers-2 elements, by default (100,)
        batch_size : int, optional
            Size of batch for training, by default 32
        learning_rate_init : float, optional
            Initial learning rate, by default 0.1
        learning_rate_decay_rate : float, optional
            Decay rate of learning rate, equal to 1 to constant learning rate, by default 0.1
        alpha : float, optional
            Weight of L2 regularization, by default 0.0001
        epochs : int, optional
            Number of epochs to train model, by default 100
        class_weight : string, optional
            If want to use class weights in the loss, pass the value "balanced", by default None
        random_state : int, optional
            Random seed, by default None
    """

    def __init__(
        self,
        hidden_layer_sizes=(100,),
        batch_size=32,
        learning_rate_init=0.1,
        learning_rate_decay_rate=0.1,
        alpha=0.0001,
        epochs=100,
        class_weight=None,
        random_state=None,
    ):
        self._random_state = random_state
        self._seed_everything(random_state)
        self.hidden_layer_sizes = hidden_layer_sizes
        self.batch_size = batch_size
        self.learning_rate_init = learning_rate_init
        self.learning_rate_decay_rate = learning_rate_decay_rate
        self.alpha = alpha
        self.epochs = epochs
        self.class_weight = class_weight

    @property
    def random_state(self):
        return self._random_state

    @random_state.setter
    def random_state(self, value):
        self._random_state = value
        self._seed_everything(value)

    def _seed_everything(self, value):
        if value is not None:
            torch.manual_seed(self.random_state)
            np.random.seed(self.random_state)

    def set_model(self, input_dim):
        layers = []
        prev_size = input_dim
        for layer_size in self.hidden_layer_sizes:
            layers.append(nn.Linear(prev_size, layer_size))
            layers.append(nn.ReLU())
            prev_size = layer_size
        layers.append(nn.Linear(prev_size, 1))
        layers.append(nn.Sigmoid())
        model = nn.Sequential(*layers)
        return model

    def fit(self, X, y, sample_weight=None):
        if self.class_weight == "balanced":
            class_counts = np.bincount(y)
            class_weights = torch.tensor([1 / class_counts[i] for i in range(len(class_counts))], dtype=torch.float)
            if torch.cuda.is_available():
                class_weights = class_weights.cuda()
        else:
            class_weights = None

        self.model = self.set_model(X.shape[1])

        criterion = nn.BCELoss()
        optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate_init)
        scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=1 - self.learning_rate_decay_rate)

        X_tensor = torch.tensor(X, dtype=torch.float32)
        y_tensor = torch.tensor(y, dtype=torch.float32).view(-1, 1)

        if torch.cuda.is_available():
            self.model = self.model.cuda()
            X_tensor = X_tensor.cuda()
            y_tensor = y_tensor.cuda()

        dataset = TensorDataset(X_tensor, y_tensor)
        dataloader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True)

        for epoch in range(self.epochs):
            self.model.train()
            running_loss = 0.0
            for inputs, labels in dataloader:
                optimizer.zero_grad()
                outputs = self.model(inputs)
                print(outputs.shape, labels.shape)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()
                running_loss += loss.item()
            scheduler.step()
            epoch_loss = running_loss / len(dataloader)
            print(f"Epoch {epoch + 1}/{self.epochs}, Loss: {epoch_loss:.4f}")

    def predict_proba(self, X):
        self.model.eval()
        X_tensor = torch.tensor(X, dtype=torch.float32)
        if torch.cuda.is_available():
            X_tensor = X_tensor.cuda()
        with torch.no_grad():
            prob = self.model(X_tensor).cpu().numpy()
        return np.concatenate([1 - prob, prob], axis=1)

    def predict(self, X):
        prob = self.predict_proba(X)
        return prob[:, 1] > 0.5

    def score(self, X, y):
        prob = self.predict_proba(X)
        return prob[:, 1]


In [3]:
VAL_RATIO = 1/10
TEST_RATIO = 0.5
SEED = 0

In [4]:
hyperparam_spaces = {
    "LGBMClassifier": {
        "n_estimators": {"low": 5, "high": 250, "type": "int"},
        "learning_rate": {"low": 0.05, "high": 1.0, "type": "float"},
        "max_depth": {"low": 2, "high": 12, "type": "int"},
        "colsample_bytree": {"low": 0.1, "high": 1.0, "type": "float"},
        "reg_alpha": {"low": 1e-3, "high": 1e3, "log": True, "type": "float"},
        "verbose": {"choices": [-1], "type": "categorical"},
        "random_state" : {"choices": [SEED], "type": "categorical"},
    }
}

In [5]:
optuna.logging.set_verbosity(optuna.logging.WARNING)

def objective(
    trial,
    hyperparams,
    X_train,
    Y_train,
    X_val,
    Y_val
    ):
    params = {}
    for k, v in hyperparams.items():
        if "choices" in v:
            params[k] = trial.suggest_categorical(k, v["choices"])
        elif v["type"] == "int":
            params[k] = trial.suggest_int(k, v["low"], v["high"])
        elif v["type"] == "float":
            params[k] = trial.suggest_float(k, v["low"], v["high"], log=v.get("log", False))

    model = LGBMClassifier(**params)
    model.fit(X_train, Y_train)
    Y_val_pred = model.predict(X_val)
    score = balanced_accuracy_score(Y_val, Y_val_pred)
    return score

## German

In [6]:
os.makedirs("../models/german", exist_ok=True)

In [44]:
df = pd.read_csv("../data/german.csv")
X = df.drop("GoodCustomer", axis=1)
Y = df["GoodCustomer"]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=TEST_RATIO, random_state=SEED, shuffle=True)
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=VAL_RATIO, random_state=SEED, shuffle=True)

# reset index
X_train = X_train.reset_index(drop=True)
X_val = X_val.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
Y_train = Y_train.reset_index(drop=True)
Y_val = Y_val.reset_index(drop=True)
Y_test = Y_test.reset_index(drop=True)

In [116]:
y_tensor

NameError: name 'y_tensor' is not defined

In [120]:
m = MLPClassifier(hidden_layer_sizes=[30, 30, 30], class_weight="balanced", learning_rate_init = 1e-3)
m.fit(X_train.values, Y_train.values)

torch.Size([32, 1]) torch.Size([32, 1])
torch.Size([32, 1]) torch.Size([32, 1])
torch.Size([32, 1]) torch.Size([32, 1])
torch.Size([32, 1]) torch.Size([32, 1])
torch.Size([32, 1]) torch.Size([32, 1])
torch.Size([32, 1]) torch.Size([32, 1])
torch.Size([32, 1]) torch.Size([32, 1])
torch.Size([32, 1]) torch.Size([32, 1])
torch.Size([32, 1]) torch.Size([32, 1])
torch.Size([32, 1]) torch.Size([32, 1])
torch.Size([32, 1]) torch.Size([32, 1])
torch.Size([32, 1]) torch.Size([32, 1])
torch.Size([32, 1]) torch.Size([32, 1])
torch.Size([32, 1]) torch.Size([32, 1])
torch.Size([2, 1]) torch.Size([2, 1])
Epoch 1/100, Loss: 3.4256
torch.Size([32, 1]) torch.Size([32, 1])
torch.Size([32, 1]) torch.Size([32, 1])
torch.Size([32, 1]) torch.Size([32, 1])
torch.Size([32, 1]) torch.Size([32, 1])
torch.Size([32, 1]) torch.Size([32, 1])
torch.Size([32, 1]) torch.Size([32, 1])
torch.Size([32, 1]) torch.Size([32, 1])
torch.Size([32, 1]) torch.Size([32, 1])
torch.Size([32, 1]) torch.Size([32, 1])
torch.Size([32, 

In [122]:
Y_train_pred = m.predict(X_train.values)
Y_test_pred = m.predict(X_test.values)
print(f"Score  training: {balanced_accuracy_score(Y_train, Y_train_pred):.3f} test: {balanced_accuracy_score(Y_test, Y_test_pred):.3f}")

Score  training: 0.500 test: 0.500


In [128]:
m.predict_proba(X_train.values)

array([[0.326266  , 0.673734  ],
       [0.41550994, 0.58449006],
       [0.41093546, 0.58906454],
       [0.36990875, 0.63009125],
       [0.3386584 , 0.6613416 ],
       [0.34866148, 0.6513385 ],
       [0.26180965, 0.73819035],
       [0.3600437 , 0.6399563 ],
       [0.40489727, 0.5951027 ],
       [0.42173642, 0.5782636 ],
       [0.39685512, 0.6031449 ],
       [0.36995435, 0.63004565],
       [0.3516733 , 0.6483267 ],
       [0.35003078, 0.6499692 ],
       [0.3783098 , 0.6216902 ],
       [0.4070995 , 0.5929005 ],
       [0.3848133 , 0.6151867 ],
       [0.39831066, 0.60168934],
       [0.39491612, 0.6050839 ],
       [0.3309278 , 0.6690722 ],
       [0.34663463, 0.6533654 ],
       [0.35596   , 0.64404   ],
       [0.3530137 , 0.6469863 ],
       [0.3682636 , 0.6317364 ],
       [0.4127286 , 0.5872714 ],
       [0.42555773, 0.57444227],
       [0.26472014, 0.73527986],
       [0.3415625 , 0.6584375 ],
       [0.38347012, 0.6165299 ],
       [0.39144236, 0.60855764],
       [0.

In [130]:
import shap

def model_predict(X):
    return m.predict_proba(X)[:, 1]

X100 = X_train.sample(100)

explainer = shap.Explainer(model_predict, masker = custom_masker)
shap_values = explainer.shap_values(X100.values)

In [131]:
sorted(list(zip(X_train.columns, shap_values.mean(axis = 0))), key = lambda x: -abs(x[1]))

[('Age', 0.10426285851746797),
 ('LoanAmount', 0.024221112355589866),
 ('LoanRateAsPercentOfIncome', 0.0070533562824130055),
 ('LoanDuration', 0.005927009098231792),
 ('YearsAtCurrentHome', 0.004446406066417694),
 ('OwnsHouse', 0.002984185516834259),
 ('NumberOfLiableIndividuals', 0.0021282975748181344),
 ('is_male', 0.0020160606130957604),
 ('MissedPayments', 0.001996358521282673),
 ('Single', 0.0012087083607912063),
 ('YearsAtCurrentJob_geq_4', 0.0007239880412817001),
 ('NumberOfOtherLoansAtBank', 0.0006569245085120201),
 ('CheckingAccountBalance_geq_0', 0.0006222424656152725),
 ('CriticalAccountOrLoansElsewhere', 0.0005663938447833061),
 ('HasTelephone', 0.0005438289418816567),
 ('JobClassIsSkilled', 0.0004725356772542),
 ('YearsAtCurrentJob_lt_1', -0.00032519057393074033),
 ('RentsHouse', -0.00030915863811969757),
 ('CheckingAccountBalance_geq_200', 0.00021936610341072082),
 ('SavingsAccountBalance_geq_500', 0.00019009385257959366),
 ('OtherLoansAtBank', 0.00018474001437425612),
 (

In [132]:
explainer = shap.DeepExplainer(m.model, data=torch.Tensor(X_train[:100].values))

In [140]:
shap_values = explainer.shap_values(torch.Tensor(X_train.values[:100]))

shap_values.shape

(100, 27, 1)

In [158]:
study = optuna.create_study(
    direction="maximize",
)

study.optimize(
    lambda trial: objective(trial, hyperparam_spaces["LGBMClassifier"], X_train, Y_train, X_val, Y_val),
    n_trials=50,
    n_jobs=1,
    show_progress_bar=True,
)

params = study.best_params
model = LGBMClassifier(**params)
model.fit(X_train, Y_train)
Y_train_pred = model.predict(X_train)
Y_test_pred = model.predict(X_test)
print(f"Score  training: {balanced_accuracy_score(Y_train, Y_train_pred):.3f} test: {balanced_accuracy_score(Y_test, Y_test_pred):.3f}")

joblib.dump(model, "../models/german/LGBMClassifier.pkl")

  0%|          | 0/50 [00:00<?, ?it/s]

Score  training: 0.854 test: 0.575


['../models/german/LGBMClassifier.pkl']

In [159]:
outlier_detection = IsolationForest(ndim=1, sample_size=256, max_depth=8, ntrees=100, missing_action="divide")
outlier_detection.fit(X_train);
outlier_detection = OutlierWrap(outlier_detection, 0.6)
joblib.dump(outlier_detection, "../models/german/IsolationForest.pkl")

outlier_detection = IsolationForest(ndim=1, sample_size=256, max_depth=8, ntrees=100, missing_action="divide")
outlier_detection.fit(X_test);
outlier_detection = OutlierWrap(outlier_detection, 0.6)
joblib.dump(outlier_detection, "../models/german/IsolationForest_test.pkl")

['../models/german/IsolationForest_test.pkl']

## German small

In [96]:
os.makedirs("../models/german_small", exist_ok=True)

In [99]:
df = pd.read_csv("../data/german.csv")
X = df.drop("GoodCustomer", axis=1)
Y = df["GoodCustomer"]

X = X[["LoanAmount", "LoanDuration", "OwnsHouse", "is_male"]]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=TEST_RATIO, random_state=SEED, shuffle=True)
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=VAL_RATIO, random_state=SEED, shuffle=True)

In [100]:
study = optuna.create_study(
    direction="maximize",
)

study.optimize(
    lambda trial: objective(trial, hyperparam_spaces["LGBMClassifier"], X_train, Y_train, X_val, Y_val),
    n_trials=50,
    n_jobs=1,
    show_progress_bar=True,
)

params = study.best_params
model = LGBMClassifier(**params)
model.fit(X_train, Y_train)
Y_train_pred = model.predict(X_train)
Y_test_pred = model.predict(X_test)
print(f"Score  training: {balanced_accuracy_score(Y_train, Y_train_pred):.3f} test: {balanced_accuracy_score(Y_test, Y_test_pred):.3f}")

joblib.dump(model, "../models/german_small/LGBMClassifier.pkl")

  0%|          | 0/50 [00:00<?, ?it/s]

Score  training: 0.634 test: 0.546


['../models/german_small/LGBMClassifier.pkl']

In [101]:
outlier_detection = IsolationForest(ndim=1, sample_size=256, max_depth=8, ntrees=100, missing_action="divide")
outlier_detection.fit(X_train);
outlier_detection = OutlierWrap(outlier_detection, 0.6)
joblib.dump(outlier_detection, "../models/german_small/IsolationForest.pkl")

outlier_detection = IsolationForest(ndim=1, sample_size=256, max_depth=8, ntrees=100, missing_action="divide")
outlier_detection.fit(X_test);
outlier_detection = OutlierWrap(outlier_detection, 0.6)
joblib.dump(outlier_detection, "../models/german_small/IsolationForest_test.pkl")

['../models/german_small/IsolationForest_test.pkl']

## Taiwan

In [227]:
os.makedirs("../models/taiwan", exist_ok=True)

In [228]:
df = pd.read_csv("../data/taiwan.csv")
X = df.drop("NoDefaultNextMonth", axis=1)
X = X.astype(int)
Y = df["NoDefaultNextMonth"]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=TEST_RATIO, random_state=SEED, shuffle=True)
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=VAL_RATIO, random_state=SEED, shuffle=True)

In [229]:
study = optuna.create_study(
    direction="maximize",
)

study.optimize(
    lambda trial: objective(trial, hyperparam_spaces["LGBMClassifier"], X_train, Y_train, X_val, Y_val),
    n_trials=50,
    n_jobs=1,
    show_progress_bar=True,
)

params = study.best_params
model = LGBMClassifier(**params)
model.fit(X_train, Y_train)
Y_train_pred = model.predict(X_train)
Y_test_pred = model.predict(X_test)
print(f"Score  training: {balanced_accuracy_score(Y_train, Y_train_pred):.3f} test: {balanced_accuracy_score(Y_test, Y_test_pred):.3f}")

joblib.dump(model, "../models/taiwan/LGBMClassifier.pkl")

  0%|          | 0/50 [00:00<?, ?it/s]

Score  training: 0.669 test: 0.631


['../models/taiwan/LGBMClassifier.pkl']

In [230]:
outlier_detection = IsolationForest(ndim=1, sample_size=256, max_depth=8, ntrees=100, missing_action="divide")
outlier_detection.fit(X_train);
outlier_detection = OutlierWrap(outlier_detection, 0.6)
joblib.dump(outlier_detection, "../models/taiwan/IsolationForest.pkl")

outlier_detection = IsolationForest(ndim=1, sample_size=256, max_depth=8, ntrees=100, missing_action="divide")
outlier_detection.fit(X_test);
outlier_detection = OutlierWrap(outlier_detection, 0.6)
joblib.dump(outlier_detection, "../models/taiwan/IsolationForest_test.pkl")

['../models/taiwan/IsolationForest_test.pkl']