In [1]:
import pickle
import sys

import pandas as pd
from os.path import join
import os
import numpy as np

In [2]:
import lightgbm as lgb
from lightgbm import LGBMClassifier

from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, cohen_kappa_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

In [110]:
data_folder = "../data/preprocessed/"
X = pd.read_pickle(join(data_folder, "X.pk.zip"))
y = pd.read_pickle(join(data_folder, "Y.pk.zip"))

X_mat = X.values
y_vec = y.values.flatten()
X_train, X_test, y_train, y_test = train_test_split(X_mat, y_vec, test_size=0.12, random_state=123)


In [111]:
sys.stdout.write(f"The number of features: {X_mat.shape[1]}\n")
sys.stdout.write(f"TRAIN \
0: {np.sum(y_train == 0)}, 1: {np.sum(y_train == 1)}, 2: {np.sum(y_train == 2)}\n")

sys.stdout.write(f"TEST  \
0: {np.sum(y_test == 0)}, 1: {np.sum(y_test == 1)}, 2: {np.sum(y_test == 2)}\n")

The number of features: 195
TRAIN 0: 1802, 1: 2487, 2: 57936
TEST  0: 239, 1: 348, 2: 7899


30

# Model

## SKlearn - like

In [112]:
classifiers = {
    "Tree w": DecisionTreeClassifier(class_weight='balanced'),
    "Tree cw": DecisionTreeClassifier(criterion = 'log_loss',
                                     class_weight='balanced'),
    # "LGBMC": LGBMClassifier(class_weight={0: 5, 1: 5, 2: 1}),
    "LGBMC": LGBMClassifier(),
    "LGBMC bo": LGBMClassifier(
        class_weight='balanced',
    objective = 'multiclass')
}

In [113]:
preprocessor = StandardScaler()
for cla in classifiers:
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor)
        , ('classifier', classifiers[cla])
    ])
    model = pipeline.fit(X_train, y_train)
    predictions = model.predict(X_test)
    score = f1_score(predictions, y_test, average=None)
    score2 = cohen_kappa_score(y_test, predictions)
    print(f'Model:{cla}; score:{score2}; all_score:{score}')

Model:Tree w; score:0.0035355118963036336; all_score:[0.03396226 0.04260985 0.92498885]
Model:Tree cw; score:0.020508496789827135; all_score:[0.06118547 0.04109589 0.92830333]
Model:LGBMC; score:0.007278791126420936; all_score:[0.01626016 0.         0.96409818]
Model:LGBMC bo; score:0.03782731570286835; all_score:[0.08225617 0.07469414 0.86010434]


## lgb-like

In [114]:
train_data = lgb.Dataset(X_train, label=y_train)
validation_data = lgb.Dataset(X_test, label=y_test)

In [116]:
def kappa_metric(dy_true, dy_pred):
    """An eval metric that always returns the same value"""
    metric_name = 'constant_metric'
    preds = dy_pred.reshape(3, -1).T
    preds = preds.argmax(axis = 1)
    value = cohen_kappa_score(dy_true, dy_pred)
    is_higher_better = True
    return metric_name, value, is_higher_better

evals_result = {}


def evalerror(preds, dtrain):

    labels = dtrain.get_label()
    preds = preds.reshape(3, -1).T
    preds = preds.argmax(axis = 1)
    f_score = f1_score(labels , preds,  average = 'weighted')
    return 'f1_score', f_score, True

In [22]:
# y_train

In [123]:
#setting up the parameters
params={}
params['learning_rate']=0.01
params['boosting_type']='gbdt' #GradientBoostingDecisionTree
params['objective']='multiclassova' #Multi-class target feature
params['metric']='multi_logloss' #metric for multi-class
# params['max_depth']=10
params['num_class']=3 
params['is_unbalance']=True

In [124]:
num_round = 800
bst = lgb.train(params, train_data, num_round)#, feval=kappa_metric)

[LightGBM] [Info] Number of positive: 1802, number of negative: 60423
[LightGBM] [Info] Number of positive: 2487, number of negative: 59738
[LightGBM] [Info] Number of positive: 57936, number of negative: 4289
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 18927
[LightGBM] [Info] Number of data points in the train set: 62225, number of used features: 188
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.028959 -> initscore=-3.512473
[LightGBM] [Info] Start training from score -3.512473
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.039968 -> initscore=-3.178891
[LightGBM] [Info] Start training from score -3.178891
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.931073 -> initscore=2.603285
[LightGBM] [Info] Start training from score 2.603285


In [125]:
ypred = bst.predict(X_test)
ypr = [np.argmax(line) for line in ypred]
cohen_kappa_score(y_test, ypr)

0.04987254963898424