In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.ensemble import BalancedRandomForestClassifier

from sklearn.utils.class_weight import compute_class_weight

In [3]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix

In [4]:
import warnings
warnings.filterwarnings(action='ignore')

In [5]:
random_seed=1225 #10, 1225
np.random.seed(random_seed) 

In [6]:
def perf(y, y_hat):
    print("ACC: ", round(accuracy_score(y, y_hat), 4))
    print("F1: ", round(f1_score(y, y_hat), 4))
    print("ROC AUC: ", round(roc_auc_score(y, y_hat), 4))
    print("PR: ", round(precision_score(y, y_hat), 4))

In [7]:
data_folder = '../data'

with open(os.path.join(data_folder, 'train_x.npy'), 'rb') as f:
    train_x = np.load(f)
with open(os.path.join(data_folder, 'train_y.npy'), 'rb') as f:
    train_y = np.load(f)
with open(os.path.join(data_folder, 'width_y.npy'), 'rb') as f:
    width_y = np.load(f)

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid, width_yt, width_yv = train_test_split(train_x, train_y, width_y, stratify=train_y, test_size=0.4)
X_valid, X_test, y_valid, y_test, width_yv, width_ytt = train_test_split(X_valid, y_valid, width_yv, stratify=y_valid, test_size=0.2)

In [9]:
X_train_reshape = X_train.reshape(X_train.shape[0], -1)
X_valid_reshape = X_valid.reshape(X_valid.shape[0], -1)
X_test_reshape = X_test.reshape(X_test.shape[0], -1)

y_train_reshape = y_train
y_valid_reshape = y_valid
y_test_reshape = y_test

In [10]:
sm = SMOTE(random_state=random_seed)
X_train_upresampled, y_train_upresampled = sm.fit_resample(X_train_reshape, y_train_reshape)

In [11]:
rus = RandomUnderSampler(random_state=random_seed)
X_train_downresampled, y_train_downresampled = rus.fit_resample(X_train_reshape, y_train_reshape)

In [13]:
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight(
                                        class_weight = "balanced",
                                        classes = np.unique(y_train_reshape),
                                        y = y_train_reshape                                                    
                                    )
class_weights = dict(zip(np.unique(y_train_reshape), class_weights))


In [14]:
from sklearn.linear_model import LogisticRegression
clf_up = LogisticRegression(random_state=random_seed).fit(X_train_upresampled, y_train_upresampled)
clf_down = LogisticRegression(random_state=random_seed).fit(X_train_downresampled, y_train_downresampled)
clf_weight = LogisticRegression(random_state=random_seed, class_weight=class_weights).fit(X_train_reshape, y_train_reshape)

y_pred_up = clf_up.predict(X_test_reshape)
y_pred_down = clf_down.predict(X_test_reshape)
y_pred_weight = clf_weight.predict(X_test_reshape)

print("---Oversampling Logistic Regression---")
perf(y_test_reshape, y_pred_up)

print("---Undersampling Logistic Regression---")
perf(y_test_reshape, y_pred_down)

print("---Weighted-matrix Logistic Regression---")
perf(y_test_reshape, y_pred_weight)

---Oversampling Logistic Regression---
ACC:  0.8143
F1:  0.3408
ROC AUC:  0.6097
PR:  0.3706
---Undersampling Logistic Regression---
ACC:  0.7636
F1:  0.501
ROC AUC:  0.7702
PR:  0.369
---Weighted-matrix Logistic Regression---
ACC:  0.8107
F1:  0.3236
ROC AUC:  0.6002
PR:  0.3546


In [16]:
from sklearn.ensemble import RandomForestClassifier

clf_up = RandomForestClassifier(random_state=random_seed).fit(X_train_upresampled, y_train_upresampled)
clf_down = RandomForestClassifier(random_state=random_seed).fit(X_train_downresampled, y_train_downresampled)
clf_weight = RandomForestClassifier(random_state=random_seed, class_weight=class_weights).fit(X_train_reshape, y_train_reshape)

y_pred_up = clf_up.predict(X_test_reshape)
y_pred_down = clf_down.predict(X_test_reshape)
y_pred_weight = clf_weight.predict(X_test_reshape)

print("---Oversampling RandomForestClassifier---")
perf(y_test_reshape, y_pred_up)

print("---Undersampling RandomForestClassifier---")
perf(y_test_reshape, y_pred_down)

print("---Weighted-matrix RandomForestClassifier---")
perf(y_test_reshape, y_pred_weight)

---Oversampling RandomForestClassifier---
ACC:  0.8351
F1:  0.4709
ROC AUC:  0.6903
PR:  0.4602
---Undersampling RandomForestClassifier---
ACC:  0.779
F1:  0.5564
ROC AUC:  0.833
PR:  0.4005
---Weighted-matrix RandomForestClassifier---
ACC:  0.8433
F1:  0.2445
ROC AUC:  0.5657
PR:  0.459


In [17]:
from xgboost import XGBClassifier
from sklearn.utils import class_weight
class_weights = list(class_weight.compute_sample_weight(
                                        class_weight = "balanced",
                                        #classes = np.unique(y_train_reshape),
                                        y = y_train_reshape                                                    
                                    ))

clf_up = XGBClassifier(random_state=random_seed).fit(X_train_upresampled, y_train_upresampled, eval_set=[(X_valid_reshape, y_valid_reshape)])
clf_down = XGBClassifier(random_state=random_seed).fit(X_train_downresampled, y_train_downresampled, eval_set=[(X_valid_reshape, y_valid_reshape)])
clf_weight = XGBClassifier(random_state=random_seed).fit(X_train_reshape, y_train_reshape, eval_set=[(X_valid_reshape, y_valid_reshape)], sample_weight=class_weights)

y_pred_up = clf_up.predict(X_test_reshape)
y_pred_down = clf_down.predict(X_test_reshape)
y_pred_weight = clf_weight.predict(X_test_reshape)

print("---Oversampling XGBClassifier---")
perf(y_test_reshape, y_pred_up)

print("---Undersampling XGBClassifier---")
perf(y_test_reshape, y_pred_down)

print("---Weighted-matrix XGBClassifier---")
perf(y_test_reshape, y_pred_weight)

[0]	validation_0-logloss:0.55592
[1]	validation_0-logloss:0.48309
[2]	validation_0-logloss:0.43683
[3]	validation_0-logloss:0.40460
[4]	validation_0-logloss:0.38494
[5]	validation_0-logloss:0.37130
[6]	validation_0-logloss:0.36288
[7]	validation_0-logloss:0.35749
[8]	validation_0-logloss:0.35272
[9]	validation_0-logloss:0.34908
[10]	validation_0-logloss:0.34674
[11]	validation_0-logloss:0.34424
[12]	validation_0-logloss:0.34246
[13]	validation_0-logloss:0.34255
[14]	validation_0-logloss:0.34322
[15]	validation_0-logloss:0.34249
[16]	validation_0-logloss:0.34268
[17]	validation_0-logloss:0.34269
[18]	validation_0-logloss:0.34343
[19]	validation_0-logloss:0.34325
[20]	validation_0-logloss:0.34604
[21]	validation_0-logloss:0.34613
[22]	validation_0-logloss:0.34605
[23]	validation_0-logloss:0.34680
[24]	validation_0-logloss:0.34799
[25]	validation_0-logloss:0.34737
[26]	validation_0-logloss:0.34759
[27]	validation_0-logloss:0.34820
[28]	validation_0-logloss:0.34788
[29]	validation_0-loglos