In [7]:
import pandas as pd
import numpy as np
from io import StringIO
from classes import BalancedUndersamplingShuffle, balanced_sampling
from helpers import plot_learning_curve, plot_validation_curve, fit_and_score_pipeline, exp_range, \
    fit_and_score_iteratively, plot_validation_curve_with_undersampling
from sklearn.model_selection import ShuffleSplit, StratifiedKFold
from sklearn.metrics import check_scoring
from sklearn.model_selection._validation import _fit_and_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder

from sklearn.metrics import fbeta_score, make_scorer
from sklearn.ensemble import AdaBoostClassifier

In [2]:
from scipy.io import arff

with open('1year.arff','r') as f:
    data1, meta1 = arff.loadarff(f)
    data1 = np.asarray(data1.tolist(), dtype=np.float32)
    print(data1.shape)

with open('2year.arff','r') as f:
    data2, meta2 = arff.loadarff(f)
    data2 = np.asarray(data2.tolist(), dtype=np.float32)
    print(data2.shape)

with open('3year.arff','r') as f:
    data3, meta3 = arff.loadarff(f)
    data3 = np.asarray(data3.tolist(), dtype=np.float32)
    print(data3.shape)

with open('4year.arff','r') as f:
    data4, meta4 = arff.loadarff(f)
    data4 = np.asarray(data4.tolist(), dtype=np.float32)
    print(data4.shape)

with open('5year.arff','r') as f:
    data5, meta5 = arff.loadarff(f)
    data5 = np.asarray(data5.tolist(), dtype=np.float32)
    print(data5.shape)


(7027, 65)
(10173, 65)
(10503, 65)
(9792, 65)
(5910, 65)


In [3]:
data = np.concatenate([data1, data2, data3, data4, data5], axis=0)
data[np.isnan(data)] = 0

X, y = data[:,:-1], data[:, -1]

y[y == 0] = -1

print(f'X.shape={X.shape}, y.shape={y.shape}')

X.shape=(43405, 64), y.shape=(43405,)


# Naive fit

In [4]:
# No undersampling
classifier = AdaBoostClassifier(n_estimators=100, random_state=0)
res = fit_and_score_iteratively(classifier, X, y, None, iterations=1)
print(res)


[0.95346337 0.22506394 0.56774194 0.14035088 0.56747274]


In [5]:
# Undersampling to 1:1 and iteration 10x
classifier = AdaBoostClassifier(n_estimators=100, random_state=0)
res = fit_and_score_iteratively(classifier, X, y, 1, iterations=10)
print(res)


a=0.0, r=1, gamma=19.758010521281683
a=0.0, r=1, gamma=19.758010521281683
a=0.0, r=1, gamma=19.758010521281683
a=0.0, r=1, gamma=19.758010521281683
a=0.0, r=1, gamma=19.758010521281683
a=0.0, r=1, gamma=19.758010521281683
a=0.0, r=1, gamma=19.758010521281683
a=0.0, r=1, gamma=19.758010521281683
a=0.0, r=1, gamma=19.758010521281683
a=0.0, r=1, gamma=19.758010521281683
[0.80119522 0.80207418 0.79794027 0.80637959 0.80119935]


# Model Tuning

In [6]:
# optimize number of weak classifiers

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

param_range = list(exp_range(1, 100, 4, 2))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
scoring = ['accuracy', 'f1', 'recall', 'precision']
cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=0)

classifier = AdaBoostClassifier(random_state=0)

results_store, plt = plot_validation_curve_with_undersampling(classifier, X_train, y_train, 
                      param_name='n_estimators', param_range=param_range, fit_params=None, error_score='raise',
                      cv=cv, scoring=scoring, n_jobs=8, iterations=5, undersampling_ratio=1, 
                      x_axis_is_log=True)


NameError: name 'plot_validation_curve_with_undersampling' is not defined

In [None]:
# optimize number of weak classifiers
param_range = list(exp_range(80, 100, 4, 2))

scoring = 'recall'
classifier = AdaBoostClassifier(random_state=0)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

train_scores_80_100, test_scores_80_100 = plot_validation_curve(classifier, X_train, y_train, 
                      param_name='n_estimators', param_range=param_range,
                      cv=cv, scoring=scoring, n_jobs=8)

In [None]:
# optimize hidden_layer_sizes
param_range = list(exp_range(10, 80, 4, 2))

results = []
scoring = 'f1'
classifier = AdaBoostClassifier(random_state=0)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

train_scores, test_scores = plot_validation_curve(classifier, X_train, y_train, 
                      param_name='n_estimators', param_range=param_range,
                      cv=cv, scoring=scoring, n_jobs=8)

In [None]:
# optimize hidden_layer_sizes
param_range = list(exp_range(80, 200, 5, 2))

results = []
scoring = 'recall'
classifier = AdaBoostClassifier(random_state=0)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

train_scores, test_scores = plot_validation_curve(classifier, X_train, y_train, 
                      param_name='n_estimators', param_range=param_range,
                      cv=cv, scoring=scoring, n_jobs=8)

# Learning Curve

In [None]:
classifier = AdaBoostClassifier(random_state=0, n_estimators=1000)

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
title = 'Adaboost Learning Curve'
plt = plot_learning_curve(classifier, title, X_train, y_train, axes=None, ylim=None, cv=cv,
                            n_jobs=None, train_sizes=np.linspace(.1, 1.0, 10), scoring='recall')