In [1]:
import pandas as pd
import numpy as np

#for modeling
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
import xgboost as xgb
#from tensorflow.keras.models import Sequential
#from tensorflow.keras.layers import Dense, Dropout
#from tensorflow.keras.optimizers import Adam
#from tensorflow.keras.regularizers import l1, l2

#for concept drift detection
from skmultiflow.drift_detection.adwin import ADWIN
from skmultiflow.drift_detection import DDM
from skmultiflow.drift_detection.eddm import EDDM
from skmultiflow.drift_detection.hddm_a import HDDM_A
from skmultiflow.drift_detection.hddm_w import HDDM_W
from skmultiflow.drift_detection import KSWIN
from skmultiflow.drift_detection import PageHinkley

Right now the data has been combined for all years 1999-2018, and split in the get_data_and_processing notebook. I still need to determine how to structure this data and present in a sequential order. 

+ Do I setup models for 1999-2001 and then structure the data for the remaining years and train and test incrementally?
  + If I go this route, do I use the full data set ordered by year with no need to do cross validation or train_test_split since the testing will be done sequentially?
+ In this current form, since all years are shuffled together, the below drift detection is picking up some drifts, especially for the ADWIN algorithm.

In [2]:
X_train = pd.read_csv('data/X_train.csv', index_col=0)
y_train = pd.read_csv('data/y_train.csv', index_col=0)
X_test = pd.read_csv('data/X_test.csv', index_col=0)
y_test = pd.read_csv('data/y_test.csv', index_col=0)

Models to Run:

+ Logistic Regression
+ Support Vector Machines
+ Random Forest
+ XGBoost
+ Neural Networks

Check if there are others that should be run

https://scikit-learn.org/stable/tutorial/statistical_inference/putting_together.html

In [3]:
X_train.describe()

Unnamed: 0,Age,LDL Chol,HDL Chol,Chloride,Total Chol,GGT,AST,Hemoglobin,Weight,BMI,ALP,ALT
count,77396.0,77396.0,77396.0,77396.0,77396.0,77396.0,77396.0,77396.0,77396.0,77396.0,77396.0,77396.0
mean,32.580637,108.698611,53.270547,103.346497,183.948457,26.755135,24.945263,13.810239,64.07973,25.471295,87.849703,23.830325
std,24.586185,20.226025,11.142197,2.38167,37.06897,33.109661,13.970365,1.394973,29.22481,7.25119,46.227507,17.867682
min,1.0,9.0,6.0,70.0,59.0,2.0,6.0,5.9,6.4,11.49,7.0,2.0
25%,11.0,108.710285,50.0,103.0,161.0,15.0,21.0,12.9,47.4,20.3,68.0,17.0
50%,26.0,108.710285,53.277988,103.345424,183.944825,26.740553,24.945995,13.806845,64.2,25.494282,87.745839,23.851419
75%,53.0,108.710285,53.277988,104.0,199.0,26.740553,24.945995,14.6,82.2,29.2,87.745839,23.851419
max,85.0,629.0,226.0,120.0,813.0,2274.0,1672.0,19.9,371.0,130.21,1378.0,1997.0


In [4]:
#reshape y_train to be a column vector
y_train = y_train.values.ravel()

In [5]:
scaler = StandardScaler()

In [6]:
#Logistic Regression
logistic = LogisticRegression(max_iter=1000)
pipe = Pipeline([('scaler', scaler),('logistic', logistic)])
param_grid = {'logistic__C': np.logspace(-3, 3, 7)}
logistic_model = GridSearchCV(pipe, param_grid, cv=5)
logistic_model.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('logistic',
                                        LogisticRegression(max_iter=1000))]),
             param_grid={'logistic__C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03])})

In [7]:
#Logistic Regression 
print(logistic_model.best_params_)
print(logistic_model.best_score_)

#performance
print('Train set MSE performance: ' + str(round(mean_squared_error(y_train,logistic_model.predict(X_train)),4))) #Train set prediction and performance
print('Test set MSE performance: ' + str(round(mean_squared_error(y_test,logistic_model.predict(X_test)),4))) #Test set prediction and performance

#Predictions
logistic_y_pred = logistic_model.predict(X_test)

{'logistic__C': 0.01}
0.9187167184747927
Train set MSE performance: 0.0813
Test set MSE performance: 0.0822


In [8]:
#Support Vector Machines
svm = SVC(max_iter=1000)
pipe = Pipeline([('scaler', scaler),('svm', svm)])
param_grid = {'svm__C': np.logspace(-3, 3, 7)}
svm_model = GridSearchCV(pipe, param_grid, cv=5)
svm_model.fit(X_train, y_train)



GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('svm', SVC(max_iter=1000))]),
             param_grid={'svm__C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03])})

In [9]:
#Support Vector Machines
print(svm_model.best_params_)
print(svm_model.best_score_)
print(svm_model.best_estimator_)

#performance
print('Train set performance: ' + str(round(mean_squared_error(y_train,svm_model.predict(X_train)),4))) #Train set prediction and performance
print('Test set performance: ' + str(round(mean_squared_error(y_test,svm_model.predict(X_test)),4))) #Test set prediction and performance

#Predictions
svm_y_pred = svm_model.predict(X_test)

{'svm__C': 0.001}
0.7503106584514854
Pipeline(steps=[('scaler', StandardScaler()),
                ('svm', SVC(C=0.001, max_iter=1000))])
Train set performance: 0.3315
Test set performance: 0.3331


In [10]:
#Random Forest
rf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=0)
pipe = Pipeline([('scaler', scaler),('rf', rf)])
param_grid = {'rf__max_depth': [3, 5, 7], 'rf__n_estimators': [100, 200, 300]}
rf_model = GridSearchCV(pipe, param_grid, cv=5)
rf_model.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('rf',
                                        RandomForestClassifier(max_depth=5,
                                                               random_state=0))]),
             param_grid={'rf__max_depth': [3, 5, 7],
                         'rf__n_estimators': [100, 200, 300]})

In [11]:
#Random Forest
print(rf_model.best_params_)
print(rf_model.best_score_)
print(rf_model.best_estimator_)

#performance
print('Train set performance: ' + str(round(mean_squared_error(y_train,rf_model.predict(X_train)),4))) #Train set prediction and performance
print('Test set performance: ' + str(round(mean_squared_error(y_test,rf_model.predict(X_test)),4))) #Test set prediction and performance

#Predictions
rf_y_pred = rf_model.predict(X_test)

{'rf__max_depth': 7, 'rf__n_estimators': 100}
0.9190268193650045
Pipeline(steps=[('scaler', StandardScaler()),
                ('rf', RandomForestClassifier(max_depth=7, random_state=0))])
Train set performance: 0.0788
Test set performance: 0.0811


In [12]:
#XGBoost
xgb = xgb.XGBClassifier(n_estimators=100, max_depth=5, random_state=0)
pipe = Pipeline([('scaler', scaler),('xgb', xgb)])
param_grid = {'xgb__max_depth': [3, 5, 7], 'xgb__n_estimators': [100, 200, 300]}
xgb_model = GridSearchCV(pipe, param_grid, cv=5)
xgb_model.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('xgb',
                                        XGBClassifier(base_score=None,
                                                      booster=None,
                                                      callbacks=None,
                                                      colsample_bylevel=None,
                                                      colsample_bynode=None,
                                                      colsample_bytree=None,
                                                      early_stopping_rounds=None,
                                                      enable_categorical=False,
                                                      eval_metric=None,
                                                      gamma=None, gpu_id=None,
                                                      grow_policy=None,
                                       

In [13]:
#XGBoost
print(xgb_model.best_params_)
print(xgb_model.best_score_)
print(xgb_model.best_estimator_)

#performance
print('Train set performance: ' + str(round(mean_squared_error(y_train,xgb_model.predict(X_train)),4))) #Train set prediction and performance
print('Test set performance: ' + str(round(mean_squared_error(y_test,xgb_model.predict(X_test)),4))) #Test set prediction and performance

#Predictions
xgb_y_pred = xgb_model.predict(X_test)

{'xgb__max_depth': 3, 'xgb__n_estimators': 100}
0.921675525046604
Pipeline(steps=[('scaler', StandardScaler()),
                ('xgb',
                 XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, early_stopping_rounds=None,
                               enable_categorical=False, eval_metric=None,
                               gamma=0, gpu_id=-1, grow_policy='depthwise',
                               importance_type=None, interaction_constraints='',
                               learning_rate=0.300000012, max_bin=256,
                               max_cat_to_onehot=4, max_delta_step=0,
                               max_depth=3, max_leaves=0, min_child_weight=1,
                               missing=nan, monotone_constraints='()',
                               n_estimators=100, n_jobs=0, num_parallel_tree=1,
                              

Add Neural Network Model Here

## Implement Concept Drift models

### Scikit-Multiflow

https://scikit-multiflow.readthedocs.io/en/stable/api/api.html#module-skmultiflow.drift_detection

In this Library, detection methods include:
+ ADWIN - Adaptive Windowing
+ DDM - Drift Detection Method
+ EDDM - Early Drift Detection Method
+ HDDM_A - Drift Detection Method based on Hoeffding's bounds with moving average-test
+ HDDM_W - Drift Detection Method based on Hoeffding's bounds with moving weighted average-test
+ KSWIN - Kolmogorov-Smirnov Windowing method for concept drift detection
+ PageHinkley - Page-Hinkley method for concept drift detection

add_element(self, value)[source]
    Add a new element to the sample window.
    
    Apart from adding the element value to the window, by inserting it in the correct bucket, it will also update the relevant statistics, in this case the total sum of all values, the window width and the total variance.
    
    Parameters
    value: int or float (a numeric value)
    Notes
    
    The value parameter can be any numeric value relevant to the analysis of concept change. For the learners in this framework we are using either 0’s or 1’s, that are interpreted as follows: 0: Means the learners prediction was wrong 1: Means the learners prediction was correct
    
    This function should be used at every new sample analysed.

In [29]:
#creating lists of elements for each model with 0 meaning the learner prediction was wrong and 1 meaning the learner predictions was correct
#These lists will be fed into the concept drift detection algorithms
logistic_predicted_stream = []
for i in range(len(logistic_y_pred)):
    if logistic_y_pred[i] == y_test.values.ravel()[i]:
        logistic_predicted_stream.append(1)
    else:
        logistic_predicted_stream.append(0)

svm_predicted_stream = []
for i in range(len(svm_y_pred)):
    if svm_y_pred[i] == y_test.values.ravel()[i]:
        svm_predicted_stream.append(1)
    else:
        svm_predicted_stream.append(0)

rf_predicted_stream = []
for i in range(len(rf_y_pred)):
    if rf_y_pred[i] == y_test.values.ravel()[i]:
        rf_predicted_stream.append(1)
    else:
        rf_predicted_stream.append(0)

xgb_predicted_stream = []
for i in range(len(xgb_y_pred)):
    if xgb_y_pred[i] == y_test.values.ravel()[i]:
        xgb_predicted_stream.append(1)
    else:
        xgb_predicted_stream.append(0)

In [31]:
#Adwin
adwin = ADWIN()

for i in range(len(logistic_predicted_stream)):
    adwin.add_element(logistic_predicted_stream[i])
    if adwin.detected_change():
        print("Change detected in data: " + str(logistic_predicted_stream[i]) + ' - at index: ' + str(i) + ' for model: Logistic Regression')

for i in range(len(svm_predicted_stream)):
    adwin.add_element(svm_predicted_stream[i])
    if adwin.detected_change():
        adwin.get_change()
        print("Change detected in data: " + str(svm_predicted_stream[i]) + ' - at index: ' + str(i) + ' for model: Support Vector Machines')

for i in range(len(rf_predicted_stream)):
    adwin.add_element(rf_predicted_stream[i])
    if adwin.detected_change():
        print("Change detected in data: " + str(rf_predicted_stream[i]) + ' - at index: ' + str(i) + ' for model: Random Forest')

for i in range(len(xgb_predicted_stream)):
    adwin.add_element(xgb_predicted_stream[i])
    if adwin.detected_change():
        print("Change detected in data: " + str(xgb_predicted_stream[i]) + ' - at index: ' + str(i) + ' for model: XGBoost')

Change detected in data: 0 - at index: 330 for model: Support Vector Machines
Change detected in data: 0 - at index: 362 for model: Support Vector Machines
Change detected in data: 1 - at index: 426 for model: Support Vector Machines
Change detected in data: 1 - at index: 522 for model: Support Vector Machines
Change detected in data: 1 - at index: 682 for model: Support Vector Machines
Change detected in data: 1 - at index: 778 for model: Support Vector Machines
Change detected in data: 1 - at index: 874 for model: Support Vector Machines
Change detected in data: 1 - at index: 1162 for model: Support Vector Machines
Change detected in data: 0 - at index: 2698 for model: Support Vector Machines
Change detected in data: 1 - at index: 725 for model: Random Forest
Change detected in data: 1 - at index: 757 for model: Random Forest
Change detected in data: 1 - at index: 789 for model: Random Forest
Change detected in data: 1 - at index: 949 for model: Random Forest
Change detected in data:

In [32]:
#DDM
ddm = DDM()

for i in range(len(logistic_predicted_stream)):
    ddm.add_element(logistic_predicted_stream[i])
    if ddm.detected_change():
        print("Change detected in data: " + str(logistic_predicted_stream[i]) + ' - at index: ' + str(i) + ' for model: Logistic Regression')

for i in range(len(svm_predicted_stream)):
    ddm.add_element(svm_predicted_stream[i])
    if ddm.detected_change():
        print("Change detected in data: " + str(svm_predicted_stream[i]) + ' - at index: ' + str(i) + ' for model: Support Vector Machines')

for i in range(len(rf_predicted_stream)):
    ddm.add_element(rf_predicted_stream[i])
    if ddm.detected_change():
        print("Change detected in data: " + str(rf_predicted_stream[i]) + ' - at index: ' + str(i) + ' for model: Random Forest')

for i in range(len(xgb_predicted_stream)):
    ddm.add_element(xgb_predicted_stream[i])
    if ddm.detected_change():
        print("Change detected in data: " + str(xgb_predicted_stream[i]) + ' - at index: ' + str(i) + ' for model: XGBoost')

Change detected in data: 1 - at index: 1256 for model: Random Forest


In [33]:
#EDDM
eddm = EDDM()

for i in range(len(logistic_predicted_stream)):
    eddm.add_element(logistic_predicted_stream[i])
    if eddm.detected_change():
        print("Change detected in data: " + str(logistic_predicted_stream[i]) + ' - at index: ' + str(i) + ' for model: Logistic Regression')

for i in range(len(svm_predicted_stream)):
    eddm.add_element(svm_predicted_stream[i])
    if eddm.detected_change():
        print("Change detected in data: " + str(svm_predicted_stream[i]) + ' - at index: ' + str(i) + ' for model: Support Vector Machines')

for i in range(len(rf_predicted_stream)):
    eddm.add_element(rf_predicted_stream[i])
    if eddm.detected_change():
        print("Change detected in data: " + str(rf_predicted_stream[i]) + ' - at index: ' + str(i) + ' for model: Random Forest')

for i in range(len(xgb_predicted_stream)):
    eddm.add_element(xgb_predicted_stream[i])
    if eddm.detected_change():
        print("Change detected in data: " + str(xgb_predicted_stream[i]) + ' - at index: ' + str(i) + ' for model: XGBoost')

Change detected in data: 1 - at index: 68 for model: Logistic Regression
Change detected in data: 1 - at index: 914 for model: XGBoost


In [34]:
#HDDM_A
hddm_a = HDDM_A()

for i in range(len(logistic_predicted_stream)):
    hddm_a.add_element(logistic_predicted_stream[i])
    if hddm_a.detected_change():
        print("Change detected in data: " + str(logistic_predicted_stream[i]) + ' - at index: ' + str(i) + ' for model: Logistic Regression')

for i in range(len(svm_predicted_stream)):
    hddm_a.add_element(svm_predicted_stream[i])
    if hddm_a.detected_change():
        print("Change detected in data: " + str(svm_predicted_stream[i]) + ' - at index: ' + str(i) + ' for model: Support Vector Machines')

for i in range(len(rf_predicted_stream)):
    hddm_a.add_element(rf_predicted_stream[i])
    if hddm_a.detected_change():
        print("Change detected in data: " + str(rf_predicted_stream[i]) + ' - at index: ' + str(i) + ' for model: Random Forest')

for i in range(len(xgb_predicted_stream)):
    hddm_a.add_element(xgb_predicted_stream[i])
    if hddm_a.detected_change():
        print("Change detected in data: " + str(xgb_predicted_stream[i]) + ' - at index: ' + str(i) + ' for model: XGBoost')

Change detected in data: 1 - at index: 59 for model: Random Forest


In [35]:
#HDDM_W
hddm_w = HDDM_W()

for i in range(len(logistic_predicted_stream)):
    hddm_w.add_element(logistic_predicted_stream[i])
    if hddm_w.detected_change():
        print("Change detected in data: " + str(logistic_predicted_stream[i]) + ' - at index: ' + str(i) + ' for model: Logistic Regression')

for i in range(len(svm_predicted_stream)):
    hddm_w.add_element(svm_predicted_stream[i])
    if hddm_w.detected_change():
        print("Change detected in data: " + str(svm_predicted_stream[i]) + ' - at index: ' + str(i) + ' for model: Support Vector Machines')

for i in range(len(rf_predicted_stream)):
    hddm_w.add_element(rf_predicted_stream[i])
    if hddm_w.detected_change():
        print("Change detected in data: " + str(rf_predicted_stream[i]) + ' - at index: ' + str(i) + ' for model: Random Forest')

for i in range(len(xgb_predicted_stream)):
    hddm_w.add_element(xgb_predicted_stream[i])
    if hddm_w.detected_change():
        print("Change detected in data: " + str(xgb_predicted_stream[i]) + ' - at index: ' + str(i) + ' for model: XGBoost')

Change detected in data: 1 - at index: 726 for model: Support Vector Machines
Change detected in data: 1 - at index: 7373 for model: Support Vector Machines
Change detected in data: 1 - at index: 11727 for model: Support Vector Machines
Change detected in data: 1 - at index: 14282 for model: Support Vector Machines
Change detected in data: 1 - at index: 15281 for model: Support Vector Machines
Change detected in data: 1 - at index: 17592 for model: Support Vector Machines
Change detected in data: 1 - at index: 19 for model: Random Forest


In [36]:
#KSWIN
kswin = KSWIN()

for i in range(len(logistic_predicted_stream)):
    kswin.add_element(logistic_predicted_stream[i])
    if kswin.detected_change():
        print("Change detected in data: " + str(logistic_predicted_stream[i]) + ' - at index: ' + str(i) + ' for model: Logistic Regression')

for i in range(len(svm_predicted_stream)):
    kswin.add_element(svm_predicted_stream[i])
    if kswin.detected_change():
        print("Change detected in data: " + str(svm_predicted_stream[i]) + ' - at index: ' + str(i) + ' for model: Support Vector Machines')

for i in range(len(rf_predicted_stream)):
    kswin.add_element(rf_predicted_stream[i])
    if kswin.detected_change():
        print("Change detected in data: " + str(rf_predicted_stream[i]) + ' - at index: ' + str(i) + ' for model: Random Forest')

for i in range(len(xgb_predicted_stream)):
    kswin.add_element(xgb_predicted_stream[i])
    if kswin.detected_change():
        print("Change detected in data: " + str(xgb_predicted_stream[i]) + ' - at index: ' + str(i) + ' for model: XGBoost')



Change detected in data: 1 - at index: 3221 for model: Support Vector Machines
Change detected in data: 1 - at index: 3996 for model: Support Vector Machines
Change detected in data: 1 - at index: 5204 for model: Support Vector Machines
Change detected in data: 0 - at index: 8433 for model: Support Vector Machines
Change detected in data: 0 - at index: 11523 for model: Support Vector Machines
Change detected in data: 1 - at index: 11891 for model: Support Vector Machines
Change detected in data: 1 - at index: 12063 for model: Support Vector Machines
Change detected in data: 1 - at index: 17230 for model: Support Vector Machines
Change detected in data: 1 - at index: 15 for model: Random Forest
Change detected in data: 1 - at index: 13105 for model: Random Forest


In [37]:
#Page Hinkley
page_hinkley = PageHinkley()

for i in range(len(logistic_predicted_stream)):
    page_hinkley.add_element(logistic_predicted_stream[i])
    if page_hinkley.detected_change():
        print("Change detected in data: " + str(logistic_predicted_stream[i]) + ' - at index: ' + str(i) + ' for model: Logistic Regression')

for i in range(len(svm_predicted_stream)):
    page_hinkley.add_element(svm_predicted_stream[i])
    if page_hinkley.detected_change():
        print("Change detected in data: " + str(svm_predicted_stream[i]) + ' - at index: ' + str(i) + ' for model: Support Vector Machines')

for i in range(len(rf_predicted_stream)):
    page_hinkley.add_element(rf_predicted_stream[i])
    if page_hinkley.detected_change():
        print("Change detected in data: " + str(rf_predicted_stream[i]) + ' - at index: ' + str(i) + ' for model: Random Forest')

for i in range(len(xgb_predicted_stream)):
    page_hinkley.add_element(xgb_predicted_stream[i])
    if page_hinkley.detected_change():
        print("Change detected in data: " + str(xgb_predicted_stream[i]) + ' - at index: ' + str(i) + ' for model: XGBoost')

Change detected in data: 1 - at index: 346 for model: Random Forest
