In [39]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

# Tree Based Models

from operator import itemgetter
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from xgboost import XGBRegressor

# Linear Regression Based Models

from sklearn.linear_model import ElasticNet, HuberRegressor, LinearRegression, Ridge, Lasso, SGDRegressor, PassiveAggressiveRegressor

# Other types

from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier

# Metrics

from sklearn import preprocessing
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

# Parallel

from dask.distributed import Client
from sklearn.externals.joblib import parallel_backend
from dask_ml.model_selection import GridSearchCV

# Helper functions

from feature_selection_helper import norm_cols 

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline


In [8]:
stk = pd.read_csv("../data/train_processed_upsampled2.csv")
stk.head()

Unnamed: 0,stroke_in_2018,average_blood_sugar,BMI,age_corr,high_BP_0,high_BP_1,smoker_status_active_smoker,smoker_status_non_smoker,smoker_status_quit,married_0,...,very_high_BMI,child,adult,senior,elderly,low_sugar,medium_sugar,high_sugar,very_high_sugar,has_smoked
0,0,71.67,36.6,61.0,0.0,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0,107.95,30.4,30.0,1.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0,76.49,42.1,51.0,0.0,1.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0,113.98,57.3,54.0,1.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0,70.6,26.7,27.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [9]:
X = stk.drop(labels="stroke_in_2018", axis=1).reset_index(drop=True)
y = stk["stroke_in_2018"].reset_index(drop=True)

In [10]:
X_norm = X.copy()
norm_cols(X_norm, ["average_blood_sugar", "BMI", "age_corr"])

X_norm.head()

Unnamed: 0,average_blood_sugar,BMI,age_corr,high_BP_0,high_BP_1,smoker_status_active_smoker,smoker_status_non_smoker,smoker_status_quit,married_0,married_1,...,very_high_BMI,child,adult,senior,elderly,low_sugar,medium_sugar,high_sugar,very_high_sugar,has_smoked
0,0.051133,0.274914,0.707317,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.204829,0.203895,0.329268,1.0,0.0,0.0,0.0,1.0,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.071553,0.337915,0.585366,0.0,1.0,1.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.230375,0.512027,0.621951,1.0,0.0,0.0,0.0,1.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0466,0.161512,0.292683,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [44]:
client = Client()
kf = StratifiedKFold(n_splits=5, random_state=99)

def get_cv_score(clf, X, y, kf):
    cv_scores = []
    with parallel_backend("dask"):
        for _, (train_index, test_index) in enumerate(kf.split(X, y)):
            print(_)
            train_X, train_y = X.loc[train_index], y[train_index] 
            test_X, test_y = X.loc[test_index], y[test_index]

            print(clf)
            clf.fit(train_X, train_y)
            y_pred = clf.predict(test_X)
            score = roc_auc_score(test_y, y_pred)
            
            print(score)
            cv_scores.append(score)
        return np.mean(cv_scores)

In [48]:
tree_model_selected_features = [
    'senior',
    'medium_BMI',
    'high_BMI',
    'low_sugar',
    'very_high_BMI',
    'elderly',
    'job_status_corr_business_owner',
    'job_status_corr_government',
    'medium_sugar',
    'age_corr',
    'smoker_status_non_smoker',
    'job_status_corr_private_sector',
    'high_sugar',
    'BMI',
    'average_blood_sugar'
]


gb_parameters = {
    "learning_rate": np.arange(0.01, 0.20, 0.05),
    "n_estimators": np.arange(30, 200, 20),
    "max_features": np.arange(1, len(tree_model_selected_features)+1)
}
_gb = GradientBoostingRegressor()
_gb_cv_clf = GridSearchCV(_gb, gb_parameters, cv=kf, scoring="roc_auc")
_gb_cv_clf.fit(X.loc[:, tree_model_selected_features], y)

GridSearchCV(cache_cv=True,
             cv=StratifiedKFold(n_splits=5, random_state=99, shuffle=False),
             error_score='raise',
             estimator=GradientBoostingRegressor(alpha=0.9,
                                                 criterion='friedman_mse',
                                                 init=None, learning_rate=0.1,
                                                 loss='ls', max_depth=3,
                                                 max_features=None,
                                                 max_leaf_nodes=None,
                                                 min_impurity_decrease=0.0,
                                                 min_impurity_split=None,
                                                 min_samples_leaf=1,
                                                 min_samples_sp...
                                                 random_state=None,
                                                 subsample=1.0, tol=0.0001,
      

In [49]:
_gb_cv_clf.best_params_

{'learning_rate': 0.16000000000000003, 'max_features': 14, 'n_estimators': 190}

In [None]:
get_cv_score(_gb_cv_clf.best_estimator_, X_norm.loc[:, tree_model_selected_features], y, kf)

In [45]:
other_models = {
    "knn" : KNeighborsClassifier(n_neighbors=3, algorithm='ball_tree'),
#     "svm" : svm.SVC(gamma='scale', probability=True),
    "log" : LogisticRegression(random_state=0, solver='lbfgs', multi_class='auto')
}

tree_based_models = {
    "dt": DecisionTreeRegressor(),
    "rf": RandomForestRegressor(),
    "ada": AdaBoostRegressor(),
    "gb": GradientBoostingRegressor(),
    "et": ExtraTreesRegressor(),
    "xgb": XGBRegressor()
}

linear_based_models = {
    "en": ElasticNet(),
    "hr": HuberRegressor(),
    "lr": LinearRegression(),
    "ridge": Ridge(),
    "lasso": Lasso(),
    "sgd": SGDRegressor(), 
    "pa": PassiveAggressiveRegressor()
}

estimators = [
    ('knn', KNeighborsRegressor(n_neighbors=3, algorithm='ball_tree')),
    ('logreg', LogisticRegression(random_state=0, solver='lbfgs', multi_class='auto')),
#     ('ridge', Ridge()),
    ('sgreg', SGDRegressor()), 
    ('lr', LinearRegression()),
    ('xgb', GradientBoostingRegressor()),
    ('et', ExtraTreesRegressor()),
    
]
ensemble_hard = VotingClassifier(estimators, voting='hard')
ensemble_soft = VotingClassifier(estimators, voting='soft')

In [46]:
get_cv_score(ensemble_hard, X_norm, y, kf)

0
VotingClassifier(estimators=[('knn',
                              KNeighborsRegressor(algorithm='ball_tree',
                                                  leaf_size=30,
                                                  metric='minkowski',
                                                  metric_params=None,
                                                  n_jobs=None, n_neighbors=3,
                                                  p=2, weights='uniform')),
                             ('logreg',
                              LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l...
                              ExtraTreesR

TypeError: Cannot cast array data from dtype('float64') to dtype('int64') according to the rule 'safe'

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_norm, y, test_size=0.3)

In [None]:
ensemble_hard.fit(train_X, train_y)
ensemble_hard.score(X_test, y_test)