In [1]:
%load_ext nb_black
# !pip install nb_black

<IPython.core.display.Javascript object>

In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn import metrics as m
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE  # doctest: +NORMALIZE_WHITESPACE
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
import xgboost as xgb
from sklearn.pipeline import make_pipeline
from mlxtend.feature_selection import ColumnSelector
from sklearn import model_selection
from mlxtend.classifier import StackingClassifier
from thundersvm import SVC as svmgpu
from mlxtend.feature_selection import ColumnSelector
from mlxtend.classifier import StackingCVClassifier

import warnings

warnings.filterwarnings("ignore")
randomseed = 7
np.random.seed(randomseed)

Using TensorFlow backend.


<IPython.core.display.Javascript object>

# 1. Read the dataset

In [3]:
x_original = pd.read_csv("../dataset/XLable_onlyDiabeticRemoved.txt")

conditions = [
    (x_original.L100800 < 100),
    (x_original.L100800 >= 100) & (x_original.L100800 < 126),
    (x_original.L100800 >= 126),
]
choices = [0, 1, 2]
x_original["CLASS"] = np.select(conditions, choices, default=0)
x_original = x_original[
    [
        "Unnamed: 0",
        "L100800",
        "L104600",
        "L103000",
        "S000300",
        "L101700",
        "L100700",
        "FIELD_33",
        "FIELD_38",
        "FIELD_40",
        "FIELD_31",
        "SEX",
        "AGE",  #'CLASS',
        "FIELD_16",
        "FIELD_23",
        "FIELD_15",
        "FIELD_22",
        "FIELD_17",
        "FIELD_24",
    ]
]

print(x_original.shape)

(185843, 19)


<IPython.core.display.Javascript object>

In [4]:
y_original = pd.read_csv("../dataset/TargetLable_onlyDiabeticRemoved.txt")

conditions = [
    (y_original.L100800 < 100),
    (y_original.L100800 >= 100) & (y_original.L100800 < 126),
    (y_original.L100800 >= 126),
]

choices = [0, 1, 2]
y_original["CLASS"] = np.select(conditions, choices, default=0)

y_original = y_original[["Unnamed: 0", "CLASS"]]

print(y_original.shape)

(185843, 2)


<IPython.core.display.Javascript object>

In [5]:
data = pd.merge(
    x_original, y_original, how="inner", left_on="Unnamed: 0", right_on="Unnamed: 0"
)

<IPython.core.display.Javascript object>

In [6]:
# filter the data set
data = data[data.FIELD_16 != 1]  # exclude people who are diagnosed for (diabetes)
data = data[data.FIELD_23 != 1]  # exclude people who are on medication for diabetes

data = data[
    data.FIELD_15 != 1
]  # exclude people who are diagnosed for (high blood pressure)
data = data[
    data.FIELD_22 != 1
]  # exclude people who are on medication for high blood pressure

data = data[data.FIELD_17 != 1]  # exclude people who are diagnosed for hyperlipidemia
data = data[
    data.FIELD_24 != 1
]  # exclude people who are on medication for hyperlipidemia

print(data.shape)

(169024, 20)


<IPython.core.display.Javascript object>

In [7]:
data = data[
    [
        "L100800",
        "L104600",
        "L103000",
        "S000300",
        "L101700",
        "L100700",
        "FIELD_33",
        "FIELD_38",
        "FIELD_40",
        "FIELD_31",
        "SEX",
        "AGE",
        "CLASS",
    ]
]
data = data.dropna()
print(data.shape)
data.head()

(56542, 13)


Unnamed: 0,L100800,L104600,L103000,S000300,L101700,L100700,FIELD_33,FIELD_38,FIELD_40,FIELD_31,SEX,AGE,CLASS
2,78.0,5.28,41.0,20.2,15.0,3.8,1.0,2.0,1.0,0.0,1.0,46.0,0
5,90.0,5.74,50.0,25.5,12.0,3.4,1.0,0.0,1.0,0.0,1.0,52.0,0
10,86.0,5.83,45.0,21.2,17.0,3.9,1.0,0.0,1.0,1.0,1.0,37.0,0
11,86.0,4.73,54.0,22.0,30.0,4.2,1.0,2.0,3.0,0.0,1.0,39.0,0
20,87.0,5.6,340.0,24.6,26.0,4.7,1.0,0.0,2.0,0.0,1.0,59.0,0


<IPython.core.display.Javascript object>

# 2. Downsample the majority class and upsample the minority

In [8]:
diabetic = data[data.CLASS == 2]
prediabetic = data[data.CLASS == 1]
normal = data[data.CLASS == 0]

print(diabetic.shape[0], prediabetic.shape[0], normal.shape[0])

1045 17331 38166


<IPython.core.display.Javascript object>

In [9]:
diabetic_test = diabetic.sample(200, random_state=randomseed)
prediabetic_test = prediabetic.sample(200, random_state=randomseed)
normal_test = normal.sample(200, random_state=randomseed)
test = pd.concat([diabetic_test, prediabetic_test, normal_test])

diabetic_train = diabetic.drop(diabetic_test.index)
prediabetic_train = prediabetic.drop(prediabetic_test.index)
# .sample(
#     10 * diabetic_train.shape[0], random_state=randomseed
# )
normal_train = normal.drop(normal_test.index).sample(
    prediabetic_train.shape[0],
    random_state=randomseed
    #     10 * diabetic_train.shape[0], random_state=randomseed
)
train = pd.concat([diabetic_train, diabetic_train, prediabetic_train, normal_train])

<IPython.core.display.Javascript object>

In [10]:
xtrain = train.iloc[:, :-1]
ytrain = train.iloc[:, -1]
xtest = test.iloc[:, :-1]
ytest = test.iloc[:, -1]

<IPython.core.display.Javascript object>

In [11]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
xtrain = scaler.fit_transform(xtrain)
xtest = scaler.transform(xtest)

<IPython.core.display.Javascript object>

In [12]:
from imblearn.over_sampling import SMOTE, SMOTENC  # doctest: +NORMALIZE_WHITESPACE

randomseed = 42

sm = SMOTENC(
    random_state=randomseed,
    categorical_features=[6, 7, 8, 9, 10],
    sampling_strategy="minority",
)
X_res, y_res = sm.fit_resample(xtrain, ytrain)

print("Resampled dataset shape %s" % Counter(y_res))
print(
    y_res[y_res == 0].shape[0], y_res[y_res == 1].shape[0], y_res[y_res == 2].shape[0]
)
print(X_res.shape, y_res.shape)

xtrain = X_res
ytrain = y_res

Resampled dataset shape Counter({2: 17131, 1: 17131, 0: 17131})
17131 17131 17131
(51393, 12) (51393,)


<IPython.core.display.Javascript object>

# 3. Generate the classifier models based on the selected 10 features

# 3.1.  Features : 10

In [13]:
top10colscomb = [
    (0, 1, 2, 3, 4, 5, 6, 9, 10, 11),
    (0, 1, 3, 4, 6, 7, 8, 9, 10, 11),
    (0, 1, 2, 3, 4, 5, 6, 7, 10, 11),
    (0, 1, 2, 3, 4, 5, 6, 8, 10, 11),
    (0, 1, 2, 3, 5, 6, 8, 9, 10, 11),
    (0, 1, 3, 4, 5, 6, 8, 9, 10, 11),
    (0, 1, 3, 4, 5, 6, 7, 9, 10, 11),
    (0, 1, 2, 3, 4, 5, 8, 9, 10, 11),
    (0, 1, 2, 3, 4, 5, 6, 7, 9, 11),
    (0, 1, 2, 3, 4, 6, 7, 9, 10, 11),
]

<IPython.core.display.Javascript object>

In [14]:
# modles used in the 12 features set
rfclf=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=12, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=10, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=randomseed, verbose=0,
                       warm_start=False)

xgbclf=xgb.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=randomseed,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

svcclf=svmgpu(C=1000, cache_size=None, class_weight={}, coef0=0.0,
    decision_function_shape='ovo', degree=3, gamma=0.1, gpu_id=0,
    kernel='linear', max_iter=-1, max_mem_size=-1, n_jobs=-1, probability=False,
    random_state=randomseed, shrinking=False, tol=0.001, verbose=False)

scv_model_cpu=SVC(C=1000, cache_size=100, class_weight={}, coef0=0.0,
    decision_function_shape='ovo', degree=3, gamma=0.1, #gpu_id=0,
    kernel='linear', max_iter=-1, #max_mem_size=-1, n_jobs=-1, 
                  probability=True,
    random_state=randomseed, shrinking=False, tol=0.001, verbose=False)


<IPython.core.display.Javascript object>

In [15]:
# generate the models based on the selected columns list and the ml classifiers

weakmodles = []
estimators = []
for i in range(5):
    weakmodles.append(make_pipeline(ColumnSelector(cols=top10colscomb[i]), rfclf))
    weakmodles.append(make_pipeline(ColumnSelector(cols=top10colscomb[i]), xgbclf))
    weakmodles.append(make_pipeline(ColumnSelector(cols=top10colscomb[i]), svcclf))

for i in range(5):
    estimators.append(
        (
            (
                "rf_model_52" + str(i),
                (make_pipeline(ColumnSelector(cols=top10colscomb[i]), rfclf)),
            )
        )
    )
    estimators.append(
        (
            "xgb_model_52" + str(i),
            (make_pipeline(ColumnSelector(cols=top10colscomb[i]), xgbclf)),
        )
    )
    estimators.append(
        (
            "scv_model_cpu_52" + str(i),
            (make_pipeline(ColumnSelector(cols=top10colscomb[i]), scv_model_cpu)),
        )
    )

<IPython.core.display.Javascript object>

In [16]:
len(weakmodles)

15

<IPython.core.display.Javascript object>

In [17]:
# add stacking classifier
sclf = StackingCVClassifier(
    classifiers=weakmodles,
    verbose=2,
    meta_classifier=xgb.XGBClassifier(
        objective="multi:softmax", random_state=randomseed
    ),
    random_state=42,
)

<IPython.core.display.Javascript object>

In [18]:
sclf.fit(xtrain, ytrain)

Fitting 15 classifiers...
Fitting classifier1: pipeline (1/15)
Pipeline(memory=None,
         steps=[('columnselector',
                 ColumnSelector(cols=(0, 1, 2, 3, 4, 5, 6, 9, 10, 11),
                                drop_axis=False)),
                ('randomforestclassifier',
                 RandomForestClassifier(bootstrap=True, class_weight=None,
                                        criterion='gini', max_depth=12,
                                        max_features='auto',
                                        max_leaf_nodes=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=10,
                                        min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        n_estimators=100, n_jobs=None,
                                        oob_score=F

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    4.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting classifier2: pipeline (2/15)
Pipeline(memory=None,
         steps=[('columnselector',
                 ColumnSelector(cols=(0, 1, 2, 3, 4, 5, 6, 9, 10, 11),
                                drop_axis=False)),
                ('xgbclassifier',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, gamma=0, learning_rate=0.1,
                               max_delta_step=0, max_depth=3,
                               min_child_weight=1, missing=None,
                               n_estimators=100, n_jobs=1, nthread=None,
                               objective='multi:softprob', random_state=42,
                               reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
                               seed=None, silent=None, subsample=1,
                               verbosity=1))],
         verbose=False)


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    6.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting classifier3: pipeline (3/15)
Pipeline(memory=None,
         steps=[('columnselector',
                 ColumnSelector(cols=(0, 1, 2, 3, 4, 5, 6, 9, 10, 11),
                                drop_axis=False)),
                ('svc',
                 SVC(C=1000, cache_size=None, class_weight={}, coef0=0.0,
                     decision_function_shape='ovo', degree=3, gamma=0.1,
                     gpu_id=0, kernel='linear', max_iter=-1, max_mem_size=-1,
                     n_jobs=-1, probability=False, random_state=None,
                     shrinking=False, tol=0.001, verbose=False))],
         verbose=False)


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   56.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  2.0min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting classifier4: pipeline (4/15)
Pipeline(memory=None,
         steps=[('columnselector',
                 ColumnSelector(cols=(0, 1, 3, 4, 6, 7, 8, 9, 10, 11),
                                drop_axis=False)),
                ('randomforestclassifier',
                 RandomForestClassifier(bootstrap=True, class_weight=None,
                                        criterion='gini', max_depth=12,
                                        max_features='auto',
                                        max_leaf_nodes=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=10,
                                        min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        n_estimators=100, n_jobs=None,
                                        oob_score=False, random_state=42,
   

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.8s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting classifier5: pipeline (5/15)
Pipeline(memory=None,
         steps=[('columnselector',
                 ColumnSelector(cols=(0, 1, 3, 4, 6, 7, 8, 9, 10, 11),
                                drop_axis=False)),
                ('xgbclassifier',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, gamma=0, learning_rate=0.1,
                               max_delta_step=0, max_depth=3,
                               min_child_weight=1, missing=None,
                               n_estimators=100, n_jobs=1, nthread=None,
                               objective='multi:softprob', random_state=42,
                               reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
                               seed=None, silent=None, subsample=1,
                               verbosity=1))],
         verbose=False)


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    6.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting classifier6: pipeline (6/15)
Pipeline(memory=None,
         steps=[('columnselector',
                 ColumnSelector(cols=(0, 1, 3, 4, 6, 7, 8, 9, 10, 11),
                                drop_axis=False)),
                ('svc',
                 SVC(C=1000, cache_size=None, class_weight={}, coef0=0.0,
                     decision_function_shape='ovo', degree=3, gamma=0.1,
                     gpu_id=0, kernel='linear', max_iter=-1, max_mem_size=-1,
                     n_jobs=-1, probability=False, random_state=None,
                     shrinking=False, tol=0.001, verbose=False))],
         verbose=False)


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.1min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  2.3min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting classifier7: pipeline (7/15)
Pipeline(memory=None,
         steps=[('columnselector',
                 ColumnSelector(cols=(0, 1, 2, 3, 4, 5, 6, 7, 10, 11),
                                drop_axis=False)),
                ('randomforestclassifier',
                 RandomForestClassifier(bootstrap=True, class_weight=None,
                                        criterion='gini', max_depth=12,
                                        max_features='auto',
                                        max_leaf_nodes=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=10,
                                        min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        n_estimators=100, n_jobs=None,
                                        oob_score=False, random_state=42,
   

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    4.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting classifier8: pipeline (8/15)
Pipeline(memory=None,
         steps=[('columnselector',
                 ColumnSelector(cols=(0, 1, 2, 3, 4, 5, 6, 7, 10, 11),
                                drop_axis=False)),
                ('xgbclassifier',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, gamma=0, learning_rate=0.1,
                               max_delta_step=0, max_depth=3,
                               min_child_weight=1, missing=None,
                               n_estimators=100, n_jobs=1, nthread=None,
                               objective='multi:softprob', random_state=42,
                               reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
                               seed=None, silent=None, subsample=1,
                               verbosity=1))],
         verbose=False)


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    6.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting classifier9: pipeline (9/15)
Pipeline(memory=None,
         steps=[('columnselector',
                 ColumnSelector(cols=(0, 1, 2, 3, 4, 5, 6, 7, 10, 11),
                                drop_axis=False)),
                ('svc',
                 SVC(C=1000, cache_size=None, class_weight={}, coef0=0.0,
                     decision_function_shape='ovo', degree=3, gamma=0.1,
                     gpu_id=0, kernel='linear', max_iter=-1, max_mem_size=-1,
                     n_jobs=-1, probability=False, random_state=None,
                     shrinking=False, tol=0.001, verbose=False))],
         verbose=False)


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   42.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.2min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting classifier10: pipeline (10/15)
Pipeline(memory=None,
         steps=[('columnselector',
                 ColumnSelector(cols=(0, 1, 2, 3, 4, 5, 6, 8, 10, 11),
                                drop_axis=False)),
                ('randomforestclassifier',
                 RandomForestClassifier(bootstrap=True, class_weight=None,
                                        criterion='gini', max_depth=12,
                                        max_features='auto',
                                        max_leaf_nodes=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=10,
                                        min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        n_estimators=100, n_jobs=None,
                                        oob_score=False, random_state=42,
 

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    4.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting classifier11: pipeline (11/15)
Pipeline(memory=None,
         steps=[('columnselector',
                 ColumnSelector(cols=(0, 1, 2, 3, 4, 5, 6, 8, 10, 11),
                                drop_axis=False)),
                ('xgbclassifier',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, gamma=0, learning_rate=0.1,
                               max_delta_step=0, max_depth=3,
                               min_child_weight=1, missing=None,
                               n_estimators=100, n_jobs=1, nthread=None,
                               objective='multi:softprob', random_state=42,
                               reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
                               seed=None, silent=None, subsample=1,
                               verbosity=1))],
         verbose=False)


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    6.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting classifier12: pipeline (12/15)
Pipeline(memory=None,
         steps=[('columnselector',
                 ColumnSelector(cols=(0, 1, 2, 3, 4, 5, 6, 8, 10, 11),
                                drop_axis=False)),
                ('svc',
                 SVC(C=1000, cache_size=None, class_weight={}, coef0=0.0,
                     decision_function_shape='ovo', degree=3, gamma=0.1,
                     gpu_id=0, kernel='linear', max_iter=-1, max_mem_size=-1,
                     n_jobs=-1, probability=False, random_state=None,
                     shrinking=False, tol=0.001, verbose=False))],
         verbose=False)


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   47.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.6min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting classifier13: pipeline (13/15)
Pipeline(memory=None,
         steps=[('columnselector',
                 ColumnSelector(cols=(0, 1, 2, 3, 5, 6, 8, 9, 10, 11),
                                drop_axis=False)),
                ('randomforestclassifier',
                 RandomForestClassifier(bootstrap=True, class_weight=None,
                                        criterion='gini', max_depth=12,
                                        max_features='auto',
                                        max_leaf_nodes=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=10,
                                        min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        n_estimators=100, n_jobs=None,
                                        oob_score=False, random_state=42,
 

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    4.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting classifier14: pipeline (14/15)
Pipeline(memory=None,
         steps=[('columnselector',
                 ColumnSelector(cols=(0, 1, 2, 3, 5, 6, 8, 9, 10, 11),
                                drop_axis=False)),
                ('xgbclassifier',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, gamma=0, learning_rate=0.1,
                               max_delta_step=0, max_depth=3,
                               min_child_weight=1, missing=None,
                               n_estimators=100, n_jobs=1, nthread=None,
                               objective='multi:softprob', random_state=42,
                               reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
                               seed=None, silent=None, subsample=1,
                               verbosity=1))],
         verbose=False)


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    6.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting classifier15: pipeline (15/15)
Pipeline(memory=None,
         steps=[('columnselector',
                 ColumnSelector(cols=(0, 1, 2, 3, 5, 6, 8, 9, 10, 11),
                                drop_axis=False)),
                ('svc',
                 SVC(C=1000, cache_size=None, class_weight={}, coef0=0.0,
                     decision_function_shape='ovo', degree=3, gamma=0.1,
                     gpu_id=0, kernel='linear', max_iter=-1, max_mem_size=-1,
                     n_jobs=-1, probability=False, random_state=None,
                     shrinking=False, tol=0.001, verbose=False))],
         verbose=False)


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.3min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  2.7min finished


StackingCVClassifier(classifiers=[Pipeline(memory=None,
                                           steps=[('columnselector',
                                                   ColumnSelector(cols=(0, 1, 2,
                                                                        3, 4, 5,
                                                                        6, 9,
                                                                        10,
                                                                        11),
                                                                  drop_axis=False)),
                                                  ('randomforestclassifier',
                                                   RandomForestClassifier(bootstrap=True,
                                                                          class_weight=None,
                                                                          criterion='gini',
                                             

<IPython.core.display.Javascript object>

In [19]:
ypred = sclf.predict(xtest)
print("Accuracy = ", m.accuracy_score(ytest, ypred))

Accuracy =  0.7416666666666667


<IPython.core.display.Javascript object>

In [20]:
confmatrx = pd.DataFrame(
    m.confusion_matrix(ytest, ypred),
    columns=["Normal", "Prediabetes", "diabetes"],
    index=["Normal", "Prediabetes", "diabetes"],
)
confmatrx.head()

Unnamed: 0,Normal,Prediabetes,diabetes
Normal,150,50,0
Prediabetes,50,134,16
diabetes,4,35,161


<IPython.core.display.Javascript object>

In [21]:
print(m.classification_report(ytest, ypred))

              precision    recall  f1-score   support

           0       0.74      0.75      0.74       200
           1       0.61      0.67      0.64       200
           2       0.91      0.81      0.85       200

    accuracy                           0.74       600
   macro avg       0.75      0.74      0.75       600
weighted avg       0.75      0.74      0.75       600



<IPython.core.display.Javascript object>

In [22]:
stackedmodels = sclf.clfs_

<IPython.core.display.Javascript object>

In [23]:
for i in range(len(stackedmodels)):
    ypred = stackedmodels[i].predict(xtest)
    print("Accuracy = ", m.accuracy_score(ytest, ypred))

    confmatrx = pd.DataFrame(
        m.confusion_matrix(ytest, ypred),
        columns=["Normal", "Prediabetes", "diabetes"],
        index=["Normal", "Prediabetes", "diabetes"],
    )
    print(confmatrx.head())

    print(m.classification_report(ytest, ypred))

    print(i)

Accuracy =  0.7383333333333333
             Normal  Prediabetes  diabetes
Normal          149           51         0
Prediabetes      54          128        18
diabetes          4           30       166
              precision    recall  f1-score   support

           0       0.72      0.74      0.73       200
           1       0.61      0.64      0.63       200
           2       0.90      0.83      0.86       200

    accuracy                           0.74       600
   macro avg       0.74      0.74      0.74       600
weighted avg       0.74      0.74      0.74       600

0
Accuracy =  0.745
             Normal  Prediabetes  diabetes
Normal          153           47         0
Prediabetes      55          119        26
diabetes          4           21       175
              precision    recall  f1-score   support

           0       0.72      0.77      0.74       200
           1       0.64      0.59      0.61       200
           2       0.87      0.88      0.87       200

    ac

<IPython.core.display.Javascript object>

# 3.2 voting classifer

In [17]:
votingclf = VotingClassifier(estimators=estimators, voting="hard")

<IPython.core.display.Javascript object>

In [18]:
votingclf.fit(xtrain, ytrain)

VotingClassifier(estimators=[('rf_model_520',
                              Pipeline(memory=None,
                                       steps=[('columnselector',
                                               ColumnSelector(cols=(0, 1, 2, 3,
                                                                    4, 5, 6, 9,
                                                                    10, 11),
                                                              drop_axis=False)),
                                              ('randomforestclassifier',
                                               RandomForestClassifier(bootstrap=True,
                                                                      class_weight=None,
                                                                      criterion='gini',
                                                                      max_depth=12,
                                                                      max_features='auto',
        

<IPython.core.display.Javascript object>

In [19]:
ypred = votingclf.predict(xtest.astype(float))

print("accuracy \t", m.accuracy_score(ytest, ypred))
confmatrx = pd.DataFrame(
    m.confusion_matrix(ytest, ypred),
    columns=["Normal", "Prediabetes", "diabetes"],
    index=["Normal", "Prediabetes", "diabetes"],
)
confmatrx.head()



accuracy 	 0.745


Unnamed: 0,Normal,Prediabetes,diabetes
Normal,150,49,1
Prediabetes,52,123,25
diabetes,4,22,174


<IPython.core.display.Javascript object>

In [20]:

print(classification_report(ytest, ypred))


              precision    recall  f1-score   support

           0       0.73      0.75      0.74       200
           1       0.63      0.61      0.62       200
           2       0.87      0.87      0.87       200

    accuracy                           0.74       600
   macro avg       0.74      0.74      0.74       600
weighted avg       0.74      0.74      0.74       600



<IPython.core.display.Javascript object>

In [22]:

stackedmodels = votingclf.estimators_
for i in range(len(stackedmodels)):
    ypred = stackedmodels[i].predict(xtest)
    print(type(stackedmodels[i]), "accuracy \t", m.accuracy_score(ytest, ypred))
    print("Accuracy = ", m.accuracy_score(ytest, ypred))

    confmatrx = pd.DataFrame(
        m.confusion_matrix(ytest, ypred),
        columns=["Normal", "Prediabetes", "diabetes"],
        index=["Normal", "Prediabetes", "diabetes"],
    )
    print(confmatrx.head())

    print(m.classification_report(ytest, ypred))

    print(i)

<class 'sklearn.pipeline.Pipeline'> accuracy 	 0.7383333333333333
Accuracy =  0.7383333333333333
             Normal  Prediabetes  diabetes
Normal          149           51         0
Prediabetes      54          128        18
diabetes          4           30       166
              precision    recall  f1-score   support

           0       0.72      0.74      0.73       200
           1       0.61      0.64      0.63       200
           2       0.90      0.83      0.86       200

    accuracy                           0.74       600
   macro avg       0.74      0.74      0.74       600
weighted avg       0.74      0.74      0.74       600

0
<class 'sklearn.pipeline.Pipeline'> accuracy 	 0.745
Accuracy =  0.745
             Normal  Prediabetes  diabetes
Normal          153           47         0
Prediabetes      55          119        26
diabetes          4           21       175
              precision    recall  f1-score   support

           0       0.72      0.77      0.74       

<class 'sklearn.pipeline.Pipeline'> accuracy 	 0.7383333333333333
Accuracy =  0.7383333333333333
             Normal  Prediabetes  diabetes
Normal          150           49         1
Prediabetes      54          119        27
diabetes          4           22       174
              precision    recall  f1-score   support

           0       0.72      0.75      0.74       200
           1       0.63      0.59      0.61       200
           2       0.86      0.87      0.87       200

    accuracy                           0.74       600
   macro avg       0.74      0.74      0.74       600
weighted avg       0.74      0.74      0.74       600

14


<IPython.core.display.Javascript object>

# 3.3 soft voting classifer

In [23]:
votingclf = VotingClassifier(estimators=estimators, voting="soft")

<IPython.core.display.Javascript object>

In [24]:
votingclf.fit(xtrain, ytrain)

VotingClassifier(estimators=[('rf_model_520',
                              Pipeline(memory=None,
                                       steps=[('columnselector',
                                               ColumnSelector(cols=(0, 1, 2, 3,
                                                                    4, 5, 6, 9,
                                                                    10, 11),
                                                              drop_axis=False)),
                                              ('randomforestclassifier',
                                               RandomForestClassifier(bootstrap=True,
                                                                      class_weight=None,
                                                                      criterion='gini',
                                                                      max_depth=12,
                                                                      max_features='auto',
        

<IPython.core.display.Javascript object>

In [25]:
ypred = votingclf.predict(xtest.astype(float))

print("accuracy \t", m.accuracy_score(ytest, ypred))
confmatrx = pd.DataFrame(
    m.confusion_matrix(ytest, ypred),
    columns=["Normal", "Prediabetes", "diabetes"],
    index=["Normal", "Prediabetes", "diabetes"],
)
confmatrx.head()



accuracy 	 0.745


Unnamed: 0,Normal,Prediabetes,diabetes
Normal,150,49,1
Prediabetes,52,124,24
diabetes,4,23,173


<IPython.core.display.Javascript object>

In [26]:

print(classification_report(ytest, ypred))


              precision    recall  f1-score   support

           0       0.73      0.75      0.74       200
           1       0.63      0.62      0.63       200
           2       0.87      0.86      0.87       200

    accuracy                           0.74       600
   macro avg       0.74      0.75      0.74       600
weighted avg       0.74      0.74      0.74       600



<IPython.core.display.Javascript object>

In [27]:
stackedmodels = votingclf.estimators_
for i in range(len(stackedmodels)):
    ypred = stackedmodels[i].predict(xtest)
    print(type(stackedmodels[i]), "accuracy \t", m.accuracy_score(ytest, ypred))
    print("Accuracy = ", m.accuracy_score(ytest, ypred))

    confmatrx = pd.DataFrame(
        m.confusion_matrix(ytest, ypred),
        columns=["Normal", "Prediabetes", "diabetes"],
        index=["Normal", "Prediabetes", "diabetes"],
    )
    print(confmatrx.head())

    print(m.classification_report(ytest, ypred))

    print(i)

<class 'sklearn.pipeline.Pipeline'> accuracy 	 0.7383333333333333
Accuracy =  0.7383333333333333
             Normal  Prediabetes  diabetes
Normal          149           51         0
Prediabetes      54          128        18
diabetes          4           30       166
              precision    recall  f1-score   support

           0       0.72      0.74      0.73       200
           1       0.61      0.64      0.63       200
           2       0.90      0.83      0.86       200

    accuracy                           0.74       600
   macro avg       0.74      0.74      0.74       600
weighted avg       0.74      0.74      0.74       600

0
<class 'sklearn.pipeline.Pipeline'> accuracy 	 0.745
Accuracy =  0.745
             Normal  Prediabetes  diabetes
Normal          153           47         0
Prediabetes      55          119        26
diabetes          4           21       175
              precision    recall  f1-score   support

           0       0.72      0.77      0.74       

<class 'sklearn.pipeline.Pipeline'> accuracy 	 0.7383333333333333
Accuracy =  0.7383333333333333
             Normal  Prediabetes  diabetes
Normal          150           49         1
Prediabetes      54          119        27
diabetes          4           22       174
              precision    recall  f1-score   support

           0       0.72      0.75      0.74       200
           1       0.63      0.59      0.61       200
           2       0.86      0.87      0.87       200

    accuracy                           0.74       600
   macro avg       0.74      0.74      0.74       600
weighted avg       0.74      0.74      0.74       600

14


<IPython.core.display.Javascript object>