In [34]:
%load_ext nb_black
# !pip install nb_black

The nb_black extension is already loaded. To reload it, use:
  %reload_ext nb_black


<IPython.core.display.Javascript object>

In [35]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn import metrics as m
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE  # doctest: +NORMALIZE_WHITESPACE
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
import xgboost as xgb
from sklearn.pipeline import make_pipeline
from mlxtend.feature_selection import ColumnSelector
from sklearn import model_selection
from mlxtend.classifier import StackingClassifier
from thundersvm import SVC as svmgpu

import warnings

warnings.filterwarnings("ignore")
randomseed = 7
np.random.seed(randomseed)

<IPython.core.display.Javascript object>

# 1. Read the dataset

In [36]:
x_original = pd.read_csv("../dataset/XLable_onlyDiabeticRemoved.txt")

conditions = [
    (x_original.L100800 < 100),
    (x_original.L100800 >= 100) & (x_original.L100800 < 126),
    (x_original.L100800 >= 126),
]
choices = [0, 1, 2]
x_original["CLASS"] = np.select(conditions, choices, default=0)
x_original = x_original[
    [
        "Unnamed: 0",
        "L100800",
        "L104600",
        "S000300",
        "AGE",
        "SEX",
        "FIELD_16",
        "FIELD_23",
        "FIELD_15",
        "FIELD_22",
        "FIELD_17",
        "FIELD_24",
    ]
]

<IPython.core.display.Javascript object>

In [37]:
y_original = pd.read_csv("../dataset/TargetLable_onlyDiabeticRemoved.txt")

conditions = [
    (y_original.L100800 < 100),
    (y_original.L100800 >= 100) & (y_original.L100800 < 126),
    (y_original.L100800 >= 126),
]

choices = [0, 1, 2]
y_original["CLASS"] = np.select(conditions, choices, default=0)

y_original = y_original[["Unnamed: 0", "CLASS"]]

print(y_original.shape)

(185843, 2)


<IPython.core.display.Javascript object>

In [38]:
data = pd.merge(
    x_original, y_original, how="inner", left_on="Unnamed: 0", right_on="Unnamed: 0"
)

<IPython.core.display.Javascript object>

In [39]:
# filter the data set
data = data[data.FIELD_16 != 1]  # exclude people who are diagnosed for (diabetes)
data = data[data.FIELD_23 != 1]  # exclude people who are on medication for diabetes

data = data[
    data.FIELD_15 != 1
]  # exclude people who are diagnosed for (high blood pressure)
data = data[
    data.FIELD_22 != 1
]  # exclude people who are on medication for high blood pressure

data = data[data.FIELD_17 != 1]  # exclude people who are diagnosed for hyperlipidemia
data = data[
    data.FIELD_24 != 1
]  # exclude people who are on medication for hyperlipidemia

print(data.shape)

(169024, 13)


<IPython.core.display.Javascript object>

In [40]:
data = data.dropna()
print(data.shape)
data = data[["L100800", "L104600", "S000300", "AGE", "SEX", "CLASS"]]
data.head()

(60035, 13)


Unnamed: 0,L100800,L104600,S000300,AGE,SEX,CLASS
2,78.0,5.28,20.2,46.0,1.0,0
5,90.0,5.74,25.5,52.0,1.0,0
10,86.0,5.83,21.2,37.0,1.0,0
11,86.0,4.73,22.0,39.0,1.0,0
20,87.0,5.6,24.6,59.0,1.0,0


<IPython.core.display.Javascript object>

# 2. Downsample the majority class and upsample the minority

In [41]:
diabetic = data[data.CLASS == 2]
prediabetic = data[data.CLASS == 1]
normal = data[data.CLASS == 0]

print(diabetic.shape[0], prediabetic.shape[0], normal.shape[0])

1156 18708 40171


<IPython.core.display.Javascript object>

In [42]:
diabetic_test = diabetic.sample(200, random_state=randomseed)
prediabetic_test = prediabetic.sample(200, random_state=randomseed)
normal_test = normal.sample(200, random_state=randomseed)
test = pd.concat([diabetic_test, prediabetic_test, normal_test])

diabetic_train = diabetic.drop(diabetic_test.index)
prediabetic_train = prediabetic.drop(prediabetic_test.index)
# .sample(
#     10 * diabetic_train.shape[0], random_state=randomseed
# )
normal_train = normal.drop(normal_test.index).sample(
    prediabetic_train.shape[0],
    random_state=randomseed
    #     10 * diabetic_train.shape[0], random_state=randomseed
)
train = pd.concat([diabetic_train, diabetic_train, prediabetic_train, normal_train])

<IPython.core.display.Javascript object>

In [43]:
xtrain = train.iloc[:, :-1]
ytrain = train.iloc[:, -1]
xtest = test.iloc[:, :-1]
ytest = test.iloc[:, -1]

<IPython.core.display.Javascript object>

In [44]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
xtrain = scaler.fit_transform(xtrain)
xtest = scaler.transform(xtest)

<IPython.core.display.Javascript object>

In [45]:
from imblearn.over_sampling import SMOTE, SMOTENC  # doctest: +NORMALIZE_WHITESPACE

randomseed = 42

sm = SMOTENC(
    random_state=randomseed, categorical_features=[4], sampling_strategy="minority"
)
X_res, y_res = sm.fit_resample(xtrain, ytrain)

print("Resampled dataset shape %s" % Counter(y_res))
print(
    y_res[y_res == 0].shape[0], y_res[y_res == 1].shape[0], y_res[y_res == 2].shape[0]
)
print(X_res.shape, y_res.shape)

xtrain = X_res
ytrain = y_res

Resampled dataset shape Counter({2: 18508, 1: 18508, 0: 18508})
18508 18508 18508
(55524, 5) (55524,)


<IPython.core.display.Javascript object>

# 3. Generate the classifier models based on the selected  features

# Models

In [46]:
rf_model = RandomForestClassifier(
    random_state=randomseed,
    n_estimators=100,
    max_depth=12,
    min_samples_split=2,
    min_samples_leaf=10,
    max_features="auto",
)

xgb_model = xgb.XGBClassifier(objective="binary:logistic", random_state=randomseed)

scv_model = svmgpu(
    C=70,
    cache_size=200,
    class_weight={},
    coef0=0.0,
    decision_function_shape="ovr",
    degree=3,
    gamma=0.001,
    gpu_id=0,
    kernel="linear",
    max_iter=-1,
    max_mem_size=-1,
    n_jobs=-1,
    probability=True,
    random_state=42,
    shrinking=True,
    tol=0.001,
    verbose=False,
)


scv_model_cpu = SVC(
    C=70,
    cache_size=200,
    class_weight={},
    coef0=0.0,
    decision_function_shape="ovr",
    degree=3,
    gamma=0.001,
    #     gpu_id=0,
    kernel="linear",
    max_iter=-1,
    #     max_mem_size=-1,
    #     n_jobs=-1,
    probability=True,
    random_state=42,
    shrinking=True,
    tol=0.001,
    verbose=False,
)

<IPython.core.display.Javascript object>

# 4. Generate Stacking Classifier

In [47]:
sclf = StackingClassifier(classifiers=[rf_model,xgb_model,scv_model],verbose=2,
                           meta_classifier=RandomForestClassifier(n_estimators=500))


<IPython.core.display.Javascript object>

In [48]:
sclf.fit(xtrain, ytrain)

Fitting 3 classifiers...
Fitting classifier1: randomforestclassifier (1/3)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=12, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=10, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)
Fitting classifier2: xgbclassifier (2/3)
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos

StackingClassifier(average_probas=False,
                   classifiers=[RandomForestClassifier(bootstrap=True,
                                                       class_weight=None,
                                                       criterion='gini',
                                                       max_depth=12,
                                                       max_features='auto',
                                                       max_leaf_nodes=None,
                                                       min_impurity_decrease=0.0,
                                                       min_impurity_split=None,
                                                       min_samples_leaf=10,
                                                       min_samples_split=2,
                                                       min_weight_fraction_leaf=0.0,
                                                       n_estimators=100,
                                                

<IPython.core.display.Javascript object>

In [49]:
ypred = sclf.predict((xtest))

print("accuracy \t", m.accuracy_score(ytest, ypred))
confmatrx = pd.DataFrame(
    m.confusion_matrix(ytest, ypred),
    columns=["Normal", "Prediabetes", "diabetes"],
    index=["Normal", "Prediabetes", "diabetes"],
)
confmatrx.head()

accuracy 	 0.735


Unnamed: 0,Normal,Prediabetes,diabetes
Normal,135,65,0
Prediabetes,51,136,13
diabetes,2,28,170


<IPython.core.display.Javascript object>

In [50]:
print(classification_report(ytest, ypred))

              precision    recall  f1-score   support

           0       0.72      0.68      0.70       200
           1       0.59      0.68      0.63       200
           2       0.93      0.85      0.89       200

    accuracy                           0.73       600
   macro avg       0.75      0.73      0.74       600
weighted avg       0.75      0.73      0.74       600



<IPython.core.display.Javascript object>

# Compute the performance of each model of the sclf

In [51]:
# for i in range(len(sclf.clfs_)):
#     ypred = sclf.clfs_[i].predict((xtest))
#     print(type(sclf.clfs_[i]), "accuracy \t", m.accuracy_score(ytest, ypred))

<IPython.core.display.Javascript object>

In [52]:
stackedmodels = sclf.clfs_
for i in range(len(stackedmodels)):
    ypred = stackedmodels[i].predict(xtest)
    print(type(sclf.clfs_[i]), "accuracy \t", m.accuracy_score(ytest, ypred))
    print("Accuracy = ", m.accuracy_score(ytest, ypred))

    confmatrx = pd.DataFrame(
        m.confusion_matrix(ytest, ypred),
        columns=["Normal", "Prediabetes", "diabetes"],
        index=["Normal", "Prediabetes", "diabetes"],
    )
    print(confmatrx.head())

    print(m.classification_report(ytest, ypred))

    print(i)

<class 'sklearn.ensemble.forest.RandomForestClassifier'> accuracy 	 0.735
Accuracy =  0.735
             Normal  Prediabetes  diabetes
Normal          135           65         0
Prediabetes      51          136        13
diabetes          2           28       170
              precision    recall  f1-score   support

           0       0.72      0.68      0.70       200
           1       0.59      0.68      0.63       200
           2       0.93      0.85      0.89       200

    accuracy                           0.73       600
   macro avg       0.75      0.73      0.74       600
weighted avg       0.75      0.73      0.74       600

0
<class 'xgboost.sklearn.XGBClassifier'> accuracy 	 0.7233333333333334
Accuracy =  0.7233333333333334
             Normal  Prediabetes  diabetes
Normal          135           64         1
Prediabetes      56          122        22
diabetes          1           22       177
              precision    recall  f1-score   support

           0       0.70  

<IPython.core.display.Javascript object>

# 5. Generate Votting Classifer

In [53]:
votingclf = VotingClassifier(
    estimators=[
        ("rf_model", rf_model),
        ("xgb_model", xgb_model),
        ("scv_model", scv_model_cpu),
    ],
    voting="hard",
)

<IPython.core.display.Javascript object>

In [54]:
votingclf.fit(xtrain, ytrain)

VotingClassifier(estimators=[('rf_model',
                              RandomForestClassifier(bootstrap=True,
                                                     class_weight=None,
                                                     criterion='gini',
                                                     max_depth=12,
                                                     max_features='auto',
                                                     max_leaf_nodes=None,
                                                     min_impurity_decrease=0.0,
                                                     min_impurity_split=None,
                                                     min_samples_leaf=10,
                                                     min_samples_split=2,
                                                     min_weight_fraction_leaf=0.0,
                                                     n_estimators=100,
                                                     n_jobs=None,
     

<IPython.core.display.Javascript object>

In [55]:
ypred = votingclf.predict(xtest.astype(float))

print("accuracy \t", m.accuracy_score(ytest, ypred))
confmatrx = pd.DataFrame(
    m.confusion_matrix(ytest, ypred),
    columns=["Normal", "Prediabetes", "diabetes"],
    index=["Normal", "Prediabetes", "diabetes"],
)
confmatrx.head()

accuracy 	 0.7233333333333334


Unnamed: 0,Normal,Prediabetes,diabetes
Normal,135,64,1
Prediabetes,55,123,22
diabetes,2,22,176


<IPython.core.display.Javascript object>

In [56]:
print(classification_report(ytest, ypred))

              precision    recall  f1-score   support

           0       0.70      0.68      0.69       200
           1       0.59      0.61      0.60       200
           2       0.88      0.88      0.88       200

    accuracy                           0.72       600
   macro avg       0.73      0.72      0.72       600
weighted avg       0.73      0.72      0.72       600



<IPython.core.display.Javascript object>

In [57]:
stackedmodels = votingclf.estimators_
for i in range(len(stackedmodels)):
    ypred = stackedmodels[i].predict(xtest)
    print(type(sclf.clfs_[i]), "accuracy \t", m.accuracy_score(ytest, ypred))
    print("Accuracy = ", m.accuracy_score(ytest, ypred))

    confmatrx = pd.DataFrame(
        m.confusion_matrix(ytest, ypred),
        columns=["Normal", "Prediabetes", "diabetes"],
        index=["Normal", "Prediabetes", "diabetes"],
    )
    print(confmatrx.head())

    print(m.classification_report(ytest, ypred))

    print(i)

<class 'sklearn.ensemble.forest.RandomForestClassifier'> accuracy 	 0.735
Accuracy =  0.735
             Normal  Prediabetes  diabetes
Normal          135           65         0
Prediabetes      51          136        13
diabetes          2           28       170
              precision    recall  f1-score   support

           0       0.72      0.68      0.70       200
           1       0.59      0.68      0.63       200
           2       0.93      0.85      0.89       200

    accuracy                           0.73       600
   macro avg       0.75      0.73      0.74       600
weighted avg       0.75      0.73      0.74       600

0
<class 'xgboost.sklearn.XGBClassifier'> accuracy 	 0.7233333333333334
Accuracy =  0.7233333333333334
             Normal  Prediabetes  diabetes
Normal          135           64         1
Prediabetes      56          122        22
diabetes          1           22       177
              precision    recall  f1-score   support

           0       0.70  

<IPython.core.display.Javascript object>

# 6  Generate Votting Classifer soft voting

In [58]:

scv_model_cpu_withProb = SVC(
   C=1000,
    cache_size=200,
    class_weight={},
    coef0=0.0,
    decision_function_shape="ovo",
    degree=3,
    gamma=0.1,
#     gpu_id=0,
    kernel="linear",
    max_iter=-1,
#     max_mem_size=-1,
#     n_jobs=-1,
    probability=True,
    random_state=randomseed,
    shrinking=False,
    tol=0.001,
    verbose=False
)


<IPython.core.display.Javascript object>

In [59]:
votingclf2 = VotingClassifier(
    estimators=[
        ("rf_model", rf_model),
        ("xgb_model", xgb_model),
        ("scv_model", scv_model_cpu_withProb),
    ],
    voting="soft",
)

<IPython.core.display.Javascript object>

In [60]:
votingclf2.fit(xtrain, ytrain)

VotingClassifier(estimators=[('rf_model',
                              RandomForestClassifier(bootstrap=True,
                                                     class_weight=None,
                                                     criterion='gini',
                                                     max_depth=12,
                                                     max_features='auto',
                                                     max_leaf_nodes=None,
                                                     min_impurity_decrease=0.0,
                                                     min_impurity_split=None,
                                                     min_samples_leaf=10,
                                                     min_samples_split=2,
                                                     min_weight_fraction_leaf=0.0,
                                                     n_estimators=100,
                                                     n_jobs=None,
     

<IPython.core.display.Javascript object>

In [61]:
ypred = votingclf2.predict(xtest.astype(float))

print("accuracy \t", m.accuracy_score(ytest, ypred))
confmatrx = pd.DataFrame(
    m.confusion_matrix(ytest, ypred),
    columns=["Normal", "Prediabetes", "diabetes"],
    index=["Normal", "Prediabetes", "diabetes"],
)
confmatrx.head()

accuracy 	 0.7233333333333334


Unnamed: 0,Normal,Prediabetes,diabetes
Normal,136,64,0
Prediabetes,55,126,19
diabetes,1,27,172


<IPython.core.display.Javascript object>

In [62]:
print(classification_report(ytest, ypred))

              precision    recall  f1-score   support

           0       0.71      0.68      0.69       200
           1       0.58      0.63      0.60       200
           2       0.90      0.86      0.88       200

    accuracy                           0.72       600
   macro avg       0.73      0.72      0.73       600
weighted avg       0.73      0.72      0.73       600



<IPython.core.display.Javascript object>

In [63]:
stackedmodels = votingclf2.estimators_
for i in range(len(stackedmodels)):
    ypred = stackedmodels[i].predict(xtest)
    print(type(sclf.clfs_[i]), "accuracy \t", m.accuracy_score(ytest, ypred))
    print("Accuracy = ", m.accuracy_score(ytest, ypred))

    confmatrx = pd.DataFrame(
        m.confusion_matrix(ytest, ypred),
        columns=["Normal", "Prediabetes", "diabetes"],
        index=["Normal", "Prediabetes", "diabetes"],
    )
    print(confmatrx.head())

    print(m.classification_report(ytest, ypred))

    print(i)

<class 'sklearn.ensemble.forest.RandomForestClassifier'> accuracy 	 0.735
Accuracy =  0.735
             Normal  Prediabetes  diabetes
Normal          135           65         0
Prediabetes      51          136        13
diabetes          2           28       170
              precision    recall  f1-score   support

           0       0.72      0.68      0.70       200
           1       0.59      0.68      0.63       200
           2       0.93      0.85      0.89       200

    accuracy                           0.73       600
   macro avg       0.75      0.73      0.74       600
weighted avg       0.75      0.73      0.74       600

0
<class 'xgboost.sklearn.XGBClassifier'> accuracy 	 0.7233333333333334
Accuracy =  0.7233333333333334
             Normal  Prediabetes  diabetes
Normal          135           64         1
Prediabetes      56          122        22
diabetes          1           22       177
              precision    recall  f1-score   support

           0       0.70  

<IPython.core.display.Javascript object>