In [None]:
%load_ext nb_black
# !pip install nb_black

In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn import metrics as m
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE  # doctest: +NORMALIZE_WHITESPACE
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
import xgboost as xgb
from sklearn.pipeline import make_pipeline
from mlxtend.feature_selection import ColumnSelector
from sklearn import model_selection
from mlxtend.classifier import StackingClassifier
from thundersvm import SVC as svmgpu

import warnings

warnings.filterwarnings("ignore")
randomseed = 7
np.random.seed(randomseed)

Using TensorFlow backend.


<IPython.core.display.Javascript object>

# 1. Read the dataset

In [3]:
x_original = pd.read_csv("../dataset/XLable_onlyDiabeticRemoved.txt")

conditions = [
    (x_original.L100800 < 100),
    (x_original.L100800 >= 100) & (x_original.L100800 < 126),
    (x_original.L100800 >= 126),
]
choices = [0, 1, 2]
x_original["CLASS"] = np.select(conditions, choices, default=0)
x_original = x_original[
    [
        "Unnamed: 0",
        "L100800",
        "L104600",
        "L103000",
        "S000300",
        "L101700",
        "L100700",
        "FIELD_33",
        "FIELD_38",
        "FIELD_40",
        "FIELD_31",
        "SEX",
        "AGE",  #'CLASS',
        "FIELD_16",
        "FIELD_23",
        "FIELD_15",
        "FIELD_22",
        "FIELD_17",
        "FIELD_24",
    ]
]

print(x_original.shape)

(185843, 19)


<IPython.core.display.Javascript object>

In [4]:
y_original = pd.read_csv("../dataset/TargetLable_onlyDiabeticRemoved.txt")

conditions = [
    (y_original.L100800 < 100),
    (y_original.L100800 >= 100) & (y_original.L100800 < 126),
    (y_original.L100800 >= 126),
]

choices = [0, 1, 2]
y_original["CLASS"] = np.select(conditions, choices, default=0)

y_original = y_original[["Unnamed: 0", "CLASS"]]

print(y_original.shape)

(185843, 2)


<IPython.core.display.Javascript object>

In [5]:
data = pd.merge(
    x_original, y_original, how="inner", left_on="Unnamed: 0", right_on="Unnamed: 0"
)

<IPython.core.display.Javascript object>

In [6]:
# filter the data set
data = data[data.FIELD_16 != 1]  # exclude people who are diagnosed for (diabetes)
data = data[data.FIELD_23 != 1]  # exclude people who are on medication for diabetes

data = data[
    data.FIELD_15 != 1
]  # exclude people who are diagnosed for (high blood pressure)
data = data[
    data.FIELD_22 != 1
]  # exclude people who are on medication for high blood pressure

data = data[data.FIELD_17 != 1]  # exclude people who are diagnosed for hyperlipidemia
data = data[
    data.FIELD_24 != 1
]  # exclude people who are on medication for hyperlipidemia

print(data.shape)

(169024, 20)


<IPython.core.display.Javascript object>

In [7]:
data = data[
    [
        "L100800",
        "L104600",
        "L103000",
        "S000300",
        "L101700",
        "L100700",
        "FIELD_33",
        "FIELD_38",
        "FIELD_40",
        "FIELD_31",
        "SEX",
        "AGE",
        "CLASS",
    ]
]
data = data.dropna()
print(data.shape)
data.head()

(56542, 13)


Unnamed: 0,L100800,L104600,L103000,S000300,L101700,L100700,FIELD_33,FIELD_38,FIELD_40,FIELD_31,SEX,AGE,CLASS
2,78.0,5.28,41.0,20.2,15.0,3.8,1.0,2.0,1.0,0.0,1.0,46.0,0
5,90.0,5.74,50.0,25.5,12.0,3.4,1.0,0.0,1.0,0.0,1.0,52.0,0
10,86.0,5.83,45.0,21.2,17.0,3.9,1.0,0.0,1.0,1.0,1.0,37.0,0
11,86.0,4.73,54.0,22.0,30.0,4.2,1.0,2.0,3.0,0.0,1.0,39.0,0
20,87.0,5.6,340.0,24.6,26.0,4.7,1.0,0.0,2.0,0.0,1.0,59.0,0


<IPython.core.display.Javascript object>

# 2. Downsample the majority class and upsample the minority

In [8]:
diabetic = data[data.CLASS == 2]
prediabetic = data[data.CLASS == 1]
normal = data[data.CLASS == 0]

print(diabetic.shape[0], prediabetic.shape[0], normal.shape[0])

1045 17331 38166


<IPython.core.display.Javascript object>

In [9]:
diabetic_test = diabetic.sample(200, random_state=randomseed)
prediabetic_test = prediabetic.sample(200, random_state=randomseed)
normal_test = normal.sample(200, random_state=randomseed)
test = pd.concat([diabetic_test, prediabetic_test, normal_test])

diabetic_train = diabetic.drop(diabetic_test.index)
prediabetic_train = prediabetic.drop(prediabetic_test.index)
# .sample(
#     10 * diabetic_train.shape[0], random_state=randomseed
# )
normal_train = normal.drop(normal_test.index).sample(
    prediabetic_train.shape[0],
    random_state=randomseed
    #     10 * diabetic_train.shape[0], random_state=randomseed
)
train = pd.concat([diabetic_train, diabetic_train, prediabetic_train, normal_train])

<IPython.core.display.Javascript object>

In [10]:
xtrain = train.iloc[:, :-1]
ytrain = train.iloc[:, -1]
xtest = test.iloc[:, :-1]
ytest = test.iloc[:, -1]

<IPython.core.display.Javascript object>

In [11]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
xtrain = scaler.fit_transform(xtrain)
xtest = scaler.transform(xtest)

<IPython.core.display.Javascript object>

In [12]:
from imblearn.over_sampling import SMOTE, SMOTENC  # doctest: +NORMALIZE_WHITESPACE

randomseed = 42

sm = SMOTENC(
    random_state=randomseed,
    categorical_features=[6, 7, 8, 9, 10],
    sampling_strategy="minority",
)
X_res, y_res = sm.fit_resample(xtrain, ytrain)

print("Resampled dataset shape %s" % Counter(y_res))
print(
    y_res[y_res == 0].shape[0], y_res[y_res == 1].shape[0], y_res[y_res == 2].shape[0]
)
print(X_res.shape, y_res.shape)

xtrain = X_res
ytrain = y_res

Resampled dataset shape Counter({2: 17131, 1: 17131, 0: 17131})
17131 17131 17131
(51393, 12) (51393,)


<IPython.core.display.Javascript object>

# 3. Generate the classifier models based on the selected  features

# Models

In [13]:
# generate the models based on the selected columns list and the ml classifiers

weakmodles = []
estimators = []

<IPython.core.display.Javascript object>

# 3.1. 12 features

In [14]:
rf_model_12 = RandomForestClassifier(
    bootstrap=True,
    class_weight=None,
    criterion="gini",
    max_depth=10,
    max_features="log2",
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    min_impurity_split=None,
    min_samples_leaf=2,
    min_samples_split=12,
    min_weight_fraction_leaf=0.0,
    n_estimators=100,
    n_jobs=-1,
    oob_score=False,
    random_state=randomseed,
    verbose=1,
    warm_start=False,
)

xgb_model_12 = xgb.XGBClassifier(objective="binary:logistic", random_state=randomseed)

scv_model_12 = svmgpu(
    C=100,
    cache_size=100,
    class_weight={},
    coef0=0.0,
    decision_function_shape="ovo",
    degree=3,
    gamma=0.1,
    gpu_id=0,
    kernel="linear",
    max_iter=-1,
    max_mem_size=-1,
    n_jobs=-1,
    probability=False,
    random_state=randomseed,
    shrinking=False,
    tol=0.001,
    verbose=False,
)

scv_model_cpu_12 = SVC(
    C=100,
    cache_size=100,
    class_weight={},
    coef0=0.0,
    decision_function_shape="ovo",
    degree=3,
    gamma=0.1,
    #     gpu_id=0,
    kernel="linear",
    max_iter=-1,
    #     max_mem_size=-1,
    #     n_jobs=-1,
    probability=False,
    random_state=randomseed,
    shrinking=False,
    tol=0.001,
    verbose=False,
)

weakmodles.append(rf_model_12)
weakmodles.append(xgb_model_12)
weakmodles.append(scv_model_12)

estimators.append(("rf_model_12", rf_model_12))
estimators.append(("xgb_model_12", xgb_model_12))
estimators.append(("scv_model_12", scv_model_cpu_12))

# estimators = [
#     ("rf_model_12", rf_model_12),
#     ("xgb_model_12", xgb_model_12),
#     ("scv_model_12", scv_model_12),
# ]

<IPython.core.display.Javascript object>

# 3.2. 5 features

In [15]:
rf_model_5 = RandomForestClassifier(
    random_state=randomseed,
    n_estimators=100,
    max_depth=12,
    min_samples_split=2,
    min_samples_leaf=10,
    max_features="auto",
)

xgb_model_5 = xgb.XGBClassifier(objective="binary:logistic", random_state=randomseed)

scv_model_5 = svmgpu(
    C=70,
    cache_size=200,
    class_weight={},
    coef0=0.0,
    decision_function_shape="ovr",
    degree=3,
    gamma=0.001,
    gpu_id=0,
    kernel="linear",
    max_iter=-1,
    max_mem_size=-1,
    n_jobs=-1,
    probability=True,
    random_state=42,
    shrinking=True,
    tol=0.001,
    verbose=False,
)


scv_model_cpu_5 = SVC(
    C=70,
    cache_size=200,
    class_weight={},
    coef0=0.0,
    decision_function_shape="ovr",
    degree=3,
    gamma=0.001,
    #     gpu_id=0,
    kernel="linear",
    max_iter=-1,
    #     max_mem_size=-1,
    #     n_jobs=-1,
    probability=True,
    random_state=42,
    shrinking=True,
    tol=0.001,
    verbose=False,
)


weakmodles.append(rf_model_5)
weakmodles.append(xgb_model_5)
weakmodles.append(scv_model_5)

estimators.append(("rf_model_5", rf_model_5))
estimators.append(("xgb_model_5", xgb_model_5))
estimators.append(("scv_model_5", scv_model_cpu_5))


<IPython.core.display.Javascript object>

# 3.3. 10 Features

In [16]:
top10colscomb = [
    (0, 1, 2, 3, 4, 5, 6, 9, 10, 11),
    (0, 1, 3, 4, 6, 7, 8, 9, 10, 11),
    (0, 1, 2, 3, 4, 5, 6, 7, 10, 11),
    (0, 1, 2, 3, 4, 5, 6, 8, 10, 11),
    (0, 1, 2, 3, 5, 6, 8, 9, 10, 11),
    (0, 1, 3, 4, 5, 6, 8, 9, 10, 11),
    (0, 1, 3, 4, 5, 6, 7, 9, 10, 11),
    (0, 1, 2, 3, 4, 5, 8, 9, 10, 11),
    (0, 1, 2, 3, 4, 5, 6, 7, 9, 11),
    (0, 1, 2, 3, 4, 6, 7, 9, 10, 11),
]

# modles used in the 12 features set
rf_model_10 = RandomForestClassifier(
    bootstrap=True,
    class_weight=None,
    criterion="gini",
    max_depth=12,
    max_features="auto",
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    min_impurity_split=None,
    min_samples_leaf=10,
    min_samples_split=2,
    min_weight_fraction_leaf=0.0,
    n_estimators=100,
    n_jobs=None,
    oob_score=False,
    random_state=42,
    verbose=0,
    warm_start=False,
)

xgb_model_10 = xgb.XGBClassifier(
    base_score=0.5,
    booster="gbtree",
    colsample_bylevel=1,
    colsample_bynode=1,
    colsample_bytree=1,
    gamma=0,
    learning_rate=0.1,
    max_delta_step=0,
    max_depth=3,
    min_child_weight=1,
    missing=None,
    n_estimators=100,
    n_jobs=1,
    nthread=None,
    objective="multi:softprob",
    random_state=42,
    reg_alpha=0,
    reg_lambda=1,
    scale_pos_weight=1,
    seed=None,
    silent=None,
    subsample=1,
    verbosity=1,
)

scv_model_10 = svmgpu(
    C=1000,
    cache_size=100,
    class_weight={},
    coef0=0.0,
    decision_function_shape="ovo",
    degree=3,
    gamma=0.1,
    gpu_id=0,
    kernel="linear",
    max_iter=-1,
    max_mem_size=-1,
    n_jobs=-1,
    probability=False,
    random_state=None,
    shrinking=False,
    tol=0.001,
    verbose=False,
)

scv_model_cpu_10 = SVC(
    C=1000,
    cache_size=100,
    class_weight={},
    coef0=0.0,
    decision_function_shape="ovo",
    degree=3,
    gamma=0.1,  # gpu_id=0,
    kernel="linear",
    max_iter=-1,  # max_mem_size=-1, n_jobs=-1,
    probability=True,
    random_state=None,
    shrinking=False,
    tol=0.001,
    verbose=False,
)

# generate the models based on the selected columns list and the ml classifiers
for i in range(5):
    weakmodles.append(make_pipeline(ColumnSelector(cols=top10colscomb[i]), rf_model_10))
    weakmodles.append(
        make_pipeline(ColumnSelector(cols=top10colscomb[i]), xgb_model_10)
    )
    weakmodles.append(
        make_pipeline(ColumnSelector(cols=top10colscomb[i]), scv_model_10)
    )

for i in range(5):
    estimators.append(
        (
            (
                "rf_model_10"str(i),
                (make_pipeline(ColumnSelector(cols=top10colscomb[i]), rf_model_10)),
            )
        )
    )
    estimators.append(
        (
            "xgb_model_10"str(i),
            (make_pipeline(ColumnSelector(cols=top10colscomb[i]), xgb_model_10)),
        )
    )
    estimators.append(
        (
            "scv_model_cpu_10"str(i),
            (make_pipeline(ColumnSelector(cols=top10colscomb[i]), scv_model_cpu_10)),
        )
    )

SyntaxError: invalid syntax (<ipython-input-16-f4dc9eca7c8d>, line 112)

ERROR:root:Cannot parse: 112:29:                 "rf_model_10"str(i),
Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\site-packages\lab_black.py", line 218, in format_cell
    formatted_code = _format_code(cell)
  File "C:\ProgramData\Anaconda3\lib\site-packages\lab_black.py", line 29, in _format_code
    return format_str(src_contents=code, mode=FileMode())
  File "C:\ProgramData\Anaconda3\lib\site-packages\black.py", line 669, in format_str
    src_node = lib2to3_parse(src_contents.lstrip(), mode.target_versions)
  File "C:\ProgramData\Anaconda3\lib\site-packages\black.py", line 758, in lib2to3_parse
    raise exc from None
black.InvalidInput: Cannot parse: 112:29:                 "rf_model_10"str(i),


# 3.4. 9 Features

In [None]:
topcols9comb = [
    (0, 1, 2, 3, 5, 6, 8, 10, 11),
    (0, 1, 2, 3, 4, 6, 9, 10, 11),
    (0, 1, 2, 3, 4, 5, 6, 9, 11),
    (0, 1, 3, 4, 6, 7, 9, 10, 11),
    (0, 1, 3, 4, 5, 8, 9, 10, 11),
    (0, 1, 3, 4, 5, 6, 9, 10, 11),
    (0, 1, 3, 4, 5, 6, 8, 10, 11),
    (0, 1, 3, 4, 6, 7, 8, 10, 11),
    (0, 1, 3, 4, 5, 6, 7, 8, 11),
    (0, 1, 3, 5, 6, 8, 9, 10, 11),
]

# modles used in the 12 features set
rf_model_9 = RandomForestClassifier(
    bootstrap=True,
    class_weight=None,
    criterion="gini",
    max_depth=12,
    max_features="auto",
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    min_impurity_split=None,
    min_samples_leaf=10,
    min_samples_split=2,
    min_weight_fraction_leaf=0.0,
    n_estimators=100,
    n_jobs=None,
    oob_score=False,
    random_state=randomseed,
    verbose=0,
    warm_start=False,
)

xgb_model_9 = xgb.XGBClassifier(
    base_score=0.5,
    booster="gbtree",
    colsample_bylevel=1,
    colsample_bynode=1,
    colsample_bytree=1,
    gamma=0,
    learning_rate=0.1,
    max_delta_step=0,
    max_depth=3,
    min_child_weight=1,
    missing=None,
    n_estimators=100,
    n_jobs=1,
    nthread=None,
    objective="multi:softprob",
    random_state=randomseed,
    reg_alpha=0,
    reg_lambda=1,
    scale_pos_weight=1,
    seed=None,
    silent=None,
    subsample=1,
    verbosity=1,
)

scv_model_9 = svmgpu(
    C=1000,
    cache_size=100,
    class_weight={},
    coef0=0.0,
    decision_function_shape="ovo",
    degree=3,
    gamma=0.1,
    gpu_id=0,
    kernel="linear",
    max_iter=-1,
    max_mem_size=-1,
    n_jobs=-1,
    probability=False,
    random_state=randomseed,
    shrinking=False,
    tol=0.001,
    verbose=False,
)

scv_model_cpu_9 = SVC(
    C=1000,
    cache_size=100,
    class_weight={},
    coef0=0.0,
    decision_function_shape="ovo",
    degree=3,
    gamma=0.1,  # gpu_id=0,
    kernel="linear",
    max_iter=-1,  # max_mem_size=-1, n_jobs=-1,
    probability=True,
    random_state=randomseed,
    shrinking=False,
    tol=0.001,
    verbose=False,
)


# generate the models based on the selected columns list and the ml classifiers
for i in range(5):
    weakmodles.append(make_pipeline(ColumnSelector(cols=topcols9comb[i]), rf_model_9))
    weakmodles.append(make_pipeline(ColumnSelector(cols=topcols9comb[i]), xgb_model_9))
    weakmodles.append(make_pipeline(ColumnSelector(cols=topcols9comb[i]), scv_model_9))

for i in range(5):
    estimators.append(
        (
            (
                "rf_model_9"+str(i),
                (make_pipeline(ColumnSelector(cols=topcols9comb[i]), rf_model_9)),
            )
        )
    )
    estimators.append(
        (
            "xgb_model_9"+str(i),
            (make_pipeline(ColumnSelector(cols=topcols9comb[i]), xgb_model_9)),
        )
    )
    estimators.append(
        (
            "scv_model_cpu_9"+str(i),
            (make_pipeline(ColumnSelector(cols=topcols9comb[i]), scv_model_cpu_9)),
        )
    )

# 3.5. 8 Features

In [None]:
top8colscomb = [
    (0, 1, 2, 3, 5, 6, 10, 11),
    (0, 1, 2, 3, 4, 6, 10, 11),
    (0, 1, 2, 3, 4, 9, 10, 11),
    (0, 1, 2, 3, 5, 9, 10, 11),
    (0, 1, 3, 4, 5, 6, 9, 11),
    (0, 1, 3, 4, 6, 9, 10, 11),
    (0, 1, 3, 4, 8, 9, 10, 11),
    (0, 1, 2, 6, 8, 9, 10, 11),
    (0, 1, 2, 3, 5, 8, 10, 11),
    (0, 1, 3, 4, 5, 6, 10, 11),
]

# modles used in the 12 features set
rf_model_8 = RandomForestClassifier(
    bootstrap=True,
    class_weight=None,
    criterion="gini",
    max_depth=12,
    max_features="auto",
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    min_impurity_split=None,
    min_samples_leaf=10,
    min_samples_split=2,
    min_weight_fraction_leaf=0.0,
    n_estimators=100,
    n_jobs=None,
    oob_score=False,
    random_state=randomseed,
    verbose=0,
    warm_start=False,
)

xgb_model_8 = xgb.XGBClassifier(
    base_score=0.5,
    booster="gbtree",
    colsample_bylevel=1,
    colsample_bynode=1,
    colsample_bytree=1,
    gamma=0,
    learning_rate=0.1,
    max_delta_step=0,
    max_depth=3,
    min_child_weight=1,
    missing=None,
    n_estimators=100,
    n_jobs=1,
    nthread=None,
    objective="multi:softprob",
    random_state=randomseed,
    reg_alpha=0,
    reg_lambda=1,
    scale_pos_weight=1,
    seed=None,
    silent=None,
    subsample=1,
    verbosity=1,
)

scv_model_8 = svmgpu(
    C=1000,
    cache_size=100,
    class_weight={},
    coef0=0.0,
    decision_function_shape="ovo",
    degree=3,
    gamma=0.1,
    gpu_id=0,
    kernel="linear",
    max_iter=-1,
    max_mem_size=-1,
    n_jobs=-1,
    probability=False,
    random_state=None,
    shrinking=False,
    tol=0.001,
    verbose=False,
)

scv_model_cpu_8 = SVC(
    C=1000,
    cache_size=100,
    class_weight={},
    coef0=0.0,
    decision_function_shape="ovo",
    degree=3,
    gamma=0.1,  # gpu_id=0,
    kernel="linear",
    max_iter=-1,  # max_mem_size=-1, n_jobs=-1,
    probability=True,
    random_state=randomseed,
    shrinking=False,
    tol=0.001,
    verbose=False,
)

# generate the models based on the selected columns list and the ml classifiers
for i in range(5):
    weakmodles.append(make_pipeline(ColumnSelector(cols=top8colscomb[i]), rf_model_8))
    weakmodles.append(make_pipeline(ColumnSelector(cols=top8colscomb[i]), xgb_model_8))
    weakmodles.append(make_pipeline(ColumnSelector(cols=top8colscomb[i]), scv_model_8))

for i in range(5):
    estimators.append(
        (
            (
                "rf_model_8",
                (make_pipeline(ColumnSelector(cols=top8colscomb[i]), rf_model_8)),
            )
        )
    )
    estimators.append(
        (
            "xgb_model_8",
            (make_pipeline(ColumnSelector(cols=top8colscomb[i]), xgb_model_8)),
        )
    )
    estimators.append(
        (
            "scv_model_cpu_8",
            (make_pipeline(ColumnSelector(cols=top8colscomb[i]), scv_model_cpu_8)),
        )
    )

# 3.6 7 Features

In [None]:
top7colscomb = [
    (0, 1, 3, 5, 6, 10, 11),
    (0, 1, 3, 4, 5, 10, 11),
    (0, 1, 2, 3, 4, 10, 11),
    (0, 1, 2, 3, 5, 10, 11),
    (0, 1, 3, 6, 8, 10, 11),
    (0, 1, 6, 8, 9, 10, 11),
    (0, 1, 6, 7, 8, 9, 10),
    (0, 1, 2, 3, 8, 10, 11),
    (0, 1, 2, 6, 8, 10, 11),
    (0, 1, 3, 8, 9, 10, 11),
]


rf_model_7 = RandomForestClassifier(
    bootstrap=True,
    class_weight=None,
    criterion="gini",
    max_depth=12,
    max_features="auto",
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    min_impurity_split=None,
    min_samples_leaf=10,
    min_samples_split=2,
    min_weight_fraction_leaf=0.0,
    n_estimators=100,
    n_jobs=None,
    oob_score=False,
    random_state=randomseed,
    verbose=0,
    warm_start=False,
)

xgb_model_7 = xgb.XGBClassifier(
    base_score=0.5,
    booster="gbtree",
    colsample_bylevel=1,
    colsample_bynode=1,
    colsample_bytree=1,
    gamma=0,
    learning_rate=0.1,
    max_delta_step=0,
    max_depth=3,
    min_child_weight=1,
    missing=None,
    n_estimators=100,
    n_jobs=1,
    nthread=None,
    objective="multi:softprob",
    random_state=randomseed,
    reg_alpha=0,
    reg_lambda=1,
    scale_pos_weight=1,
    seed=None,
    silent=None,
    subsample=1,
    verbosity=1,
)

scv_model_7 = svmgpu(
    C=1000,
    cache_size=100,
    class_weight={},
    coef0=0.0,
    decision_function_shape="ovo",
    degree=3,
    gamma=0.1,
    gpu_id=0,
    kernel="linear",
    max_iter=-1,
    max_mem_size=-1,
    n_jobs=-1,
    probability=False,
    random_state=randomseed,
    shrinking=False,
    tol=0.001,
    verbose=False,
)

scv_model_cpu_7 = SVC(
    C=1000,
    cache_size=100,
    class_weight={},
    coef0=0.0,
    decision_function_shape="ovo",
    degree=3,
    gamma=0.1,  # gpu_id=0,
    kernel="linear",
    max_iter=-1,  # max_mem_size=-1, n_jobs=-1,
    probability=True,
    random_state=randomseed,
    shrinking=False,
    tol=0.001,
    verbose=False,
)


# generate the models based on the selected columns list and the ml classifiers
for i in range(5):
    weakmodles.append(make_pipeline(ColumnSelector(cols=top7colscomb[i]), rf_model_7))
    weakmodles.append(make_pipeline(ColumnSelector(cols=top7colscomb[i]), xgb_model_7))
    weakmodles.append(make_pipeline(ColumnSelector(cols=top7colscomb[i]), scv_model_7))

for i in range(5):
    estimators.append(
        (
            (
                "rf_model_7",
                (make_pipeline(ColumnSelector(cols=top7colscomb[i]), rf_model_7)),
            )
        )
    )
    estimators.append(
        (
            "xgb_model_7",
            (make_pipeline(ColumnSelector(cols=top7colscomb[i]), xgb_model_7)),
        )
    )
    estimators.append(
        (
            "scv_model_cpu_7",
            (make_pipeline(ColumnSelector(cols=top7colscomb[i]), scv_model_cpu_7)),
        )
    )

# 3.7. 6 Features

In [None]:
top6colscomb = [
    (0, 1, 8, 9, 10),
    (0, 1, 6, 8, 10),
    (0, 1, 7, 9, 10),
    (0, 1, 2, 8, 10),
    (0, 1, 5, 8, 10),
    (0, 1, 5, 6, 11),
    (0, 1, 7, 8, 10),
    (0, 1, 9, 10, 11),
    (0, 1, 5, 10, 11),
    (0, 1, 2, 5, 11),
]

# modles used in the 12 features set
rf_model_6 = RandomForestClassifier(
    bootstrap=True,
    class_weight=None,
    criterion="gini",
    max_depth=12,
    max_features="auto",
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    min_impurity_split=None,
    min_samples_leaf=10,
    min_samples_split=2,
    min_weight_fraction_leaf=0.0,
    n_estimators=100,
    n_jobs=None,
    oob_score=False,
    random_state=randomseed,
    verbose=0,
    warm_start=False,
)

xgb_model_6 = xgb.XGBClassifier(
    base_score=0.5,
    booster="gbtree",
    colsample_bylevel=1,
    colsample_bynode=1,
    colsample_bytree=1,
    gamma=0,
    learning_rate=0.1,
    max_delta_step=0,
    max_depth=3,
    min_child_weight=1,
    missing=None,
    n_estimators=100,
    n_jobs=1,
    nthread=None,
    objective="multi:softprob",
    random_state=randomseed,
    reg_alpha=0,
    reg_lambda=1,
    scale_pos_weight=1,
    seed=None,
    silent=None,
    subsample=1,
    verbosity=1,
)

scv_model_6 = svmgpu(
    C=1000,
    cache_size=100,
    class_weight={},
    coef0=0.0,
    decision_function_shape="ovo",
    degree=3,
    gamma=0.1,
    gpu_id=0,
    kernel="linear",
    max_iter=-1,
    max_mem_size=-1,
    n_jobs=-1,
    probability=False,
    random_state=randomseed,
    shrinking=False,
    tol=0.001,
    verbose=False,
)

scv_model_cpu_6 = SVC(
    C=1000,
    cache_size=100,
    class_weight={},
    coef0=0.0,
    decision_function_shape="ovo",
    degree=3,
    gamma=0.1,  # gpu_id=0,
    kernel="linear",
    max_iter=-1,  # max_mem_size=-1, n_jobs=-1,
    probability=True,
    random_state=randomseed,
    shrinking=False,
    tol=0.001,
    verbose=False,
)


# generate the models based on the selected columns list and the ml classifiers
for i in range(5):
    weakmodles.append(make_pipeline(ColumnSelector(cols=top6colscomb[i]), rf_model_6))
    weakmodles.append(make_pipeline(ColumnSelector(cols=top6colscomb[i]), xgb_model_6))
    weakmodles.append(make_pipeline(ColumnSelector(cols=top6colscomb[i]), scv_model_6))

for i in range(5):
    estimators.append(
        (
            (
                "rf_model_6",
                (make_pipeline(ColumnSelector(cols=top7colscomb[i]), rf_model_6)),
            )
        )
    )
    estimators.append(
        (
            "xgb_model_6",
            (make_pipeline(ColumnSelector(cols=top7colscomb[i]), xgb_model_6)),
        )
    )
    estimators.append(
        (
            "scv_model_cpu_6",
            (make_pipeline(ColumnSelector(cols=top7colscomb[i]), scv_model_cpu_6)),
        )
    )

# 3.8. 5 Features

In [None]:
top5colscomb = [
    (0, 1, 8, 9, 10),
    (0, 1, 6, 8, 10),
    (0, 1, 7, 9, 10),
    (0, 1, 2, 8, 10),
    (0, 1, 5, 8, 10),
    (0, 1, 5, 6, 11),
    (0, 1, 7, 8, 10),
    (0, 1, 9, 10, 11),
    (0, 1, 5, 10, 11),
    (0, 1, 2, 5, 11),
]


rf_model_5_2 = RandomForestClassifier(
    bootstrap=True,
    class_weight=None,
    criterion="gini",
    max_depth=12,
    max_features="auto",
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    min_impurity_split=None,
    min_samples_leaf=10,
    min_samples_split=2,
    min_weight_fraction_leaf=0.0,
    n_estimators=100,
    n_jobs=None,
    oob_score=False,
    random_state=randomseed,
    verbose=0,
    warm_start=False,
)

xgb_model_5_2 = xgb.XGBClassifier(
    base_score=0.5,
    booster="gbtree",
    colsample_bylevel=1,
    colsample_bynode=1,
    colsample_bytree=1,
    gamma=0,
    learning_rate=0.1,
    max_delta_step=0,
    max_depth=3,
    min_child_weight=1,
    missing=None,
    n_estimators=100,
    n_jobs=1,
    nthread=None,
    objective="multi:softprob",
    random_state=randomseed,
    reg_alpha=0,
    reg_lambda=1,
    scale_pos_weight=1,
    seed=None,
    silent=None,
    subsample=1,
    verbosity=1,
)

scv_model_5_2 = svmgpu(
    C=1000,
    cache_size=100,
    class_weight={},
    coef0=0.0,
    decision_function_shape="ovo",
    degree=3,
    gamma=0.1,
    gpu_id=0,
    kernel="linear",
    max_iter=-1,
    max_mem_size=-1,
    n_jobs=-1,
    probability=False,
    random_state=randomseed,
    shrinking=False,
    tol=0.001,
    verbose=False,
)

scv_model_cpu_5_2 = SVC(
    C=1000,
    cache_size=100,
    class_weight={},
    coef0=0.0,
    decision_function_shape="ovo",
    degree=3,
    gamma=0.1,
    #     gpu_id=0,
    kernel="linear",
    max_iter=-1,
    # max_mem_size=-1,   n_jobs=-1,
    probability=True,
    random_state=randomseed,
    shrinking=False,
    tol=0.001,
    verbose=False,
)


# generate the models based on the selected columns list and the ml classifiers
for i in range(5):
    weakmodles.append(make_pipeline(ColumnSelector(cols=top5colscomb[i]), rf_model_5_2))
    weakmodles.append(
        make_pipeline(ColumnSelector(cols=top5colscomb[i]), xgb_model_5_2)
    )
    weakmodles.append(
        make_pipeline(ColumnSelector(cols=top5colscomb[i]), scv_model_5_2)
    )

for i in range(5):
    estimators.append(
        (
            (
                "rf_model_52" + str(i),
                (make_pipeline(ColumnSelector(cols=top5colscomb[i]), rf_model_5_2)),
            )
        )
    )
    estimators.append(
        (
            "xgb_model_52" + str(i),
            (make_pipeline(ColumnSelector(cols=top5colscomb[i]), xgb_model_5_2)),
        )
    )
    estimators.append(
        (
            "scv_model_cpu_52" + str(i),
            (make_pipeline(ColumnSelector(cols=top5colscomb[i]), scv_model_cpu_5_2)),
        )
    )

In [None]:
len(weakmodles)

In [None]:
len(estimators)

In [None]:
from itertools import groupby

a = [
    "rf_model_5",
    "xgb_model_5",
    "scv_model_5",
    
    "rf_model_5_2",
    "xgb_model_5_2",
    "scv_model_cpu_5_2",
    
    "rf_model_5_2",
    "xgb_model_5_2",
    "scv_model_cpu_5_2",
    "rf_model_5_2",
    "xgb_model_5_2",
    "scv_model_cpu_5_2",
    "rf_model_5_2",
    "xgb_model_5_2",
    "scv_model_cpu_5_2",
    "rf_model_5_2",
    "xgb_model_5_2",
    "scv_model_cpu_5_2",
]
[len(list(group)) for key, group in groupby(a)]

In [None]:
np.unique(a)

In [None]:
estimators

# 4. Generate Stacking Classifier

In [None]:
sclf = StackingClassifier(classifiers=weakmodles,verbose=2,
                           meta_classifier=RandomForestClassifier(n_estimators=500))


In [None]:
sclf.fit(xtrain, ytrain)

In [None]:
ypred = sclf.predict((xtest))

print("accuracy \t", m.accuracy_score(ytest, ypred))
confmatrx = pd.DataFrame(
    m.confusion_matrix(ytest, ypred),
    columns=["Normal", "Prediabetes", "diabetes"],
    index=["Normal", "Prediabetes", "diabetes"],
)
confmatrx.head()

In [None]:
print(classification_report(ytest, ypred))

# Compute the performance of each model of the sclf

In [None]:
# for i in range(len(sclf.clfs_)):
#     ypred = sclf.clfs_[i].predict((xtest))
#     print(type(sclf.clfs_[i]), "accuracy \t", m.accuracy_score(ytest, ypred))

In [None]:
stackedmodels = sclf.clfs_
for i in range(len(stackedmodels)):
    ypred = stackedmodels[i].predict(xtest)
    print(type(sclf.clfs_[i]), "accuracy \t", m.accuracy_score(ytest, ypred))
    print("Accuracy = ", m.accuracy_score(ytest, ypred))

    confmatrx = pd.DataFrame(
        m.confusion_matrix(ytest, ypred),
        columns=["Normal", "Prediabetes", "diabetes"],
        index=["Normal", "Prediabetes", "diabetes"],
    )
    print(confmatrx.head())

    print(m.classification_report(ytest, ypred))

    print(i)

# 5. Generate Votting Classifer

In [None]:
votingclf = VotingClassifier(estimators=estimators, voting="hard")

In [None]:
votingclf.fit(xtrain, ytrain)

In [None]:
ypred = votingclf.predict(xtest.astype(float))

print("accuracy \t", m.accuracy_score(ytest, ypred))
confmatrx = pd.DataFrame(
    m.confusion_matrix(ytest, ypred),
    columns=["Normal", "Prediabetes", "diabetes"],
    index=["Normal", "Prediabetes", "diabetes"],
)
confmatrx.head()

In [None]:
print(classification_report(ytest, ypred))

In [None]:
stackedmodels = votingclf.estimators_
for i in range(len(stackedmodels)):
    ypred = stackedmodels[i].predict(xtest)
    print(type(sclf.clfs_[i]), "accuracy \t", m.accuracy_score(ytest, ypred))
    print("Accuracy = ", m.accuracy_score(ytest, ypred))

    confmatrx = pd.DataFrame(
        m.confusion_matrix(ytest, ypred),
        columns=["Normal", "Prediabetes", "diabetes"],
        index=["Normal", "Prediabetes", "diabetes"],
    )
    print(confmatrx.head())

    print(m.classification_report(ytest, ypred))

    print(i)

# 6  Generate Votting Classifer soft voting

In [None]:

scv_model_cpu_withProb = SVC(
  C=100,
    cache_size=100,
    class_weight={},
    coef0=0.0,
    decision_function_shape="ovo",
    degree=3,
    gamma=0.1,
#     gpu_id=0,
    kernel="linear",
    max_iter=-1,
#     max_mem_size=-1,
#     n_jobs=-1,
    probability=True,
    random_state=randomseed,
    shrinking=False,
    tol=0.001,
    verbose=False,
)


In [None]:
votingclf2 = VotingClassifier(
    estimators=[
        ("rf_model", rf_model),
        ("xgb_model", xgb_model),
        ("scv_model", scv_model_cpu_withProb),
    ],
    voting="soft",
)

In [None]:
votingclf2.fit(xtrain, ytrain)

In [None]:
ypred = votingclf2.predict(xtest.astype(float))

print("accuracy \t", m.accuracy_score(ytest, ypred))
confmatrx = pd.DataFrame(
    m.confusion_matrix(ytest, ypred),
    columns=["Normal", "Prediabetes", "diabetes"],
    index=["Normal", "Prediabetes", "diabetes"],
)
confmatrx.head()

In [None]:
print(classification_report(ytest, ypred))

In [None]:
stackedmodels = votingclf2.estimators_
for i in range(len(stackedmodels)):
    ypred = stackedmodels[i].predict(xtest)
    print(type(sclf.clfs_[i]), "accuracy \t", m.accuracy_score(ytest, ypred))
    print("Accuracy = ", m.accuracy_score(ytest, ypred))

    confmatrx = pd.DataFrame(
        m.confusion_matrix(ytest, ypred),
        columns=["Normal", "Prediabetes", "diabetes"],
        index=["Normal", "Prediabetes", "diabetes"],
    )
    print(confmatrx.head())

    print(m.classification_report(ytest, ypred))

    print(i)