In [1]:
%load_ext nb_black
# !pip install nb_black

import warnings

warnings.filterwarnings("ignore")

<IPython.core.display.Javascript object>

In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn import metrics as m
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE  # doctest: +NORMALIZE_WHITESPACE
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
import xgboost as xgb
from sklearn.pipeline import make_pipeline
from mlxtend.feature_selection import ColumnSelector
from sklearn import model_selection
from mlxtend.classifier import StackingClassifier
from thundersvm import SVC as svmgpu

randomseed = 7
np.random.seed(randomseed)

Using TensorFlow backend.


<IPython.core.display.Javascript object>

# 1. Read the dataset

In [3]:
x_original = pd.read_csv("../../../dataset/XLable_onlyDiabeticRemoved.txt")

conditions = [
    (x_original.L100800 < 100),
    (x_original.L100800 >= 100) & (x_original.L100800 < 126),
    (x_original.L100800 >= 126),
]
choices = [0, 1, 2]
x_original["CLASS"] = np.select(conditions, choices, default=0)
x_original = x_original[
    [
        "Unnamed: 0",
        "L100800",
        "L104600",
        "L103000",
        "S000300",
        "L101700",
        "L100700",
        "FIELD_33",
        "FIELD_38",
        "FIELD_40",
        "FIELD_31",
        "SEX",
        "AGE",  #'CLASS',
        "FIELD_16",
        "FIELD_23",
        "FIELD_15",
        "FIELD_22",
        "FIELD_17",
        "FIELD_24",
    ]
]

print(x_original.shape)

(185843, 19)


<IPython.core.display.Javascript object>

In [4]:
y_original = pd.read_csv("../../../dataset/TargetLable_onlyDiabeticRemoved.txt")

conditions = [
    (y_original.L100800 < 100),
    (y_original.L100800 >= 100) & (y_original.L100800 < 126),
    (y_original.L100800 >= 126),
]

choices = [0, 1, 2]
y_original["CLASS"] = np.select(conditions, choices, default=0)

y_original = y_original[["Unnamed: 0", "CLASS"]]

print(y_original.shape)

(185843, 2)


<IPython.core.display.Javascript object>

In [5]:
data = pd.merge(
    x_original, y_original, how="inner", left_on="Unnamed: 0", right_on="Unnamed: 0"
)

<IPython.core.display.Javascript object>

In [6]:
# filter the data set
data = data[data.FIELD_16 != 1]  # exclude people who are diagnosed for (diabetes)
data = data[data.FIELD_23 != 1]  # exclude people who are on medication for diabetes

data = data[
    data.FIELD_15 != 1
]  # exclude people who are diagnosed for (high blood pressure)
data = data[
    data.FIELD_22 != 1
]  # exclude people who are on medication for high blood pressure

data = data[data.FIELD_17 != 1]  # exclude people who are diagnosed for hyperlipidemia
data = data[
    data.FIELD_24 != 1
]  # exclude people who are on medication for hyperlipidemia

print(data.shape)

(169024, 20)


<IPython.core.display.Javascript object>

In [7]:
data = data[
    [
        "L100800",
        "L104600",
        "L103000",
        "S000300",
        "L101700",
        "L100700",
        "FIELD_33",
        "FIELD_38",
        "FIELD_40",
        "FIELD_31",
        "SEX",
        "AGE",
        "CLASS",
    ]
]
data = data.dropna()
print(data.shape)
data.head()

(56542, 13)


Unnamed: 0,L100800,L104600,L103000,S000300,L101700,L100700,FIELD_33,FIELD_38,FIELD_40,FIELD_31,SEX,AGE,CLASS
2,78.0,5.28,41.0,20.2,15.0,3.8,1.0,2.0,1.0,0.0,1.0,46.0,0
5,90.0,5.74,50.0,25.5,12.0,3.4,1.0,0.0,1.0,0.0,1.0,52.0,0
10,86.0,5.83,45.0,21.2,17.0,3.9,1.0,0.0,1.0,1.0,1.0,37.0,0
11,86.0,4.73,54.0,22.0,30.0,4.2,1.0,2.0,3.0,0.0,1.0,39.0,0
20,87.0,5.6,340.0,24.6,26.0,4.7,1.0,0.0,2.0,0.0,1.0,59.0,0


<IPython.core.display.Javascript object>

# 2. Downsample the majority class and upsample the minority

In [8]:
diabetic = data[data.CLASS == 2]
prediabetic = data[data.CLASS == 1]
normal = data[data.CLASS == 0]

print(diabetic.shape[0], prediabetic.shape[0], normal.shape[0])

1045 17331 38166


<IPython.core.display.Javascript object>

In [9]:
diabetic_test = diabetic.sample(200, random_state=randomseed)
prediabetic_test = prediabetic.sample(200, random_state=randomseed)
normal_test = normal.sample(200, random_state=randomseed)
test = pd.concat([diabetic_test, prediabetic_test, normal_test])

diabetic_train = diabetic.drop(diabetic_test.index)
prediabetic_train = prediabetic.drop(prediabetic_test.index)
# .sample(
#     10 * diabetic_train.shape[0], random_state=randomseed
# )
normal_train = normal.drop(normal_test.index).sample(
    prediabetic_train.shape[0],
    random_state=randomseed
    #     10 * diabetic_train.shape[0], random_state=randomseed
)
train = pd.concat([diabetic_train, diabetic_train, prediabetic_train, normal_train])

<IPython.core.display.Javascript object>

In [10]:
xtrain = train.iloc[:, :-1]
ytrain = train.iloc[:, -1]
xtest = test.iloc[:, :-1]
ytest = test.iloc[:, -1]

<IPython.core.display.Javascript object>

In [11]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
xtrain = scaler.fit_transform(xtrain)
xtest = scaler.transform(xtest)

<IPython.core.display.Javascript object>

In [12]:
from imblearn.over_sampling import SMOTE, SMOTENC  # doctest: +NORMALIZE_WHITESPACE

randomseed = 42

sm = SMOTENC(
    random_state=randomseed,
    categorical_features=[6, 7, 8, 9, 10],
    sampling_strategy="minority",
)
X_res, y_res = sm.fit_resample(xtrain, ytrain)

print("Resampled dataset shape %s" % Counter(y_res))
print(
    y_res[y_res == 0].shape[0], y_res[y_res == 1].shape[0], y_res[y_res == 2].shape[0]
)
print(X_res.shape, y_res.shape)

xtrain = X_res
ytrain = y_res

Resampled dataset shape Counter({2: 17131, 1: 17131, 0: 17131})
17131 17131 17131
(51393, 12) (51393,)


<IPython.core.display.Javascript object>

# 3. Generate the classifier models based on the selected  features

# Models

In [13]:
# generate the models based on the selected columns list and the ml classifiers
cols = []
weakmodles = []
estimators = []

<IPython.core.display.Javascript object>

# 3.1. 12 features

In [14]:
rf_model_12 = RandomForestClassifier(
    bootstrap=True,
    class_weight=None,
    criterion="gini",
    max_depth=10,
    max_features="log2",
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    min_impurity_split=None,
    min_samples_leaf=2,
    min_samples_split=12,
    min_weight_fraction_leaf=0.0,
    n_estimators=100,
    n_jobs=-1,
    oob_score=False,
    random_state=randomseed,
    verbose=0,
    warm_start=False,
)

xgb_model_12 = xgb.XGBClassifier(objective="binary:logistic", random_state=randomseed)

scv_model_12 = svmgpu(
    C=100,
    cache_size=100,
    class_weight={},
    coef0=0.0,
    decision_function_shape="ovo",
    degree=3,
    gamma=0.1,
    gpu_id=0,
    kernel="linear",
    max_iter=-1,
    max_mem_size=-1,
    n_jobs=-1,
    probability=True,
    random_state=randomseed,
    shrinking=False,
    tol=0.001,
    verbose=False,
)

scv_model_cpu_12 = SVC(
    C=100,
    cache_size=100,
    class_weight={},
    coef0=0.0,
    decision_function_shape="ovo",
    degree=3,
    gamma=0.1,
    #     gpu_id=0,
    kernel="linear",
    max_iter=-1,
    #     max_mem_size=-1,
    #     n_jobs=-1,
    probability=True,
    random_state=randomseed,
    shrinking=False,
    tol=0.001,
    verbose=False,
)

cols.append(np.arange(0, 12))

# weakmodles.append(rf_model_12)
# weakmodles.append(xgb_model_12)
# weakmodles.append(scv_model_12)

weakmodles.append(make_pipeline(ColumnSelector(cols=(np.arange(0, 12))), rf_model_12))
weakmodles.append(make_pipeline(ColumnSelector(cols=(np.arange(0, 12))), xgb_model_12))
weakmodles.append(make_pipeline(ColumnSelector(cols=(np.arange(0, 12))), scv_model_12))


estimators.append(("rf_model_12", rf_model_12))
estimators.append(("xgb_model_12", xgb_model_12))
estimators.append(("scv_model_12", scv_model_cpu_12))

# estimators = [
#     ("rf_model_12", rf_model_12),
#     ("xgb_model_12", xgb_model_12),
#     ("scv_model_12", scv_model_12),
# ]

<IPython.core.display.Javascript object>

# 3.2. 5 features

In [15]:
rf_model_5 = RandomForestClassifier(
    random_state=randomseed,
    n_estimators=100,
    max_depth=12,
    min_samples_split=2,
    min_samples_leaf=10,
    max_features="auto",
)

xgb_model_5 = xgb.XGBClassifier(objective="binary:logistic", random_state=randomseed)

scv_model_5 = svmgpu(
    C=70,
    cache_size=200,
    class_weight={},
    coef0=0.0,
    decision_function_shape="ovr",
    degree=3,
    gamma=0.001,
    gpu_id=0,
    kernel="linear",
    max_iter=-1,
    max_mem_size=-1,
    n_jobs=-1,
    probability=True,
    random_state=42,
    shrinking=True,
    tol=0.001,
    verbose=False,
)


scv_model_cpu_5 = SVC(
    C=70,
    cache_size=200,
    class_weight={},
    coef0=0.0,
    decision_function_shape="ovr",
    degree=3,
    gamma=0.001,
    #     gpu_id=0,
    kernel="linear",
    max_iter=-1,
    #     max_mem_size=-1,
    #     n_jobs=-1,
    probability=True,
    random_state=42,
    shrinking=True,
    tol=0.001,
    verbose=False,
)


# weakmodles.append(rf_model_5)
# weakmodles.append(xgb_model_5)
# weakmodles.append(scv_model_5)


weakmodles.append(make_pipeline(ColumnSelector(cols=[0,1,3,10,11]), rf_model_5))
weakmodles.append(make_pipeline(ColumnSelector(cols=[0,1,3,10,11]), xgb_model_5))
weakmodles.append(make_pipeline(ColumnSelector(cols=[0,1,3,10,11]), scv_model_5))
    

cols.append([0,1,3,10,11])
cols.append([0,1,3,10,11])
cols.append([0,1,3,10,11])

estimators.append(("rf_model_5", rf_model_5))
estimators.append(("xgb_model_5", xgb_model_5))
estimators.append(("scv_model_5", scv_model_cpu_5))


<IPython.core.display.Javascript object>

# 3.3. 10 Features

In [16]:
top10colscomb = [
    (0, 1, 2, 3, 4, 5, 6, 9, 10, 11),
    (0, 1, 3, 4, 6, 7, 8, 9, 10, 11),
    (0, 1, 2, 3, 4, 5, 6, 7, 10, 11),
    (0, 1, 2, 3, 4, 5, 6, 8, 10, 11),
    (0, 1, 2, 3, 5, 6, 8, 9, 10, 11),
    (0, 1, 3, 4, 5, 6, 8, 9, 10, 11),
    (0, 1, 3, 4, 5, 6, 7, 9, 10, 11),
    (0, 1, 2, 3, 4, 5, 8, 9, 10, 11),
    (0, 1, 2, 3, 4, 5, 6, 7, 9, 11),
    (0, 1, 2, 3, 4, 6, 7, 9, 10, 11),
]

# modles used in the 12 features set
rf_model_10 = RandomForestClassifier(
    bootstrap=True,
    class_weight=None,
    criterion="gini",
    max_depth=12,
    max_features="auto",
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    min_impurity_split=None,
    min_samples_leaf=10,
    min_samples_split=2,
    min_weight_fraction_leaf=0.0,
    n_estimators=100,
    n_jobs=None,
    oob_score=False,
    random_state=42,
    verbose=0,
    warm_start=False,
)

xgb_model_10 = xgb.XGBClassifier(
    base_score=0.5,
    booster="gbtree",
    colsample_bylevel=1,
    colsample_bynode=1,
    colsample_bytree=1,
    gamma=0,
    learning_rate=0.1,
    max_delta_step=0,
    max_depth=3,
    min_child_weight=1,
    missing=None,
    n_estimators=100,
    n_jobs=1,
    nthread=None,
    objective="binary:logistic",
    random_state=42,
    reg_alpha=0,
    reg_lambda=1,
    scale_pos_weight=1,
    seed=None,
    silent=None,
    subsample=1,
    verbosity=1,
)

scv_model_10 = svmgpu(
    C=1000,
    cache_size=100,
    class_weight={},
    coef0=0.0,
    decision_function_shape="ovo",
    degree=3,
    gamma=0.1,
    gpu_id=0,
    kernel="linear",
    max_iter=-1,
    max_mem_size=-1,
    n_jobs=-1,
    probability=True,
    random_state=None,
    shrinking=False,
    tol=0.001,
    verbose=False,
)

scv_model_cpu_10 = SVC(
    C=1000,
    cache_size=100,
    class_weight={},
    coef0=0.0,
    decision_function_shape="ovo",
    degree=3,
    gamma=0.1,  # gpu_id=0,
    kernel="linear",
    max_iter=-1,  # max_mem_size=-1, n_jobs=-1,
    probability=True,
    random_state=None,
    shrinking=False,
    tol=0.001,
    verbose=False,
)

# generate the models based on the selected columns list and the ml classifiers
for i in range(5):

    #     weakmodles.append(rf_model_10)
    #     weakmodles.append(xgb_model_10)
    #     weakmodles.append(scv_model_10)

    weakmodles.append(make_pipeline(ColumnSelector(cols=top10colscomb[i]), rf_model_10))
    weakmodles.append(
        make_pipeline(ColumnSelector(cols=top10colscomb[i]), xgb_model_10)
    )
    weakmodles.append(
        make_pipeline(ColumnSelector(cols=top10colscomb[i]), scv_model_10)
    )

for i in range(5):
    estimators.append(
        (
            (
                "rf_model_10" + str(i),
                (make_pipeline(ColumnSelector(cols=top10colscomb[i]), rf_model_10)),
            )
        )
    )
    estimators.append(
        (
            "xgb_model_10" + str(i),
            (make_pipeline(ColumnSelector(cols=top10colscomb[i]), xgb_model_10)),
        )
    )
    estimators.append(
        (
            "scv_model_cpu_10" + str(i),
            (make_pipeline(ColumnSelector(cols=top10colscomb[i]), scv_model_cpu_10)),
        )
    )

cols = cols + top10colscomb

<IPython.core.display.Javascript object>

# 3.4. 9 Features

In [17]:
topcols9comb = [
    (0, 1, 2, 3, 5, 6, 8, 10, 11),
    (0, 1, 2, 3, 4, 6, 9, 10, 11),
    (0, 1, 2, 3, 4, 5, 6, 9, 11),
    (0, 1, 3, 4, 6, 7, 9, 10, 11),
    (0, 1, 3, 4, 5, 8, 9, 10, 11),
    (0, 1, 3, 4, 5, 6, 9, 10, 11),
    (0, 1, 3, 4, 5, 6, 8, 10, 11),
    (0, 1, 3, 4, 6, 7, 8, 10, 11),
    (0, 1, 3, 4, 5, 6, 7, 8, 11),
    (0, 1, 3, 5, 6, 8, 9, 10, 11),
]

# modles used in the 12 features set
rf_model_9 = RandomForestClassifier(
    bootstrap=True,
    class_weight=None,
    criterion="gini",
    max_depth=12,
    max_features="auto",
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    min_impurity_split=None,
    min_samples_leaf=10,
    min_samples_split=2,
    min_weight_fraction_leaf=0.0,
    n_estimators=100,
    n_jobs=None,
    oob_score=False,
    random_state=randomseed,
    verbose=0,
    warm_start=False,
)

xgb_model_9 = xgb.XGBClassifier(
    base_score=0.5,
    booster="gbtree",
    colsample_bylevel=1,
    colsample_bynode=1,
    colsample_bytree=1,
    gamma=0,
    learning_rate=0.1,
    max_delta_step=0,
    max_depth=3,
    min_child_weight=1,
    missing=None,
    n_estimators=100,
    n_jobs=1,
    nthread=None,
    objective="binary:logistic",
    random_state=randomseed,
    reg_alpha=0,
    reg_lambda=1,
    scale_pos_weight=1,
    seed=None,
    silent=None,
    subsample=1,
    verbosity=1,
)

scv_model_9 = svmgpu(
    C=1000,
    cache_size=100,
    class_weight={},
    coef0=0.0,
    decision_function_shape="ovo",
    degree=3,
    gamma=0.1,
    gpu_id=0,
    kernel="linear",
    max_iter=-1,
    max_mem_size=-1,
    n_jobs=-1,
    probability=True,
    random_state=randomseed,
    shrinking=False,
    tol=0.001,
    verbose=False,
)

scv_model_cpu_9 = SVC(
    C=1000,
    cache_size=100,
    class_weight={},
    coef0=0.0,
    decision_function_shape="ovo",
    degree=3,
    gamma=0.1,  # gpu_id=0,
    kernel="linear",
    max_iter=-1,  # max_mem_size=-1, n_jobs=-1,
    probability=True,
    random_state=randomseed,
    shrinking=False,
    tol=0.001,
    verbose=False,
)


# generate the models based on the selected columns list and the ml classifiers
for i in range(5):
    #     weakmodles.append(rf_model_9)
    #     weakmodles.append(xgb_model_9)
    #     weakmodles.append(scv_model_9)

    weakmodles.append(make_pipeline(ColumnSelector(cols=topcols9comb[i]), rf_model_9))
    weakmodles.append(make_pipeline(ColumnSelector(cols=topcols9comb[i]), xgb_model_9))
    weakmodles.append(make_pipeline(ColumnSelector(cols=topcols9comb[i]), scv_model_9))


for i in range(5):
    estimators.append(
        (
            (
                "rf_model_9" + str(i),
                (make_pipeline(ColumnSelector(cols=topcols9comb[i]), rf_model_9)),
            )
        )
    )
    estimators.append(
        (
            "xgb_model_9" + str(i),
            (make_pipeline(ColumnSelector(cols=topcols9comb[i]), xgb_model_9)),
        )
    )
    estimators.append(
        (
            "scv_model_cpu_9" + str(i),
            (make_pipeline(ColumnSelector(cols=topcols9comb[i]), scv_model_cpu_9)),
        )
    )

cols = cols + topcols9comb

<IPython.core.display.Javascript object>

# 3.5. 8 Features

In [18]:
top8colscomb = [
    (0, 1, 2, 3, 5, 6, 10, 11),
    (0, 1, 2, 3, 4, 6, 10, 11),
    (0, 1, 2, 3, 4, 9, 10, 11),
    (0, 1, 2, 3, 5, 9, 10, 11),
    (0, 1, 3, 4, 5, 6, 9, 11),
    (0, 1, 3, 4, 6, 9, 10, 11),
    (0, 1, 3, 4, 8, 9, 10, 11),
    (0, 1, 2, 6, 8, 9, 10, 11),
    (0, 1, 2, 3, 5, 8, 10, 11),
    (0, 1, 3, 4, 5, 6, 10, 11),
]

cols = cols + top8colscomb

# modles used in the 12 features set
rf_model_8 = RandomForestClassifier(
    bootstrap=True,
    class_weight=None,
    criterion="gini",
    max_depth=12,
    max_features="auto",
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    min_impurity_split=None,
    min_samples_leaf=10,
    min_samples_split=2,
    min_weight_fraction_leaf=0.0,
    n_estimators=100,
    n_jobs=None,
    oob_score=False,
    random_state=randomseed,
    verbose=0,
    warm_start=False,
)

xgb_model_8 = xgb.XGBClassifier(
    base_score=0.5,
    booster="gbtree",
    colsample_bylevel=1,
    colsample_bynode=1,
    colsample_bytree=1,
    gamma=0,
    learning_rate=0.1,
    max_delta_step=0,
    max_depth=3,
    min_child_weight=1,
    missing=None,
    n_estimators=100,
    n_jobs=1,
    nthread=None,
    objective="binary:logistic",
    random_state=randomseed,
    reg_alpha=0,
    reg_lambda=1,
    scale_pos_weight=1,
    seed=None,
    silent=None,
    subsample=1,
    verbosity=1,
)

scv_model_8 = svmgpu(
    C=1000,
    cache_size=100,
    class_weight={},
    coef0=0.0,
    decision_function_shape="ovo",
    degree=3,
    gamma=0.1,
    gpu_id=0,
    kernel="linear",
    max_iter=-1,
    max_mem_size=-1,
    n_jobs=-1,
    probability=True,
    random_state=None,
    shrinking=False,
    tol=0.001,
    verbose=False,
)

scv_model_cpu_8 = SVC(
    C=1000,
    cache_size=100,
    class_weight={},
    coef0=0.0,
    decision_function_shape="ovo",
    degree=3,
    gamma=0.1,  # gpu_id=0,
    kernel="linear",
    max_iter=-1,  # max_mem_size=-1, n_jobs=-1,
    probability=True,
    random_state=randomseed,
    shrinking=False,
    tol=0.001,
    verbose=False,
)

# generate the models based on the selected columns list and the ml classifiers
for i in range(5):
    weakmodles.append(make_pipeline(ColumnSelector(cols=top8colscomb[i]), rf_model_8))
    weakmodles.append(make_pipeline(ColumnSelector(cols=top8colscomb[i]), xgb_model_8))
    weakmodles.append(make_pipeline(ColumnSelector(cols=top8colscomb[i]), scv_model_8))

#     weakmodles.append(rf_model_8)
#     weakmodles.append(xgb_model_8)
#     weakmodles.append(scv_model_8)

for i in range(5):
    estimators.append(
        (
            (
                "rf_model_8" + str(i),
                (make_pipeline(ColumnSelector(cols=top8colscomb[i]), rf_model_8)),
            )
        )
    )
    estimators.append(
        (
            "xgb_model_8" + str(i),
            (make_pipeline(ColumnSelector(cols=top8colscomb[i]), xgb_model_8)),
        )
    )
    estimators.append(
        (
            "scv_model_cpu_8" + str(i),
            (make_pipeline(ColumnSelector(cols=top8colscomb[i]), scv_model_cpu_8)),
        )
    )

<IPython.core.display.Javascript object>

# 3.6 7 Features

In [19]:
top7colscomb = [
    (0, 1, 3, 5, 6, 10, 11),
    (0, 1, 3, 4, 5, 10, 11),
    (0, 1, 2, 3, 4, 10, 11),
    (0, 1, 2, 3, 5, 10, 11),
    (0, 1, 3, 6, 8, 10, 11),
    (0, 1, 6, 8, 9, 10, 11),
    (0, 1, 6, 7, 8, 9, 10),
    (0, 1, 2, 3, 8, 10, 11),
    (0, 1, 2, 6, 8, 10, 11),
    (0, 1, 3, 8, 9, 10, 11),
]

cols = cols + top7colscomb

rf_model_7 = RandomForestClassifier(
    bootstrap=True,
    class_weight=None,
    criterion="gini",
    max_depth=12,
    max_features="auto",
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    min_impurity_split=None,
    min_samples_leaf=10,
    min_samples_split=2,
    min_weight_fraction_leaf=0.0,
    n_estimators=100,
    n_jobs=None,
    oob_score=False,
    random_state=randomseed,
    verbose=0,
    warm_start=False,
)

xgb_model_7 = xgb.XGBClassifier(
    base_score=0.5,
    booster="gbtree",
    colsample_bylevel=1,
    colsample_bynode=1,
    colsample_bytree=1,
    gamma=0,
    learning_rate=0.1,
    max_delta_step=0,
    max_depth=3,
    min_child_weight=1,
    missing=None,
    n_estimators=100,
    n_jobs=1,
    nthread=None,
    objective="binary:logistic",
    random_state=randomseed,
    reg_alpha=0,
    reg_lambda=1,
    scale_pos_weight=1,
    seed=None,
    silent=None,
    subsample=1,
    verbosity=1,
)

scv_model_7 = svmgpu(
    C=1000,
    cache_size=100,
    class_weight={},
    coef0=0.0,
    decision_function_shape="ovo",
    degree=3,
    gamma=0.1,
    gpu_id=0,
    kernel="linear",
    max_iter=-1,
    max_mem_size=-1,
    n_jobs=-1,
    probability=True,
    random_state=randomseed,
    shrinking=False,
    tol=0.001,
    verbose=False,
)

scv_model_cpu_7 = SVC(
    C=1000,
    cache_size=100,
    class_weight={},
    coef0=0.0,
    decision_function_shape="ovo",
    degree=3,
    gamma=0.1,  # gpu_id=0,
    kernel="linear",
    max_iter=-1,  # max_mem_size=-1, n_jobs=-1,
    probability=True,
    random_state=randomseed,
    shrinking=False,
    tol=0.001,
    verbose=False,
)


# generate the models based on the selected columns list and the ml classifiers
for i in range(5):
    weakmodles.append(make_pipeline(ColumnSelector(cols=top7colscomb[i]), rf_model_7))
    weakmodles.append(make_pipeline(ColumnSelector(cols=top7colscomb[i]), xgb_model_7))
    weakmodles.append(make_pipeline(ColumnSelector(cols=top7colscomb[i]), scv_model_7))

#     weakmodles.append(rf_model_7)
#     weakmodles.append(xgb_model_7)
#     weakmodles.append(scv_model_7)


for i in range(5):
    estimators.append(
        (
            (
                "rf_model_7" + str(i),
                (make_pipeline(ColumnSelector(cols=top7colscomb[i]), rf_model_7)),
            )
        )
    )
    estimators.append(
        (
            "xgb_model_7" + str(i),
            (make_pipeline(ColumnSelector(cols=top7colscomb[i]), xgb_model_7)),
        )
    )
    estimators.append(
        (
            "scv_model_cpu_7" + str(i),
            (make_pipeline(ColumnSelector(cols=top7colscomb[i]), scv_model_cpu_7)),
        )
    )

<IPython.core.display.Javascript object>

# 3.7. 6 Features

In [20]:
top6colscomb = [
    (0, 1, 8, 9, 10),
    (0, 1, 6, 8, 10),
    (0, 1, 7, 9, 10),
    (0, 1, 2, 8, 10),
    (0, 1, 5, 8, 10),
    (0, 1, 5, 6, 11),
    (0, 1, 7, 8, 10),
    (0, 1, 9, 10, 11),
    (0, 1, 5, 10, 11),
    (0, 1, 2, 5, 11),
]

cols = cols + top6colscomb

# modles used in the 12 features set
rf_model_6 = RandomForestClassifier(
    bootstrap=True,
    class_weight=None,
    criterion="gini",
    max_depth=12,
    max_features="auto",
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    min_impurity_split=None,
    min_samples_leaf=10,
    min_samples_split=2,
    min_weight_fraction_leaf=0.0,
    n_estimators=100,
    n_jobs=None,
    oob_score=False,
    random_state=randomseed,
    verbose=0,
    warm_start=False,
)

xgb_model_6 = xgb.XGBClassifier(
    base_score=0.5,
    booster="gbtree",
    colsample_bylevel=1,
    colsample_bynode=1,
    colsample_bytree=1,
    gamma=0,
    learning_rate=0.1,
    max_delta_step=0,
    max_depth=3,
    min_child_weight=1,
    missing=None,
    n_estimators=100,
    n_jobs=1,
    nthread=None,
    objective="binary:logistic",
    random_state=randomseed,
    reg_alpha=0,
    reg_lambda=1,
    scale_pos_weight=1,
    seed=None,
    silent=None,
    subsample=1,
    verbosity=1,
)

scv_model_6 = svmgpu(
    C=1000,
    cache_size=100,
    class_weight={},
    coef0=0.0,
    decision_function_shape="ovo",
    degree=3,
    gamma=0.1,
    gpu_id=0,
    kernel="linear",
    max_iter=-1,
    max_mem_size=-1,
    n_jobs=-1,
    probability=True,
    random_state=randomseed,
    shrinking=False,
    tol=0.001,
    verbose=False,
)

scv_model_cpu_6 = SVC(
    C=1000,
    cache_size=100,
    class_weight={},
    coef0=0.0,
    decision_function_shape="ovo",
    degree=3,
    gamma=0.1,  # gpu_id=0,
    kernel="linear",
    max_iter=-1,  # max_mem_size=-1, n_jobs=-1,
    probability=True,
    random_state=randomseed,
    shrinking=False,
    tol=0.001,
    verbose=False,
)


# generate the models based on the selected columns list and the ml classifiers
for i in range(5):
    weakmodles.append(make_pipeline(ColumnSelector(cols=top6colscomb[i]), rf_model_6))
    weakmodles.append(make_pipeline(ColumnSelector(cols=top6colscomb[i]), xgb_model_6))
    weakmodles.append(make_pipeline(ColumnSelector(cols=top6colscomb[i]), scv_model_6))

#     weakmodles.append(rf_model_6)
#     weakmodles.append(xgb_model_6)
#     weakmodles.append(scv_model_6)


for i in range(5):
    estimators.append(
        (
            (
                "rf_model_6" + str(i),
                (make_pipeline(ColumnSelector(cols=top7colscomb[i]), rf_model_6)),
            )
        )
    )
    estimators.append(
        (
            "xgb_model_6" + str(i),
            (make_pipeline(ColumnSelector(cols=top7colscomb[i]), xgb_model_6)),
        )
    )
    estimators.append(
        (
            "scv_model_cpu_6" + str(i),
            (make_pipeline(ColumnSelector(cols=top7colscomb[i]), scv_model_cpu_6)),
        )
    )

<IPython.core.display.Javascript object>

# 3.8. 5 Features

In [21]:
top5colscomb = [
    (0, 1, 8, 9, 10),
    (0, 1, 6, 8, 10),
    (0, 1, 7, 9, 10),
    (0, 1, 2, 8, 10),
    (0, 1, 5, 8, 10),
    (0, 1, 5, 6, 11),
    (0, 1, 7, 8, 10),
    (0, 1, 9, 10, 11),
    (0, 1, 5, 10, 11),
    (0, 1, 2, 5, 11),
]

cols = cols + top5colscomb

rf_model_5_2 = RandomForestClassifier(
    bootstrap=True,
    class_weight=None,
    criterion="gini",
    max_depth=12,
    max_features="auto",
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    min_impurity_split=None,
    min_samples_leaf=10,
    min_samples_split=2,
    min_weight_fraction_leaf=0.0,
    n_estimators=100,
    n_jobs=None,
    oob_score=False,
    random_state=randomseed,
    verbose=0,
    warm_start=False,
)

xgb_model_5_2 = xgb.XGBClassifier(
    base_score=0.5,
    booster="gbtree",
    colsample_bylevel=1,
    colsample_bynode=1,
    colsample_bytree=1,
    gamma=0,
    learning_rate=0.1,
    max_delta_step=0,
    max_depth=3,
    min_child_weight=1,
    missing=None,
    n_estimators=100,
    n_jobs=1,
    nthread=None,
    objective="binary:logistic",
    random_state=randomseed,
    reg_alpha=0,
    reg_lambda=1,
    scale_pos_weight=1,
    seed=None,
    silent=None,
    subsample=1,
    verbosity=1,
)

scv_model_5_2 = svmgpu(
    C=1000,
    cache_size=100,
    class_weight={},
    coef0=0.0,
    decision_function_shape="ovo",
    degree=3,
    gamma=0.1,
    gpu_id=0,
    kernel="linear",
    max_iter=-1,
    max_mem_size=-1,
    n_jobs=-1,
    probability=True,
    random_state=randomseed,
    shrinking=False,
    tol=0.001,
    verbose=False,
)

scv_model_cpu_5_2 = SVC(
    C=1000,
    cache_size=100,
    class_weight={},
    coef0=0.0,
    decision_function_shape="ovo",
    degree=3,
    gamma=0.1,
    #     gpu_id=0,
    kernel="linear",
    max_iter=-1,
    # max_mem_size=-1,   n_jobs=-1,
    probability=True,
    random_state=randomseed,
    shrinking=False,
    tol=0.001,
    verbose=False,
)


# generate the models based on the selected columns list and the ml classifiers
for i in range(5):

    #     weakmodles.append(rf_model_5_2)
    #     weakmodles.append(xgb_model_5_2)
    #     weakmodles.append(scv_model_5_2)

    weakmodles.append(make_pipeline(ColumnSelector(cols=top5colscomb[i]), rf_model_5_2))
    weakmodles.append(
        make_pipeline(ColumnSelector(cols=top5colscomb[i]), xgb_model_5_2)
    )
    weakmodles.append(
        make_pipeline(ColumnSelector(cols=top5colscomb[i]), scv_model_5_2)
    )

for i in range(5):
    estimators.append(
        (
            (
                "rf_model_52" + str(i),
                (make_pipeline(ColumnSelector(cols=top5colscomb[i]), rf_model_5_2)),
            )
        )
    )
    estimators.append(
        (
            "xgb_model_52" + str(i),
            (make_pipeline(ColumnSelector(cols=top5colscomb[i]), xgb_model_5_2)),
        )
    )
    estimators.append(
        (
            "scv_model_cpu_52" + str(i),
            (make_pipeline(ColumnSelector(cols=top5colscomb[i]), scv_model_cpu_5_2)),
        )
    )

<IPython.core.display.Javascript object>

In [22]:
len(weakmodles)

96

<IPython.core.display.Javascript object>

In [23]:
cols = np.array(cols)

<IPython.core.display.Javascript object>

OVO
====

In [24]:
def swapcolumns(trainval, testval, coldindexval):
    trainval[trainval != coldindexval] = 5
    testval[testval != coldindexval] = 5

    trainval[trainval == coldindexval] = 0
    trainval[trainval == 5] = 1

    testval[testval == coldindexval] = 0
    testval[testval == 5] = 1

    return trainval, testval


xtrain_original = xtrain
xtest_original = xtest
ytrain_original = ytrain
ytest_original = ytest


<IPython.core.display.Javascript object>

In [25]:
# Class 0
# ===========================
ytrain = ytrain_original.copy()
ytest = ytest_original.copy()
ytrain, ytest = swapcolumns(ytrain, ytest, 0)
# =================================================

<IPython.core.display.Javascript object>

In [26]:
clf = []
acc = []
finalacc = []
ypredproba_all = []
ypredconfprob_all = []

<IPython.core.display.Javascript object>

In [27]:
for i, classifier in enumerate(weakmodles):
    print(i)
    rf = classifier
    rf.fit(xtrain, ytrain)
    rfpred = rf.predict(xtest)
    print(m.f1_score(ytest, rfpred, average="weighted"))

    clf.append(rf)
    acc.append(m.f1_score(ytest, rfpred, average="weighted"))
    ypredproba_all.append(rf.predict_proba(xtest))

    confmat = m.confusion_matrix(ytest, rfpred)
    confsumh = np.sum(confmat, axis=1)
    propconfmat = confmat.copy()
    for i in range(propconfmat.shape[0]):
        propconfmat[i] = 100 * propconfmat[i] / confsumh[i]
    ypredconfprob_all.append(propconfmat.ravel() / 100)

0
0.8109031491384433
1
0.8173286512555166
2
0.813098557883834
3
0.820855614973262
4
0.8224131502820027
5
0.8180147058823528
6
0.8177421014716476
7
0.8194980248489727
8
0.8198337449871191
9
0.8126673652346025
10
0.8140073540056169
11
0.8164661514354765
12
0.811320289783648
13
0.8137884140172812
14
0.813371854648779
15
0.8157743895448816
16
0.8210558814448904
17
0.8167345239343867
18
0.8104632071247327
19
0.8173286512555166
20
0.8247549019607844
21
0.8102345415778252
22
0.8188834536842923
23
0.8232013385387618
24
0.8157743895448816
25
0.820855614973262
26
0.8198337449871191
27
0.8126673652346025
28
0.8175381263616558
29
0.8146446078431372
30
0.8111145415251619
31
0.8155602143023526
32
0.8164661514354765
33
0.8126673652346025
34
0.8226142307973668
35
0.8182836628432482
36
0.807585660526837
37
0.8188834536842923
38
0.8229368734240726
39
0.819298608943112
40
0.8226142307973668
41
0.8182836628432482
42
0.8175381263616558
43
0.8226142307973668
44
0.819564242441674
45
0.8093517433803475
46
0.8

<IPython.core.display.Javascript object>

In [28]:
weakmodles[i]

Pipeline(memory=None,
         steps=[('columnselector',
                 ColumnSelector(cols=array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11]),
                                drop_axis=False)),
                ('xgbclassifier',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, gamma=0, learning_rate=0.1,
                               max_delta_step=0, max_depth=3,
                               min_child_weight=1, missing=None,
                               n_estimators=100, n_jobs=1, nthread=None,
                               objective='binary:logistic', random_state=42,
                               reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
                               seed=None, silent=None, subsample=1,
                               verbosity=1))],
         verbose=False)

<IPython.core.display.Javascript object>

In [29]:
np.array(ypredconfprob_all)[0].ravel()

pc1=0
pnc1=0

tempval = []
# for i in range(len(weakmodles)):
for i in range(4):
    pc1 += (    ypredproba_all[i][:, 0] * ypredconfprob_all[i][0]
        + ypredproba_all[i][:, 1] * ypredconfprob_all[i][1]
    )
    pnc1 += (
        ypredproba_all[i][:, 0] * ypredconfprob_all[i][2]
        + ypredproba_all[i][:, 1] * ypredconfprob_all[i][3]
    )


<IPython.core.display.Javascript object>

In [30]:
temp = np.zeros((ytest_original.shape[0], 2))
temp[:, 0] = pc1
temp[:, 1] = pnc1
m.accuracy_score(ytest, np.argmax(temp, axis=1))

0.8133333333333334

<IPython.core.display.Javascript object>

In [33]:
m.confusion_matrix(ytest, np.argmax(temp, axis=1)).ravel()

array([151,  49,  63, 337], dtype=int64)

<IPython.core.display.Javascript object>

In [35]:
pd.DataFrame(temp).to_csv("class_1.txt", index=False)
pd.DataFrame(m.confusion_matrix(ytest, np.argmax(temp, axis=1)).ravel()).to_csv(
    "confusion_matrix_class_1.txt", index=False
)

<IPython.core.display.Javascript object>

In [31]:
# pd.DataFrame(acc).to_csv("stackingdatatestresult/_acc.txt", index=False)
# pd.DataFrame(ytest).to_csv("stackingdatatestresult/_ytest.txt", index=False)


# pd.DataFrame(np.array(ypredproba_all)[:, :, 0]).to_csv(
#     "stackingdatatestresult/_ypredproba_all_class_0.txt", index=False
# )

# pd.DataFrame(np.array(ypredproba_all)[:, :, 1]).to_csv(
#     "stackingdatatestresult/_ypredproba_all_class_1.txt", index=False
# )

# pd.DataFrame(np.array(ypredproba_all)[:, :, 2]).to_csv(
#     "stackingdatatestresult/_ypredproba_all_class_2.txt", index=False
# )

# pd.DataFrame(tempval).to_csv("stackingdatatestresult/_confmatrix.txt", index=False)

<IPython.core.display.Javascript object>

In [32]:
# np.std(acc)

<IPython.core.display.Javascript object>