In [1]:
%load_ext nb_black
# !pip install nb_black
import warnings

warnings.filterwarnings("ignore")

<IPython.core.display.Javascript object>

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn import metrics as m
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE  # doctest: +NORMALIZE_WHITESPACE
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
import xgboost as xgb
from sklearn.pipeline import make_pipeline
from mlxtend.feature_selection import ColumnSelector
from sklearn import model_selection
from mlxtend.classifier import StackingClassifier
from thundersvm import SVC as svmgpu
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE, SMOTENC  # doctest: +NORMALIZE_WHITESPACE
from multiscorer import MultiScorer
from sklearn.model_selection import cross_val_score
from numpy import average
import xgboost as xgb
from sklearn import metrics as m
import itertools

In [None]:
randomseed = 7
np.random.seed(randomseed)

1.Read the data set
===

In [None]:
# read the data set
x_original = pd.read_csv("../../../../../dataset/XLable_onlyDiabeticRemoved.txt")

conditions = [
    (x_original.L100800 < 100),
    (x_original.L100800 >= 100) & (x_original.L100800 < 126),
    (x_original.L100800 >= 126),
]
choices = [0, 1, 2]
x_original["CLASS"] = np.select(conditions, choices, default=0)
x_original = x_original[
    [
        "Unnamed: 0",
        "L100800",
        "L104600",
        "L103000",
        "S000300",
        "L101700",
        "L100700",
        "FIELD_33",
        "FIELD_38",
        "FIELD_40",
        "FIELD_31",
        "SEX",
        "AGE",  #'CLASS',
        "FIELD_16",
        "FIELD_23",
        "FIELD_15",
        "FIELD_22",
        "FIELD_17",
        "FIELD_24",
    ]
]

print(x_original.shape)

In [None]:
y_original = pd.read_csv("../../../../../dataset/TargetLable_onlyDiabeticRemoved.txt")

conditions = [
    (y_original.L100800 < 100),
    (y_original.L100800 >= 100) & (y_original.L100800 < 126),
    (y_original.L100800 >= 126),
]

choices = [0, 1, 2]
y_original["CLASS"] = np.select(conditions, choices, default=0)

y_original = y_original[["Unnamed: 0", "CLASS"]]

print(y_original.shape)

In [None]:
data = pd.merge(
    x_original, y_original, how="inner", left_on="Unnamed: 0", right_on="Unnamed: 0"
)

In [None]:
# filter the data set
data = data[data.FIELD_16 != 1]  # exclude people who are diagnosed for (diabetes)
data = data[data.FIELD_23 != 1]  # exclude people who are on medication for diabetes

data = data[
    data.FIELD_15 != 1
]  # exclude people who are diagnosed for (high blood pressure)
data = data[
    data.FIELD_22 != 1
]  # exclude people who are on medication for high blood pressure

data = data[data.FIELD_17 != 1]  # exclude people who are diagnosed for hyperlipidemia
data = data[
    data.FIELD_24 != 1
]  # exclude people who are on medication for hyperlipidemia

print(data.shape)

data = data[
    [
        "L100800",
        "L104600",
        "L103000",
        "S000300",
        "L101700",
        "L100700",
        "FIELD_33",
        "FIELD_38",
        "FIELD_40",
        "FIELD_31",
        "SEX",
        "AGE",
        "CLASS",
    ]
]
data = data.dropna()
print(data.shape)
data.head()

2.Downsample the majority class and upsample the minority
===

In [None]:
diabetic = data[data.CLASS == 2]
prediabetic = data[data.CLASS == 1]
normal = data[data.CLASS == 0]

print(diabetic.shape[0], prediabetic.shape[0], normal.shape[0])

In [None]:
diabetic_test = diabetic.sample(200, random_state=randomseed)
prediabetic_test = prediabetic.sample(200, random_state=randomseed)
normal_test = normal.sample(200, random_state=randomseed)
test = pd.concat([diabetic_test, prediabetic_test, normal_test])

diabetic_train = diabetic.drop(diabetic_test.index)
prediabetic_train = prediabetic.drop(prediabetic_test.index)
# .sample(
#     10 * diabetic_train.shape[0], random_state=randomseed
# )
normal_train = normal.drop(normal_test.index).sample(
    prediabetic_train.shape[0],
    random_state=randomseed
    #     10 * diabetic_train.shape[0], random_state=randomseed
)
train = pd.concat([diabetic_train, diabetic_train, prediabetic_train, normal_train])

In [None]:
scorer = MultiScorer({                                               # Create a MultiScorer instance
  'accuracy_score': (m.accuracy_score, {}),
  'precision_score': (m.precision_score, {'average': 'weighted'}),  
  'recall_score': (m.recall_score, {'average': 'weighted'}),  
  'f1_score': (m.f1_score, {'average': 'weighted'})               # Param 'average' will be passed to precision_score as kwarg 
})



In [None]:
# features

3.Generate the classifier models 
===

In [None]:
features = [
    [0, 1, 8, 9, 10],[0, 1, 6, 8, 10],[0, 1, 7, 9, 10],[0, 1, 2, 8, 10],
    [0, 1, 5, 8, 10],[0, 1, 5, 6, 11],[0, 1, 7, 8, 10],[0, 1, 9, 10, 11],
    [0, 1, 5, 10, 11],[0, 1, 2, 5, 11]
]

RF_acc = []
RF_pre = []
RF_rec = []
RF_f1 = []

RF_acc_cv = []
RF_pre_cv = []
RF_rec_cv = []
RF_f1_cv = []

RF_acc_cv_std = []
RF_pre_cv_std = []
RF_rec_cv_std = []
RF_f1_cv_std = []

In [None]:
for f in features:
    print(f)
    f = list(f)
    xtrain = train.iloc[:, f]
    xtest = test.iloc[:, f]

    ytest = test.iloc[:, -1]
    ytrain = train.iloc[:, -1]

    scaler = MinMaxScaler()
    xtrain = scaler.fit_transform(xtrain)
    xtest = scaler.transform(xtest)
    randomseed = 42

    # sm = SMOTENC(
    #     random_state=randomseed,
    #     categorical_features=[1],  # [6, 7, 8, 9, 10],
    #     sampling_strategy="minority",
    # )

    sm = SMOTE(random_state=randomseed, sampling_strategy="minority")

    X_res, y_res = sm.fit_resample(xtrain, ytrain)
    xtrain = X_res
    ytrain = y_res

    rf_12 = RandomForestClassifier(
        random_state=randomseed,
        n_estimators=100,
        max_depth=12,
        min_samples_split=2,
        min_samples_leaf=10,
        max_features="auto",
    )
    rf_12.fit(xtrain, ytrain)

    ypred = rf_12.predict(xtest)

    RF_acc.append(m.accuracy_score(ytest, ypred))
    RF_pre.append(m.precision_score(ytest, ypred, average="weighted"))
    RF_rec.append(m.recall_score(ytest, ypred, average="weighted"))
    RF_f1.append(m.f1_score(ytest, ypred, average="weighted"))

    scores = model_selection.cross_val_score(
        rf_12, xtrain, ytrain, cv=10, scoring="accuracy"
    )

    RF_acc_cv.append(scores.mean())
    RF_acc_cv_std.append(scores.std())

#     cross_val_score(rf_12, xtrain, ytrain, scoring=scorer, cv=10)
#     results = scorer.get_results()

#     RF_acc_cv.append(average(results["accuracy_score"]))
#     RF_acc_cv_std.append(np.std(results["accuracy_score"]))
#     RF_pre_cv.append(average(results["precision_score"]))
#     RF_pre_cv_std.append(np.std(results["precision_score"]))
#     RF_rec_cv.append(average(results["recall_score"]))
#     RF_rec_cv_std.append(np.std(results["recall_score"]))
#     RF_f1_cv.append(average(results["f1_score"]))
#     RF_f1_cv_std.append(np.std(results["f1_score"]))

In [None]:
len(RF_acc_cv)

In [None]:
XGB_acc = []
XGB_pre = []
XGB_rec = []
XGB_f1 = []

XGB_acc_cv = []
XGB_pre_cv = []
XGB_rec_cv = []
XGB_f1_cv = []

XGB_acc_cv_std = []
XGB_pre_cv_std = []
XGB_rec_cv_std = []
XGB_f1_cv_std = []

In [None]:
for f in features:
    print(f)
    f = list(f)
    xtrain = train.iloc[:, f]
    xtest = test.iloc[:, f]

    ytest = test.iloc[:, -1]
    ytrain = train.iloc[:, -1]

    scaler = MinMaxScaler()
    xtrain = scaler.fit_transform(xtrain)
    xtest = scaler.transform(xtest)
    randomseed = 42

    # sm = SMOTENC(
    #     random_state=randomseed,
    #     categorical_features=[1],  # [6, 7, 8, 9, 10],
    #     sampling_strategy="minority",
    # )

    sm = SMOTE(random_state=randomseed, sampling_strategy="minority")

    X_res, y_res = sm.fit_resample(xtrain, ytrain)
    xtrain = X_res
    ytrain = y_res

    xgb_model_12 = xgb.XGBClassifier(
        objective="binary:logistic", random_state=randomseed
    )
    xgb_model_12.fit(xtrain, ytrain)

    ypred = xgb_model_12.predict(xtest)

    XGB_acc.append(m.accuracy_score(ytest, ypred))
    XGB_pre.append(m.precision_score(ytest, ypred, average="weighted"))
    XGB_rec.append(m.recall_score(ytest, ypred, average="weighted"))
    XGB_f1.append(m.f1_score(ytest, ypred, average="weighted"))

    scores = model_selection.cross_val_score(
        xgb_model_12, xtrain, ytrain, cv=10, scoring="accuracy"
    )

    XGB_acc_cv.append(scores.mean())
    XGB_acc_cv_std.append(scores.std())

#     cross_val_score(xgb_model_12, xtrain, ytrain, scoring=scorer, cv=10)
#     results = scorer.get_results()

#     XGB_acc_cv.append(average(results["accuracy_score"]))
#     XGB_acc_cv_std.append(np.std(results["accuracy_score"]))
#     XGB_pre_cv.append(average(results["precision_score"]))
#     XGB_pre_cv_std.append(np.std(results["precision_score"]))
#     XGB_rec_cv.append(average(results["recall_score"]))
#     XGB_rec_cv_std.append(np.std(results["recall_score"]))
#     XGB_f1_cv.append(average(results["f1_score"]))
#     XGB_f1_cv_std.append(np.std(results["f1_score"]))

In [None]:
SVM_acc = []
SVM_pre = []
SVM_rec = []
SVM_f1 = []

SVM_acc_cv = []
SVM_pre_cv = []
SVM_rec_cv = []
SVM_f1_cv = []

SVM_acc_cv_std = []
SVM_pre_cv_std = []
SVM_rec_cv_std = []
SVM_f1_cv_std = []

In [None]:
for f in features:
    print(f)
    f = list(f)
    xtrain = train.iloc[:, f]
    xtest = test.iloc[:, f]

    ytest = test.iloc[:, -1]
    ytrain = train.iloc[:, -1]

    scaler = MinMaxScaler()
    xtrain = scaler.fit_transform(xtrain)
    xtest = scaler.transform(xtest)
    randomseed = 42

    # sm = SMOTENC(
    #     random_state=randomseed,
    #     categorical_features=[1],  # [6, 7, 8, 9, 10],
    #     sampling_strategy="minority",
    # )

    sm = SMOTE(random_state=randomseed, sampling_strategy="minority")

    X_res, y_res = sm.fit_resample(xtrain, ytrain)
    xtrain = X_res
    ytrain = y_res

    scv_12 = svmgpu(
        C=100,
        cache_size=None,
        class_weight={},
        coef0=0.0,
        decision_function_shape="ovo",
        degree=3,
        gamma=0.1,
        gpu_id=0,
        kernel="linear",
        max_iter=-1,
        max_mem_size=-1,
        n_jobs=-1,
        probability=False,
        random_state=None,
        shrinking=False,
        tol=0.001,
        verbose=False,
    )

    scv_12.fit(xtrain, ytrain)

    ypred = scv_12.predict(xtest)

    SVM_acc.append(m.accuracy_score(ytest, ypred))
    SVM_pre.append(m.precision_score(ytest, ypred, average="weighted"))
    SVM_rec.append(m.recall_score(ytest, ypred, average="weighted"))
    SVM_f1.append(m.f1_score(ytest, ypred, average="weighted"))

    scores = model_selection.cross_val_score(
        scv_12, xtrain, ytrain, cv=10, scoring="accuracy"
    )
    
    SVM_acc_cv.append(scores.mean())
    SVM_acc_cv_std.append(scores.std())
    
#     cross_val_score(scv_12, xtrain, ytrain, scoring=scorer, cv=10)
#     results = scorer.get_results()

#     SVM_acc_cv.append(average(results["accuracy_score"]))
#     SVM_acc_cv_std.append(np.std(results["accuracy_score"]))
#     SVM_pre_cv.append(average(results["precision_score"]))
#     SVM_pre_cv_std.append(np.std(results["precision_score"]))
#     SVM_rec_cv.append(average(results["recall_score"]))
#     SVM_rec_cv_std.append(np.std(results["recall_score"]))
#     SVM_f1_cv.append(average(results["f1_score"]))
#     SVM_f1_cv_std.append(np.std(results["f1_score"]))

In [None]:
SVM_Result = pd.DataFrame(
    np.array([SVM_acc, SVM_pre, SVM_rec, SVM_f1, SVM_acc_cv, SVM_acc_cv_std]).T,
    columns=["SVM_acc", "SVM_pre", "SVM_rec", "SVM_f1", "SVM_acc_cv", "SVM_acc_cv_std"],
)
SVM_Result.to_csv(" SVM_Result.csv", index=False)


XGB_Result = pd.DataFrame(
    np.array([XGB_acc, XGB_pre, XGB_rec, XGB_f1, XGB_acc_cv, XGB_acc_cv_std]).T,
    columns=["XGB_acc", "XGB_pre", "XGB_rec", "XGB_f1", "XGB_acc_cv", "XGB_acc_cv_std"],
)
XGB_Result.to_csv(" XGB_Result.csv", index=False)

RF_Result2 = pd.DataFrame(
    data=np.array([RF_acc, RF_pre, RF_rec, RF_f1, RF_acc_cv, RF_acc_cv_std]).T,
    columns=["RF_acc", "RF_pre", "RF_rec", "RF_f1", "RF_acc_cv", "RF_acc_cv_std"],
)
RF_Result2.to_csv(" RF_Result.csv", index=False)