## <b>DATA</b>

In [1]:
from ucimlrepo import fetch_ucirepo 
from joblib import dump, load
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

In [2]:
  # fetch dataset 
cdc_diabetes_health_indicators = fetch_ucirepo(id=891) 
  
# data (as pandas dataframes) 
X = cdc_diabetes_health_indicators.data.features 
y = cdc_diabetes_health_indicators.data.targets 

In [3]:
# diabetes features
X.head(5)

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,1,1,1,40,1,0,0,0,0,1,...,1,0,5,18,15,1,0,9,4,3
1,0,0,0,25,1,0,0,1,0,0,...,0,1,3,0,0,0,0,7,6,1
2,1,1,1,28,0,0,0,0,1,0,...,1,1,5,30,30,1,0,9,4,8
3,1,0,1,27,0,0,0,1,1,1,...,1,0,2,0,0,0,0,11,3,6
4,1,1,1,24,0,0,0,1,1,1,...,1,0,2,3,0,0,0,11,5,4


In [4]:
# diabetes targets
y.head(5)

Unnamed: 0,Diabetes_binary
0,0
1,0
2,0
3,0
4,0


In [5]:
# normalizing

X_n = X.drop(['PhysHlth', 'MentHlth', 'Education'], axis=1)

scaler = StandardScaler()
X_n[['BMI', 'Age', 'GenHlth', 'Income']] = scaler.fit_transform(X[['BMI', 'Age', 'GenHlth', 'Income']])


#keep y as raw labels
y_n = y.copy()

In [6]:
X_n

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,DiffWalk,Sex,Age,Income
0,1,1,1,1.757936,1,0,0,0,0,1,0,1,0,2.329121,1,0,0.316900,-1.474487
1,0,0,0,-0.511806,1,0,0,1,0,0,0,0,1,0.457294,0,0,-0.337933,-2.440138
2,1,1,1,-0.057858,0,0,0,0,1,0,0,1,1,2.329121,1,0,0.316900,0.939638
3,1,0,1,-0.209174,0,0,0,1,1,1,0,1,0,-0.478619,0,0,0.971733,-0.026012
4,1,1,1,-0.663122,0,0,0,1,1,1,0,1,0,-0.478619,0,0,0.971733,-0.991662
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
253675,1,1,1,2.514516,0,0,0,0,1,1,0,1,0,0.457294,0,1,-0.992766,0.456813
253676,1,1,1,-1.571019,0,0,0,0,0,0,0,1,0,1.393207,1,0,0.971733,-0.991662
253677,0,0,1,-0.057858,0,0,0,1,1,0,0,1,0,-1.414532,0,0,-1.975015,-1.957312
253678,1,0,1,-0.814438,0,0,0,0,1,1,0,1,0,0.457294,0,1,-0.337933,-2.440138


In [7]:
y_n["Diabetes_binary"] = y_n["Diabetes_binary"].astype(float)

y_n.head()

Unnamed: 0,Diabetes_binary
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0


## <b>Gradient Boosting (XGBoost)</b>


In [8]:
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from imblearn.over_sampling import SMOTE

In [9]:
# split data
y_ravel = np.array(y_n).ravel()
X_train, X_test, y_train, y_test = train_test_split(X_n, y_ravel, test_size=0.2, random_state=42)

# experiment with class weights or SMOTE
# SMOTE - oversample minority class
smote = SMOTE(random_state=42, k_neighbors=1)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [10]:
#base model
xgb_base = XGBClassifier(eval_metric='logloss',
                         objective='binary:logistic',
                         reg_alpha = 0.5,
                         reg_lambda=5,
                         random_state=42)

# tune hyperparameters with gridsearch
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3,7, 10]
}

grid_search = GridSearchCV(
    estimator= xgb_base,
    param_grid=param_grid,
    scoring='roc_auc',
    cv=5,
    verbose=1,
    n_jobs=4  #parallel process
) 

grid_search.fit(X_train_smote, y_train_smote)

print("Best Parameters:", grid_search.best_params_)
print("Best ROC-AUC:", grid_search.best_score_)

#consider class-specific thresholds
#precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba[:, class_index])
#optimal_threshold = thresholds[np.argmax(precision * recall)]

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best Parameters: {'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 200}
Best ROC-AUC: 0.956124644490177


In [11]:
# train on 3 splits
for test_size in [0.2,0.5, 0.8]:
    X_train, X_test, y_train, y_test = train_test_split(X_n, y_ravel, test_size=test_size, random_state=42)
    
    # create model with optimal params
    xgb_opt = XGBClassifier(
        eval_metric='mlogloss',
        **grid_search.best_params_,
        random_state=42
    )
    
    xgb_opt.fit(X_train, y_train)
    
    y_pred_proba = xgb_opt.predict_proba(X_test)[:,1]
    
    y_pred_class = xgb_opt.predict(X_test)
    
    print("Train="+str(round(1-test_size,2)))
    print(classification_report(y_test, y_pred_class,zero_division=0))
    

    try:
        roc_auc = roc_auc_score(y_test, y_pred_proba)  # One-vs-rest
        print(f"ROC-AUC: {roc_auc:.4f}")
    except ValueError as e:
        print(f"Could not calculate ROC-AUC: {e}")
        
    print(" ")
    print(" ")


Train=0.8
              precision    recall  f1-score   support

         0.0       0.88      0.98      0.93     43739
         1.0       0.54      0.18      0.27      6997

    accuracy                           0.87     50736
   macro avg       0.71      0.58      0.60     50736
weighted avg       0.83      0.87      0.84     50736

ROC-AUC: 0.8251
 
 
Train=0.5
              precision    recall  f1-score   support

         0.0       0.88      0.97      0.92    109241
         1.0       0.52      0.17      0.26     17599

    accuracy                           0.86    126840
   macro avg       0.70      0.57      0.59    126840
weighted avg       0.83      0.86      0.83    126840

ROC-AUC: 0.8178
 
 
Train=0.2
              precision    recall  f1-score   support

         0.0       0.88      0.97      0.92    174653
         1.0       0.49      0.19      0.27     28291

    accuracy                           0.86    202944
   macro avg       0.68      0.58      0.60    202944
weig

In [12]:
# save the model

dump(xgb_opt, "trained/xgb_opt_CDC.joblib")

['trained/xgb_opt_CDC.joblib']

## <b>NEURALNET</b>

In [13]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input, BatchNormalization, Input
from sklearn.model_selection import KFold
from tensorflow.keras.callbacks import EarlyStopping
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, GridSearchCV
from scikeras.wrappers import KerasClassifier
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import accuracy_score,classification_report, roc_auc_score, roc_curve
from sklearn.preprocessing import label_binarize

In [14]:
# splitting NORMED data

y_ravel = np.array(y_n).ravel()

X_train, X_test, y_train, y_test = train_test_split(X_n, y_ravel, test_size=0.2, random_state=42)

# Oversample the minority classes
smote = SMOTE(random_state=42, k_neighbors=1)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [15]:
def create_nn(learning_rate=0.001):
    model = Sequential([Input(shape=(18,)),
                        Dense(64, activation="relu"),
                        Dense(32, activation="relu"), 
                        Dense(1, activation="sigmoid")]) # output layer
    
    model.compile(optimizer = Adam(learning_rate=learning_rate),
                  loss = "binary_crossentropy",
                  metrics = ["accuracy"])
                  
    return model

In [16]:
# Keras wrapper -> use gridsearchcv with neuralnet

nn = KerasClassifier(model=create_nn)

In [17]:
# Gridsearchcv

param_grid = {
    'model__learning_rate': [0.001, 0.01],
    'batch_size': [64, 128, 256]
}

grid_search = GridSearchCV(
    estimator=nn,
    param_grid=param_grid,
    scoring="accuracy",
    cv=5
) 

# early stop
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True
)

grid_search.fit(
    X_train_smote,
    y_train_smote,
    epochs=200, 
    callbacks=[early_stop]
)

print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)

Epoch 1/200
[1m4365/4365[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 740us/step - accuracy: 0.7488 - loss: 0.5086
Epoch 2/200
[1m4365/4365[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 737us/step - accuracy: 0.7579 - loss: 0.4939
Epoch 3/200
[1m4365/4365[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 740us/step - accuracy: 0.7599 - loss: 0.4920
Epoch 4/200
[1m4365/4365[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 736us/step - accuracy: 0.7614 - loss: 0.4886
Epoch 5/200
[1m4365/4365[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 735us/step - accuracy: 0.7622 - loss: 0.4874
Epoch 6/200
[1m4365/4365[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 735us/step - accuracy: 0.7645 - loss: 0.4848
Epoch 7/200
[1m4365/4365[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 737us/step - accuracy: 0.7640 - loss: 0.4835
Epoch 8/200
[1m4365/4365[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 744us/step - accuracy: 0.7657 - loss: 0.4814


IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



[1m5457/5457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 783us/step - accuracy: 0.7683 - loss: 0.4766
Epoch 17/200
[1m5457/5457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 770us/step - accuracy: 0.7687 - loss: 0.4764
Epoch 18/200
[1m5457/5457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 768us/step - accuracy: 0.7762 - loss: 0.4658
Epoch 38/200
[1m5457/5457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 776us/step - accuracy: 0.7755 - loss: 0.4646
Epoch 39/200
[1m5457/5457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 768us/step - accuracy: 0.7773 - loss: 0.4655
Epoch 40/200
[1m5457/5457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 776us/step - accuracy: 0.7762 - loss: 0.4659
Epoch 41/200
[1m5457/5457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 773us/step - accuracy: 0.7756 - loss: 0.4653
Epoch 42/200
[1m5457/5457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 775us/step - accuracy: 0.7761 - loss: 0.4650
Epoch

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



[1m5457/5457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 787us/step - accuracy: 0.7830 - loss: 0.4538
Epoch 159/200
[1m5457/5457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 769us/step - accuracy: 0.7846 - loss: 0.4525
Epoch 160/200
[1m5457/5457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 780us/step - accuracy: 0.7839 - loss: 0.4539
Epoch 161/200
[1m5457/5457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 771us/step - accuracy: 0.7831 - loss: 0.4538
Epoch 162/200
[1m5457/5457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 772us/step - accuracy: 0.7837 - loss: 0.4526
Epoch 163/200
[1m5457/5457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 769us/step - accuracy: 0.7841 - loss: 0.4529
Epoch 164/200
[1m5457/5457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 770us/step - accuracy: 0.7832 - loss: 0.4537
Epoch 165/200
[1m5457/5457[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 769us/step - accuracy: 0.7845 - loss: 0.452

In [18]:

# train on 3 splits
for test_size in [0.2,0.5, 0.8]:
    X_train, X_test, y_train, y_test = train_test_split(X_n, y_ravel, test_size=test_size, random_state=42)
    
    # create model with optimal learning rate
    nn_opt = create_nn(
        learning_rate=grid_search.best_params_['model__learning_rate']
    )
    
    nn_opt.fit(X_train, y_train,
               batch_size=grid_search.best_params_['batch_size'],
               epochs=200,
               validation_split=0.2,
               callbacks=[early_stop])

    y_pred_proba = nn_opt.predict(X_test)
    y_pred_class = np.argmax(y_pred_proba, axis=1) 
      
    print("Train="+str(round(1-test_size)))
    print(f"ROC-AUC: {roc_auc_score(y_test, y_pred_proba, multi_class='ovr'):.4f}")
    print(" ")
    print(" ")

           

Epoch 1/200
[1m2537/2537[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 958us/step - accuracy: 0.8624 - loss: 0.3311 - val_accuracy: 0.8661 - val_loss: 0.3142
Epoch 2/200
[1m2537/2537[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 924us/step - accuracy: 0.8653 - loss: 0.3155 - val_accuracy: 0.8668 - val_loss: 0.3132
Epoch 3/200
[1m2537/2537[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 927us/step - accuracy: 0.8665 - loss: 0.3153 - val_accuracy: 0.8670 - val_loss: 0.3131
Epoch 4/200
[1m2537/2537[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 923us/step - accuracy: 0.8679 - loss: 0.3130 - val_accuracy: 0.8669 - val_loss: 0.3119
Epoch 5/200
[1m2537/2537[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 948us/step - accuracy: 0.8656 - loss: 0.3139 - val_accuracy: 0.8659 - val_loss: 0.3158
Epoch 6/200
[1m2537/2537[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 991us/step - accuracy: 0.8675 - loss: 0.3128 - val_accuracy: 0.8660 - val_loss: 0.312

In [19]:
# save the model

dump(nn_opt, "trained/nn_opt_CDC.joblib")

['trained/nn_opt_CDC.joblib']

## <b>RandomForest</b>

In [20]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
from imblearn.over_sampling import SMOTE

In [21]:
# Split 80/20 train/test
y_ravel = np.array(y_n).ravel()

X_train, X_test, y_train, y_test = train_test_split(X_n, y_ravel, test_size=0.2, random_state=42)

# Oversample the minority classes
smote = SMOTE(random_state=42,  k_neighbors=1)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [22]:
# gridsearchcv
param_grid = {
    'n_estimators': [50,100,200],
    'max_depth': [10, 20],
    'min_samples_split': [2, 5]
}

grid_search = GridSearchCV(
    RandomForestClassifier(random_state=42), 
    param_grid, 
    cv=5, 
    scoring='roc_auc',
    verbose=1,
    n_jobs=4
)

grid_search.fit(X_train_smote, y_train_smote)
print("Best Parameters:", grid_search.best_params_)
print("Best ROC-AUC:", grid_search.best_score_)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best Parameters: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 200}
Best ROC-AUC: 0.9597264794063696


In [23]:
# Check most 'influential' feature to predict the target
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

array([0.10250572, 0.03900913, 0.00626016, 0.23048239, 0.0174048 ,
       0.00723347, 0.01591477, 0.01619847, 0.01688682, 0.01424869,
       0.01340387, 0.00668094, 0.00929185, 0.16383855, 0.02929138,
       0.01736452, 0.19334173, 0.10064275])

In [24]:
# Assuming X_n is a Pandas DataFrame
feature_names = X_n.columns

# Rank features by importance
ranked_indices = np.argsort(feature_importances)[::-1]
ranked_importances = feature_importances[ranked_indices]
ranked_feature_names = feature_names[ranked_indices]

# Print ranked feature names and their importances
ranked_features_df = pd.DataFrame({
    "Feature Name": ranked_feature_names,
    "Importance": ranked_importances
})

ranked_features_df

Unnamed: 0,Feature Name,Importance
0,BMI,0.230482
1,Age,0.193342
2,GenHlth,0.163839
3,HighBP,0.102506
4,Income,0.100643
5,HighChol,0.039009
6,DiffWalk,0.029291
7,Smoker,0.017405
8,Sex,0.017365
9,Fruits,0.016887


In [25]:
#Train model with best parameters on 3 different splits

for test_size in [0.2,0.5, 0.8]:
    X_train, X_test, y_train, y_test = train_test_split(X_n, y_ravel, test_size=test_size, random_state=42)    
    
    rf_opt = RandomForestClassifier(**grid_search.best_params_, random_state=42)
    
    rf_opt.fit(X_train, y_train)
    
    y_pred = rf_opt.predict(X_test)
    y_pred_proba = rf_opt.predict_proba(X_test)[:, 1]
    
    print("Train="+str(round(1-test_size,2)))
    
    print(classification_report(y_test, y_pred, zero_division=0))
    print(f"ROC-AUC: {roc_auc_score(y_test, y_pred_proba):.4f}")
    print(" ")
    print(" ")

Train=0.8
              precision    recall  f1-score   support

         0.0       0.88      0.97      0.92     43739
         1.0       0.49      0.17      0.25      6997

    accuracy                           0.86     50736
   macro avg       0.69      0.57      0.59     50736
weighted avg       0.83      0.86      0.83     50736

ROC-AUC: 0.8145
 
 
Train=0.5
              precision    recall  f1-score   support

         0.0       0.88      0.97      0.92    109241
         1.0       0.50      0.18      0.26     17599

    accuracy                           0.86    126840
   macro avg       0.69      0.57      0.59    126840
weighted avg       0.83      0.86      0.83    126840

ROC-AUC: 0.8084
 
 
Train=0.2
              precision    recall  f1-score   support

         0.0       0.88      0.97      0.92    174653
         1.0       0.49      0.17      0.26     28291

    accuracy                           0.86    202944
   macro avg       0.68      0.57      0.59    202944
weig

In [26]:
# save the model

dump(rf_opt, "trained/rf_opt_CDC.joblib")

['trained/rf_opt_CDC.joblib']

## <b>SGDClassifier (SVM approximation)</b>

In [27]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.metrics import confusion_matrix, classification_report


In [28]:
# Split 80/20 train/test

# splitting NORMED data
X_train, X_test, y_train, y_test = train_test_split(X_n, y_ravel, test_size=0.2, random_state=42)

# Oversample the minority class
smote = SMOTE(random_state=42,k_neighbors=1)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [29]:
# base model
sgd = SGDClassifier(
    learning_rate='optimal',
    penalty='elasticnet',
    max_iter=200,
    random_state=42
)

#gridsearch
param_grid = {
    'alpha' : [1e-5, 1e-3, 1e-1],
    'l1_ratio': [0.15,0.5, 0.85],
    'tol' : [1e-5, 1e-6],
    'loss': ['log_loss']
}

grid_search = GridSearchCV(
    estimator=sgd,
    param_grid=param_grid,
    scoring='roc_auc',
    cv=5,
    verbose=1,
    n_jobs=4
)

grid_search.fit(X_train_smote, y_train_smote)

# Best parameters and evaluation
print("Best Parameters:", grid_search.best_params_)
print("Best ROC-AUC:", grid_search.best_score_)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best Parameters: {'alpha': 0.001, 'l1_ratio': 0.15, 'loss': 'log_loss', 'tol': 1e-06}
Best ROC-AUC: 0.8224965657369475


In [30]:
for test_size in [0.2,0.5, 0.8]:
    X_train, X_test, y_train, y_test = train_test_split(X_n, y_ravel, test_size=test_size, shuffle=True, random_state=42)    
    
    sgd_opt = SGDClassifier(
        **grid_search.best_params_, 
        learning_rate='optimal',
        penalty='elasticnet',
        max_iter=1000,
        random_state=42
    )
    
    sgd_opt.fit(X_train, y_train)
    
    y_pred = sgd_opt.predict(X_test)
    y_pred_proba = sgd_opt.predict_proba(X_test)[:,1]
    
    print("Train="+str(round(1 - test_size, 2)))
    print(classification_report(y_test, y_pred, zero_division=0))
    print(f"ROC-AUC: {roc_auc_score(y_test, y_pred_proba):.4f}")
    print(" ")
    print(" ")

Train=0.8
              precision    recall  f1-score   support

         0.0       0.88      0.98      0.93     43739
         1.0       0.55      0.15      0.24      6997

    accuracy                           0.87     50736
   macro avg       0.71      0.57      0.58     50736
weighted avg       0.83      0.87      0.83     50736

ROC-AUC: 0.8258
 
 
Train=0.5
              precision    recall  f1-score   support

         0.0       0.88      0.98      0.93    109241
         1.0       0.53      0.15      0.24     17599

    accuracy                           0.86    126840
   macro avg       0.71      0.57      0.58    126840
weighted avg       0.83      0.86      0.83    126840

ROC-AUC: 0.8232
 
 
Train=0.2
              precision    recall  f1-score   support

         0.0       0.88      0.98      0.92    174653
         1.0       0.53      0.17      0.25     28291

    accuracy                           0.86    202944
   macro avg       0.70      0.57      0.59    202944
weig

In [31]:
# save the model

dump(sgd_opt, "trained/sgd_opt_CDC.joblib")

['trained/sgd_opt_CDC.joblib']

## <b>Ensemble (Meta-model = SVM)</b>

In [32]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, roc_auc_score

In [33]:
# split data
X_train, X_test, y_train, y_test = train_test_split(X_n, y_ravel, test_size=0.2, random_state=42)    

# all pretrained models
base_models = [sgd_opt, rf_opt, nn_opt, xgb_opt]

# make meta features
meta_trainl = []
meta_testl = []

for model in base_models:
    if hasattr(model, "predict_proba"):
        meta_trainl.append(model.predict_proba(X_train)[:,1])
        meta_testl.append(model.predict_proba(X_test)[:,1])
    else:
        meta_trainl.append(model.predict(X_train).ravel())
        meta_testl.append(model.predict(X_test).ravel())    #NN doesnt have predict proba

meta_train = np.array(meta_trainl).T
meta_test = np.array(meta_testl).T

[1m6342/6342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 411us/step
[1m1586/1586[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 423us/step


In [35]:
svm_meta = SVC(kernel='rbf', probability=True, random_state=42)

param_grid = {
    'C' : [0.1, 1, 10],
    'gamma': [0.01, 0.1, 1]
}

grid_search = GridSearchCV(
    estimator=svm_meta,
    param_grid=param_grid,
    scoring='roc_auc',
    cv=5,  # Inner cross-validation
    verbose=1,
    n_jobs=-1
)

# Fit the meta-model using GridSearchCV
grid_search.fit(meta_train, y_train)

# Best parameters and evaluation
print("Best Parameters:", grid_search.best_params_)
print("Best ROC-AUC:", grid_search.best_score_)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best Parameters: {'C': 0.1, 'gamma': 0.01}
Best ROC-AUC: 0.8396255323288571


In [38]:
for test_size in [0.2,0.5, 0.8]:
    # split data
    X_train, X_test, y_train, y_test = train_test_split(X_n, y_ravel, test_size=test_size, random_state=42,shuffle=True)    

    meta_trainl=[]
    meta_testl=[]
    
    for model in base_models:
        if hasattr(model, "predict_proba"):
            meta_trainl.append(model.predict_proba(X_train)[:,1])
            meta_testl.append(model.predict_proba(X_test)[:,1])
        else:
            meta_trainl.append(model.predict(X_train).ravel())
            meta_testl.append(model.predict(X_test).ravel())    #NN doesnt have predict proba
    
    meta_train = np.array(meta_trainl).T
    meta_test = np.array(meta_testl).T
    
    svm_meta_opt = SVC(
        **grid_search.best_params_, 
        probability=True,
        random_state=42
    )
    
    svm_meta_opt.fit(meta_train, y_train)
    
    y_pred = svm_meta_opt.predict(meta_test)
    y_pred_proba = svm_meta_opt.predict_proba(meta_test)[:,1]
    
    print("Train="+str(round(1 - test_size, 2)))
    print(classification_report(y_test, y_pred, zero_division=0))
    print(f"ROC-AUC: {roc_auc_score(y_test, y_pred_proba):.4f}")
    print(" ")
    print(" ")

[1m6342/6342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 405us/step
[1m1586/1586[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 409us/step
Train=0.8
              precision    recall  f1-score   support

         0.0       0.86      1.00      0.93     43739
         1.0       0.57      0.03      0.05      6997

    accuracy                           0.86     50736
   macro avg       0.72      0.51      0.49     50736
weighted avg       0.82      0.86      0.81     50736

ROC-AUC: 0.8064
 
 
[1m3964/3964[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 411us/step
[1m3964/3964[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 408us/step
Train=0.5
              precision    recall  f1-score   support

         0.0       0.88      0.97      0.92    109241
         1.0       0.49      0.16      0.24     17599

    accuracy                           0.86    126840
   macro avg       0.68      0.57      0.58    126840
weighted avg       0.82      0.86      0.83  

In [39]:
# save the model

dump(svm_meta_opt, "trained/svm_meta_opt_CDC.joblib")

['trained/svm_meta_opt_CDC.joblib']