# Classification - Adaptive Boosting (Best Hyperparameter)

In [27]:
# Import packages
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

from imblearn.over_sampling import SMOTE

Normalised - Without outlier

In [28]:
df_no_outlier = pd.read_csv('../Final_Data_Set/Original Dataset without Outliers Normalized.csv')
df_no_outlier

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes,smoking_history_encoded,gender_encoded
0,1.713008,0,1,-0.286437,1.134061,0.126046,0,-0.633042,-0.841116
1,0.560337,0,0,0.072849,1.134061,-1.523079,0,-0.633042,-0.841116
2,-0.592335,0,0,0.072849,0.232946,0.620784,0,-0.633042,1.188683
3,-0.237667,0,0,-0.579938,-0.467921,0.538328,0,1.579675,-0.841116
4,1.535674,1,1,-1.138266,-0.668169,0.538328,0,1.579675,1.188683
...,...,...,...,...,...,...,...,...,...
96303,1.713008,0,0,0.072849,0.733566,-1.248225,0,-0.633042,-0.841116
96304,-1.745006,0,0,-1.605507,1.033937,-0.973371,0,-0.633042,-0.841116
96305,1.092339,0,0,0.158875,0.232946,0.538328,0,1.579675,1.188683
96306,-0.769669,0,0,1.439149,-1.469161,-0.973371,0,-0.633042,-0.841116


In [29]:
# Split the dataset into features and target variable
X = df_no_outlier.drop(columns=['diabetes'])
y = df_no_outlier['diabetes']

In [30]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [31]:
# Use SMOTE for oversampling
# Initialize SMOTE
smote = SMOTE(random_state=42)

# Apply SMOTE to the training dataset
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [32]:
# Initialize AdaBoost Classifier
adaboost_clf = AdaBoostClassifier()

In [33]:
# Define hyperparameters grid
param_grid = {
    'n_estimators': [50, 100, 200],  # Number of weak learners
    'learning_rate': [0.1, 0.5, 1.0]  # Learning rate of the boosting process
}

# Perform Grid Search
grid_search = GridSearchCV(estimator=adaboost_clf, param_grid=param_grid, cv=5)
grid_search.fit(X_train_resampled, y_train_resampled)

# Get Best Parameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)



Best Hyperparameters: {'learning_rate': 1.0, 'n_estimators': 200}


In [34]:
# Train Model with Best Parameters
ada_classifier = AdaBoostClassifier(**best_params, random_state=42)
ada_classifier.fit(X_train_resampled, y_train_resampled)



In [35]:
# Evaluate the Gradient Boosting classifier
y_pred_ada = ada_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_ada)
precision = precision_score(y_test, y_pred_ada)
recall = recall_score(y_test, y_pred_ada)
f1 = f1_score(y_test, y_pred_ada)
roc_auc = roc_auc_score(y_test, y_pred_ada)

In [36]:
print("Adaptive Boosting Evaluation Report (Normalised w/o outlier):")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)
print("ROC AUC:", roc_auc)

Adaptive Boosting Evaluation Report (Normalised w/o outlier):
Accuracy: 0.9509915896583948
Precision: 0.5663956639566395
Recall: 0.7333333333333333
F1-score: 0.6391437308868502
ROC AUC: 0.8490085715336791


Normalised - With Outliers

In [37]:
# We will be using the new_df_without_outliers_copy_smote_resampled.xlsx
df_outlier = pd.read_csv('../Final_Data_Set/Original Dataset with Outliers Included Normalized.csv')
df_outlier

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes,smoking_history_encoded,gender_encoded
0,1.692577,0,1,-0.321051,1.001692,0.047709,0,-0.640425,-0.841175
1,0.537899,0,0,-0.000114,1.001692,-1.426157,0,-0.640425,-0.841175
2,-0.616779,0,0,-0.000114,0.161089,0.489869,0,-0.640425,1.188813
3,-0.261494,0,0,-0.583225,-0.492714,0.416175,0,1.561464,-0.841175
4,1.514935,1,1,-1.081957,-0.679515,0.416175,0,1.561464,1.188813
...,...,...,...,...,...,...,...,...,...
99977,1.692577,0,0,-0.000114,0.628091,-1.180513,0,-0.640425,-0.841175
99978,-1.771458,0,0,-1.499326,0.908292,-0.934869,0,-0.640425,-0.841175
99979,1.070828,0,0,0.076730,0.161089,0.416175,0,1.561464,1.188813
99980,-0.794422,0,0,1.220350,-1.426718,-0.934869,0,-0.640425,-0.841175


In [38]:
# Split the dataset into features and target variable
X = df_outlier.drop(columns=['diabetes'])
y = df_outlier['diabetes']

In [39]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [40]:
# Use SMOTE for oversampling
# Initialize SMOTE
smote = SMOTE(random_state=42)

# Apply SMOTE to the training dataset
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [41]:
# Initialize AdaBoost Classifier
adaboost_clf = AdaBoostClassifier()

In [42]:
# Define hyperparameters grid
param_grid = {
    'n_estimators': [50, 100, 200],  # Number of weak learners
    'learning_rate': [0.1, 0.5, 1.0]  # Learning rate of the boosting process
}

# Perform Grid Search
grid_search = GridSearchCV(estimator=adaboost_clf, param_grid=param_grid, cv=5)
grid_search.fit(X_train_resampled, y_train_resampled)

# Get Best Parameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)



In [None]:
# Train Model with Best Parameters
ada_classifier = AdaBoostClassifier(**best_params, random_state=42)
ada_classifier.fit(X_train_resampled, y_train_resampled)

AdaBoostClassifier(n_estimators=200, random_state=42)

In [None]:
# Evaluate the Adaptive Boosting classifier
y_pred_ada = ada_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_ada)
precision = precision_score(y_test, y_pred_ada)
recall = recall_score(y_test, y_pred_ada)
f1 = f1_score(y_test, y_pred_ada)
roc_auc = roc_auc_score(y_test, y_pred_ada)

In [None]:
print("Adaptive Boosting Evaluation Report (Normalised outliers only):")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)
print("ROC AUC:", roc_auc)

Adaptive Boosting Evaluation Report (Normalised outliers only):
Accuracy: 0.960894134120118
Precision: 0.7766016713091922
Recall: 0.7853521126760563
F1-score: 0.7809523809523808
ROC AUC: 0.8816728733723822


Not Normalised - Without Outliers

In [None]:
df_without_outlier_notnorm = pd.read_csv('../Final_Data_Set/Original Dataset without Outliers.csv')
df_without_outlier_notnorm

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes,smoking_history_encoded,gender_encoded
0,80.0,0,1,25.19,6.6,140,0,-0.247356,-0.128959
1,54.0,0,0,27.32,6.6,80,0,-0.247356,-0.128959
2,28.0,0,0,27.32,5.7,158,0,-0.247356,0.160772
3,36.0,0,0,23.45,5.0,155,0,0.452953,-0.128959
4,76.0,1,1,20.14,4.8,155,0,0.452953,0.160772
...,...,...,...,...,...,...,...,...,...
96303,80.0,0,0,27.32,6.2,90,0,-0.247356,-0.128959
96304,2.0,0,0,17.37,6.5,100,0,-0.247356,-0.128959
96305,66.0,0,0,27.83,5.7,155,0,0.452953,0.160772
96306,24.0,0,0,35.42,4.0,100,0,-0.247356,-0.128959


In [None]:
# Split the dataset into features and target variable
X = df_without_outlier_notnorm.drop(columns=['diabetes'])
y = df_without_outlier_notnorm['diabetes']

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Use SMOTE for oversampling
# Initialize SMOTE
smote = SMOTE(random_state=42)

# Apply SMOTE to the training dataset
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [None]:
# Initialize AdaBoost Classifier
adaboost_clf = AdaBoostClassifier()

In [None]:
# Define hyperparameters grid
param_grid = {
    'n_estimators': [50, 100, 200],  # Number of weak learners
    'learning_rate': [0.1, 0.5, 1.0]  # Learning rate of the boosting process
}

# Perform Grid Search
grid_search = GridSearchCV(estimator=adaboost_clf, param_grid=param_grid, cv=5)
grid_search.fit(X_train_resampled, y_train_resampled)

# Get Best Parameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)





KeyboardInterrupt: 

In [None]:
# Train Model with Best Parameters
ada_classifier = AdaBoostClassifier(**best_params, random_state=42)
ada_classifier.fit(X_train_resampled, y_train_resampled)

AdaBoostClassifier(n_estimators=200, random_state=42)

In [None]:
# Evaluate the Adaptive Boosting classifier
y_pred_ada = ada_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_ada)
precision = precision_score(y_test, y_pred_ada)
recall = recall_score(y_test, y_pred_ada)
f1 = f1_score(y_test, y_pred_ada)
roc_auc = roc_auc_score(y_test, y_pred_ada)

In [None]:
print("Adaptive Boosting Evaluation Report (Not Normalised w/o outlier):")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)
print("ROC AUC:", roc_auc)

Adaptive Boosting Evaluation Report (Not Normalised w/o outlier):
Accuracy: 0.9692659121586543
Precision: 0.8595800524934383
Recall: 0.5745614035087719
F1-score: 0.6887486855941115
ROC AUC: 0.7843284889743395


Not Normalised - With Outliers

In [None]:
df_outlier_notnorm = pd.read_csv('C:\wamp64\www\IS424-Data-Mining\Final_Data_Set\Original Dataset with Outliers Included.csv')
df_outlier_notnorm

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes,smoking_history_encoded,gender_encoded
0,80.0,0,1,25.19,6.6,140,0,-0.246527,-0.119227
1,54.0,0,0,27.32,6.6,80,0,-0.246527,-0.119227
2,28.0,0,0,27.32,5.7,158,0,-0.246527,0.150651
3,36.0,0,0,23.45,5.0,155,0,0.450465,-0.119227
4,76.0,1,1,20.14,4.8,155,0,0.450465,0.150651
...,...,...,...,...,...,...,...,...,...
99977,80.0,0,0,27.32,6.2,90,0,-0.246527,-0.119227
99978,2.0,0,0,17.37,6.5,100,0,-0.246527,-0.119227
99979,66.0,0,0,27.83,5.7,155,0,0.450465,0.150651
99980,24.0,0,0,35.42,4.0,100,0,-0.246527,-0.119227


In [None]:
# Split the dataset into features and target variable
X = df_outlier_notnorm.drop(columns=['diabetes'])
y = df_outlier_notnorm['diabetes']

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Use SMOTE for oversampling
# Initialize SMOTE
smote = SMOTE(random_state=42)

# Apply SMOTE to the training dataset
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [None]:
# Initialize AdaBoost Classifier
adaboost_clf = AdaBoostClassifier()

In [None]:
# Define hyperparameters grid
param_grid = {
    'n_estimators': [50, 100, 200],  # Number of weak learners
    'learning_rate': [0.1, 0.5, 1.0]  # Learning rate of the boosting process
}

# Perform Grid Search
grid_search = GridSearchCV(estimator=adaboost_clf, param_grid=param_grid, cv=5)
grid_search.fit(X_train_resampled, y_train_resampled)

# Get Best Parameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

Best Hyperparameters: {'learning_rate': 1.0, 'n_estimators': 200}


In [None]:
# Train Model with Best Parameters
ada_classifier = AdaBoostClassifier(**best_params, random_state=42)
ada_classifier.fit(X_train_resampled, y_train_resampled)

AdaBoostClassifier(n_estimators=200, random_state=42)

In [None]:
# Evaluate the Adaptive Boosting classifier
y_pred_ada = ada_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_ada)
precision = precision_score(y_test, y_pred_ada)
recall = recall_score(y_test, y_pred_ada)
f1 = f1_score(y_test, y_pred_ada)
roc_auc = roc_auc_score(y_test, y_pred_ada)


In [None]:
print("Adaptive Boosting Evaluation Report (Not Normalised outliers only):")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)
print("ROC AUC:", roc_auc)

Adaptive Boosting Evaluation Report (Not Normalised outliers only):
Accuracy: 0.9703455518327749
Precision: 0.9484066767830045
Recall: 0.704225352112676
F1-score: 0.808276753960556
ROC AUC: 0.850246799643211
