# Classification - Adaptive Boosting

In [18]:
# Import packages
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import matplotlib.pyplot as plt

Normalised - Without outlier

In [19]:
# We will be using the new_df_without_outliers_copy_smote_resampled.xlsx
df_without_outlier = pd.read_excel('C:\wamp64\www\IS424-Data-Mining\Data_Set\\new_df_without_outliers_copy_smote_resampled.xlsx')
df_without_outlier

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,smoking_history_encoded,gender_encoded,diabetes
0,1.349487,0,1,-0.580455,0.629251,-0.317312,0.263730,-0.974068,0
1,0.149555,0,0,-0.241118,0.629251,-1.649552,-1.579747,-0.974068,0
2,-1.050377,0,0,-0.241118,-0.272192,0.082360,0.263730,1.211318,0
3,-0.681167,0,0,-0.857661,-0.973313,0.015748,0.331039,-0.974068,0
4,1.164882,1,1,-1.384988,-1.173634,0.015748,0.331039,1.211318,0
...,...,...,...,...,...,...,...,...,...
181139,1.349487,0,0,-0.261522,0.979109,-0.628168,1.103451,1.211318,1
181140,1.349487,0,0,-0.241118,-0.172031,0.015748,0.624467,-0.399610,1
181141,-0.727318,0,0,-0.241061,1.025419,0.104564,0.763605,1.211318,1
181142,0.334160,0,0,-0.241118,0.045409,0.015748,-1.579747,1.211318,1


In [20]:
# Split the dataset into features and target variable
X = df_without_outlier.drop(columns=['diabetes'])
y = df_without_outlier['diabetes']

In [21]:
feature_names = X.columns

# Train AdaBoost classifier
adaboost = AdaBoostClassifier(n_estimators=50, random_state=42)
adaboost.fit(X, y)

# Compute feature importance scores
feature_importance = np.mean([est.feature_importances_ for est in adaboost.estimators_], axis=0)

# Normalize feature importance scores
feature_importance_normalized = feature_importance / np.sum(feature_importance)

# Sort feature importance scores and corresponding feature names in descending order
sorted_indices = np.argsort(feature_importance_normalized)[::-1]
sorted_importance = feature_importance_normalized[sorted_indices]
sorted_features = np.array(feature_names)[sorted_indices]

print("Feature Importance (Normalised - Without Outlier):")

# Print feature importance in descending order
for feature, importance in zip(sorted_features, sorted_importance):
    print("{}: {:.4f}".format(feature, importance))

Feature Importance (Normalised - Without Outlier):
HbA1c_level: 0.4000
smoking_history_encoded: 0.2400
gender_encoded: 0.2000
age: 0.0800
blood_glucose_level: 0.0400
bmi: 0.0400
heart_disease: 0.0000
hypertension: 0.0000


Normalised - Outlier Only

In [22]:
# We will be using the new_df_without_outliers_copy_smote_resampled.xlsx
df_outlier = pd.read_excel('C:\wamp64\www\IS424-Data-Mining\Data_Set\\new_df_outliers_only_copy_smote_resampled.xlsx')
df_outlier

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,smoking_history_encoded,gender_encoded,diabetes
0,0.174516,0,0,0.930849,-0.357602,-1.154539,1.806645,-0.777719,0
1,-1.018012,0,0,1.073804,-0.233789,0.206884,-0.280453,-0.777719,0
2,0.949660,0,0,1.656370,1.375782,-0.405756,-0.280453,-0.777719,1
3,-0.779506,0,0,1.006045,-0.048069,-0.746112,-0.280453,1.417943,0
4,-1.256518,0,0,0.553214,-0.357602,-1.154539,-1.553924,-0.777719,0
...,...,...,...,...,...,...,...,...,...
6163,0.449954,0,0,0.397771,-0.410728,-0.364914,-0.280453,-0.777719,0
6164,-0.874241,0,0,0.341732,-1.100480,-0.786955,-1.030285,-0.777719,0
6165,-2.108582,0,0,0.585866,-0.225281,-1.399595,-1.009532,-0.777719,0
6166,-0.640560,0,0,0.629500,-0.108843,-0.405756,-1.133316,-0.052526,0


In [23]:
# Split the dataset into features and target variable
X = df_outlier.drop(columns=['diabetes'])
y = df_outlier['diabetes']

In [24]:
feature_names = X.columns

# Train AdaBoost classifier
adaboost = AdaBoostClassifier(n_estimators=50, random_state=42)
adaboost.fit(X, y)

# Compute feature importance scores
feature_importance = np.mean([est.feature_importances_ for est in adaboost.estimators_], axis=0)

# Normalize feature importance scores
feature_importance_normalized = feature_importance / np.sum(feature_importance)

# Sort feature importance scores and corresponding feature names in descending order
sorted_indices = np.argsort(feature_importance_normalized)[::-1]
sorted_importance = feature_importance_normalized[sorted_indices]
sorted_features = np.array(feature_names)[sorted_indices]

print("Feature Importance (Normalised - Outlier Only):")

# Print feature importance in descending order
for feature, importance in zip(sorted_features, sorted_importance):
    print("{}: {:.4f}".format(feature, importance))

Feature Importance (Normalised - Outlier Only):
smoking_history_encoded: 0.3200
gender_encoded: 0.1800
blood_glucose_level: 0.1400
age: 0.1200
HbA1c_level: 0.1000
bmi: 0.1000
heart_disease: 0.0200
hypertension: 0.0200


Not Normalised - Without Outliers

In [25]:
df_without_outlier_notnorm = pd.read_csv('C:\wamp64\www\IS424-Data-Mining\Data_Set\\new_df_without_outliers_copy_smote_resampled_noNormalised.csv')
df_without_outlier_notnorm

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,smoking_history_encoded,gender_encoded,diabetes
0,80.0,0,1,25.190000,6.600000,140,0.131757,-0.128959,0
1,54.0,0,0,27.320000,6.600000,80,-0.797024,-0.128959,0
2,28.0,0,0,27.320000,5.700000,158,0.131757,0.160772,0
3,36.0,0,0,23.450000,5.000000,155,0.165669,-0.128959,0
4,76.0,1,1,20.140000,4.800000,155,0.165669,0.160772,0
...,...,...,...,...,...,...,...,...,...
181139,80.0,0,0,27.191924,6.949298,126,0.554826,0.160772,1
181140,80.0,0,0,27.320000,5.800000,155,0.313504,-0.052799,1
181141,35.0,0,0,27.320357,6.995535,159,0.383605,0.160772,1
181142,58.0,0,0,27.320000,6.017092,155,-0.797024,0.160772,1


In [26]:
# Split the dataset into features and target variable
X = df_without_outlier_notnorm.drop(columns=['diabetes'])
y = df_without_outlier_notnorm['diabetes']

In [27]:
feature_names = X.columns

# Train AdaBoost classifier
adaboost = AdaBoostClassifier(n_estimators=50, random_state=42)
adaboost.fit(X, y)

# Compute feature importance scores
feature_importance = np.mean([est.feature_importances_ for est in adaboost.estimators_], axis=0)

# Normalize feature importance scores
feature_importance_normalized = feature_importance / np.sum(feature_importance)

# Sort feature importance scores and corresponding feature names in descending order
sorted_indices = np.argsort(feature_importance_normalized)[::-1]
sorted_importance = feature_importance_normalized[sorted_indices]
sorted_features = np.array(feature_names)[sorted_indices]

print("Feature Importance (Without Outliers):")

# Print feature importance in descending order
for feature, importance in zip(sorted_features, sorted_importance):
    print("{}: {:.4f}".format(feature, importance))

Feature Importance (Without Outliers):
HbA1c_level: 0.4000
smoking_history_encoded: 0.2400
gender_encoded: 0.2000
age: 0.0800
blood_glucose_level: 0.0400
bmi: 0.0400
heart_disease: 0.0000
hypertension: 0.0000


Not Normalised - Outliers Only 

In [28]:
df_outlier_notnorm = pd.read_csv('C:\wamp64\www\IS424-Data-Mining\Data_Set\\new_df_outliers_only_copy_smote_resampled_noNormalised.csv')
df_outlier_notnorm

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,smoking_history_encoded,gender_encoded,diabetes
0,54.000000,0,0,54.700000,6.000000,100,0.310171,-0.310005,0
1,34.000000,0,0,56.430000,6.200000,200,-0.052540,-0.310005,0
2,67.000000,0,0,63.480000,8.800000,155,-0.052540,-0.310005,1
3,38.000000,0,0,55.610000,6.500000,130,-0.052540,0.561670,0
4,30.000000,0,0,50.130000,6.000000,100,-0.273853,-0.310005,0
...,...,...,...,...,...,...,...,...,...
6163,58.619390,0,0,48.248887,5.914183,158,-0.052540,-0.310005,0
6164,36.411190,0,0,47.570729,4.800000,127,-0.182851,-0.310005,0
6165,15.709946,0,0,50.525147,6.213743,82,-0.179245,-0.310005,0
6166,40.330285,0,0,51.053191,6.401829,155,-0.200757,-0.022104,0


In [29]:
# Split the dataset into features and target variable
X = df_outlier_notnorm.drop(columns=['diabetes'])
y = df_outlier_notnorm['diabetes']

In [30]:
feature_names = X.columns

# Train AdaBoost classifier
adaboost = AdaBoostClassifier(n_estimators=50, random_state=42)
adaboost.fit(X, y)

# Compute feature importance scores
feature_importance = np.mean([est.feature_importances_ for est in adaboost.estimators_], axis=0)

# Normalize feature importance scores
feature_importance_normalized = feature_importance / np.sum(feature_importance)

# Sort feature importance scores and corresponding feature names in descending order
sorted_indices = np.argsort(feature_importance_normalized)[::-1]
sorted_importance = feature_importance_normalized[sorted_indices]
sorted_features = np.array(feature_names)[sorted_indices]

print("Feature Importance (Outliers Only):")

# Print feature importance in descending order
for feature, importance in zip(sorted_features, sorted_importance):
    print("{}: {:.4f}".format(feature, importance))

Feature Importance (Outliers Only):
smoking_history_encoded: 0.3200
gender_encoded: 0.1800
blood_glucose_level: 0.1400
age: 0.1200
HbA1c_level: 0.1000
bmi: 0.1000
heart_disease: 0.0200
hypertension: 0.0200
