# Classification - Adaptive Boosting

In [1]:
# Import packages
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

Normalised - Without outlier

In [3]:
df_no_outlier = pd.read_csv('../Final_Data_Set/Original Dataset without Outliers Normalized.csv')
df_no_outlier

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes,smoking_history_encoded,gender_encoded
0,1.713008,0,1,-0.286437,1.134061,0.126046,0,-0.633042,-0.841116
1,0.560337,0,0,0.072849,1.134061,-1.523079,0,-0.633042,-0.841116
2,-0.592335,0,0,0.072849,0.232946,0.620784,0,-0.633042,1.188683
3,-0.237667,0,0,-0.579938,-0.467921,0.538328,0,1.579675,-0.841116
4,1.535674,1,1,-1.138266,-0.668169,0.538328,0,1.579675,1.188683
...,...,...,...,...,...,...,...,...,...
96303,1.713008,0,0,0.072849,0.733566,-1.248225,0,-0.633042,-0.841116
96304,-1.745006,0,0,-1.605507,1.033937,-0.973371,0,-0.633042,-0.841116
96305,1.092339,0,0,0.158875,0.232946,0.538328,0,1.579675,1.188683
96306,-0.769669,0,0,1.439149,-1.469161,-0.973371,0,-0.633042,-0.841116


In [7]:
# Split the dataset into features and target variable
X = df_no_outlier.drop(columns=['diabetes'])
y = df_no_outlier['diabetes']

In [8]:
feature_names = X.columns

# Train AdaBoost classifier
adaboost = AdaBoostClassifier(n_estimators=50, random_state=42)
adaboost.fit(X, y)

# Compute feature importance scores
feature_importance = np.mean([est.feature_importances_ for est in adaboost.estimators_], axis=0)

# Normalize feature importance scores
feature_importance_normalized = feature_importance / np.sum(feature_importance)

# Sort feature importance scores and corresponding feature names in descending order
sorted_indices = np.argsort(feature_importance_normalized)[::-1]
sorted_importance = feature_importance_normalized[sorted_indices]
sorted_features = np.array(feature_names)[sorted_indices]

print("Feature Importance (Normalised - Without Outlier):")

# Print feature importance in descending order
for feature, importance in zip(sorted_features, sorted_importance):
    print("{}: {:.4f}".format(feature, importance))



Feature Importance (Normalised - Without Outlier):
blood_glucose_level: 0.5200
age: 0.2000
bmi: 0.1600
HbA1c_level: 0.0400
gender_encoded: 0.0200
smoking_history_encoded: 0.0200
heart_disease: 0.0200
hypertension: 0.0200


Normalised - With Outliers

In [12]:
df_outlier = pd.read_csv('../Final_Data_Set/Original Dataset with Outliers Included Normalized.csv')
df_outlier

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes,smoking_history_encoded,gender_encoded
0,1.692577,0,1,-0.321051,1.001692,0.047709,0,-0.640425,-0.841175
1,0.537899,0,0,-0.000114,1.001692,-1.426157,0,-0.640425,-0.841175
2,-0.616779,0,0,-0.000114,0.161089,0.489869,0,-0.640425,1.188813
3,-0.261494,0,0,-0.583225,-0.492714,0.416175,0,1.561464,-0.841175
4,1.514935,1,1,-1.081957,-0.679515,0.416175,0,1.561464,1.188813
...,...,...,...,...,...,...,...,...,...
99977,1.692577,0,0,-0.000114,0.628091,-1.180513,0,-0.640425,-0.841175
99978,-1.771458,0,0,-1.499326,0.908292,-0.934869,0,-0.640425,-0.841175
99979,1.070828,0,0,0.076730,0.161089,0.416175,0,1.561464,1.188813
99980,-0.794422,0,0,1.220350,-1.426718,-0.934869,0,-0.640425,-0.841175


In [13]:
# Split the dataset into features and target variable
X = df_outlier.drop(columns=['diabetes'])
y = df_outlier['diabetes']

In [14]:
feature_names = X.columns

# Train AdaBoost classifier
adaboost = AdaBoostClassifier(n_estimators=50, random_state=42)
adaboost.fit(X, y)

# Compute feature importance scores
feature_importance = np.mean([est.feature_importances_ for est in adaboost.estimators_], axis=0)

# Normalize feature importance scores
feature_importance_normalized = feature_importance / np.sum(feature_importance)

# Sort feature importance scores and corresponding feature names in descending order
sorted_indices = np.argsort(feature_importance_normalized)[::-1]
sorted_importance = feature_importance_normalized[sorted_indices]
sorted_features = np.array(feature_names)[sorted_indices]

print("Feature Importance (Normalised - With Outliers):")

# Print feature importance in descending order
for feature, importance in zip(sorted_features, sorted_importance):
    print("{}: {:.4f}".format(feature, importance))



Feature Importance (Normalised - With Outliers):
blood_glucose_level: 0.5200
age: 0.2000
bmi: 0.1600
HbA1c_level: 0.0400
gender_encoded: 0.0200
smoking_history_encoded: 0.0200
heart_disease: 0.0200
hypertension: 0.0200
