# Classification - Ensemble Methods

- An ensemble method is a machine learning technique that combines the predictions of multiple individual models to produce a stronger predictive model.
- The idea behind ensemble methods is to leverage the diversity of different models to improve overall predictive performance and robustness.
- Ensemble methods are widely used in machine learning because they often yield better results compared to single models.

In [8]:
# Import packages
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report

Without outlier

In [9]:
# We will be using the new_df_without_outliers_copy_smote_resampled.xlsx
df_without_outlier = pd.read_excel('C:\wamp64\www\IS424-Data-Mining\Data_Set\\new_df_without_outliers_copy_smote_resampled.xlsx')
df_without_outlier

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,smoking_history_encoded,gender_encoded,diabetes
0,1.349487,0,1,-0.580455,0.629251,-0.317312,0.263730,-0.974068,0
1,0.149555,0,0,-0.241118,0.629251,-1.649552,-1.579747,-0.974068,0
2,-1.050377,0,0,-0.241118,-0.272192,0.082360,0.263730,1.211318,0
3,-0.681167,0,0,-0.857661,-0.973313,0.015748,0.331039,-0.974068,0
4,1.164882,1,1,-1.384988,-1.173634,0.015748,0.331039,1.211318,0
...,...,...,...,...,...,...,...,...,...
181139,1.349487,0,0,-0.261522,0.979109,-0.628168,1.103451,1.211318,1
181140,1.349487,0,0,-0.241118,-0.172031,0.015748,0.624467,-0.399610,1
181141,-0.727318,0,0,-0.241061,1.025419,0.104564,0.763605,1.211318,1
181142,0.334160,0,0,-0.241118,0.045409,0.015748,-1.579747,1.211318,1


In [10]:
# Split the dataset into features and target variable
X = df_without_outlier.drop(columns=['diabetes'])
y = df_without_outlier['diabetes']

In [11]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# Train a Gradient Boosting classifier
gb_classifier = GradientBoostingClassifier(random_state=42)
gb_classifier.fit(X_train, y_train)

GradientBoostingClassifier(random_state=42)

In [13]:
# Evaluate the Gradient Boosting classifier
y_pred_gb = gb_classifier.predict(X_test)
print("Gradient Boosting Classifier Report:")
print(classification_report(y_test, y_pred_gb))

Gradient Boosting Classifier Report:
              precision    recall  f1-score   support

           0       0.96      0.99      0.97     18125
           1       0.99      0.96      0.97     18104

    accuracy                           0.97     36229
   macro avg       0.97      0.97      0.97     36229
weighted avg       0.97      0.97      0.97     36229



Outlier Only

In [14]:
# We will be using the new_df_without_outliers_copy_smote_resampled.xlsx
df_outlier = pd.read_excel('C:\wamp64\www\IS424-Data-Mining\Data_Set\\new_df_outliers_only_copy_smote_resampled.xlsx')
df_outlier

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,smoking_history_encoded,gender_encoded,diabetes
0,0.174516,0,0,0.930849,-0.357602,-1.154539,1.806645,-0.777719,0
1,-1.018012,0,0,1.073804,-0.233789,0.206884,-0.280453,-0.777719,0
2,0.949660,0,0,1.656370,1.375782,-0.405756,-0.280453,-0.777719,1
3,-0.779506,0,0,1.006045,-0.048069,-0.746112,-0.280453,1.417943,0
4,-1.256518,0,0,0.553214,-0.357602,-1.154539,-1.553924,-0.777719,0
...,...,...,...,...,...,...,...,...,...
6163,0.449954,0,0,0.397771,-0.410728,-0.364914,-0.280453,-0.777719,0
6164,-0.874241,0,0,0.341732,-1.100480,-0.786955,-1.030285,-0.777719,0
6165,-2.108582,0,0,0.585866,-0.225281,-1.399595,-1.009532,-0.777719,0
6166,-0.640560,0,0,0.629500,-0.108843,-0.405756,-1.133316,-0.052526,0


In [19]:
# Split the dataset into features and target variable
X = df_outlier.drop(columns=['diabetes'])
y = df_outlier['diabetes']

In [21]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
# Train a Gradient Boosting classifier
gb_classifier = GradientBoostingClassifier(random_state=42)
gb_classifier.fit(X_train, y_train)

GradientBoostingClassifier(random_state=42)

In [23]:
# Evaluate the Gradient Boosting classifier
y_pred_gb = gb_classifier.predict(X_test)
print("Gradient Boosting Classifier Report:")
print(classification_report(y_test, y_pred_gb))

Gradient Boosting Classifier Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       609
           1       1.00      0.97      0.98       625

    accuracy                           0.98      1234
   macro avg       0.98      0.98      0.98      1234
weighted avg       0.98      0.98      0.98      1234

