### Exercise 3: SUPERVISED LEARNING

In [1]:
from ucimlrepo import fetch_ucirepo
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.utils import resample
import time
import warnings

warnings.filterwarnings("ignore")

##### Load the Indian Liver Patient Dataset (UCI Machine Learning Repository). Preview the first 5 rows to see the structure of the dataset.

In [2]:
ilpd = fetch_ucirepo(id=225)
features = ilpd.data.features
targets = ilpd.data.targets
ilpd_df = pd.concat([features, targets], axis=1, ignore_index=False)
ilpd_df.head(5)

Unnamed: 0,Age,Gender,TB,DB,Alkphos,Sgpt,Sgot,TP,ALB,A/G Ratio,Selector
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.4,1


In [3]:
ilpd_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 583 entries, 0 to 582
Data columns (total 11 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Age        583 non-null    int64  
 1   Gender     583 non-null    object 
 2   TB         583 non-null    float64
 3   DB         583 non-null    float64
 4   Alkphos    583 non-null    int64  
 5   Sgpt       583 non-null    int64  
 6   Sgot       583 non-null    int64  
 7   TP         583 non-null    float64
 8   ALB        583 non-null    float64
 9   A/G Ratio  579 non-null    float64
 10  Selector   583 non-null    int64  
dtypes: float64(5), int64(5), object(1)
memory usage: 50.2+ KB


In [4]:
ilpd_df.describe()

Unnamed: 0,Age,TB,DB,Alkphos,Sgpt,Sgot,TP,ALB,A/G Ratio,Selector
count,583.0,583.0,583.0,583.0,583.0,583.0,583.0,583.0,579.0,583.0
mean,44.746141,3.298799,1.486106,290.576329,80.713551,109.910806,6.48319,3.141852,0.947064,1.286449
std,16.189833,6.209522,2.808498,242.937989,182.620356,288.918529,1.085451,0.795519,0.319592,0.45249
min,4.0,0.4,0.1,63.0,10.0,10.0,2.7,0.9,0.3,1.0
25%,33.0,0.8,0.2,175.5,23.0,25.0,5.8,2.6,0.7,1.0
50%,45.0,1.0,0.3,208.0,35.0,42.0,6.6,3.1,0.93,1.0
75%,58.0,2.6,1.3,298.0,60.5,87.0,7.2,3.8,1.1,2.0
max,90.0,75.0,19.7,2110.0,2000.0,4929.0,9.6,5.5,2.8,2.0


In [5]:
ilpd_df.isnull().sum()

Age          0
Gender       0
TB           0
DB           0
Alkphos      0
Sgpt         0
Sgot         0
TP           0
ALB          0
A/G Ratio    4
Selector     0
dtype: int64

##### Prepare the data for training such as handling missing values.

In [6]:
ilpd_df = ilpd_df.dropna(how='any')

##### Split the data into training and testing sets. Create and train models for both Random Forest and Gradient Boosting (including AdaBoost, GBoost, and XGBoost).

In [7]:
label_encoder = LabelEncoder()
ilpd_df['Gender'] = label_encoder.fit_transform(ilpd_df['Gender'])
ilpd_df['Selector'] = label_encoder.fit_transform(ilpd_df['Selector'])

X = ilpd_df.drop('Selector', axis=1)
y = ilpd_df['Selector']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf = RandomForestClassifier(n_estimators=100, random_state=42)
t0 = time.time()
rf.fit(X_train, y_train)
print("RandomForestClassifier Training Time:", time.time()-t0)

ada = AdaBoostClassifier(n_estimators=100, learning_rate=0.9, random_state=42)
t1 = time.time()
ada.fit(X_train, y_train)
print("AdaBoostClassifier Training Time:", time.time()-t1)

gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
t2 = time.time()
gb.fit(X_train, y_train)
print("GradientBoostingClassifier Training Time:", time.time()-t2)

xgb = XGBClassifier(n_estimators=100, random_state=42)
t3 = time.time()
xgb.fit(X_train, y_train)
print("XGBClassifier Training Time:", time.time()-t3)

RandomForestClassifier Training Time: 0.1151578426361084
AdaBoostClassifier Training Time: 0.12972378730773926
GradientBoostingClassifier Training Time: 0.11032629013061523
XGBClassifier Training Time: 1.435539960861206


##### Evaluate the performance of each model using Accuracy, Precision, Recall, F1-Score, and ROC-AUC.

In [8]:
def evaluate_model(model, X_test, y_test):
    start_time = time.time()

    y_test_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_test_pred)
    precision = precision_score(y_test, y_test_pred)
    recall = recall_score(y_test, y_test_pred)
    f1 = f1_score(y_test, y_test_pred)
    roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:,1])
    
    print(f'{type(model).__name__} Prediction Time: {time.time() - start_time}')
    
    return {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'ROC-AUC': roc_auc
    }

rf_results = evaluate_model(rf, X_test, y_test)
ada_results = evaluate_model(ada, X_test, y_test)
gb_results = evaluate_model(gb, X_test, y_test)
xgb_results = evaluate_model(xgb, X_test, y_test)

results_df = pd.DataFrame([rf_results, ada_results, gb_results, xgb_results], index=['Random Forest', 'AdaBoost', ' GBoost', 'XGBoost'])
print(results_df)

RandomForestClassifier Prediction Time: 0.013006925582885742
AdaBoostClassifier Prediction Time: 0.06339883804321289
GradientBoostingClassifier Prediction Time: 0.0011665821075439453
XGBClassifier Prediction Time: 0.022515535354614258
               Accuracy  Precision    Recall  F1 Score   ROC-AUC
Random Forest  0.655172   0.571429  0.279070  0.375000  0.753903
AdaBoost       0.689655   0.606061  0.465116  0.526316  0.726346
 GBoost        0.689655   0.640000  0.372093  0.470588  0.725709
XGBoost        0.672414   0.631579  0.279070  0.387097  0.732080


In [9]:
for model in (rf, ada, gb, xgb):
    n_samples = 30
    bias_train = []
    variance_test = []
    
    for _ in range(n_samples):
        X_resampled, y_resampled = resample(X, y, random_state=42)
        model.fit(X_resampled, y_resampled)
        
        y_train_pred = model.predict(X_resampled)
        y_test_pred = model.predict(X_test)
        
        bias_train.append(np.mean(np.abs(y_train_pred - y_resampled)))
        variance_test.append(np.var(y_test_pred))
    
    mean_bias_train = np.mean(bias_train)
    mean_variance_test = np.mean(variance_test)
    
    print(f'{type(model).__name__} Bias: {mean_bias_train}, Variance: {mean_variance_test}')

RandomForestClassifier Bias: 0.0, Variance: 0.19582342449464932
AdaBoostClassifier Bias: 0.10362694300518133, Variance: 0.19976218787158143
GradientBoostingClassifier Bias: 0.04663212435233159, Variance: 0.1917360285374555
XGBClassifier Bias: 0.0, Variance: 0.20719381688466118
