In [19]:
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import tree
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import numpy as np


In [20]:
data= pd.read_csv("data/diabetes.csv", sep = ",")
#Import Data Set from previous homework, 80% version worked better.
test_data= pd.read_csv("data/test/clean_test 80percent.csv", sep = ",")
train_data= pd.read_csv("data/train/clean_train 80percent.csv", sep = ",")

In [21]:
train_data.head()

Unnamed: 0,Pregnancies,Glucose,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,9.0,123.0,94.0,33.1,0.374,40.0,0
1,0.0,95.0,92.0,36.5,0.33,26.0,0
2,7.0,150.0,342.0,34.7,0.718,42.0,0
3,2.0,100.0,90.0,32.9,0.867,28.0,1
4,1.0,80.0,60.0,30.0,0.527,22.0,0


XGBoost Classification

In [22]:
X_train = train_data.drop(["Outcome"], axis = 1)
y_train = train_data["Outcome"]
X_test = test_data.drop(["Outcome"], axis = 1)
y_test = test_data["Outcome"]

In [23]:
model = GradientBoostingClassifier(n_estimators=5, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [24]:

model = XGBClassifier(n_estimators = 200, learning_rate = 0.908, random_state = 42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred
report=classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.82      0.95      0.88        44
           1       0.87      0.59      0.70        22

    accuracy                           0.83        66
   macro avg       0.85      0.77      0.79        66
weighted avg       0.84      0.83      0.82        66



Perform Grid Search for the best combination of hyper parameters

In [25]:
#Perform grid search
param_grid = {
    'max_depth': [3, 5, 7, 9],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2, 0.3]
}

# Set up the search
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, 
                           scoring='accuracy', cv=5, verbose=1, n_jobs=-1)

# Fit the search
grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Fitting 5 folds for each of 432 candidates, totalling 2160 fits
Best Parameters: {'colsample_bytree': 0.8, 'gamma': 0.3, 'max_depth': 5, 'min_child_weight': 3, 'subsample': 0.8}
Best Score: 0.8181422351233671


Perform Random Search for the best combination of hyper parameters

In [26]:
#Perform random search
param_dist = {
    'max_depth': [3, 5, 7, 9],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2, 0.3]
}

# Set up the Randomized Search
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_dist, 
                                   n_iter=50, scoring='accuracy', cv=5, verbose=1, n_jobs=-1, random_state=42)

# Fit the Randomized Search
random_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best Parameters:", random_search.best_params_)
print("Best Score:", random_search.best_score_)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best Parameters: {'subsample': 1.0, 'min_child_weight': 5, 'max_depth': 3, 'gamma': 0.3, 'colsample_bytree': 0.6}
Best Score: 0.8032656023222062


Compare Grid with Random Search

In [27]:
print(f"Grid score {grid_search.best_score_} and Random Score {random_search.best_score_}")

if grid_search.best_score_ > random_search.best_score_:
    print ("Use Grid Search Result")
else:  print("Use Random Search Result")
    

Grid score 0.8181422351233671 and Random Score 0.8032656023222062
Use Grid Search Result


In [28]:
best_params = grid_search.best_params_

model = XGBClassifier(
    n_estimators=200,
    max_depth=best_params['max_depth'],
    min_child_weight=best_params['min_child_weight'],
    subsample=best_params['subsample'],
    random_state=42
)
model.fit(X_train, y_train)

In [29]:
y_pred = model.predict(X_test)
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0])

In [30]:
from sklearn.metrics import classification_report
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.83      0.98      0.90        44
           1       0.93      0.59      0.72        22

    accuracy                           0.85        66
   macro avg       0.88      0.78      0.81        66
weighted avg       0.86      0.85      0.84        66



In [31]:
from pickle import dump

dump(model, open("model/boosting_classifier_nestimators-20_learnrate-0.001_42.sav", "wb"))

Stacking - 66% - Score deteriorated

In [32]:


X_train_base, X_train_meta, y_train_base, y_train_meta = train_test_split(X_train, y_train, test_size=0.5, random_state=42)

#Train the base models with gradient and classifier
xgb_model = XGBClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train_base, y_train_base)

gb_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
gb_model.fit(X_train_base, y_train_base)

#Predictions from the base models
xgb_predictions_train = xgb_model.predict(X_train_meta).reshape(-1, 1)  # reshape for stacking
gb_predictions_train = gb_model.predict(X_train_meta).reshape(-1, 1)    # reshape for stacking

# Stack the predictions 
meta_features_train = np.hstack((xgb_predictions_train, gb_predictions_train))

#Train the meta-classifier using the stacked predictions
meta_model = LogisticRegression(random_state=42)
meta_model.fit(meta_features_train, y_train_meta)

# Test set
xgb_predictions_test = xgb_model.predict(X_test).reshape(-1, 1)
gb_predictions_test = gb_model.predict(X_test).reshape(-1, 1)

# Stack
meta_features_test = np.hstack((xgb_predictions_test, gb_predictions_test))

# Final predictions using the meta-classifier
final_predictions = meta_model.predict(meta_features_test)

report = classification_report(y_test, final_predictions)
print(report)


              precision    recall  f1-score   support

           0       0.67      1.00      0.80        44
           1       0.00      0.00      0.00        22

    accuracy                           0.67        66
   macro avg       0.33      0.50      0.40        66
weighted avg       0.44      0.67      0.53        66



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
