<a href="https://www.kaggle.com/code/larsmagnusson/itf31519-workshop-07-10?scriptVersionId=145074077" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Load dataset
dataset = pd.read_csv('/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')

from sklearn.model_selection import train_test_split

# Split into training and test data (75/25)
train_X, test_X, train_y, test_y = train_test_split(dataset.drop('quality', axis=1), dataset['quality'])


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

# Create a dictionary with hyperparameter values we want to test
gb_param = {'learning_rate': [0.025, 0.05, 0.1, 0.15, 0.20], # The "size" of each boosting step
            'n_estimators': [50, 100, 150, 200]} # The number of gradient boosted trees to include in the ensemble

# Create an gridsearch object for finding the best hyperparameter values 
# for a gradient boosting classifier and fit (train) it on the training data
gb_grid = GridSearchCV(GradientBoostingClassifier(), gb_param)
gb_grid.fit(train_X, train_y)

# IMPORTANT: gb_grid will in addition to performing the grid search to find the 
# best values also retrain a model using the best found values. This is controlled 
# the refit parameter in GridSearchCV. gb_grid can be used to score and predict using
# the retrained model.

In [None]:
# Fetch cross-validation results into a pandas dataframe
gb_results = pd.DataFrame(gb_grid.cv_results_)
gb_results

In [None]:
# For debug purposes. These values will be our x values in the plot
gb_results['param_learning_rate']

In [None]:
# For debug purposes. These values will be our y values in the plot
gb_results['mean_test_score']

In [None]:
import matplotlib.pyplot as plt

# Plot the values directly
plt.scatter(gb_results['param_n_estimators'], gb_results['mean_test_score'])

In [None]:
# Find the average score for each value of the learning_rate and n_estimators hyperparameters. We 
# have four results (the number of n_estimator values) for each learning rate. Note that we could 
# potentially select the best score (instead of the mean) for each value as well. 
gb_mean_test_scores = np.array(gb_results['mean_test_score']).reshape(-1,4)
# The learning rate means are calculated along the columns
gb_learning_rate_means = np.mean(gb_mean_test_scores, axis=1)
# The learning rate means are calculated along the rows
gb_n_estimator_means = np.mean(gb_mean_test_scores, axis=0)
print(gb_n_estimator_means)
print(gb_learning_rate_means)

# Find the x values directly from the param values. Could be fetched from the 
# cv_results as well
x_n_estimators = gb_param['n_estimators']
x_learning_rate = gb_param['learning_rate']


In [None]:
# Plot the average performance for each n_estimator and learning_rate value tested
fig,ax = plt.subplots(1,2,figsize=(10,5))
ax[0].plot(x_learning_rate, gb_learning_rate_means)
ax[1].plot(x_n_estimators, gb_n_estimator_means)

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Create a dictionary with hyperparameter values we want to test
rf_param = {'max_depth': [4, 6, 8, 10, 12],      # The maximum depth of each decision tree
            'n_estimators': [50, 100, 150, 200]} # The number of decision trees to include in the ensemble

# Create an gridsearch object for finding the best hyperparameter values 
# for a gradient boosting classifier and fit (train) it on the training data
rf_grid = GridSearchCV(RandomForestClassifier(), rf_param)
rf_grid.fit(train_X, train_y)

In [None]:
# Convert the results from the cross validated grid search into a dataframe 
rf_results = pd.DataFrame(rf_grid.cv_results_)
rf_results

In [None]:
# Find the average for each value of the hyperparameter values. We 
# have several results for each learning rate, so we need to combine them. We'll be using the mean,
# but other methods could illustrate the results more accurately
rf_mean_test_scores = np.array(rf_results['mean_test_score']).reshape(-1,4)

rf_max_depth_means = np.mean(rf_mean_test_scores, axis=1)
rf_n_estimators_means = np.mean(rf_mean_test_scores, axis=0)

# We'll use the same x axis points from the gb model since we've tested the same values for n_estimators
plt.plot(x_n_estimators, gb_n_estimator_means, label="GradientBoosting")
plt.plot(x_n_estimators, rf_n_estimators_means, label="RandomForest")
plt.legend()
plt.show()

In [None]:
x_max_depth = rf_param['max_depth']
# Plot the average performance for each n_estimator and max_depth value tested
fig,ax = plt.subplots(1,2,figsize=(10,5))
ax[0].plot(x_max_depth, rf_max_depth_means)
ax[1].plot(x_n_estimators, rf_n_estimators_means)

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay

fig,ax = plt.subplots(1,2,figsize=(10,5))
ConfusionMatrixDisplay.from_estimator(gb_grid, test_X, test_y, ax=ax[0])
ConfusionMatrixDisplay.from_estimator(rf_grid, test_X, test_y, ax=ax[1])