## Setup and Imports

In [1]:
# Install the required version of scikit-learn
!pip install numpy==1.23.5 scikit-learn==1.2.2

# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.datasets import load_wine, load_diabetes
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.metrics import accuracy_score, mean_squared_error



## Graded Questions 1-3: Wine Dataset with DecisionTreeClassifier

Load Dataset and Split

In [2]:
# Load Wine dataset
wine = load_wine()
X_wine, y_wine = wine.data, wine.target

# Split into train and test sets (70:30 ratio)
X_train_wine, X_test_wine, y_train_wine, y_test_wine = train_test_split(
    X_wine, y_wine, test_size=0.3, random_state=1
)

Hyperparameter Tuning with GridSearchCV

In [3]:
# Define the DecisionTreeClassifier
dt_classifier = DecisionTreeClassifier(random_state=1)

# Define the hyperparameter grid
param_grid = {
    'criterion': ['entropy', 'gini'],
    'splitter': ['random', 'best'],
    'min_samples_leaf': [2, 4, 6, 8, 10],
    'max_depth': [3, 4, 5, 6]
}

# Perform GridSearchCV
grid_search = GridSearchCV(
    dt_classifier, param_grid, cv=4, scoring='accuracy'
)
grid_search.fit(X_train_wine, y_train_wine)

# Best parameters and score
best_params = grid_search.best_params_
best_score = grid_search.score(X_test_wine, y_test_wine)

print("Best max_depth:", best_params['max_depth'])
print("Best min_samples_leaf:", best_params['min_samples_leaf'])
print("Test set score:", best_score)

Best max_depth: 4
Best min_samples_leaf: 2
Test set score: 0.9074074074074074


## Graded Questions 4-7: Diabetes Dataset with DecisionTreeRegressor

Load Dataset and Split

In [4]:
# Load Diabetes dataset
diabetes = load_diabetes()
X_diabetes, y_diabetes = diabetes.data, diabetes.target

# Split into train and test sets (70:30 ratio)
X_train_diabetes, X_test_diabetes, y_train_diabetes, y_test_diabetes = train_test_split(
    X_diabetes, y_diabetes, test_size=0.3, random_state=1
)

Train DecisionTreeRegressor

In [5]:
# Define the DecisionTreeRegressor
dt_regressor = DecisionTreeRegressor(
    criterion='squared_error', splitter='random', max_leaf_nodes=10, random_state=1
)

# Train the model
dt_regressor.fit(X_train_diabetes, y_train_diabetes)

# Compute scores
train_score = dt_regressor.score(X_train_diabetes, y_train_diabetes)
test_score = dt_regressor.score(X_test_diabetes, y_test_diabetes)

print("Training set score:", train_score)
print("Testing set score:", test_score)

Training set score: 0.5032060624566737
Testing set score: 0.218771375137222


Squared Error at Root Node

In [6]:
# Get the squared error at the root node
mse_root = mean_squared_error(y_train_diabetes, [np.mean(y_train_diabetes)] * len(y_train_diabetes))
print("Squared error at root node:", mse_root)

Squared error at root node: 6302.895487060253


Ratio of Samples in Left and Right Child Nodes

In [7]:
# Get the number of samples in left and right child nodes
n_samples_left = dt_regressor.tree_.n_node_samples[1]  # Left child
n_samples_right = dt_regressor.tree_.n_node_samples[2]  # Right child
ratio = n_samples_left / n_samples_right

print("Ratio of samples in left to right child node:", ratio)

Ratio of samples in left to right child node: 0.1839080459770115


## Graded Questions 8-10: Diabetes Dataset with Hyperparameter Tuning

Hyperparameter Tuning with GridSearchCV

In [8]:
# Define the DecisionTreeRegressor
dt_regressor_tuned = DecisionTreeRegressor(max_depth=4, random_state=1)

# Define the hyperparameter grid
param_grid_tuned = {
    'criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
    'splitter': ['random', 'best'],
    'max_features': [4, 5, 6, 'sqrt', 'log2'],
    'ccp_alpha': [0.001, 0.01, 0.02, 0.05]
}

# Perform GridSearchCV
grid_search_tuned = GridSearchCV(
    dt_regressor_tuned, param_grid_tuned, cv=4, scoring='neg_mean_squared_error'
)
grid_search_tuned.fit(X_train_diabetes, y_train_diabetes)

# Best parameters and score
best_params_tuned = grid_search_tuned.best_params_
best_score_tuned = grid_search_tuned.score(X_test_diabetes, y_test_diabetes)

print("Best ccp_alpha:", best_params_tuned['ccp_alpha'])
print("Best max_features:", best_params_tuned['max_features'])
print("Test set score:", best_score_tuned)

Best ccp_alpha: 0.001
Best max_features: 5
Test set score: -3888.1571845947697
