# HW19

### Author: Joseph Wong

## Import the Packages

In [1]:
# NOTE - This list of package imports is getting long
# In a professional setting you would only want to 
#      import what you need!
# I had chatGPT break the packages into groups here

# ============================================================
# Basic packages
# ============================================================
import os                             # For file and directory operations
import numpy as np                    # For numerical computing and arrays
import pandas as pd                   # For data manipulation and analysis

# ============================================================
# Visualization packages
# ============================================================
import matplotlib.pyplot as plt        # Static 2D plotting
import seaborn as sns                  # Statistical data visualization built on matplotlib

# Interactive visualization with Plotly
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.io as pio
pio.renderers.default = 'colab'        # Set renderer for interactive output in Colab or notebooks

# ============================================================
# Scikit-learn: Core utilities for model building and evaluation
# ============================================================
from sklearn.model_selection import train_test_split    # Train/test data splitting
from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler, StandardScaler  # Feature transformations and scaling
from sklearn.metrics import (                            # Model evaluation metrics
    mean_squared_error, r2_score, accuracy_score, 
    precision_score, recall_score, confusion_matrix, 
    classification_report
)

# ============================================================
# Scikit-learn: Linear and polynomial models
# ============================================================
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor       # For KNN

# ============================================================
# Scikit-learn: Synthetic dataset generators
# ============================================================
from sklearn.datasets import make_classification, make_regression

# ============================================================
# Scikit-learn: Naive Bayes models
# ============================================================
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB

# ============================================================
# Scikit-learn: Decision Trees
# ============================================================
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree

# ============================================================
# Text Processing Packages and Code
# ============================================================
from sklearn.feature_extraction.text import TfidfVectorizer
import string
from nltk.corpus import stopwords
from nltk import PorterStemmer as Stemmer


# ============================================================
# Dimensionality Reduction
# ============================================================
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE


# ============================================================
# Scikit-learn: Cross-Validation and Parameter Searches
# ============================================================

from sklearn.model_selection import cross_val_score, KFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# ============================================================
# Scikit-learn: Defining model pipelines
# ============================================================
from sklearn.pipeline import Pipeline

## Import the Data

In [2]:
import kagglehub
# Download latest version
path = kagglehub.dataset_download("amldvvs/avocado-ripeness-classification-dataset")

print("Path to dataset files:", path)

# Note this downloads three files. We will use the second one.
file = path + '/' + os.listdir(path)[0]
df = pd.read_csv(file)
df

Path to dataset files: C:\Users\josee\.cache\kagglehub\datasets\amldvvs\avocado-ripeness-classification-dataset\versions\1


Unnamed: 0,firmness,hue,saturation,brightness,color_category,sound_db,weight_g,size_cm3,ripeness
0,14.5,19,40,26,black,34,175,261,ripe
1,71.7,53,69,75,green,69,206,185,pre-conditioned
2,88.5,60,94,46,dark green,79,220,143,hard
3,93.8,105,87,41,dark green,75,299,140,hard
4,42.5,303,58,32,purple,63,200,227,breaking
...,...,...,...,...,...,...,...,...,...
245,94.1,83,80,58,dark green,72,254,134,hard
246,21.6,17,36,19,black,47,182,240,firm-ripe
247,14.0,4,40,17,black,37,188,274,ripe
248,61.5,63,87,75,green,65,261,162,pre-conditioned


**Exploratory data analysis has already been completed in previous homeworks.**

## Test Train Validate Split

In [3]:
x_cols = ['hue', 'saturation', 'brightness', 'sound_db', 'weight_g', 'size_cm3']
y_cols = ['firmness']

X = df[x_cols]
y = df[y_cols]

# first split: hold out test data
seed = 16
X_train, X_test_full, y_train, y_test_full = train_test_split(X, y, test_size=0.2, random_state=seed)

# second split: create a validation set from the testing data
X_test, X_val, y_test, y_val = train_test_split(X_test_full, y_test_full, test_size=0.5, random_state=seed)

# final proportions are 80% train / 10% val / 10% test
print(f"Train: {len(X_train)}, Validation: {len(X_val)}, Test: {len(X_test)}")

Train: 200, Validation: 25, Test: 25


## Pipeline, Cross Validation, Hyperparameter Tuning

In [4]:
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('model', DecisionTreeRegressor(random_state=seed))
])

param_dist = {
'model__max_depth': np.arange(2,30,1),
'model__ccp_alpha': np.arange(0,0.05,0.0001)
}

rand_search_pipe = RandomizedSearchCV(pipe, param_dist, n_iter=10, cv=10, scoring='neg_mean_squared_error')

# not X_train_sc because the pipeline scales it automatically
rand_search_pipe.fit(X_train, y_train)

print("Best parameters:", rand_search_pipe.best_params_)
print("Best CV score:", rand_search_pipe.best_score_)

Best parameters: {'model__max_depth': np.int64(3), 'model__ccp_alpha': np.float64(0.025)}
Best CV score: -26.359382281843562


We will use a randomized search to run more efficiently. The optimized max depth is 3 and the optimized cpp alpha is around 0.0464. Since these values are not the borders of our range of values, we do not need to alter the range. We use the negative mean squared error because the score is always maximized using sklearn.

## Normalize the Data

In [5]:
# define the scalar
scaler = StandardScaler()
# find mu and sigma of the training data and transform the training data
X_train_sc = scaler.fit_transform(X_train)
# transform the validation data
X_val_sc = scaler.transform(X_val)
# transform the test data
X_test_sc = scaler.transform(X_test)

## Final Model

In [6]:
depth = rand_search_pipe.best_params_.get('model__max_depth')
alpha = rand_search_pipe.best_params_.get('model__ccp_alpha')

model = DecisionTreeRegressor(random_state=seed, max_depth=depth, ccp_alpha=alpha)
model.fit(X_train_sc, y_train)

# cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=seed)
scores = cross_val_score(model, X_train_sc, y_train, cv=kf, scoring='neg_mean_squared_error')
rmse_scores = np.sqrt(-scores)
print("RMSE for each fold:", rmse_scores)
print("Mean RMSE:", rmse_scores.mean())
print(f"\nSTD: {df['firmness'].std():.3f}")

# evaluate test set
y_pred_test = model.predict(X_test_sc)
mse_test = mean_squared_error(y_test, y_pred_test)
r2_test = r2_score(y_test, y_pred_test)

print("\nTesting Data:")
print(f"MSE: {mse_test:.3f}")
print(f"R²: {r2_test:.3f}")

RMSE for each fold: [5.40629813 5.61928555 5.39131688 4.96031329 5.73473663]
Mean RMSE: 5.422390095251031

STD: 27.278

Testing Data:
MSE: 17.899
R²: 0.973


Using cross validation, the mean root mean squared error is around 5.4224, which is fairly low considering the standard deviation is 27.278. This suggests our model predicts relatively close to the actual values. Furthermore, when analyzing the model's performance with the testing set, the R² value is 0.973 and the mean squared error is 17.899. So, the model represents a large amount of the variance in the data and predicts with fairly high certainty.