In [7]:
##############################################
# Programmer: Hannah Horn
# Class: CPSC 322-01, Fall 2024
# Programming Assignment #5
# 10/28/24
# I did not attempt the bonus
# 
# Description: This jupyter notebook calculates the accuracy and error rates of the different classifiers
#               based on random sub sampling, k fold cross validation, and the bootstrap method. It then displays a 
#               a confusion matrix based on the k fold cross validation results
##############################################

In [8]:
# some useful mysklearn package import statements and reloads
import importlib

import mysklearn.myutils
importlib.reload(mysklearn.myutils)
import mysklearn.myutils as myutils

import mysklearn.mypytable
importlib.reload(mysklearn.mypytable)
from mysklearn.mypytable import MyPyTable 

import mysklearn.myclassifiers
importlib.reload(mysklearn.myclassifiers)
from mysklearn.myclassifiers import MyKNeighborsClassifier, MyDummyClassifier

import mysklearn.myevaluation
importlib.reload(mysklearn.myevaluation)
import mysklearn.myevaluation as myevaluation

# Part 2: 🚗 Auto Classification 🚗

### Step 1 Train/Test Sets: Random Sub-sampling

This section will calculate the accuracy and error rate for the kNN and Dummy classifiers for predicting DOE mpg ratings using number of cylinders, weight, and acceleration attributes. 

This will use `random sub-sampling` with k = 10 and a 2:1 train/test split

In [9]:
# compute the predictive accuracy and error rate for each classifier
# start with KNN

X, y = myutils.prepare_data_random_subsamples()
classifier = MyKNeighborsClassifier()

knn_accuracy, knn_error_rate = myutils.random_subsample(X, y, classifier, k=10, test_size=0.33)

# then with Dummy
X, y = myutils.prepare_data_random_subsamples()
classifier = MyDummyClassifier()

dummy_accuracy, dummy_error_rate = myutils.random_subsample(X, y, classifier, k=10, test_size=0.33)

# Print the formatted output
print("===========================================")
print("STEP 1: Predictive Accuracy and Error Rate")
print("===========================================")
print("Random Subsample (k=10, 2:1 Train/Test)")
print(f"k Nearest Neighbors Classifier: accuracy = {knn_accuracy:.2f}, error rate = {knn_error_rate:.2f}")
print(f"Dummy Classifier: accuracy = {dummy_accuracy:.2f}, error rate = {dummy_error_rate:.2f}")



STEP 1: Predictive Accuracy and Error Rate
Random Subsample (k=10, 2:1 Train/Test)
k Nearest Neighbors Classifier: accuracy = 0.40, error rate = 0.60
Dummy Classifier: accuracy = 0.18, error rate = 0.82


### Step 2 Train/Test Sets: Cross Validation

This section will calculate the accuracy and error rate for the kNN and Dummy classifiers for predicting DOE mpg ratings using number of cylinders, weight, and acceleration attributes. 

Instead of random sub-sampling, we are using `k-fold cross validation` with k = 10. 

In [10]:
# get X and y for cross_val_predict function
X, y = myutils.prepare_data_random_subsamples()

# KNN classifier
knn_classifier = MyKNeighborsClassifier()
knn_predictions = myutils.cross_val_predict(X, y, knn_classifier, n_splits=10)
knn_accuracy = myevaluation.accuracy_score(y, knn_predictions)
knn_error_rate = 1 - knn_accuracy

# Dummy classifier
dummy_classifier = MyDummyClassifier()
dummy_predictions = myutils.cross_val_predict(X, y, dummy_classifier, n_splits=10)
dummy_accuracy = myevaluation.accuracy_score(y, dummy_predictions)
dummy_error_rate = 1 - dummy_accuracy

# display results
print("========================================================")
print("STEP 2: Predictive Accuracy with K-Fold Cross-Validation")
print("========================================================")
print("10-Fold Cross Validation")
print(f"K-Nearest Neighbors Classifier: accuracy = {knn_accuracy:.2f}, error rate = {knn_error_rate:.2f}")
print(f"Dummy Classifier: accuracy = {dummy_accuracy:.2f}, error rate = {dummy_error_rate:.2f}")


STEP 2: Predictive Accuracy with K-Fold Cross-Validation
10-Fold Cross Validation
K-Nearest Neighbors Classifier: accuracy = 0.39, error rate = 0.61
Dummy Classifier: accuracy = 0.20, error rate = 0.80


### Step 3 Train/Test Sets: Bootstrap Method

This section will calculate the accuracy and error rate for the kNN and Dummy classifiers for predicting DOE mpg ratings using number of cylinders, weight, and acceleration attributes. 

This will use the `bootstrap method` with k=10

In [11]:
# get X and y for bootstrap_method function
X, y = myutils.prepare_data_random_subsamples()

# KNN classifier
knn_classifier = MyKNeighborsClassifier()
knn_accuracy, knn_error_rate = myutils.bootstrap_method(X, y, knn_classifier, k = 10)

# Dummy classifier
dummy_classifier = MyDummyClassifier()
dummy_accuracy, dummy_error_rate = myutils.bootstrap_method(X, y, dummy_classifier, k =10)

# display results
print("==============================================")
print("STEP 3: Predictive Accuracy with Bootstrapping")
print("==============================================")
print("k=10 Bootstrap Method")
print(f"k-Nearest Neighbors Classifier: accuracy = {knn_accuracy:.2f}, error rate = {knn_error_rate:.2f}")
print(f"Dummy Classifier: accuracy = {dummy_accuracy:.2f}, error rate = {dummy_error_rate:.2f}")



STEP 3: Predictive Accuracy with Bootstrapping
k=10 Bootstrap Method
k-Nearest Neighbors Classifier: accuracy = 0.35, error rate = 0.65
Dummy Classifier: accuracy = 0.21, error rate = 0.79


### Step 4 Confusion Matrices

This step will create confusion matrices for each classifier (kNN and Dummy) based on the 10-fold cross validation results.

In [12]:
from tabulate import tabulate

# define the classifiers
knn_classifier = MyKNeighborsClassifier()
dummy_classifier = MyDummyClassifier()

# get predictions based on 10-fold cross validation results
knn_predictions = myutils.cross_val_predict(X, y, knn_classifier, n_splits=10)
dummy_predictions = myutils.cross_val_predict(X, y, dummy_classifier, n_splits=10)

# get the unique labels for the target values
labels = list(set(y)) 

# call function to create confusion matrix
knn_confusion = myevaluation.confusion_matrix(y, knn_predictions, labels)
dummy_confusion = myevaluation.confusion_matrix(y, dummy_predictions, labels)

# Print the confusion matrix with the MPG Ranking column
print("K-Nearest Neighbors Confusion Matrix:")
myutils.print_confusion_matrix_with_metrics(knn_confusion, labels)

print("\nDummy Classifier Confusion Matrix:")
myutils.print_confusion_matrix_with_metrics(dummy_confusion, labels)


K-Nearest Neighbors Confusion Matrix:
  MPG Ranking    1    2    3    4    5    6    7    8    9    Total    Recognition (%)
-------------  ---  ---  ---  ---  ---  ---  ---  ---  ---  -------  -----------------
            1   15    4    6    3    0    0    0    0    0       28               53.6
            2    5    0    9    2    0    0    0    0    0       16                0.0
            3    6    5    8   11    5    0    0    0    0       35               22.9
            4    1    2    6   26   15    1    0    0    0       51               51.0
            5    0    0    2   19   15    8    3    0    0       47               31.9
            6    0    0    0    2    7   21    4    1    0       35               60.0
            7    0    0    0    1    3    5    9    7    2       27               33.3
            8    0    0    0    0    0    5    6    8    0       19               42.1
            9    0    0    0    0    0    1    1    0    0        2                0.0

Dumm