<a href="https://colab.research.google.com/github/jcdevaney/audioUnderstandingSp2025/blob/main/week6/metrics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Libraries and Data

In [1]:
import numpy as np
import pandas as pd
from pandas.plotting import scatter_matrix

!pip install -U scikit-learn==1.4

Collecting scikit-learn==1.4
  Downloading scikit_learn-1.4.0-1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading scikit_learn-1.4.0-1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.1/12.1 MB[0m [31m32.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.6.1
    Uninstalling scikit-learn-1.6.1:
      Successfully uninstalled scikit-learn-1.6.1
Successfully installed scikit-learn-1.4.0


# Import Data

## Iris Dataset for Classification

In [2]:
# import Iris dataset
from sklearn.datasets import load_iris
iris = load_iris()

## California Housing Data for Regression

In [3]:
# Import the California Housing Data from used in the HOML book, Chapter 2

import os
import tarfile
import urllib

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

In [4]:
fetch_housing_data()

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

In [5]:
housing = load_housing_data()
y = housing['median_house_value']
X = housing.drop(['median_house_value','ocean_proximity','total_bedrooms'],axis=1)

# Testing/training split

In [6]:
# Use stratify split works for the iris dataset

from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(iris.data, iris.target):
    X_iris_train = iris.data[train_index]
    X_iris_test = iris.data[test_index]
    y_iris_train = iris.target[train_index]
    y_iris_test = iris.target[test_index]

In [7]:
# Use regular split for the housing dataset

from sklearn.model_selection import train_test_split

# split data and labels into a training and a test set
X_train, X_test, y_housing_train, y_housing_test = train_test_split(X, y, random_state=0, stratify=None)

In [8]:
# Use a mean imputer to replace missing values
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_mean.fit(X_train)
SimpleImputer()
# apply to both the testing and training data
X_housing_train = imp_mean.transform(X_train)
X_housing_test = imp_mean.transform(X_test)

# Metrics

## Classification

In [9]:
# import KNN clasifier and fit to training data
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_iris_train, y_iris_train)

# generate list of predictions
y_iris_pred = knn.predict(X_iris_test)

In [10]:
# generate a confusion matrix on iris data
from sklearn.metrics import confusion_matrix

confusion_matrix(y_iris_test, y_iris_pred)

array([[10,  0,  0],
       [ 0, 10,  0],
       [ 0,  1,  9]])

In [11]:
# calculate class-wise precision score on iris data
from sklearn.metrics import precision_score
precision_score(y_iris_test, y_iris_pred, average=None)

array([1.        , 0.90909091, 1.        ])

In [12]:
# calculate class-wise recall score on iris data
from sklearn.metrics import recall_score
recall_score(y_iris_test, y_iris_pred, average=None)

array([1. , 1. , 0.9])

In [13]:
# calculate overall and class-wise F1-score on iris data
from sklearn.metrics import f1_score

# Calculate metrics globally by counting the total true positives, false negatives and false positives.
print(f1_score(y_iris_test, y_iris_pred, average='micro'))
# Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account.
print(f1_score(y_iris_test, y_iris_pred, average='macro'))
# Calculate metrics for each label, and find their average weighted by support (the number of true instances for each label). This alters ‘macro’ to account for label imbalance; it can result in an F-score that is not between precision and recall.
print(f1_score(y_iris_test, y_iris_pred, average='weighted'))

# Class-wise, no averaging
print(f1_score(y_iris_test, y_iris_pred, average=None))

0.9666666666666667
0.9665831244778613
0.9665831244778613
[1.         0.95238095 0.94736842]


In [14]:
# calculate AUC score (ROC implementation in scikit-learn only works for binary classification) on iris data
from sklearn.metrics import roc_auc_score
roc_auc_score(y_iris_test, knn.predict_proba(X_iris_test), multi_class='ovr')

0.975

## Regression

In [15]:
from sklearn.linear_model import LinearRegression

# instantiate a model and fit it to the training set
linreg = LinearRegression().fit(X_housing_train, y_housing_train)

# generate list of predictions
y_housing_pred=linreg.predict(X_housing_test)

In [16]:
# calculate RMSE on housing data
from sklearn.metrics import root_mean_squared_error

root_mean_squared_error(y_housing_test, y_housing_pred)

70639.47198102309

In [17]:
# calculate MAE on housing data
from sklearn.metrics import mean_absolute_error

y_pred=linreg.predict(X_housing_test)

mean_absolute_error(y_housing_test, y_housing_pred)

51604.687324832186