# W5 Practicals - Performance

### Aims:
* To gain some practical experience in evaluating supervised machine learning
models.
* To produce some assessable work for this subject.

### Procedure:
In Prac W2 we applied k-NN and decision tree models to simple classification and regression datasets. The evaluation of the models was based on a single 70/30 split of the data into training and test data. In this Prac we will look more closely at the evaluation of these models.
> Select one of the questions (3 – 6) from Prac W2 and revisit it for this prac.

> On blackboard you will find a [link](https://docs.google.com/spreadsheets/d/1HAIDBp9ofIeEp5_braHBnwJ-heN8qdaxn5qk3q1c0vo/edit?usp=sharing) to a Google spreadsheet. Go there and enter your answers for Q2 – Q5.

In [204]:
# Common Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor

In [205]:
# ANSI color
class color:
  GREEN_BOLD = '\033[1;32m'
  YELLOW_BOLD = '\033[1;33m'
  BLUE_BOLD = '\033[1;34m'
  WHITE_BOLD = '\033[1;37m'
  END = '\033[0m'

### Q1: Repeat Q2 from Prac W2 10 times, saving the 10 resulting training and test sets.

In [206]:
# Specific Imports
import os
import datetime

In [207]:
def load_and_split_data(CSV_file, test_size=0.3, random_state=42):
  """
  Loads a CSV file and splits it into training and test sets.

  Parameters:
  CSV_file (str): The path to the CSV file.
  test_size (float): The proportion of the data to include in the test split.
  random_state (int): The seed used by the random number generator.

  Returns:
  tuple: A tuple containing the training and test sets.
  """
  df = pd.read_csv(CSV_file, header=None)
  X, y = df.iloc[:, :-1].values, df.iloc[:, -1].values
  return train_test_split(X, y, test_size=test_size, random_state=random_state)

def save_split_data(data_splits, iteration, headers, save_dir="datasets", sep=""):
    """
    Saves the combined X_train, y_train, X_test, and y_test for one iteration to CSV files.
    The files are named X{iteration}_train.csv and X{iteration}_test.csv.

    Parameters:
    data_splits (tuple): A tuple containing the training and test sets.
    iteration (int): The iteration number.
    headers (list): A list of column headers for the CSV files.
    save_dir (str): The directory where the CSV files will be saved.
    """
    if not os.path.exists(save_dir):
      os.makedirs(save_dir)

    train_dir = os.path.join(save_dir, 'train')
    test_dir = os.path.join(save_dir, 'test')

    os.makedirs(train_dir, exist_ok=True)
    os.makedirs(test_dir, exist_ok=True)

    X_train, X_test, y_train, y_test = data_splits

    def save_data(X, y, filename):
        combined_data = pd.DataFrame(X)
        combined_data['target'] = y
        combined_data.columns = headers
        combined_data.to_csv(filename, index=False)

    save_data(X_train, y_train, os.path.join(train_dir, f"X{iteration}.csv"))
    save_data(X_test, y_test, os.path.join(test_dir, f"X{iteration}.csv"))

    print(f"{sep*2}Data saved as {color.WHITE_BOLD}'{save_dir}/train/X{iteration}.csv'{color.END} and {color.WHITE_BOLD}'test/X{iteration}.csv'{color.END}")

def q1(test_percent=30, sep=""):
  for type in ["classification", "regression"]:
    print(f"{color.YELLOW_BOLD}{sep}{type.title()}{color.END}")
    if type == "classification":
        CSV = 'w3classif.csv'
        headers = ["feature1", "feature2", "target"]
    elif type == "regression":
        CSV = 'w3regr.csv'
        headers = ["Feature1", "Feature2"]
    else:
        exit

    for i in range(1,11):
        data_splits = load_and_split_data(CSV, test_size=(test_percent/100), random_state=None)  # random_state=None for variability
        save_split_data(data_splits, i, headers, f"dataset/{type}", sep)
q1()

[1;33mClassification[0m
Data saved as [1;37m'dataset/classification/train/X1.csv'[0m and [1;37m'test/X1.csv'[0m
Data saved as [1;37m'dataset/classification/train/X2.csv'[0m and [1;37m'test/X2.csv'[0m
Data saved as [1;37m'dataset/classification/train/X3.csv'[0m and [1;37m'test/X3.csv'[0m
Data saved as [1;37m'dataset/classification/train/X4.csv'[0m and [1;37m'test/X4.csv'[0m
Data saved as [1;37m'dataset/classification/train/X5.csv'[0m and [1;37m'test/X5.csv'[0m
Data saved as [1;37m'dataset/classification/train/X6.csv'[0m and [1;37m'test/X6.csv'[0m
Data saved as [1;37m'dataset/classification/train/X7.csv'[0m and [1;37m'test/X7.csv'[0m
Data saved as [1;37m'dataset/classification/train/X8.csv'[0m and [1;37m'test/X8.csv'[0m
Data saved as [1;37m'dataset/classification/train/X9.csv'[0m and [1;37m'test/X9.csv'[0m
Data saved as [1;37m'dataset/classification/train/X10.csv'[0m and [1;37m'test/X10.csv'[0m
[1;33mRegression[0m
Data saved as [1;37m'dataset/

### Q2: Calculate the training and test set errors over all of the datasets from Q1 and calculate the average training and test errors over the 10 trials. Are the averages lower or higher than the values you found in Prac W3 (or alternatively compare with the values for the first of your 10 runs)?


In [208]:
# Specfic Imports
from sklearn.metrics import accuracy_score, mean_squared_error

In [209]:
def knn_error(model, X, y):
    model.fit(X, y)
    y_pred = model.predict(X)
    if isinstance(model, KNeighborsClassifier): error = 1 - accuracy_score(y, y_pred)
    elif isinstance(model, KNeighborsRegressor): error = mean_squared_error(y, y_pred)
    else: exit

    return error

def q2(sep=""):
  for type in ["classification", "regression"]:
    print(f"{color.YELLOW_BOLD}{sep}{type.title()}{color.END}")

    if type == "classification": model_knn = KNeighborsClassifier(n_neighbors=5)
    elif type == "regression": model_knn = KNeighborsRegressor(n_neighbors=5)
    else: exit

    for data_type in ["train", "test"]:
      errors = []
      for i in range(1, 11):
        X = pd.read_csv(f'dataset/{type}/{data_type}/X{i}.csv')
        y = X.iloc[:, -1].values
        error = knn_error(model_knn, X, y)
        errors.append(error)
      avg_error = np.mean(errors)
    print(f"""{color.BLUE_BOLD}{sep*2}Errors:{color.END}
    {sep*2}Average of {type.title()} {data_type.title()} Error (KNN): {color.WHITE_BOLD}{avg_error}{color.END}""")

q2()

[1;33mClassification[0m
[1;34mErrors:[0m
    Average of Classification Test Error (KNN): [1;37m0.015833333333333355[0m
[1;33mRegression[0m
[1;34mErrors:[0m
    Average of Regression Test Error (KNN): [1;37m132.21048721743227[0m


### Question 3: Repeat Q1 and Q2 but use a different split – try 50/50 or 90/10. Compare your average error values with those you found in Q2.

In [210]:
def q1_q2(test_percent):
  print(f"{color.GREEN_BOLD}Q1 & Q2 for {100-test_percent}/{test_percent} {color.END}")
  q1(test_percent, "\t")
  q2("\t")
  print("\n")

q1_q2(50)
q1_q2(10)

[1;32mQ1 & Q2 for 50/50 [0m
[1;33m	Classification[0m
		Data saved as [1;37m'dataset/classification/train/X1.csv'[0m and [1;37m'test/X1.csv'[0m
		Data saved as [1;37m'dataset/classification/train/X2.csv'[0m and [1;37m'test/X2.csv'[0m
		Data saved as [1;37m'dataset/classification/train/X3.csv'[0m and [1;37m'test/X3.csv'[0m
		Data saved as [1;37m'dataset/classification/train/X4.csv'[0m and [1;37m'test/X4.csv'[0m
		Data saved as [1;37m'dataset/classification/train/X5.csv'[0m and [1;37m'test/X5.csv'[0m
		Data saved as [1;37m'dataset/classification/train/X6.csv'[0m and [1;37m'test/X6.csv'[0m
		Data saved as [1;37m'dataset/classification/train/X7.csv'[0m and [1;37m'test/X7.csv'[0m
		Data saved as [1;37m'dataset/classification/train/X8.csv'[0m and [1;37m'test/X8.csv'[0m
		Data saved as [1;37m'dataset/classification/train/X9.csv'[0m and [1;37m'test/X9.csv'[0m
		Data saved as [1;37m'dataset/classification/train/X10.csv'[0m and [1;37m'test/X10.csv'[0m


### Q4: Calculate the sample standard deviation of your training and test set error values over the 10 trials from Q2 and Q3. What do you observe?

In [211]:
def q4_helper(sep=""):
  for type in ["classification", "regression"]:
    print(f"{color.YELLOW_BOLD}{sep}{type.title()}{color.END}")

    if type == "classification": model_knn = KNeighborsClassifier(n_neighbors=5)
    elif type == "regression": model_knn = KNeighborsRegressor(n_neighbors=5)
    else: exit

    for data_type in ["train", "test"]:
      errors = []
      for i in range(1, 11):
        X = pd.read_csv(f'dataset/{type}/{data_type}/X{i}.csv')
        y = X.iloc[:, -1].values
        error = knn_error(model_knn, X, y)
        errors.append(error)
      avg_error = np.mean(errors)
      std_error = np.std(errors, ddof=1)

    print(f"""{color.BLUE_BOLD}{sep*2}Errors:{color.END}
    {sep*2}Average of {type.title()} {data_type.title()} Error (KNN): {color.WHITE_BOLD}{avg_error}{color.END}
    {sep*2}Standard Devation of {type.title()} {data_type.title()} Error (KNN): {color.WHITE_BOLD}{std_error}{color.END}""")

def q4(test_percent):
  print(f"{color.GREEN_BOLD}Q1 & Q2 for {100-test_percent}/{test_percent} {color.END}")
  q1(test_percent, "\t")
  q4_helper("\t")
  print("\n")

q4(50)
q4(10)

[1;32mQ1 & Q2 for 50/50 [0m
[1;33m	Classification[0m
		Data saved as [1;37m'dataset/classification/train/X1.csv'[0m and [1;37m'test/X1.csv'[0m
		Data saved as [1;37m'dataset/classification/train/X2.csv'[0m and [1;37m'test/X2.csv'[0m
		Data saved as [1;37m'dataset/classification/train/X3.csv'[0m and [1;37m'test/X3.csv'[0m
		Data saved as [1;37m'dataset/classification/train/X4.csv'[0m and [1;37m'test/X4.csv'[0m
		Data saved as [1;37m'dataset/classification/train/X5.csv'[0m and [1;37m'test/X5.csv'[0m
		Data saved as [1;37m'dataset/classification/train/X6.csv'[0m and [1;37m'test/X6.csv'[0m
		Data saved as [1;37m'dataset/classification/train/X7.csv'[0m and [1;37m'test/X7.csv'[0m
		Data saved as [1;37m'dataset/classification/train/X8.csv'[0m and [1;37m'test/X8.csv'[0m
		Data saved as [1;37m'dataset/classification/train/X9.csv'[0m and [1;37m'test/X9.csv'[0m
		Data saved as [1;37m'dataset/classification/train/X10.csv'[0m and [1;37m'test/X10.csv'[0m


### Q5: Perform 10-fold cross validation using your model and the (original) dataset (use existing Matlab or python functions to do this). What are the mean and standard devations of the cross-validation error?

In [212]:
# Specific Imports
from sklearn.model_selection import cross_val_score

In [224]:
def q5_helper(sep=""):
  for type in ["classification", "regression"]:
    print(f"{color.YELLOW_BOLD}{sep}{type.title()}{color.END}")

    if type == "classification": model_knn = KNeighborsClassifier(n_neighbors=5)
    elif type == "regression": model_knn = KNeighborsRegressor(n_neighbors=5)
    else: exit

    for data_type in ["train", "test"]:
      cv_scores = []
      for i in range(1, 1):
        X = pd.read_csv(f'dataset/{type}/{data_type}/X{i}.csv')
        y = X.iloc[:, -1].values
        cv_score = cross_val_score(model_knn, X, y, cv=10, scoring='accuracy')
        cv_scores.append(cv_score)
      avg_error = np.mean(cv_scores)
      std_error = np.std(cv_scores, ddof=1)

    print(f"""{color.BLUE_BOLD}{sep*2}Errors:{color.END}
    {sep*2}Average of {type.title()} {data_type.title()} Error (KNN): {color.WHITE_BOLD}{avg_error}{color.END}
    {sep*2}Standard Devation of {type.title()} {data_type.title()} Error (KNN): {color.WHITE_BOLD}{std_error}{color.END}""")

q5_helper()

[1;33mClassification[0m
[1;34mErrors:[0m
    Average of Classification Test Error (KNN): [1;37mnan[0m
    Standard Devation of Classification Test Error (KNN): [1;37mnan[0m
[1;33mRegression[0m
[1;34mErrors:[0m
    Average of Regression Test Error (KNN): [1;37mnan[0m
    Standard Devation of Regression Test Error (KNN): [1;37mnan[0m


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
