# Import Modules

In [None]:
import cv2 as cv
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier, plot_tree
from skimage.feature import hog
from skimage import exposure
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier

from IPython.display import Markdown

# Import caching utility functions from local module

In [None]:
# Autoreload module when changes are made
%load_ext autoreload
%autoreload 2
import caching_utils

# Loading Images

In [None]:
class_label_encoding = {
    'SPOILED': 0,
    'HALF': 1,
    'FRESH': 2
}

In [None]:
def load_images(file_path, output_x, output_y):
    for file_name in os.listdir(file_path):
        class_name = file_name.split('-')[0]
        if (class_name == '_classes.csv'): continue
        img = cv.imread(file_path + file_name).astype('float32')
        img = cv.cvtColor(img, cv.COLOR_BGR2RGB)
        img = cv.resize(img, (128, 128), interpolation = cv.INTER_AREA)
        img /= 255
        output_x.append(img)
        output_y.append(class_label_encoding[class_name])

In [None]:
train_x = []
train_y = []
test_x = []
test_y = []

load_images('data/train/', train_x, train_y)
load_images('data/valid/', test_x, test_y)

In [None]:
train_x[0]

# Feature Extraction

## Color Histogram
Jason

In [None]:
def extract_color_histogram(image, bins=32):
    """
    Extract color histogram features from an image.
    
    Parameters:
    - image: The input image (should be in RGB format)
    - bins: Number of bins for the histogram
    
    Returns:
    - histogram_features: Flattened histogram features
    """
    # Extract histograms for each channel
    hist_r = cv.calcHist([image], [0], None, [bins], [0, 1])  # Changed range to [0, 1] since you're normalizing images
    hist_g = cv.calcHist([image], [1], None, [bins], [0, 1])
    hist_b = cv.calcHist([image], [2], None, [bins], [0, 1])
    
    # Normalize the histograms
    cv.normalize(hist_r, hist_r, 0, 1, cv.NORM_MINMAX)
    cv.normalize(hist_g, hist_g, 0, 1, cv.NORM_MINMAX)
    cv.normalize(hist_b, hist_b, 0, 1, cv.NORM_MINMAX)
    
    # Flatten and concatenate the histograms
    histogram_features = np.concatenate([
        hist_r.flatten(), 
        hist_g.flatten(), 
        hist_b.flatten()
    ])
    
    return histogram_features

In [None]:
def plot_color_histogram(image, bins=32, title="Color Histogram"):
    """
    Plot the color histogram of an image.
    
    Parameters:
    - image: The input image (should be in RGB format)
    - bins: Number of bins for the histogram
    - title: Title for the plot
    
    Returns:
    - None (displays the plot)
    """
    # Create a figure with subplots
    fig, ax = plt.subplots(1, 4, figsize=(16, 4))
    
    # Display the original image
    ax[0].imshow(image)
    ax[0].set_title('Original Image')
    ax[0].axis('off')
    
    # Get histogram features using your existing function
    features = extract_color_histogram(image, bins)
    
    # Split the features back into channels
    channel_length = len(features) // 3
    hist_r = features[:channel_length].reshape(bins, 1)
    hist_g = features[channel_length:2*channel_length].reshape(bins, 1)
    hist_b = features[2*channel_length:].reshape(bins, 1)
    
    # Define colors and channels
    colors = ['r', 'g', 'b']
    channels = ['Red', 'Green', 'Blue']
    hists = [hist_r, hist_g, hist_b]
    
    # Plot histograms for each channel
    for i, (hist, col, chan) in enumerate(zip(hists, colors, channels)):
        ax[i+1].plot(hist, color=col)
        ax[i+1].set_xlim([0, bins])
        ax[i+1].set_title(f'{chan} Histogram')
        ax[i+1].set_xlabel('Bins')
        ax[i+1].set_ylabel('# of Pixels')
        ax[i+1].grid(True, alpha=0.3)
    
    plt.suptitle(title, fontsize=16)
    plt.tight_layout()
    plt.show()
    
    return features

In [None]:
image = train_x[0]  # Get the first image
plot_color_histogram(image, bins=32, title="Meat Sample Color Histogram")
print()

In [None]:
def load_feature_hist(use_cached=True) -> tuple[np.ndarray, np.ndarray]:
    train, success = caching_utils.attempt_load_feature_from_cache("raw_hist_train.csv")
    if not success or not use_cached: # Not present in cache, regenerate.
        train = []
        for img in train_x:
            hist_features = extract_color_histogram(img)
            train.append(hist_features)
        train = np.array(train)
        caching_utils.save_feature_to_cache("raw_hist_train.csv", train)

    test, success = caching_utils.attempt_load_feature_from_cache("raw_hist_test.csv")
    if not success or not use_cached:
        test = []
        for img in test_x:
            hist_features = extract_color_histogram(img)
            test.append(hist_features)
        test = np.array(test)
        caching_utils.save_feature_to_cache("raw_hist_test.csv", test)
    return (train, test)
    

## Local Binary Pattern
Aiden

In [None]:
def get_pixel(img, center, x, y): 
    """
    Gets Local Binary Patterns values for pixel adjacent to the selected one.
    
    Parameters:
    - img: The image containing the pixels
    - center: The pixel Local Binary Patterns is being applied to
    - x: The x coordinate of the adjacent pixel
    - y: The y coordinate of the adjacent pixel
    
    Returns:
    - new_val: The output value for the comparison between the center pixel and the adjacent pixel
    """
      
    new_value = 0
      
    try: 
        # if local neighbourhood pixel value is greater than or equal to center pixel values then set it to 1 
        if img[x][y] >= center: 
            new_value = 1
              
    except: 
        # exception required when neighbourhood value of center pixel value is null
        pass
      
    return new_value 
   
# Function for calculating LBP 
def lbp_calculated_pixel(img, x, y): 
    """
    Apply the Local Binary Patterns to a single pixel.
    
    Parameters:
    - img: The image containing the pixel
    - x: The x coordinate of the selected pixel
    - y: The y coordinate of the selected pixel
    
    Returns:
    - val: The output value for the selected pixel after applying Local Binary Patterns
    """
   
    center = img[x][y] 
   
    val_ar = [] 
      
    # top_left 
    val_ar.append(get_pixel(img, center, x-1, y-1)) 
      
    # top 
    val_ar.append(get_pixel(img, center, x-1, y)) 
      
    # top_right 
    val_ar.append(get_pixel(img, center, x-1, y + 1)) 
      
    # right 
    val_ar.append(get_pixel(img, center, x, y + 1)) 
      
    # bottom_right 
    val_ar.append(get_pixel(img, center, x + 1, y + 1)) 
      
    # bottom 
    val_ar.append(get_pixel(img, center, x + 1, y)) 
      
    # bottom_left 
    val_ar.append(get_pixel(img, center, x + 1, y-1)) 
      
    # left 
    val_ar.append(get_pixel(img, center, x, y-1)) 
       
    # convert binary values to decimal 
    power_val = [1, 2, 4, 8, 16, 32, 64, 128] 
   
    val = 0
      
    for i in range(len(val_ar)): 
        val += val_ar[i] * power_val[i] 
          
    return val


def lbp_output(img_bgr):
    """
    Apply the Local Binary Patterns filter to an image.
    
    Parameters:
    - img_bgr: The image to be transformed
    
    Returns:
    - image_lbp: The output image with the Local Binary Patterns filter applied
    """
    height, width, _ = img_bgr.shape 
   
    # convert RGB to gray 
    img_gray = cv.cvtColor(img_bgr, 
                            cv.COLOR_BGR2GRAY) 
       
    # create numpy array as same height and width of RGB image 
    img_lbp = np.zeros((height, width), 
                       np.float32) 
       
    for i in range(0, height): 
        for j in range(0, width): 
            img_lbp[i, j] = lbp_calculated_pixel(img_gray, i, j)

    return img_lbp

In [None]:
img_bgr = train_x[0]
img_lbp = lbp_output(img_bgr)
  
plt.imshow(img_bgr) 
plt.show()
   
plt.imshow(img_lbp, cmap ="gray")
plt.show()

In [None]:
def save_images_lbp(imgs, labels, train_test='train'):
    """
    Save images with the Local Binary Patterns filter applied and unique file names based on label.
    
    Parameters:
    - imgs: List of images to be transformed
    - labels: List of labels for each image
    - train_test: The string value either 'train' or 'test' which determines the output directory
    """
    label_text = ['SPOILED', 'HALF', 'FRESH']
    for image in range(len(imgs)):
        lbp_image = lbp_output(imgs[image])
        filename = f'data/lbp/{train_test}/{label_text[labels[image]]}-{image}-lbp.jpg'
        cv.imwrite(filename, lbp_image)

In [None]:
# save_images_lbp(train_x, train_y, train_test='train')

In [None]:
# save_images_lbp(test_x, test_y, train_test='test')

In [None]:
def load_feature_lbp(use_cached=True) -> tuple[np.ndarray, np.ndarray]:
    train, success = caching_utils.attempt_load_feature_from_cache("raw_lbp_train.csv")
    if not success or not use_cached:
        train = []
        for img in train_x:
            lbp_features = lbp_output(img).flatten()
            train.append(lbp_features)
        train = np.array(train)
        caching_utils.save_feature_to_cache("raw_lbp_train.csv", train)
    
    test, success = caching_utils.attempt_load_feature_from_cache("raw_lbp_test.csv")
    if not success or not use_cached:
        test = []
        for img in test_x:
            lbp_features = lbp_output(img).flatten()
            test.append(lbp_features)
        test = np.array(test)
        caching_utils.save_feature_to_cache("raw_lbp_test.csv", test)
    return (train, test)

## Histograms of Oriented Gradients
Fiona

In [None]:
# Adapted from here: https://scikit-image.org/docs/stable/auto_examples/features_detection/plot_hog.html
def make_hog(image, visualize=False):
    features, hog_image = hog(
            image,
            orientations=16,
            pixels_per_cell=(8, 8),
            cells_per_block=(1, 1),
            visualize=True,
            channel_axis=-1
        )

    if visualize:
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(8, 4), sharex=True, sharey=True)

        ax1.axis('off')
        ax1.imshow(image, cmap=plt.cm.gray)
        ax1.set_title('Input image')

        hog_image_rescaled = exposure.rescale_intensity(hog_image, in_range=(0, 10))

        ax2.axis('off')
        ax2.imshow(hog_image_rescaled, cmap=plt.cm.gray)
        ax2.set_title('Histogram of Oriented Gradients')
        plt.show()
    return features

In [None]:
make_hog(train_x[1], True)

In [None]:
def load_feature_hog(use_cached=True) -> tuple[np.ndarray, np.ndarray]:
    train, success = caching_utils.attempt_load_feature_from_cache("raw_hog_train.csv")
    if not success or not use_cached:
        train = []
        for img in train_x:
            hog_features = make_hog(img)
            train.append(hog_features)
        train = np.array(train)
        caching_utils.save_feature_to_cache("raw_hog_train.csv", train)

    test, success = caching_utils.attempt_load_feature_from_cache("raw_hog_test.csv")
    if not success or not use_cached:
        test = []
        for img in test_x:
            hog_features = make_hog(img)
            test.append(hog_features)
        test = np.array(test)
        caching_utils.save_feature_to_cache("raw_hog_test.csv", test)
    return (train, test)

## Load extracted features from cache

In [None]:
train_features_hist, test_features_hist = load_feature_hist()

In [None]:
train_features_lbp, test_features_lbp = load_feature_lbp()

In [None]:
train_features_hog, test_features_hog = load_feature_hog()

# Convert features to dataframe and apply PCA

In [None]:
def transform_features_to_dataframe(train_features, train_labels, test_features, test_labels,
                                    pca_components=2) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    Transform a set of extracted features into a pandas DataFrame and applies PCA.
    
    Parameters:
    - train_features: List of extracted features for training (numpy arrays)
    - train_labels: Labels for training
    - test_features: List of extracted features for training (numpy arrays)
    - test_labels: Labels for testing
    - pca_components: Number of PCA components to keep (default=2)
    - labels: Optional list of labels for the images
    
    Returns:
    - tuple (train_dataframe, test_dataframe)
    """
    
    # Apply PCA
    pca = PCA(n_components=pca_components)
    pca.fit(train_features)

    train_features_pca = pca.transform(train_features)
    test_features_pca = pca.transform(test_features)
    
    # Create DataFrame with PCA results
    columns = [f'pca_{i+1}' for i in range(pca_components)]
    train_df = pd.DataFrame(train_features_pca, columns=columns)
    train_df['label'] = train_labels

    test_df = pd.DataFrame(test_features_pca, columns=columns)
    test_df['label'] = test_labels
    
    return (train_df, test_df)

"""
Example usage:
train, test = transform_features_to_dataframe(x_train, y_train, x_test, y_test)
"""

In [None]:
def make_dataframe(train_features: np.ndarray, train_labels: np.ndarray,
                   test_features: np.ndarray, test_labels: np.ndarray,
                   name: str, pca_components: int=2, use_cached=True) -> tuple[pd.DataFrame, pd.DataFrame]:
    target = f"{name}_pca_{pca_components}"

    train, train_success = caching_utils.attempt_load_dataframe(target + "_train")
    test, test_success = caching_utils.attempt_load_dataframe(target + "_test")

    if not train_success or not test_success or not use_cached:
        train, test = transform_features_to_dataframe(train_features, train_labels, test_features, test_labels, pca_components=pca_components)
        caching_utils.save_dataframe_to_cache(target + "_train", train)
        caching_utils.save_dataframe_to_cache(target + "_test", test)
    
    return (train, test)

In [None]:
def get_pca_cols(df: pd.DataFrame) -> pd.DataFrame:
    return df[df.drop(['label'], axis=1).columns]

In [None]:
train_df_hist, test_df_hist = make_dataframe(train_features_hist, train_y, test_features_hist, test_y, "hist", pca_components=16)

In [None]:
train_df_lbp, test_df_lbp = make_dataframe(train_features_lbp, train_y, test_features_lbp, test_y, "lbp", pca_components=2)

In [None]:
train_df_hog, test_df_hog = make_dataframe(train_features_hog, train_y, test_features_hog, test_y, "hog", pca_components=27)

# Classification

## Decision Tree
Jason

In [None]:
def train_decision_tree(x_train_tree, y_train_tree, x_test_tree, y_test_tree, max_depth=5, 
                   min_samples_split=2, criterion='gini', show_tree=True, feature_names=None):
    """
    Train a decision tree classifier on any type of features, with optional histogram visualization.
    
    Parameters:
    - x_train_tree: Training features
    - y_train_tree: Training labels
    - x_test_tree: Test features
    - y_test_tree: Test labels
    - max_depth: Maximum depth of the decision tree
    - min_samples_split: The minimum number of samples required to split an internal node
    - criterion: The function to measure the quality of a split ('gini' or 'entropy')
    - show_tree: Whether to visualize the decision tree
    - feature_names: Names of features (will be auto-generated if None)
    
    Returns:
    - dt_classifier: Trained decision tree classifier
    - accuracy: Classification accuracy on test set
    - report: Classification report
    - conf_matrix: Confusion matrix
    """

    # Create a list of class names
    class_names=['SPOILED', 'HALF', 'FRESH']
    
    # Create and train a Decision Tree classifier
    dt_classifier = DecisionTreeClassifier(max_depth=max_depth, 
                                          min_samples_split=min_samples_split,
                                          criterion=criterion,
                                          random_state=42)
    dt_classifier.fit(x_train_tree, y_train_tree)
    
    # Make predictions
    predictions = dt_classifier.predict(x_test_tree)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test_tree, predictions)
    report = classification_report(y_test_tree, predictions, target_names=class_names)
    conf_matrix = confusion_matrix(y_test_tree, predictions)
    
    # Print results
    print(f"Decision Tree Accuracy: {accuracy:.4f}")
    print("Criterion:", criterion)
    print("Min Samples Split:", min_samples_split)
    print("\nClassification Report:")
    print(report)
    
    # Display confusion matrix
    plt.figure(figsize=(8, 6))
    plt.imshow(conf_matrix, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title('Confusion Matrix')
    plt.colorbar()
    tick_marks = np.arange(len(class_names))
    plt.xticks(tick_marks, class_names, rotation=45)
    plt.yticks(tick_marks, class_names)
    
    # Add text annotations in the confusion matrix
    thresh = conf_matrix.max() / 2
    for i in range(conf_matrix.shape[0]):
        for j in range(conf_matrix.shape[1]):
            plt.text(j, i, format(conf_matrix[i, j], 'd'),
                    horizontalalignment="center",
                    color="white" if conf_matrix[i, j] > thresh else "black")
    
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()
    
    # Show decision tree if requested
    if show_tree:
        # Create feature names if not provided
        if feature_names is None:
            feature_names = [f"Feature_{i}" for i in range(x_train_tree.shape[1])]
            
        plt.figure(figsize=(15, 10))
        plot_tree(dt_classifier, 
                  feature_names=feature_names,
                  class_names=class_names,
                  filled=True, 
                  rounded=True, 
                  fontsize=8)
        plt.title(f"Decision Tree (criterion={criterion}, min_samples_split={min_samples_split})")
        plt.tight_layout()
        plt.show()
    
    return dt_classifier, accuracy, report, conf_matrix

## Decision Tree Tests

In [None]:
def run_decision_tree_on_pca_dataframes(train_dfs, test_dfs, train_labels, test_labels, 
                              feature_names=None, max_depth=3, 
                              min_samples_split=2, criterion='gini'):
    """
    Run decision tree classification on multiple PCA-transformed dataframes.
    
    Parameters:
    - train_dfs: Dictionary of training dataframes {'name': dataframe}
    - test_dfs: Dictionary of testing dataframes {'name': dataframe}
    - train_labels: Training labels
    - test_labels: Testing labels
    - feature_names: Optional dictionary of feature names for each dataframe
    - max_depth: Maximum depth for decision tree
    - min_samples_split: The minimum number of samples required to split an internal node
    - criterion: The function to measure the quality of a split ('gini' or 'entropy')
    
    Returns:
    - results: Dictionary containing trained models and their performance metrics
    """
    results = {}
    
    for name in train_dfs.keys():
        print(f"\n--- Decision Tree Classification for {name} features ---")
        print(f"Parameters: max_depth={max_depth}, min_samples_split={min_samples_split}, criterion={criterion}")
        
        # Get the dataframes
        train_df = train_dfs[name]
        test_df = test_dfs[name]
        
        # Extract features without the label column
        if 'label' in train_df.columns:
            train_features = train_df.drop('label', axis=1).values
            test_features = test_df.drop('label', axis=1).values
        else:
            train_features = train_df.values
            test_features = test_df.values
            
        # Determine feature names if not provided
        names = None
        if feature_names and name in feature_names:
            names = feature_names[name]
        else:
            if 'label' in train_df.columns:
                names = train_df.drop('label', axis=1).columns.tolist()
            else:
                names = train_df.columns.tolist()
        
        # Train decision tree
        model, accuracy, report, conf_matrix = train_decision_tree(
            x_train_tree=train_features,
            y_train_tree=train_labels,
            x_test_tree=test_features,
            y_test_tree=test_labels,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            criterion=criterion,
            show_tree=True,
            feature_names=names
        )
        
        # Store results
        results[name] = {
            'model': model,
            'accuracy': accuracy,
            'report': report,
            'confusion_matrix': conf_matrix
        }
    
    # Compare results
    print("\n--- Comparison of Decision Tree Results ---")
    for name, result in results.items():
        print(f"{name}: Accuracy = {result['accuracy']:.4f}")
    
    return results

In [None]:
def plot_parameter_performance(parameter_values, accuracies, parameter_name, criteria=None):
    """
    Plot the performance of a decision tree parameter.
    
    Parameters:
    - parameter_values: List of parameter values tested
    - accuracies: List of lists containing accuracy values for each parameter value
    - parameter_name: Name of the parameter being tested
    - criteria: Optional list of criterion names if testing both gini and entropy
    """
    plt.figure(figsize=(10, 6))
    
    if criteria:
        # We're plotting multiple lines (one for each criterion)
        for i, criterion in enumerate(criteria):
            plt.plot(parameter_values, accuracies[i], marker='o', label=f'Criterion: {criterion}')
    else:
        # Just one line for a single criterion
        plt.plot(parameter_values, accuracies, marker='o')
    
    plt.xlabel(parameter_name)
    plt.ylabel('Test Accuracy')
    plt.title(f'Decision Tree Performance vs {parameter_name}')
    if criteria:
        plt.legend()
    plt.grid(True, alpha=0.3)
    plt.show()

## Random Forest
Aiden

In [None]:
def train_random_forest(x_train_forest, y_train_forest, x_test_forest, y_test_forest, n_estimators=100, criterion='gini', max_depth=None,
                        min_samples_split=2, min_samples_leaf=1, max_features='sqrt'):
    """
    Train and generate evaluation metrics for a Random Forest classifier given training and testing data.
    
    Parameters:
    - x_train_forest: Training data
    - y_train_forest: Training labels
    - x_test_forest: Test data
    - y_test_forest: Test labels
    - n_estimators: The number of trees in the forest
    - criterion: The function to measure the quality of a split
    - max_depth: Maximum depth of the tree
    - min_samples_split: The minimum number of samples required to split an internal node
    - min_samples_leaf: The minimum number of samples required to be at a leaf node
    - max_features: The number of features to consider when looking for the best split
    
    Returns:
    - accuracy: The accuracy score for the model's predictions
    - precision: The precision score for the model's predictions
    - recall: The recall score for the model's predictions
    - f1: The f1 score for the model's predictions
    - confusion: The confusion matrix fro the model's predictions
    """
    # Create Random Forest classifer object
    clf = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, min_samples_split=min_samples_split,
                                min_samples_leaf=min_samples_leaf, max_features=max_features, random_state=42)
    
    # Train Random Forest Classifer
    clf.fit(x_train_forest,y_train_forest)
    
    #Predict the response for test dataset
    y_pred = clf.predict(x_test_forest)
    
    accuracy = accuracy_score(y_test_forest, y_pred)
    precision = precision_score(y_test_forest, y_pred, average='macro')
    recall = recall_score(y_test_forest, y_pred, average='macro')
    f1 = f1_score(y_test_forest, y_pred, average='macro')
    confusion = confusion_matrix(y_test_forest, y_pred)

    return accuracy, precision, recall, f1, confusion

## Random Forest Tests

In [None]:
def test_rf(rf_train_x, rf_train_y, rf_test_x, rf_test_y):
    """
    Run a series of tests with different combinations of hyperparameters for the Random Forest classifier.
    
    Parameters:
    - rf_train_x: Training data
    - rf_train_y: Training lables
    - rf_test_x: Test data
    - rf_test_y: Test labels
    
    Returns:
    - pd.DataFrame(all_tests): The dataframe containing all hyperparameter values and results for each test
    - confusion_matrices: List of confusion matrices for each test
    """
    # Hyperparameter tuning values
    all_n_estimators = [20,100,500]
    all_criterion = ['gini', 'entropy', 'log_loss']
    all_max_depth = [10, 20, 100]
    all_min_samples_split = [2]
    all_min_samples_leaf = [1]
    all_max_features = ['sqrt','log2',None]

    # Run tests with all combinations and output results
    all_tests = {'n_estimators': [], 'criterion': [], 'max_depth': [], 'min_samples_split': [], 'min_samples_leaf': [], 'max_features': [],
                'accuracy': [], 'precision': [], 'recall': [], 'f1': []}
    confusion_matrices = []
    for n_estimators in all_n_estimators:
        for criterion in all_criterion:
            for max_depth in all_max_depth:
                for min_samples_split in all_min_samples_split:
                    for min_samples_leaf in all_min_samples_leaf:
                        for max_features in all_max_features:
                            accuracy, precision, recall, f1, confusion = train_random_forest(rf_train_x, rf_train_y, rf_test_x, rf_test_y, n_estimators,
                                                                                            criterion, max_depth, min_samples_split, min_samples_leaf,
                                                                                            max_features)
                            for i, feature in enumerate(all_tests):
                                feature_vals = [n_estimators, criterion, max_depth, min_samples_split, min_samples_leaf, max_features,
                                               accuracy, precision, recall, f1]
                                all_tests[feature].append(feature_vals[i])
                            confusion_matrices.append(confusion)

    return pd.DataFrame(all_tests), confusion_matrices

In [None]:
train_df_hist, test_df_hist = make_dataframe(train_features_hist, train_y, test_features_hist, test_y, "hist", pca_components=16)

In [None]:
train_df_lbp, test_df_lbp = make_dataframe(train_features_lbp, train_y, test_features_lbp, test_y, "lbp", pca_components=50)

In [None]:
train_df_hog, test_df_hog = make_dataframe(train_features_hog, train_y, test_features_hog, test_y, "hog", pca_components=27)

In [None]:
train_x_rf = {}
test_x_rf = {}
train_y_rf = {}
test_y_rf = {}
for data in [(train_df_hist, 'train', 'hist'), (test_df_hist, 'test', 'hist'), (train_df_lbp, 'train', 'lbp'), (test_df_lbp, 'test', 'lbp'),
           (train_df_hog, 'train', 'hog'), (test_df_hog, 'test', 'hog')]:
    if data[1] == 'train':
        train_x_rf[data[2]] = data[0].drop('label', axis=1)
        train_y_rf[data[2]] = data[0]['label']
    else:
        test_x_rf[data[2]] = data[0].drop('label', axis=1)
        test_y_rf[data[2]] = data[0]['label']

In [None]:
results = {}
for df in ['hist','lbp','hog']:
    result_frame, result_conf = test_rf(train_x_rf[df], train_y_rf[df], test_x_rf[df], test_y_rf[df])
    results[df] = result_frame
    results[f'{df}_conf'] = result_conf

In [None]:
print(results['hog'].to_string())

In [None]:
matrix_results = results['hog_conf'][57]

In [None]:
confusion_matrix_display = ConfusionMatrixDisplay(confusion_matrix = matrix_results, display_labels = ['SPOILED', 'HALF', 'FRESH'])

confusion_matrix_display.plot()
plt.show()

## Knn
Fiona

In [None]:
# I Liked Jason's implementation of the docstring and report visualization in train_decision_tree
# so I modified my existing implementation to work similarly.
def train_knn(x_train: pd.DataFrame, y_train: pd.DataFrame, x_test: pd.DataFrame, y_test: pd.DataFrame,
              n_neighbors=5, weights='uniform', p=2, report=False) -> tuple[
                  KNeighborsClassifier, float, float, float, float,
                  np.ndarray]:
    """
    Train and generate evaluation metrics for a Knn classifier.
    All returned metrics are calculated using the macro method.
    
    Parameters:
    - x_train: Training features.
    - y_train: Training labels.
    - x_test: Testing features.
    - y_test: Testing labels.

    - n_neighbors: int - Number of neighbors to use, default 5.
    - weights: 'uniform' | 'distance' - Weight function to use, default 'uniform'.
    - p: int - Power parameter, default 2.

    - report: bool - Visualize report, default False.

    Returns:
    - knn: Trained Knn classifier.
    - accuracy: Classification accuracy.
    - precision: Classifier precision.
    - recall: Model recall.
    - f1-score: Model F1-score.
    - conf_matrix: Confusion matrix.
    """
    knn = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, p=p)
    knn.fit(x_train, y_train)

    pred = knn.predict(x_test)
    
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred, average='macro')
    recall = recall_score(y_test, pred, average='macro')
    f1 = f1_score(y_test, pred, average='macro')

    conf_matrix = confusion_matrix(y_test, pred)

    if report:
        class_names = ['SPOILED', 'HALF', 'FRESH']

        print("Knn Classifier performance:")
        print(f"N-neighbors: {n_neighbors}")
        print(f"Weight type: {weights}")
        print(f"p: {p}\n")

        print(f"Accuracy {accuracy:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1-score: {f1:.4f}")

        plt.figure(figsize=(8, 6))
        plt.imshow(conf_matrix, interpolation='nearest', cmap=plt.cm.Purples)
        plt.title('Confusion Matrix (Knn)')
        plt.colorbar()
        tick_marks = np.arange(len(class_names))
        plt.xticks(tick_marks, class_names, rotation=45)
        plt.yticks(tick_marks, class_names)

        thresh = conf_matrix.max() / 2
        for i in range(conf_matrix.shape[0]):
            for j in range(conf_matrix.shape[1]):
                plt.text(j, i, format(conf_matrix[i, j], 'd'),
                         horizontalalignment="center",
                         color="white" if conf_matrix[i, j] > thresh else "black")
        
        plt.tight_layout()
        plt.ylabel('True label')
        plt.xlabel('Predicted label')
        plt.show()
        
    return (knn, accuracy, precision, recall, f1, conf_matrix)

## Knn Tests

In [None]:
def test_knn(x_train: pd.DataFrame, y_train: pd.DataFrame,
             x_test: pd.DataFrame, y_test: pd.DataFrame) -> tuple[
                 pd.DataFrame,
                 np.ndarray
             ]:
    """
    Test different permutations of hyperparameters for the Knn classifier.

    Parameters:
    - x_train: Training features.
    - y_train: Training labels.
    - x_test: Testing features.
    - y_test: Testing labels.

    Returns:
    - all_tests: Dataframe containing all hyperparameter values and their corresponding performance metrics.
    - confusion: List of confusion matricies for all tests.
    """
    # Hyperparameter values
    all_n_neighbors = [5, 10, 25, 50, 100, 250]
    all_weights = ['uniform', 'distance']
    all_p = [1, 2, 3, 5, 7, 10]

    # Run tests and save results.
    all_tests = {'n_neighbors': [], 'weight': [], 'p': [],
                 'accuracy': [], 'precision': [], 'recall': [], 'f1': []}
    confusion_matrices = []

    for n_neighbors in all_n_neighbors:
        for weight in all_weights:
            for p in all_p:
                all_tests['n_neighbors'].append(n_neighbors)
                all_tests['weight'].append(weight)
                all_tests['p'].append(p)

                knn, accuracy, precision, recall, f1, conf_matrix = train_knn(
                    x_train, y_train, x_test, y_test,
                    n_neighbors=n_neighbors, weights=weight, p=p
                )

                all_tests['accuracy'].append(accuracy)
                all_tests['precision'].append(precision)
                all_tests['recall'].append(recall)
                all_tests['f1'].append(f1)
                confusion_matrices.append(conf_matrix)
    return (pd.DataFrame.from_dict(all_tests), confusion_matrices)

# Neural Network
Aiden

In [None]:
def train_neural_network(x_train_mlp, y_train_mlp, x_test_mlp, y_test_mlp, hidden_layer_sizes, max_iter=100, activation='relu'):
    """
    Train and generate evaluation metrics for an MLP classifier given training and testing data.
    
    Parameters:
    - x_train_mlp: Training data
    - y_train_mlp: Training labels
    - x_test_mlp: Test data
    - y_test_mlp: Test labels
    - hidden_layer_sizes: The size of each hidden layer in tuple form (each entry is a hidden layer)
    - max_iter: The number of iterations for tuning weights and biases
    - activation: The activation function of the perceptrons
    
    Returns:
    - accuracy: The accuracy score for the model's predictions
    - precision: The precision score for the model's predictions
    - recall: The recall score for the model's predictions
    - f1: The f1 score for the model's predictions
    - confusion: The confusion matrix fro the model's predictions
    """
    # Create MLP classifer object
    mlp = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, max_iter=max_iter, activation=activation, random_state=42)
    
    # Train MLP Classifer
    mlp.fit(x_train_mlp,y_train_mlp)
    
    #Predict the response for test dataset
    y_pred = mlp.predict(x_test_mlp)
    
    accuracy = accuracy_score(y_test_mlp, y_pred)
    precision = precision_score(y_test_mlp, y_pred, average='macro')
    recall = recall_score(y_test_mlp, y_pred, average='macro')
    f1 = f1_score(y_test_mlp, y_pred, average='macro')
    confusion = confusion_matrix(y_test_mlp, y_pred)

    return accuracy, precision, recall, f1, confusion

## Neural Network Tests

In [None]:
def test_nn(nn_train_x, nn_train_y, nn_test_x, nn_test_y):
    """
    Run a series of tests with different combinations of hyperparameters for the MLP classifier.
    
    Parameters:
    - nn_train_x: Training data
    - nn_train_y: Training lables
    - nn_test_x: Test data
    - nn_test_y: Test labels
    
    Returns:
    - pd.DataFrame(all_tests): The dataframe containing all hyperparameter values and results for each test
    - confusion_matrices: List of confusion matrices for each test
    """
    # Hyperparameter tuning values
    all_hidden_layer_sizes = [(200,100), (200,50,20)]
    all_max_iter = [10000]
    all_activation = ['relu', 'logistic']

    # Run tests with all combinations and output results
    all_tests = {'hidden_layer_sizes': [], 'max_iter': [], 'activation': [],
                'accuracy': [], 'precision': [], 'recall': [], 'f1': []}
    confusion_matrices = []
    for hidden_layer_sizes in all_hidden_layer_sizes:
        for max_iter in all_max_iter:
            for activation in all_activation:
                accuracy, precision, recall, f1, confusion = train_neural_network(nn_train_x, nn_train_y, nn_test_x, nn_test_y, hidden_layer_sizes,
                                                                                max_iter, activation)
                for i, feature in enumerate(all_tests):
                    feature_vals = [hidden_layer_sizes, max_iter, activation, accuracy, precision, recall, f1]
                    all_tests[feature].append(feature_vals[i])
                confusion_matrices.append(confusion)

    return pd.DataFrame(all_tests), confusion_matrices

In [None]:
train_x_nn = {}
test_x_nn = {}
train_y_nn = {}
test_y_nn = {}
for data in [(train_df_hist, 'train', 'hist'), (test_df_hist, 'test', 'hist'), (train_df_lbp, 'train', 'lbp'), (test_df_lbp, 'test', 'lbp'),
           (train_df_hog, 'train', 'hog'), (test_df_hog, 'test', 'hog')]:
    if data[1] == 'train':
        train_x_nn[data[2]] = data[0].drop('label', axis=1)
        train_y_nn[data[2]] = data[0]['label']
    else:
        test_x_nn[data[2]] = data[0].drop('label', axis=1)
        test_y_nn[data[2]] = data[0]['label']

In [None]:
results_nn = {}
for df in ['hist','lbp','hog']:
    result_frame, result_conf = test_nn(train_x_nn[df], train_y_nn[df], test_x_nn[df], test_y_nn[df])
    results_nn[df] = result_frame
    results_nn[f'{df}_conf'] = result_conf

In [None]:
print(results_nn['hog'].to_string())

In [None]:
matrix_results = results_nn['hog_conf'][3]

In [None]:
confusion_matrix_display = ConfusionMatrixDisplay(confusion_matrix = matrix_results, display_labels = ['SPOILED', 'HALF', 'FRESH'])

confusion_matrix_display.plot()
plt.show()

# Parameter Tuning

## Parameter Tuning for Decision Tree

In [None]:
train_dfs = {
    'Histogram': train_df_hist,
    'LBP': train_df_lbp,
    'HOG': train_df_hog
}

test_dfs = {
    'Histogram': test_df_hist,
    'LBP': test_df_lbp,
    'HOG': test_df_hog
}

# Define feature names for better visualization
pca_components = 16
feature_names = {
    'Histogram': [f'Histogram PCA {i+1}' for i in range(pca_components)],
    'LBP': [f'LBP PCA {i+1}' for i in range(pca_components)],
    'HOG': [f'HOG PCA {i+1}' for i in range(pca_components)]
}

# Test different min_samples_split values with Gini criterion
print("\n--- Testing different min_samples_split values with Gini criterion ---")
min_samples_splits_to_test = [2, 15, 50]
gini_accuracies = []
entropy_accuracies = []

# First test with Gini
for min_samples in min_samples_splits_to_test:
    print(f"\nTesting min_samples_split = {min_samples} with criterion = 'gini'")
    results = run_decision_tree_on_pca_dataframes(
        train_dfs=train_dfs,
        test_dfs=test_dfs,
        train_labels=train_y,
        test_labels=test_y,
        feature_names=feature_names,
        max_depth=3,
        min_samples_split=min_samples,
        criterion='gini'
    )
    # Store average accuracy across all feature types
    avg_accuracy = sum(result['accuracy'] for result in results.values()) / len(results)
    gini_accuracies.append(avg_accuracy)

# Next test with Entropy
for min_samples in min_samples_splits_to_test:
    print(f"\nTesting min_samples_split = {min_samples} with criterion = 'entropy'")
    results = run_decision_tree_on_pca_dataframes(
        train_dfs=train_dfs,
        test_dfs=test_dfs,
        train_labels=train_y,
        test_labels=test_y,
        feature_names=feature_names,
        max_depth=3,
        min_samples_split=min_samples,
        criterion='entropy'
    )
    # Store average accuracy across all feature types
    avg_accuracy = sum(result['accuracy'] for result in results.values()) / len(results)
    entropy_accuracies.append(avg_accuracy)

# Visualize the parameter tuning results
plot_parameter_performance(
    min_samples_splits_to_test, 
    [gini_accuracies, entropy_accuracies], 
    "Min Samples Split", 
    criteria=['gini', 'entropy']
)

# Find best parameters
max_gini_idx = gini_accuracies.index(max(gini_accuracies))
max_entropy_idx = entropy_accuracies.index(max(entropy_accuracies))
best_gini = min_samples_splits_to_test[max_gini_idx]
best_entropy = min_samples_splits_to_test[max_entropy_idx]

print("\n--- Best parameters found ---")
print(f"Best min_samples_split for gini: {best_gini} (accuracy: {max(gini_accuracies):.4f})")
print(f"Best min_samples_split for entropy: {best_entropy} (accuracy: {max(entropy_accuracies):.4f})")

# Final run with best parameters
print("\n--- Final run with best parameters ---")
if max(gini_accuracies) >= max(entropy_accuracies):
    best_criterion = 'gini'
    best_min_samples = best_gini
else:
    best_criterion = 'entropy'
    best_min_samples = best_entropy

## Parameter Tuning for Knn

In [None]:
def tune_knn(metric='accuracy'):
    hist_tests, hist_matrices = test_knn(get_pca_cols(train_df_hist), train_df_hist['label'], get_pca_cols(test_df_hist), test_df_hist['label'])
    lbp_tests, lbp_matrices = test_knn(get_pca_cols(train_df_lbp), train_df_lbp['label'], get_pca_cols(test_df_lbp), test_df_lbp['label'])
    hog_tests, hog_matrices = test_knn(get_pca_cols(train_df_hog), train_df_hog['label'], get_pca_cols(test_df_hog), test_df_hog['label'])

    def find_best(tests: pd.DataFrame, matrices: np.ndarray, title: str, n=3):
        top = tests.sort_values(metric, ascending=False).head(n)
        display(Markdown(f"## {title}"))
        display(top)

        class_names = ['SPOILED', 'HALF', 'FRESH']
        for index, row in top.iterrows():
            matrix = matrices[index]

            # Visualize confusion matrix.
            plt.figure(figsize=(8, 6))
            plt.imshow(matrix, interpolation='nearest', cmap=plt.cm.Purples)
            plt.suptitle(f"Knn Confusion Matrix ({title})")
            plt.title(f"n_neighbors={row['n_neighbors']}  weight={row['weight']}  p={row['p']}", fontsize='medium')
            plt.colorbar()
            tick_marks = np.arange(len(class_names))
            plt.xticks(tick_marks, class_names, rotation=45)
            plt.yticks(tick_marks, class_names)

            thresh = matrix.max() / 2
            for i in range(matrix.shape[0]):
                for j in range(matrix.shape[1]):
                    plt.text(j, i, format(matrix[i, j], 'd'),
                            horizontalalignment="center",
                            color="white" if matrix[i, j] > thresh else "black")
            
            plt.tight_layout()
            plt.ylabel('True label')
            plt.xlabel('Predicted label')
            plt.show()
    
    find_best(hist_tests, hist_matrices, "Color Histograms", 3)
    find_best(lbp_tests, lbp_matrices, "Local Binary Patterns", 3)
    find_best(hog_tests, hog_matrices, "Histograms of Oriented Gradients", 3)

In [None]:
tune_knn(metric='accuracy')