### Here we will create patches from the annotated coco data 
1) Load in the coco annotations and labels
2) load in the images that are labeled
3) create patches from the images using the coco labels and save to a new file
4) Create a new csv of the images, labels

# Import Necessary Libraries and Load Data

In [None]:
# Load in the coco annotations
import cv2
import numpy as np
import matplotlib.pyplot as plt

import pandas as pd

# Image analysis
from scipy.stats import skew
from scipy.stats import kurtosis

#using sikit image to compute the HOG features
from skimage.io import imread
from skimage.transform import resize
from skimage.feature import hog

# Use SIFT to extract features from the images
# Use K means clustering to cluster the images into 3 or 7 clusters
from sklearn.cluster import KMeans

################################### Machine Learning ###################################

# Use a SVM classifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import scipy.stats as stats

# Save the model
import joblib

# Train a decision tree classifier
from sklearn.tree import DecisionTreeClassifier
# Print the decision tree
from sklearn import tree

# Computing the kappa score 
from sklearn.metrics import cohen_kappa_score

# Calculate the AUROC 
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import label_binarize

# Create a confusion matrix
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Define refit strategy
from sklearn.metrics import make_scorer, precision_score, recall_score

In [None]:
################################### State Your Paths ###################################

patches_image_directory = "your_path_here/All_Patches_resized/"
split_data_folder = "your_path_here/Train_test_develop_split_combined_background/"
model_checkpoints = "your_path_here/Model_Checkpoints/"

In [None]:
################################# Load in the Real Data #################################

train_names_labels_df = pd.read_csv(f"{split_data_folder}tra_val_names_labels_df.csv")
test_names_labels_df = pd.read_csv(f"{split_data_folder}dev_names_labels_df.csv")

# Get the lists of file names and labels
train_names = train_names_labels_df['patch_name'].tolist()
train_labels = train_names_labels_df['label'].tolist()
train_images = [cv2.imread(f"{patches_image_directory}{i}") for i in train_names]
# resize the images to 224x224
train_images = [cv2.resize(i, (224, 224)) for i in train_images]

# # Get the lists of file names and labels for the testing data
test_names = test_names_labels_df['patch_name'].tolist()
test_labels = test_names_labels_df['label'].tolist()
test_images = [cv2.imread(f"{patches_image_directory}{i}") for i in test_names]
# resize the images to 224x224
test_images = [cv2.resize(i, (224, 224)) for i in test_images]

In [None]:
# show the unique values of train_labels
print(np.unique(train_labels))

In [None]:
# print the shapes of the images
print(f"test images shape: {test_images[14].shape}") 

In [None]:
# display the number of patches in the training data
print(f"There are {len(train_labels)} training patches.")
print(f"There are {len(test_labels)} testing patches.")

# Data Visualization

In [None]:
# Define a function to plot the images
def plot_images(images, labels, num_images, capsid_type):
    plt.figure(figsize=(20, 1.5))
    
    # Get full, partial, or empty images
    if capsid_type == 1:
        indices = [i for i in range(len(images)) if labels[i] == 1]
    elif capsid_type == 2:
        indices = [i for i in range(len(images)) if labels[i] == 2]
    elif capsid_type == 3:
        indices = [i for i in range(len(images)) if labels[i] == 3]
    elif capsid_type == 4:
        indices = [i for i in range(len(images)) if labels[i] == 4]
    elif capsid_type == 5:
        indices = [i for i in range(len(images)) if labels[i] == 5]
    elif capsid_type == 6:
        indices = [i for i in range(len(images)) if labels[i] == 6]
    else:
        indices = [i for i in range(len(images)) if labels[i] == 7]
    
    for i in range(num_images):
        if i < len(indices):
            index = indices[i]
            plt.subplot(2, num_images // 2, i + 1)  # Two rows of images
            plt.grid(False)
            # Remove the axis
            plt.xticks([])
            plt.yticks([])
            plt.imshow(images[index], cmap="gray")
    
    plt.show()

# Plot 10 full capsids
plot_images(train_images, train_labels, 50, 1)
# Plot 10 partial capsids
plot_images(train_images, train_labels, 50, 2)
# Plot 10 empty capsids
plot_images(train_images, train_labels, 50, 3)

# Plot aggregation 
plot_images(train_images, train_labels, 50, 4)
# Plot ice
plot_images(train_images, train_labels, 50, 5)
# Plot broken capsids
plot_images(train_images, train_labels, 50, 6)
# Plot background annotations
plot_images(train_images, train_labels, 50, 7)


In [None]:
# Create a full_images, partial_images, and empty_images list from the test_images list
full_images = [train_images[i] for i in range(len(train_images)) if train_labels[i] == 1]
partial_images = [train_images[i] for i in range(len(train_images)) if train_labels[i] == 2]
empty_images = [train_images[i] for i in range(len(train_images)) if train_labels[i] == 3]

aggregation_images = [train_images[i] for i in range(len(train_images)) if train_labels[i] == 4]
ice_images = [train_images[i] for i in range(len(train_images)) if train_labels[i] == 5]
broken_images = [train_images[i] for i in range(len(train_images)) if train_labels[i] == 6]
background_images = [train_images[i] for i in range(len(train_images)) if train_labels[i] == 7]

In [None]:
#print the length of the lists
print(f"There are {len(full_images)} full capsid images.")
print(f"There are {len(partial_images)} partial capsid images.")
print(f"There are {len(empty_images)} empty capsid images.")

print(f"There are {len(aggregation_images)} aggregation images.")
print(f"There are {len(ice_images)} ice images.")
print(f"There are {len(broken_images)} broken images.")
print(f"There are {len(background_images)} background images.")

# Create a bar chart of the number of images in each category

# Create a list of the number of images in each category
# num_images = [len(full_images), len(partial_images), len(empty_images)]
num_images = [len(full_images), len(partial_images), len(empty_images), len(aggregation_images), len(ice_images), len(broken_images), len(background_images)]

# Create a list of the labels
# labels = ['Full', 'Partial', 'Empty']
labels = ['Full', 'Partial', 'Empty', 'Aggregation', 'Ice', 'Broken', 'Background']

# Create a bar chart
plt.bar(labels, num_images, color=['lightgreen', 'seagreen', 'skyblue'])
plt.title('Number of Images in Each Category')
plt.xlabel('Category')
plt.ylabel('Number of Images')
plt.show()


In [None]:
# Print how many instances of each class are in the testing and rw testing data
print(f"There are {(test_labels).count(1)} full capsid images in the testing data.")
print(f"There are {(test_labels).count(2)} partial capsid images in the testing data.")
print(f"There are {(test_labels).count(3)} empty capsid images in the testing data.")

print(f"There are {(test_labels).count(4)} aggregation images in the testing data.")
print(f"There are {(test_labels).count(5)} ice images in the testing data.")
print(f"There are {(test_labels).count(6)} broken images in the testing data.")
print(f"There are {(test_labels).count(7)} background images in the testing data.")

In [None]:
# Here we are creating a plot that shows the Standard Deviation vs the Mean of each patch
################################ capsids ####################################

# convert the images to a numpy array
empty_images = np.array(empty_images)
full_images = np.array(full_images)
partial_images = np.array(partial_images)

# Take the mean and standard deviation of the images
em = [x.mean() for x in empty_images]
es = [x.std() for x in empty_images]

fm = [x.mean() for x in full_images]
fs = [x.std() for x in full_images]

pm = [x.mean() for x in partial_images]
ps = [x.std() for x in partial_images]

################################ Debris ###################################

am = [x.mean() for x in aggregation_images]
as_ = [x.std() for x in aggregation_images]

im = [x.mean() for x in ice_images]
is_ = [x.std() for x in ice_images]

bm = [x.mean() for x in broken_images]
bs = [x.std() for x in broken_images]

bam = [x.mean() for x in background_images]
bas = [x.std() for x in background_images]

###########################################################################

# plot the mean and standard deviation of the empty capsids
plt.plot(em, es, 'o', color='black', alpha=0.5)
plt.plot(fm, fs, 'o', color='red', alpha=0.5)
plt.plot(pm, ps, 'o', color='blue', alpha=0.5)

plt.plot(am, as_, 'o', color='green', alpha=0.5)
plt.plot(im, is_, 'o', color='purple', alpha=0.5)
plt.plot(bm, bs, 'o', color='orange', alpha=0.5)
plt.plot(bam, bas, 'o', color='yellow', alpha=0.5)

plt.xlabel('Mean')
plt.ylabel('Standard Deviation')
# plt.legend(['Empty', 'Full', 'Partial'])
plt.legend(['Empty', 'Full', 'Partial', 'Aggregation', 'Ice', 'Broken', 'Background'])
plt.title('Standard Deviation vs the Mean')
plt.show()

In [None]:
########################## For Images With a Background ####################################

# Create a Histogram of the average intensity values of the capsid patches
plt.hist([x.mean() for x in empty_images], label='empty', alpha=0.2) 
plt.hist([x.mean() for x in full_images], label='full', alpha=0.2) 
plt.hist([x.mean() for x in partial_images], label='partial', alpha=0.2)

plt.hist([x.mean() for x in aggregation_images], label='aggregation', alpha=0.2)
plt.hist([x.mean() for x in ice_images], label='ice', alpha=0.2)
plt.hist([x.mean() for x in broken_images], label='broken', alpha=0.2)
plt.hist([x.mean() for x in background_images], label='background', alpha=0.2)

plt.xlabel('Mean')
plt.ylabel('Frequency')
plt.title('Histogram of the Mean')
plt.legend()

In [None]:
# get the shape of the images and get the center patch of the images
left_center_coords = (empty_images[0].shape[0] // 3) # assuming a square and that all images are the same size
right_center_coords = (2 * empty_images[0].shape[0] // 3)

# Classifiying based on only using the inside of the capsid
plt.hist([x[left_center_coords:right_center_coords, left_center_coords:right_center_coords].mean() for x in empty_images], label='empty', alpha=0.2) 
plt.hist([x[left_center_coords:right_center_coords, left_center_coords:right_center_coords].mean() for x in full_images], label='filled', alpha=0.2) 
plt.hist([x[left_center_coords:right_center_coords, left_center_coords:right_center_coords].mean() for x in partial_images], label='partial', alpha=0.2)
plt.xlabel('Mean')
plt.ylabel('Frequency')
plt.title('Histogram of the Mean: Center Patch')
plt.legend();

# Create Prototypical Capsids

In [None]:
# Here I will create the average protoypical full capsid by averaging the images in the synthetic
# dataset. you will need to run those cells alone without the other real data
# Then you could uncomment the following code to create the average images

# # Convert the list of images to a numpy array
# full_images_array = np.array(full_images)
# partial_images_array = np.array(partial_images)
# empty_images_array = np.array(empty_images)

# # # Convert the images to float type for accurate averaging
# full_images_float = full_images_array.astype(np.float32)
# partial_images_float = partial_images_array.astype(np.float32)
# empty_images_float = empty_images_array.astype(np.float32)

# # Compute the average images
# avg_filled = np.mean(full_images_float, axis=0)
# avg_partial = np.mean(partial_images_float, axis=0)
# avg_empty = np.mean(empty_images_float, axis=0)

# # Clip the values to [0, 255] range and convert back to uint8 for display
# avg_filled = np.clip(avg_filled, 0, 255).astype(np.uint8)
# avg_partial = np.clip(avg_partial, 0, 255).astype(np.uint8)
# avg_empty = np.clip(avg_empty, 0, 255).astype(np.uint8)

# ################################### Save the Prototypical Patches ###################################

# # Save the average images
# cv2.imwrite(f"{model_checkpoints}avg_filled.png", avg_filled)
# cv2.imwrite(f"{model_checkpoints}avg_partial.png", avg_partial)
# cv2.imwrite(f"{model_checkpoints}avg_empty.png", avg_empty)

################################### Load the Prototypical Patches ###################################

# Load in the average images
avg_filled = cv2.imread(f"{model_checkpoints}avg_filled.png")
avg_partial = cv2.imread(f"{model_checkpoints}avg_partial.png")
avg_empty = cv2.imread(f"{model_checkpoints}avg_empty.png")

# resize the images to 224x224
avg_filled = cv2.resize(avg_filled, (224, 224))
avg_partial = cv2.resize(avg_partial, (224, 224))
avg_empty = cv2.resize(avg_empty, (224, 224))

######################################### Show the Images #########################################

plt.rcParams.update({'font.size': 10})

# plot both of the avg_empty and the avg_filled on subplots
fig, (ax1, ax2, ax3) = plt.subplots(1, 3)
#change the size of the figure
fig.set_size_inches(10, 10)

ax1.imshow(avg_filled, cmap='gray')
ax1.axis('off')
ax1.set_title('Average Filled Capsid')
ax2.imshow(avg_partial, cmap='gray')    
ax2.axis('off')
ax2.set_title('Average Partial Capsid')
ax3.imshow(avg_empty, cmap='gray')
ax3.axis('off')
ax3.set_title('Average Empty Capsid')
plt.show()

In [None]:
# print the shape of the average images
print(f"The shape of the average filled image is: {avg_filled.shape}")

In [None]:
# Using the similarity to the prototypical capsid to classify the capsids
data = train_images
labels  = train_labels
f1 = [np.linalg.norm(x - avg_filled) for x in data] 
f2 = [np.linalg.norm(x - avg_partial) for x in data]
f3 = [np.linalg.norm(x - avg_empty) for x in data]

#change the opacity of the points
plt.scatter(f1, f3, c=labels, alpha=0.2);

# plt.xlabel('label')
plt.xlabel('Distance From Average Filled Capsid')
plt.ylabel('Distance From Average Empty Capsid')
plt.title('Similarity to the Prototypical Capsid: Frobenius Norm of Pixelwise Difference');

# Use the Patches to Create Features

This paper gives a good comparison of SIFT and HOG:

Srinivas, D., & Hanumaji, K. (2019). Analysis of various image feature extraction methods against noisy image: SIFT, SURF and HOG. J Eng Sci, 10(2), 32-36.

## Image Analysis Feature Extraction

In [None]:
########################## For Images With a Background ####################################

# compute features on the patches and assesmble the features into a csv with the associated labels
# features: mean, sd, median, mean of inner patch, sd of inner patch, median of inner patch, min, max, range, similarity to avg_filled, similarity to avg_partial, similarity to avg_empty
# Do this for each of the list of images: full_images, partial_images, empty_images

left_center_coords = (test_images[0].shape[0] // 3) # assuming a square and that all images are the same size
right_center_coords = (2 * test_images[0].shape[0] // 3)

avg_full_inner_patch = avg_filled[left_center_coords:right_center_coords, left_center_coords:right_center_coords]
avg_partial_inner_patch = avg_partial[left_center_coords:right_center_coords, left_center_coords:right_center_coords]
avg_empty_inner_patch = avg_empty[left_center_coords:right_center_coords, left_center_coords:right_center_coords]

################################### Function to Compute Features #################################
def compute_features(image_list, labels):
    # initialize the lists
    mean = []
    sd = []
    median = []
    min_val = []
    max_val = []
    range_val = []
    skewness = []
    kurtosis_val = []
    mode = []
    mean_inner = []
    sd_inner = []
    median_inner = []
    min_inner = []
    max_inner = []
    range_inner = []
    skewness_inner = []
    kurtosis_inner = []
    mode_inner = []
    sim_to_avg_full_inner = []
    sim_to_avg_partial_inner = []
    sim_to_avg_empty_inner = []
    sim_to_avg_full = []
    sim_to_avg_partial = []
    sim_to_avg_empty = []
    
    # Iterate over the patches
    for patch in image_list:
        inner_patch = patch[left_center_coords:right_center_coords, left_center_coords:right_center_coords]
        mean.append(np.mean(patch))
        sd.append(np.std(patch))
        median.append(np.median(patch))
        min_val.append(np.min(patch))
        max_val.append(np.max(patch))
        range_val.append(np.max(patch) - np.min(patch))
        skewness.append(skew(patch.flatten()))
        kurtosis_val.append(kurtosis(patch.flatten()))
        mode.append(np.argmax(np.bincount(patch.flatten())))
        mean_inner.append(np.mean(inner_patch))
        sd_inner.append(np.std(inner_patch))
        median_inner.append(np.median(inner_patch))
        min_inner.append(np.min(inner_patch))
        max_inner.append(np.max(inner_patch))
        range_inner.append(np.max(inner_patch) - np.min(inner_patch))
        skewness_inner.append(skew(inner_patch.flatten()))
        kurtosis_inner.append(kurtosis(inner_patch.flatten()))
        mode_inner.append(np.argmax(np.bincount(inner_patch.flatten())))
        sim_to_avg_full_inner.append(np.linalg.norm(inner_patch - avg_full_inner_patch))
        sim_to_avg_partial_inner.append(np.linalg.norm(inner_patch - avg_partial_inner_patch))
        sim_to_avg_empty_inner.append(np.linalg.norm(inner_patch - avg_empty_inner_patch))
        sim_to_avg_full.append(np.linalg.norm(patch - avg_filled))
        sim_to_avg_partial.append(np.linalg.norm(patch - avg_partial))
        sim_to_avg_empty.append(np.linalg.norm(patch - avg_empty))
        
        
    # normalize each of the statistics
    mean = (mean - np.mean(mean)) / np.std(mean)
    sd = (sd - np.mean(sd)) / np.std(sd)
    median = (median - np.mean(median)) / np.std(median)
    # min_val = (min_val - np.mean(min_val)) / np.std(min_val)
    # max_val = (max_val - np.mean(max_val)) / np.std(max_val)
    min_val = min_val / np.max(min_val)
    max_val = max_val / np.max(max_val)
    range_val = (range_val - np.mean(range_val)) / np.std(range_val)
    skewness = (skewness - np.mean(skewness)) / np.std(skewness)
    kurtosis_val = (kurtosis_val - np.mean(kurtosis_val)) / np.std(kurtosis_val)
    mode = (mode - np.mean(mode)) / np.std(mode)
    mean_inner = (mean_inner - np.mean(mean_inner)) / np.std(mean_inner)
    sd_inner = (sd_inner - np.mean(sd_inner)) / np.std(sd_inner)
    median_inner = (median_inner - np.mean(median_inner)) / np.std(median_inner)
    min_inner = (min_inner - np.mean(min_inner)) / np.std(min_inner)
    max_inner = (max_inner - np.mean(max_inner)) / np.std(max_inner)
    range_inner = (range_inner - np.mean(range_inner)) / np.std(range_inner)
    skewness_inner = (skewness_inner - np.mean(skewness_inner)) / np.std(skewness_inner)
    kurtosis_inner = (kurtosis_inner - np.mean(kurtosis_inner)) / np.std(kurtosis_inner)
    mode_inner = (mode_inner - np.mean(mode_inner)) / np.std(mode_inner)
    sim_to_avg_full_inner = (sim_to_avg_full_inner - np.mean(sim_to_avg_full_inner)) / np.std(sim_to_avg_full_inner)
    sim_to_avg_partial_inner = (sim_to_avg_partial_inner - np.mean(sim_to_avg_partial_inner)) / np.std(sim_to_avg_partial_inner)
    sim_to_avg_empty_inner = (sim_to_avg_empty_inner - np.mean(sim_to_avg_empty_inner)) / np.std(sim_to_avg_empty_inner)
    sim_to_avg_full = (sim_to_avg_full - np.mean(sim_to_avg_full)) / np.std(sim_to_avg_full)
    sim_to_avg_partial = (sim_to_avg_partial - np.mean(sim_to_avg_partial)) / np.std(sim_to_avg_partial)
    sim_to_avg_empty = (sim_to_avg_empty - np.mean(sim_to_avg_empty)) / np.std(sim_to_avg_empty)
        
    # Create a dataframe of the statistics
    stats = pd.DataFrame({
        'mean': mean,
        'sd': sd,
        'median': median,
        'min': min_val,
        'max': max_val,
        'range': range_val,
        'skewness': skewness,
        'kurtosis': kurtosis_val,
        'mode': mode,
        'mean_inner': mean_inner,
        'sd_inner': sd_inner,
        'median_inner': median_inner,
        'min_inner': min_inner,
        'max_inner': max_inner,
        'range_inner': range_inner,
        'skewness_inner': skewness_inner,
        'kurtosis_inner': kurtosis_inner,
        'mode_inner': mode_inner,
        'sim_to_avg_full_inner': sim_to_avg_full_inner,
        'sim_to_avg_partial_inner': sim_to_avg_partial_inner,
        'sim_to_avg_empty_inner': sim_to_avg_empty_inner,
        'sim_to_avg_full': sim_to_avg_full,
        'sim_to_avg_partial': sim_to_avg_partial,
        'sim_to_avg_empty': sim_to_avg_empty
    })
    
    # add the labels to the dataframe
    stats['label'] = labels
    
    return stats

################################# Compute Features for Training #######################################

# Compute features for the training data
train_img_analysis_features_df = compute_features(train_images, train_labels)

# Compute features for the testing data
test_img_analysis_features_df = compute_features(test_images, test_labels)

In [None]:
####################################### Save the All Combined Data #######################################
# save the df to a csv
train_img_analysis_features_df.to_csv(f"{split_data_folder}train_img_analysis_features_df.csv", index=False)

#save the df to a csv
test_img_analysis_features_df.to_csv(f"{split_data_folder}test_img_analysis_features_df.csv", index=False)

In [None]:
# Load in the All Combined Capsid Dataset

train_img_analysis_features_df = pd.read_csv(f"{split_data_folder}train_img_analysis_features_df.csv")
test_img_analysis_features_df = pd.read_csv(f"{split_data_folder}test_img_analysis_features_df.csv")

In [None]:
# Calculate the range, mean, and sd of each of the features
feature_stats = test_img_analysis_features_df.describe()
# print(feature_stats)

# Create a table of the mean, sd, and range of each of the features
feature_stats = feature_stats.loc[['mean', 'std', 'min', 'max']]

# Convert feature_stats to a df
feature_stats_df = pd.DataFrame(feature_stats)

# show the feature_stats_df
feature_stats_df


## HOG Feature Extraction

#### First we will get the feature descriptors for one image using HOG

Some of the following code comes from a tutorial at the following link:
https://towardsdatascience.com/hog-histogram-of-oriented-gradients-67ecd887675f

In [None]:
train_hog_features = []
for image in train_images:
    fd, hog_image = hog(image, orientations=9, pixels_per_cell=(8, 8), cells_per_block=(2, 2), visualize=True, channel_axis=-1)
    train_hog_features.append(fd)

In [None]:
# Also get the feautres for the test data
test_hog_features = []
for image in test_images:
    fd, hog_image = hog(image, orientations=9, pixels_per_cell=(8, 8), cells_per_block=(2, 2), visualize=True, channel_axis=-1)
    test_hog_features.append(fd)

In [None]:
# Save the train HOG features as a csv
train_hog_features_df = pd.DataFrame(train_hog_features)
train_hog_features_df['label'] = train_labels

# Save the test HOG features as a csv
test_hog_features_df = pd.DataFrame(test_hog_features)
test_hog_features_df['label'] = test_labels

############################## Save the features ########################################

train_hog_features_df.to_csv(f"{split_data_folder}train_hog_features_df.csv", index=False)
test_hog_features_df.to_csv(f"{split_data_folder}test_hog_features_df.csv", index=False)

In [None]:
# Load in the real data csvs
train_hog_features_df = pd.read_csv(f"{split_data_folder}train_hog_features_df.csv")
test_hog_features_df = pd.read_csv(f"{split_data_folder}test_hog_features_df.csv")

## SIFT Feature Extraction


#### Fist we will create a library of visual words. This is done by getting the feature descriptors for all of the images, combining them into one np array, and clustering them down to 200 visual words. 

Some of the code in this section came from this blog post: https://liverungrow.medium.com/sift-bag-of-features-svm-for-classification-b5f775d8e55f

In [None]:
# Create a SIFT object
sift = cv2.SIFT_create(contrastThreshold=0.01, edgeThreshold=30)
sift_features = []
sift_keypoints = []

# Compute the SIFT features for each image
for image in train_images:
    keypoints, descriptors = sift.detectAndCompute(image, None)
    sift_features.append(descriptors)
    sift_keypoints.append(keypoints)

# Concatenate the descriptors along the first axis
sift_features_concat = np.concatenate(sift_features, axis=0)

# Draw the keypoints on the first image
img = np.array(train_images[5]).astype(np.uint8)  # Replace with the index of the image you want to visualize
img_with_keypoints = cv2.drawKeypoints(img, sift_keypoints[0], img.copy())
plt.imshow(img_with_keypoints)
plt.show()

# Now, sift_features_concat contains all descriptors in a single array
print(sift_features_concat.shape)


In [None]:
# Using the Bag of Visual Words technique to be able to USE these SIFT features
# This takes about 8 minutes to run

print('Performing K-means clustering...')
# Perform K-means clustering
k = 200  # number of clusters/ visual words
sift_kmeans = KMeans(n_clusters=k, random_state=0)
sift_kmeans.fit(sift_features_concat)
print('K-means clustering complete.')

# get the cluster centers
cluster_centers = sift_kmeans.cluster_centers_

In [None]:
# Save the data cluster centers
np.save(f"{split_data_folder}sift_kmeans_cluster_centers.npy", cluster_centers)

In [None]:
# Load in the data cluster centers
cluster_centers = np.load(f"{split_data_folder}sift_kmeans_cluster_centers.npy")

#### Now that we have our visual words we can generate the keypoints for every image and assign visual words to each image.

This is done by generating the features for each of the images and assesing how close the keypoint is to each of the 200 vocab words. In this way we can determine how many of the 200 feature descriptors are actually present in each of the images. We can then build a histogram which compiles the number of times each cluster was used.

In [None]:
# For each of the images in the dataset, compute the distance to each of the cluster centers and compile the results into a histogram

def compute_visual_words_histogram(cluster_centers, X):
    # initialize the list of histograms
    visual_words_histograms = []
    
    for image in X:
        ####################### Compute the SIFT features #######################
        # Compute the SIFT features
        keypoints, descriptors = sift.detectAndCompute(image, None) # in sift it scales the images
        
        ############## Create a histogram of the cluster centers ################
        
        # Compute the euclidean distance from each descriptor to each cluster center
        distances = np.linalg.norm(descriptors[:, None] - cluster_centers[None], axis=-1)
        
        # Get the bin assignment for each descriptor
        bin_assignments = np.argmin(distances, axis=-1)
        
        # Classify each of the keypoints to a cluster center. Determine how many of visual words are in each image
        visual_words = np.zeros(len(cluster_centers))
        for bin_idx in bin_assignments:
            visual_words[bin_idx] += 1
        
        # Normalize the histogram
        visual_words = visual_words / np.sum(visual_words)
        
        # Append the histogram to the list of histograms
        visual_words_histograms.append(visual_words)
    
    # Convert the list of histograms to a numpy array
    visual_words_histograms = np.array(visual_words_histograms)
    
    # Convert the list of histograms to a df
    visual_words_histograms_df = pd.DataFrame(visual_words_histograms)
    
    return visual_words_histograms_df, visual_words_histograms


In [None]:
# Compute the visual words histograms for the training, testing, and real world testing data

####################################### Training Data #######################################
# Compute the visual words histograms for the training data
train_sift_features_df, visual_words_histograms = compute_visual_words_histogram(cluster_centers, train_images)
# Add the labels to the df
train_sift_features_df['label'] = train_labels

####################################### Testing Data ########################################
# Compute the visual words histograms for the testing data
test_sift_features_df, _    = compute_visual_words_histogram(cluster_centers, test_images)
# Add the labels to the df
test_sift_features_df['label'] = test_labels

##################################### Save the real data #####################################
train_sift_features_df.to_csv(f"{split_data_folder}train_sift_features_df.csv", index=False)
test_sift_features_df.to_csv(f"{split_data_folder}test_sift_features_df.csv", index=False)

In [None]:
# Load in the real data csvs
train_sift_features_df = pd.read_csv(f"{split_data_folder}train_sift_features_df.csv")
test_sift_features_df = pd.read_csv(f"{split_data_folder}test_sift_features_df.csv")

In [None]:
# Visualize the histogram of visual words for the first image
plt.bar(np.arange(len(visual_words_histograms[0])), visual_words_histograms[0])
plt.xlabel('Visual Word Index')
plt.ylabel('Count')
plt.title('Histogram of Visual Words')
plt.show()

## Basic CV Filter Banks Feature Extraction

In [None]:
# Use a filter bank to create features. Each filter is one feature. Use max pooling so that each filter is only one feature.

####################################### Filter Bank #######################################

def sobel(image):
    # Apply the Sobel filter to the image
    filtered_image = cv2.Sobel(image, cv2.CV_64F, 1, 1)
    # Return the filtered image
    return filtered_image

def laplacian(image):
    # Apply the Laplacian filter to the image
    filtered_image = cv2.Laplacian(image, cv2.CV_64F)
    # Return the filtered image
    return filtered_image

def gaussian_blur(image):
    # Apply Gaussian blur to the image
    blurred_image = cv2.GaussianBlur(image, (3, 3), 0)
    # Return the blurred image
    return blurred_image

def prewitt(image):
    # Apply the Prewitt filter to the image
    filtered_image = cv2.filter2D(image, -1, np.array([[-1, 0, 1], [-1, 0, 1], [-1, 0, 1]]))
    # Return the filtered image
    return filtered_image

def scharr(image):
    # Apply the Scharr filter to the image
    filtered_image = cv2.filter2D(image, -1, np.array([[-3, 0, 3], [-10, 0, 10], [-3, 0, 3]]))
    # Return the filtered image
    return filtered_image

def roberts(image):
    # Apply the Roberts filter to the image
    filtered_image = cv2.filter2D(image, -1, np.array([[-1, 0], [0, 1]]))
    # Return the filtered image
    return filtered_image

def max_pool(image):
    # Get the dimensions of the image
    height, width, _ = image.shape
    
    # Divide the image into nine sections
    top_left = image[:height//3, :width//3]
    top_center = image[:height//3, width//3:2*width//3]
    top_right = image[:height//3, 2*width//3:]
    
    middle_left = image[height//3:2*height//3, :width//3]
    middle_center = image[height//3:2*height//3, width//3:2*width//3]
    middle_right = image[height//3:2*height//3, 2*width//3:]
    
    bottom_left = image[2*height//3:, :width//3]
    bottom_center = image[2*height//3:, width//3:2*width//3]
    bottom_right = image[2*height//3:, 2*width//3:]
    
    # Calculate the maximum value in each section
    max_top_left = np.max(top_left)
    max_top_center = np.max(top_center)
    max_top_right = np.max(top_right)
    
    max_middle_left = np.max(middle_left)
    max_middle_center = np.max(middle_center)
    max_middle_right = np.max(middle_right)
    
    max_bottom_left = np.max(bottom_left)
    max_bottom_center = np.max(bottom_center)
    max_bottom_right = np.max(bottom_right)
    
    # Return the max values in each section as a list or array
    return [max_top_left, max_top_center, max_top_right, 
            max_middle_left, max_middle_center, max_middle_right,
            max_bottom_left, max_bottom_center, max_bottom_right]

####################################### Extracting Features #######################################

# filter_bank function
def filter_bank(image):
    # Create a list to hold the features
    features_one_img = []
    
    # Create a list of filters
    filters = [sobel, laplacian, gaussian_blur, prewitt, scharr, roberts]
    
    # Apply each filter to the image
    for f in filters:
        # Apply the filter to the image
        filtered_image = f(image)
        # Apply max pooling to the filtered image
        pooled_values = max_pool(filtered_image)
        # Append the pooled values to the features list
        features_one_img.extend(pooled_values)
    
    # Return the features as a vector
    return np.array(features_one_img)

# here we will loop through the images and apply the filter bank to each image
def cv_features(image_list):
    # Create a list to hold the features
    features_list = []
    
    # Loop through the images
    for image in image_list:
        # Apply the filter bank to the image
        features = filter_bank(image)
        # Append the features to the features list
        features_list.append(features)
        
    # Return the features list
    return features_list

In [None]:
############################## Feature Extraction for the train data ############################

# Apply the filter bank to the training images
train_features = cv_features(train_images)

# Create a df of the features
train_features_df = pd.DataFrame(train_features, columns= ['sobel_tl', 'sobel_tc', 'sobel_tr', 'sobel_ml', 'sobel_mc', 'sobel_mr', 'sobel_bl', 'sobel_bc', 'sobel_br',
                                                            'laplacian_tl', 'laplacian_tc', 'laplacian_tr', 'laplacian_ml', 'laplacian_mc', 'laplacian_mr', 'laplacian_bl', 'laplacian_bc', 'laplacian_br',
                                                            'gaussian_blur_tl', 'gaussian_blur_tc', 'gaussian_blur_tr', 'gaussian_blur_ml', 'gaussian_blur_mc', 'gaussian_blur_mr', 'gaussian_blur_bl', 'gaussian_blur_bc', 'gaussian_blur_br',
                                                            'prewitt_tl', 'prewitt_tc', 'prewitt_tr', 'prewitt_ml', 'prewitt_mc', 'prewitt_mr', 'prewitt_bl', 'prewitt_bc', 'prewitt_br',
                                                            'scharr_tl', 'scharr_tc', 'scharr_tr', 'scharr_ml', 'scharr_mc', 'scharr_mr', 'scharr_bl', 'scharr_bc', 'scharr_br',
                                                            'roberts_tl', 'roberts_tc', 'roberts_tr', 'roberts_ml', 'roberts_mc', 'roberts_mr', 'roberts_bl', 'roberts_bc', 'roberts_br'])
                                                     

# add the labels to the df
train_features_df['label'] = train_labels

############################## Features Extraction for the test data ############################

# Apply the filter bank to the testing images
test_features = cv_features(test_images)

# Create a df of the features. Include the filter names as the column names
test_features_df = pd.DataFrame(test_features, columns= ['sobel_tl', 'sobel_tc', 'sobel_tr', 'sobel_ml', 'sobel_mc', 'sobel_mr', 'sobel_bl', 'sobel_bc', 'sobel_br',
                                                            'laplacian_tl', 'laplacian_tc', 'laplacian_tr', 'laplacian_ml', 'laplacian_mc', 'laplacian_mr', 'laplacian_bl', 'laplacian_bc', 'laplacian_br',
                                                            'gaussian_blur_tl', 'gaussian_blur_tc', 'gaussian_blur_tr', 'gaussian_blur_ml', 'gaussian_blur_mc', 'gaussian_blur_mr', 'gaussian_blur_bl', 'gaussian_blur_bc', 'gaussian_blur_br',
                                                            'prewitt_tl', 'prewitt_tc', 'prewitt_tr', 'prewitt_ml', 'prewitt_mc', 'prewitt_mr', 'prewitt_bl', 'prewitt_bc', 'prewitt_br',
                                                            'scharr_tl', 'scharr_tc', 'scharr_tr', 'scharr_ml', 'scharr_mc', 'scharr_mr', 'scharr_bl', 'scharr_bc', 'scharr_br',
                                                            'roberts_tl', 'roberts_tc', 'roberts_tr', 'roberts_ml', 'roberts_mc', 'roberts_mr', 'roberts_bl', 'roberts_bc', 'roberts_br'])

# add the labels to the df
test_features_df['label'] = test_labels

######################################## Save the real data #######################################
train_features_df.to_csv(f"{split_data_folder}train_cv_features_df.csv", index=False)
test_features_df.to_csv(f"{split_data_folder}test_cv_features_df.csv", index=False)

In [None]:
# Load in the real data csvs
train_cv_features_df = pd.read_csv(f"{split_data_folder}train_cv_features_df.csv")
test_cv_features_df = pd.read_csv(f"{split_data_folder}test_cv_features_df.csv")

## Combine Image Anlaysis and Filter Bank Feature Extraction

In [None]:
# Combine the image analysis features with the CV features

# first remove the labels from the dataframes
train_cv_features_df_nolabels = train_cv_features_df.drop(columns=['label'])
test_cv_features_df_nolabels = test_cv_features_df.drop(columns=['label'])
train_img_analysis_features_df_nolabels = train_img_analysis_features_df.drop(columns=['label'])
test_img_analysis_features_df_nolabels = test_img_analysis_features_df.drop(columns=['label'])

# combine the dataframes
train_combined_features_df = pd.concat([train_img_analysis_features_df_nolabels, train_cv_features_df_nolabels], axis=1)
test_combined_features_df = pd.concat([test_img_analysis_features_df_nolabels, test_cv_features_df_nolabels], axis=1)

# Add the labels back to the dataframes
train_combined_features_df['label'] = train_img_analysis_features_df['label']
test_combined_features_df['label'] = test_img_analysis_features_df['label']

# Save the real data combined features to a csv
train_combined_features_df.to_csv(f"{split_data_folder}train_combined_features_df.csv", index=False)
test_combined_features_df.to_csv(f"{split_data_folder}test_combined_features_df.csv", index=False)

In [None]:
# Load in the real data csvs
train_combined_features_df = pd.read_csv(f"{split_data_folder}train_combined_features_df.csv")
test_combined_features_df = pd.read_csv(f"{split_data_folder}test_combined_features_df.csv")

# ML Using the Features we created in the Previous Section

## First we define functions for:

0. Cross Validation Strategy
1. SVM
2. Decision Tree Classification
3. Kmeans Clustering

In [None]:
# The following code is inspired by the following tutorial: 
# https://scikit-learn.org/stable/auto_examples/model_selection/plot_grid_search_digits.html#sphx-glr-auto-examples-model-selection-plot-grid-search-digits-py

# Define custom scorers for precision and recall
scoring = {
    'precision': make_scorer(precision_score, average='macro', zero_division=0),
    'recall': make_scorer(recall_score, average='macro', zero_division=0)
}

# First we will define our cross validation strategy
def refit_strategy(cv_results):
    """Here we will define a function to get the parameters of the best estimator
    First, we will filter by precision using a threshold, and then we will pick the best estimator of the remaining based on the recall
    """
    precision_threshold = 0.60
    
    cv_results_ = pd.DataFrame(cv_results)
    
    # Filter by precision
    high_precision_cv_results = cv_results_[cv_results_['mean_test_precision'] > precision_threshold]
    
    if not high_precision_cv_results.empty:
        best_recall_index = high_precision_cv_results['mean_test_recall'].idxmax()
        return best_recall_index
    else:
        # if the precision is too low, we will just pick the best estimator based on the recall
        best_recall_index = cv_results_['mean_test_recall'].idxmax()
        return best_recall_index

### SVM

In [None]:
# Define a function that uses a SVM and takes in training and 
# testing data and returns the accuracy and kappa score and plots the confusion matrix
def svm_classifier(train_features_df, test_features_df, model_checkpoints, model_name):
    # Split the data into training and testing sets
    X_train = train_features_df.drop(columns=['label'])
    y_train = train_features_df['label']
    
    X_test = test_features_df.drop(columns=['label'])
    y_test = test_features_df['label']
    
    # ############################## Grid Search ##############################
    # # tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-4, 0.01, 0.2, 0.5],
    # #                     'C': [1, 10, 100, 1000]},
    # #                     {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
    
    # model = GridSearchCV(SVC(), tuned_parameters, scoring=scoring, refit=refit_strategy)
    # model.fit(X_train, y_train)
    
    ############################# Random Search ##############################
    # specify parameters and distributions to sample from
    parameter_distribution = {'kernel': ['rbf', 'linear'], 'C': stats.uniform(1, 1000), 'gamma': stats.uniform(0.0001, 0.1)}
    
    n_iter_search = 20
    model = RandomizedSearchCV(SVC(), param_distributions=parameter_distribution, n_iter=n_iter_search, scoring=scoring, refit=refit_strategy)
    model.fit(X_train, y_train)
    
    ############################## Save the Model ##############################
    # this tutorial was used for saving the model: 
    # https://saturncloud.io/blog/sklearn-how-to-save-a-model-created-from-a-pipeline-and-gridsearchcv-using-joblib-or-pickle/#:~:text=To%20save%20a%20model%20using%20Pickle%2C%20you%20need%20to%20import,model%20and%20the%20file%20name.&text=(model%2C%20f)-,To%20load%20the%20saved%20model%2C%20you%20need%20to%20import%20the,function%20with%20the%20file%20name.

    # join the model name with the model_save_path
    model_save_path = f"{model_checkpoints}{model_name}.pkl"

    # save the model 
    joblib.dump(model, model_save_path)
    
    # # Load in the saved model
    # model = joblib.load(model_save_path)
    
    ###########################################################################
    
    print(f"Best Estimator: {model.best_estimator_}")
    y_pred = model.predict(X_test)
    
    # Compute the accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"The accuracy of the SVM classifier is {accuracy}")

    # Compute the kappa score
    kappa = cohen_kappa_score(y_test, y_pred)
    print(f"The kappa score of the SVM classifier is {kappa}")
    
    # Compute the sensitivity for empty capsids
    empty_sensitivity = confusion_matrix(y_test, y_pred)[2, 2] / np.sum(confusion_matrix(y_test, y_pred)[2])
    print(f"The sensitivity for empty capsids is {empty_sensitivity}")

    # Binarize the true labels
    # y_test_bin = label_binarize(y_test, classes=[1, 2, 3])
    y_test_bin = label_binarize(y_test, classes=[1, 2, 3, 4, 5, 6, 7])
    # Predict probabilities for each class
    y_pred_prob = model.decision_function(X_test)
    # Calculate ROC-AUC scores for each class
    auroc = roc_auc_score(y_test_bin, y_pred_prob, multi_class='ovr')
    print(f"The AUROC of the SVM classifier is {auroc}")

    # Create a confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    # disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels = ['Full', 'Partial', 'Empty'])
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels = ['Full', 'Partial', 'Empty', 'Agg', 'Ice', 'Broken', 'Backgrd'])
    disp.plot(cmap = 'Blues')
    plt.show() 

### Decision Tree Classification

In [None]:
# Define a function that uses a Decision Tree Classifier and takes in training and
# testing data and returns the accuracy and kappa score and plots the confusion matrix
def decision_tree_classifier(train_features_df, test_features_df, model_checkpoints, model_name, print_tree = False):
    # Split the data into training and testing sets
    X_train = train_features_df.drop(columns=['label'])
    y_train = train_features_df['label']
    
    X_test = test_features_df.drop(columns=['label'])
    y_test = test_features_df['label']
    
    ############################## Random Search ##############################
    # specify parameters and distributions to sample from
    parameter_distribution = {
        'criterion': ['gini', 'entropy', 'log_loss'],
        'max_depth': stats.randint(1, 100),
        'min_samples_split': stats.randint(2, 10),
        'min_samples_leaf': stats.randint(1, 10),
        'max_features': stats.randint(1, 10)
    }
    
    n_iter_search = 200
    model = RandomizedSearchCV(DecisionTreeClassifier(), param_distributions=parameter_distribution, n_iter=n_iter_search, scoring=scoring, refit=refit_strategy)

    model.fit(X_train, y_train)

    ################################# Save the Model ##################################
    
    # Save the model
    model_save_path = f"{model_checkpoints}{model_name}.pkl"
    joblib.dump(model, model_save_path)
    
    # Load in the saved model
    # model = joblib.load(model_save_path)
    
    ####################### Make Predictions on the Test Data ########################
    
    y_pred = model.predict(X_test)
    
    print(f"Best Estimator: {model.best_estimator_}")

    # Compute the accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"The accuracy of the decision tree classifier is {accuracy}")

    # Compute the kappa score
    kappa = cohen_kappa_score(y_test, y_pred)
    print(f"The kappa score of the decision tree classifier is {kappa}")
    
    # Compute the sensitivity for empty capsids
    empty_sensitivity = confusion_matrix(y_test, y_pred)[2, 2] / np.sum(confusion_matrix(y_test, y_pred)[2])
    print(f"The sensitivity for empty capsids is {empty_sensitivity}")
    
    # Binarize the true labels
    # y_test_bin = label_binarize(y_test, classes=[1, 2, 3])
    y_test_bin = label_binarize(y_test, classes=[1, 2, 3, 4, 5, 6, 7])
    # Predict probabilities for each class
    y_pred_prob = model.predict_proba(X_test)
    # Calculate ROC-AUC scores for each class
    auroc = roc_auc_score(y_test_bin, y_pred_prob, multi_class='ovr')
    print(f"The AUROC of the decision tree classifier is {auroc}")

    # Create a confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    # disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels = ['Full', 'Partial', 'Empty'])
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels = ['Full', 'Partial', 'Empty', 'Agg', 'Ice', 'Broken', 'Backgrd'])
    disp.plot(cmap = 'Blues')
    plt.show()
    
    if print_tree:
        plt.figure(figsize=(20, 20))
        # tree.plot_tree(model, filled=True, feature_names=X_train.columns, class_names=['Full', 'Partial', 'Empty'])
        tree.plot_tree(model, filled=True, feature_names=X_train.columns, class_names=['Full', 'Partial', 'Empty', 'Agg', 'Ice', 'Broken', 'Backgrd'])
        plt.show()

### Kmeans Clustering

In [None]:
# Define a function that uses a Kmeans Clustering and takes in training and
# testing data and returns the accuracy and kappa score and plots the confusion matrix
def kmeans_clustering(train_features_df, test_features_df, model_checkpoints, model_name):
    # Split the data into training and testing sets
    X_train = train_features_df.drop(columns=['label'])
    y_train = train_features_df['label']
    
    X_test = test_features_df.drop(columns=['label'])
    y_test = test_features_df['label']
    
    # ############################## Grid Search ##############################
    # specify parameters
    # tuned_parameters = [{'n_clusters': [3], 'init': ['k-means++', 'random'], 'algorithm': ['lloyd', 'elkan'], 'n_init': [10, 20]}] # change this line if you change the number of classes. num classes = num clusters
    # tuned_parameters = [{'n_clusters': [7], 'init': ['k-means++', 'random'], 'algorithm': ['lloyd', 'elkan'], 'n_init': [10, 20]}] # change this line if you change the number of classes. num classes = num clusters

    ################################ Random Search ##############################
    # specify parameters and distributions to sample from
    parameter_distribution = {
        'n_clusters': [3], # change this line if you change the number of classes. num classes = num clusters
        # 'n_clusters': [7], # change this line if you change the number of classes. num classes = num clusters
        'init': ['k-means++', 'random'],
        'algorithm': ['lloyd', 'elkan'],
        'n_init': stats.randint(10, 20)
    }
    n_iter_search = 20
    model = RandomizedSearchCV(KMeans(), param_distributions=parameter_distribution, n_iter=n_iter_search, scoring=scoring, refit=refit_strategy, verbose=3)
    
    # model = GridSearchCV(KMeans(), tuned_parameters, scoring=scoring, refit=refit_strategy)
    model.fit(X_train, y_train)
    
    print(f"Best Estimator: {model.best_estimator_}")

    ################################# Save the Model ##################################

    # save the model
    model_save_path = f"{model_checkpoints}{model_name}.pkl"
    joblib.dump(model, model_save_path)
    
    # Load in the saved model
    model = joblib.load(model_save_path)
    
    ########################### Assigning Each Cluster ###############################

    # Get the cluster assignments
    cluster_assignments = model.predict(X_train)+1

    # Create a df of the cluster assignments
    cluster_assignments_df = pd.DataFrame({'cluster': cluster_assignments, 'label': y_train})

    # For each cluster, find the majority vote label
    majority_vote_labels = cluster_assignments_df.groupby('cluster')['label'].agg(lambda x: np.bincount(x).argmax())

    # Create a mapping dictionary from cluster to majority vote label
    cluster_label_mapping = dict(zip(majority_vote_labels.index, majority_vote_labels.values))

    # Map the cluster assignments to the majority vote labels
    cluster_assignments_df['majority_label'] = cluster_assignments_df['cluster'].map(cluster_label_mapping)

    ################################# Save the Model ##################################
    
    # save the cluster label mapping
    cluster_label_mapping_save_path = f"{model_checkpoints}{model_name}_cluster_label_mapping.pkl"
    joblib.dump(cluster_label_mapping, cluster_label_mapping_save_path)
    
    # Load in the saved model
    # cluster_label_mapping = joblib.load(cluster_label_mapping_save_path)
    
    ####################### Make Predictions on the Test Data ########################
    
    # Get the cluster assignments for the test data
    cluster_assignments_test = model.predict(X_test)+1
    
    # Map the cluster assignments to the majority vote labels
    cluster_assignments_test_df = pd.DataFrame({'cluster': cluster_assignments_test, 'label': y_test})
    cluster_assignments_test_df['majority_label'] = cluster_assignments_test_df['cluster'].map(cluster_label_mapping)
    
    # Make predictions
    y_pred = cluster_assignments_test_df['majority_label']
    
    ############################# Calculate the Metrics ##############################

    # Calculate the accuracy of the clustering
    accuracy = accuracy_score(y_test, y_pred)
    print(f"The accuracy of the clustering is {accuracy}")

    # Calculate the kappa score of the clustering
    kappa_cluster = cohen_kappa_score(y_test, y_pred)
    print(f"The kappa score of the clustering is {kappa_cluster}")
    
    # Compute the sensitivity for empty capsids
    empty_sensitivity = confusion_matrix(y_test, y_pred)[2, 2] / np.sum(confusion_matrix(cluster_assignments_test_df['label'], cluster_assignments_test_df['majority_label'])[2])
    print(f"The sensitivity for empty capsids is {empty_sensitivity}")
    
    # Calculate the AUROC of the clustering
    # y_test_bin = label_binarize(y_test, classes=[1, 2, 3])
    # y_pred_bin = label_binarize(y_pred, classes=[1, 2, 3])
    y_test_bin = label_binarize(y_test, classes=[1, 2, 3, 4, 5, 6, 7])
    y_pred_bin = label_binarize(y_pred, classes=[1, 2, 3, 4, 5, 6, 7])
    auroc = roc_auc_score(y_test_bin, y_pred_bin, multi_class='ovr')
    print(f"The AUROC of the clustering is {auroc}")

    # Create a confusion matrix
    cm_cluster_assignments = confusion_matrix(y_test, y_pred)
    # disp_cluster_assignments = ConfusionMatrixDisplay(confusion_matrix=cm_cluster_assignments, display_labels = ['Full', 'Partial', 'Empty'])
    disp_cluster_assignments = ConfusionMatrixDisplay(confusion_matrix=cm_cluster_assignments, display_labels = ['Full', 'Partial', 'Empty', 'Agg', 'Ice', 'Broken', 'Backgrd'])
    
    disp_cluster_assignments.plot(cmap = 'Blues')
    plt.show()

## Image analysis ML models

In [None]:
############################ SVM Classifier img_an ############################

svm_classifier(train_img_analysis_features_df, test_img_analysis_features_df, model_checkpoints, model_name="svm_model_img_an")


In [None]:
############################ Decision Tree Classifier img_an ############################

decision_tree_classifier(train_img_analysis_features_df, test_img_analysis_features_df, model_checkpoints, model_name="decision_tree_model_img_an")

In [None]:
############################ Kmeans Clustering img_an ############################

kmeans_clustering(train_img_analysis_features_df, test_img_analysis_features_df, model_checkpoints, model_name="kmeans_model_img_an")


## HOG ML models

In [None]:
############################## SVM Classifier HOG ############################

svm_classifier(train_hog_features_df, test_hog_features_df, model_checkpoints, model_name="svm_model_hog")

In [None]:
############################ Decision Tree Classifier HOG ############################

decision_tree_classifier(train_hog_features_df, test_hog_features_df, model_checkpoints, model_name="decision_tree_model_hog")

In [None]:
############################ Kmeans Clustering HOG ############################

kmeans_clustering(train_hog_features_df, test_hog_features_df, model_checkpoints, model_name="kmeans_model_hog")

## SIFT ML models

Here is the original paper that made this method:

Lowe, D. G. (2004). Distinctive image features from scale-invariant keypoints. International journal of computer vision, 60, 91-110.

In [None]:
############################ SVM Classifier sift ############################

svm_classifier(train_sift_features_df, test_sift_features_df, model_checkpoints, model_name="svm_model_sift")

In [None]:
############################ Decision Tree Classifier sift ############################

decision_tree_classifier(train_sift_features_df, test_sift_features_df, model_checkpoints, model_name="decision_tree_model_sift")

In [None]:
############################ Kmeans Clustering sift ############################

kmeans_clustering(train_sift_features_df, test_sift_features_df, model_checkpoints, model_name="kmeans_model_sift");

## CV Filter Bank ML Models

In [None]:
############################ SVM Classifier cv ############################

svm_classifier(train_cv_features_df, test_cv_features_df, model_checkpoints, model_name="svm_model_cv")

In [None]:
############################ Decision Tree Classifier cv ############################

decision_tree_classifier(train_cv_features_df, test_cv_features_df, model_checkpoints, model_name="decision_tree_model_cv")

In [None]:
############################ Kmeans Clustering cv ############################

kmeans_clustering(train_cv_features_df, test_cv_features_df, model_checkpoints, model_name="kmeans_model_cv")

## Combined Features ML Models

In [None]:
############################ SVM Classifier combined ############################

svm_classifier(train_combined_features_df, test_combined_features_df, model_checkpoints, model_name="svm_model_combined")

In [None]:
############################ Decision Tree Classifier combined ############################

decision_tree_classifier(train_combined_features_df, test_combined_features_df, model_checkpoints, model_name="decision_tree_model_combined")

In [None]:
############################ Kmeans Clustering combined ############################

kmeans_clustering(train_combined_features_df, test_combined_features_df, model_checkpoints, model_name="kmeans_model_combined")