## 1 - Import and load packages

In [1]:
# INSTRUCTIONS TO USER: Ensure you have ran pip install -r requirements.txt as stated in the README
import h5py
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path
import seaborn
import tensorflow as tf
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.svm import SVC
from tensorflow.keras import layers, models


%matplotlib inline
plt.rcParams['figure.figsize'] = (5.0, 4.0)
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

%load_ext autoreload
%autoreload 2

np.random.seed(1)

## 2 - Load and format data

In [2]:
# Load images, labels, and classes from a given HDF5 file
def load_data_from_hfile(file, photos_name): 
    if list(Path.cwd().rglob(file)):
        with h5py.File(list(Path.cwd().rglob(file))[0], 'r') as hfile:
            images_unshape = np.array(hfile[photos_name][:])                # Reads images into NumPy array
            images = images_unshape.reshape(images_unshape.shape[0], -1)    # Reshapes images into 2D format
            labels = np.array(hfile['numeric_labels_photos'][:])            # Reads all labels (per image) into NumPy array
            names = np.array([item.decode('utf-8') for item in hfile['fruit_names_legend'][:]]) # Reads list of classes of fruits into Numpy array
            hfile.close()
            {label: idx for idx, label in enumerate(names)}                 # Maps labels to classes
            return images, labels, names
    else:
        print(f"{file} not found or does not exist.")

train_images, train_labels, train_names = load_data_from_hfile("train_fruits_NEWEST_dataset.h5", 'train_fruits_photos')        # Loads training data
val_images, val_labels, val_names = load_data_from_hfile("validation_fruits_NEWEST_dataset.h5", 'validation_fruits_photos')    # Loads validation data
test_images, test_labels, test_names = load_data_from_hfile("test_fruits_NEWEST_dataset.h5", 'test_fruits_photos')             # Loads testing data

## 3- Design the Support Vector Machine (SVM)

### 3.1- Tune first SVM to find the best parameters

In [3]:
#NOTE: don't run this cell unless you want to wait ~800 minutes

# Tune model using halving grid search to find the best parameters based on accuracy
param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'gamma': [1, 0.1, 0.01, 0.001]
}

svm_grid = SVC(decision_function_shape='ovo')

grid_search = HalvingGridSearchCV(svm_grid, param_grid, scoring='accuracy')
grid_search.fit(train_images, train_labels)

# Print the best parameters and score
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_
best_index = grid_search.best_index_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Estimator:", best_estimator)
print("Best Index:", best_index)
print("Best Score:", best_score)

KeyboardInterrupt: 

### 3.2 Train and validate second SVM using the identified best parameters for one iteration

In [None]:
# Train a second SVM with the best parameters from 3.1
svm_final = SVC(C=10, kernel='linear', gamma=0.1, decision_function_shape='ovo')
svm_final.fit(train_images, train_labels)

In [None]:
#Purpose of testing with validation set: to use as a comparison measure with test set

# Test the second SVM with the validation set
svm_final_pred = svm_final.predict(val_images)

# Print accuracy and classification report
svm_final_acc = accuracy_score(val_labels, svm_final_pred)
print(f"Accuracy: {svm_final_acc:.2f}")

print("Classification Report:")
print(classification_report(val_labels, svm_final_pred))

### 3.3- Test the final SVM

In [9]:
# Test the final SVM with the test set
svm_final_pred = svm_final.predict(test_images)

### 3.4- Evaluate the results

In [None]:
# Display accuracy
svm_final_acc = accuracy_score(test_labels, svm_final_pred)
print(f"Accuracy: {svm_final_acc:.2f}")

# Display classification report
print("Classification Report:")
print(classification_report(test_labels, svm_final_pred))

In [None]:
# Calculate confusion matrix
conf_matrix = confusion_matrix(test_labels, svm_final_pred)

# Visualize confusion matrix
plt.figure(figsize=(8, 6))
seaborn.heatmap(conf_matrix, annot=True, cmap='YlGnBu', fmt='g')
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('SVM Confusion Matrix')
plt.show()

## 4- Design the Convolutional Neural Network (CNN)

### 4.1- Define CNN

In [12]:
cnn_target_shape = (100, 100, 3)

# Reshape images for input
train_images_reshaped = train_images.reshape(-1, *cnn_target_shape)
val_images_reshaped = val_images.reshape(-1, *cnn_target_shape)
test_images_reshaped = test_images.reshape(-1, *cnn_target_shape)

In [13]:
# Build the neural network model
cnn = models.Sequential()

# Add convolutional layers
cnn.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=(100, 100, 3)))
cnn.add(layers.MaxPooling2D((2, 2)))
cnn.add(layers.Conv2D(64, (3, 3), activation='relu'))
cnn.add(layers.MaxPooling2D((2, 2)))
cnn.add(layers.Flatten())
cnn.add(layers.Dense(64, activation='relu'))
cnn.add(layers.Dense(36, activation='softmax'))

# Compile with the best parameters from fine-tunings
cnn.compile(optimizer='adamax',
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy'])

### 4.2- Train and validate the CNN using the identified best parameters and epoch=7

In [None]:
# Train the model
cnn.fit(train_images_reshaped, train_labels, epochs=7, validation_data=(val_images_reshaped, val_labels))

### 4.3- Test with test set

In [None]:
# Evaluate the model on the test set
test_loss, test_acc = cnn.evaluate(test_images_reshaped, test_labels)

# Get predictions on the test set
cnn_test_pred = cnn.predict(test_images_reshaped)
cnn_pred_classes = np.argmax(cnn_test_pred, axis=1)

### 4.4- Evaluate the results

In [None]:
print(f'Accuracy: {test_acc}')

# Display classification report
classification_rep = classification_report(test_labels, cnn_pred_classes)
print("Classification Report:")
print(classification_rep)

In [None]:
# Calculate confusion matrix
conf_matrix = confusion_matrix(test_labels, cnn_pred_classes)

# Visualize confusion matrix
plt.figure(figsize=(8, 6))
seaborn.heatmap(conf_matrix, annot=True, cmap='YlGnBu', fmt='g')
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('CNN Confusion Matrix')
plt.show()