<h1> Training Classifiers and performance assessment</h1>

In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)
warnings.simplefilter(action='ignore', category=UserWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
import numpy as np
from sklearn.model_selection import KFold
from sklearn.decomposition import PCA
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.model_selection import RandomizedSearchCV
import sys, os

np.random.seed(42)

  from numpy.core.umath_tests import inner1d


### Retrieving and preprocessing of data

In [6]:
from ipynb.fs.defs.feature_extraction_and_exploratory_data_analysis import get_preprocessed_feature_frame
from ipynb.fs.defs.transfer_learning_cnn import get_recall, get_precision

In [None]:
# Preprocessing data:
#  • Seperating data into train, validation and test data.
#     - 'training_data' is data used for hypertuning
#
#  • We will use 'validation_data' as our own test data, to evaluate the final models.
#     - 'validation_data' will be held aside until all hypertuning of both preprocessing parameters and model parameters is done.
#     - 'test_data' refers to the unlabeled images, which we will use our best model to predict labels for, and submitting these predictions.

# Preprocessing variables below is arbitrary, and only represents a default setting for our preprocessing parameters
downsample, k, desc_limit = False, 2, 1000

# This specifies the train/val split
validation_size = 0.1

training_data, validation_data, test_data = get_preprocessed_feature_frame(k=k, desc_limit=desc_limit, downsample=downsample, validation_size=validation_size)

In [None]:
print(f"Training data shape: {training_data.shape}")
print(f"Validation data shape: {validation_data.shape}")
print(f"Test data shape: {test_data.shape}")

In [3]:
#Fitting the PCA algorithm with our data, did not give promising results, therefore not using in final version of classification

def show_variance_explained_pca(data):
    """
    Showing cumulative variance explained by the principle components after performing pca on input dataframe
    """
    pca = PCA().fit(data)
    #Plotting the Cumulative Summation of the Explained Variance
    plt.figure()
    plt.plot(np.cumsum(pca.explained_variance_ratio_))
    plt.xlabel('Number of Components')
    plt.ylabel('Variance (%)') #for each component
    plt.title('Explained Variance')
    plt.show()
    
def fit_pca(pca, train_x, val_x):
    """
    Returning the transformed datasets using the provided pca
    Fitting using only training data, transforms both
    """
    train_x_pca = pca.fit_transform(train_x)
    val_x_pca = pca.transform(val_x)    
    return train_x_pca, test_x_pca

## Performing 5-fold cross validation to find optimal preprocessing parameters

In [4]:
def rmse(actual_y, predicted_y):
    """
    The root mean square error between the prediction and the ground truth
    """
    return np.sqrt(np.sum((actual_y - predicted_y)**2)/len(predicted_y))


def compute_CV_error_acc_rec_prec(model, X_train, Y_train):
    '''
    Split the training data into 5 subsets.
    For each subset:
        - Fit a model holding out that subset
        - Compute the RMSE on that subset (the validation set)

    @param models: List of sklearn models with fit and predict functions 
    @param X_train: X training data
    @param Y_train: Y training data
    
    @return: The average RMSE, Accuracy, Recall and Precision of these 5 folds.
    '''
    kf = KFold(n_splits=5, random_state=42)
    validation_errors = []
    validation_accuracies = []
    validation_recall = []
    validation_precision = []
    
    for train_idx, valid_idx in kf.split(X_train):
        
        # Split the data
        split_X_train, split_X_valid = np.take(X_train, train_idx, axis=0), np.take(X_train, valid_idx, axis=0)
        split_Y_train, split_Y_valid = np.take(Y_train, train_idx, axis=0), np.take(Y_train, valid_idx, axis=0)
        
        # Fit the model on the training split
        model.fit(split_X_train, split_Y_train)
        
        # Compute the RMSE on the validation split
        preds = model.predict(split_X_valid)
        
        # Computing metrics
        error = rmse(split_Y_valid, preds)
        acc = accuracy_score(split_Y_valid, preds)
        recall = np.mean(get_recall(preds, split_Y_valid))
        precision = np.mean(get_precision(preds, split_Y_valid))
        
        validation_errors.append(error)
        validation_accuracies.append(acc)
        validation_recall.append(recall)
        validation_precision.append(precision)
        
    return np.mean(validation_errors), np.mean(validation_accuracies), np.mean(validation_recall), np.mean(validation_precision)

In [5]:
# Cross-validation
def compute_multimodel_CV_rmse_and_acc(models, training_df, k, down_sample, desc_limit):
    '''
    Split the training data into 5 subsets.
    For each subset:
        - Fit a model holding out that subset
        - Compute the RMSE on that subset (the validation set)
        
    You should be fitting 5 models total.

    @param models: List of sklearn models with fit and predict functions 
    @param training_df: Training dataframe: Training data
    @param k: K for KMeans
    @param down_sample: Whether or not to down-sample
    @param desc_limit: Descriptor limit
    
    @return: The average RMSE, Accuracy, Recall and Precision of these 5 folds.
    '''
    kf = KFold(n_splits=5, random_state=42)
    validation_accuracies = [[] for _ in range(len(models))]
    
    for train_idx, valid_idx in kf.split(training_df):
        
        # Split the data
        split_train = training_df.iloc[train_idx]
        split_valid = training_df.iloc[valid_idx]
        
        train, _, val = get_preprocessed_feature_frame(split_train, split_valid, validation_size=0.0, k=k, desc_limit=desc_limit)
        
        train_x, train_y = train.drop(columns=['Label', 'Image_Id', 'Scaled_Image', 'Image']), train['Label']
        val_x, val_y = val.drop(columns=['Label', 'Image_Id', 'Scaled_Image', 'Image']), val['Label']
        
        
        scaler = MinMaxScaler()
        train_x_scaled = scaler.fit_transform(train_x)
        val_x_scaled = scaler.transform(val_x)
        
        for i, model in enumerate(models):
            model.fit(train_x_scaled, train_y)
            preds = model.predict(val_x_scaled)

            acc = accuracy_score(preds, val_y)
            validation_accuracies[i].append(acc)
    
    for i in range(len(models)):
        print(type(models[i]))
        print(f"Cross validation mean accuracy: {np.mean(validation_accuracies[i])}")
        
    return validation_accuracies

### Grid search for finding preprocessing parameters

In [6]:
def grid_search_preprocessing_parameters(train_data):
    """
    Performing gridsearch with five-fold cross-validation to find optimal preprocessing for all classifiers.
    """
    accs = []
    for k in [10, 50, 100]:
        for down_sample in [True, False]:
            for descriptor_limit in [5000, 10000, 12000, 30000]:
                
                print(f'**** k={k}, down_sample={down_sample}, decriptor_limit={descriptor_limit} ****')
                
                
                models = [LogisticRegression(max_iter=1000), 
                          KNeighborsClassifier(10, weights='distance'),
                          DecisionTreeClassifier(),
                          RandomForestClassifier(n_estimators=800),
                          SVC(kernel='rbf',C=10, gamma=0.01)]
                data = train_data.copy()[['Image_Id', 'Image', 'Label']]
                accs.append(compute_multimodel_CV_rmse_and_acc(models, data, k, down_sample, descriptor_limit))
    
    return accs
                
                
#grid_search_preprocessing_parameters(training_data)

### Results from grid-search for optimal preprocessing

After performing grid search to find optimal preprocessing parameters on base models (without tuned hyper parameters), we have the following results:

| Model               | K   | Under-sampling | Descriptor limit | Achieved CV Acc |
|---------------------|-----|----------------|------------------|-----------------|
| Logistic Regression | 100 | True           | 12,000           | 0.440           |
| k-NN                | 100 | False          | 12,000           | 0.334           |
| Decision Tree       | 100 | False          | 5,000            | 0.301           |
| Random Forest       | 100 | False          | 12,000           | 0.466           |
| SVM                 | 100 | False          | 5,000            | 0.423           |

## Tuning hyperparameters for classifiers

In [7]:
def get_optimal_hyperparameters(base_model, random_grid, train_x, train_y):
    """
    Performing a randomized grid search with 3-fold cross validation to find optimal hyperparameters for given model.
    Random grid contains the list of parameters to try.
    """
    model_random = RandomizedSearchCV(estimator = base_model, param_distributions = random_grid, n_iter = 100, cv = 5, verbose=1, random_state=42, n_jobs = -1)
    model_random.fit(train_x, train_y)
    return model_random
    

In [8]:
def get_train_val_data(k, desc_limit, downsample):
    """
    Returning training and validation data, given a set of preprocessing parameters.
    The validation data will always contain the same images.
    """
    train_data, val_data, _ = get_preprocessed_feature_frame(k=k, desc_limit=desc_limit, downsample=downsample, validation_size=0.1)
    train_x, train_y = train_data.drop(columns=['Label', 'Image_Id', 'Scaled_Image', 'Image']), train_data['Label']
    val_x, val_y = val_data.drop(columns=['Label', 'Image_Id', 'Scaled_Image', 'Image']), val_data['Label']
    
    scaler = StandardScaler()
    train_x_scaled = scaler.fit_transform(train_x)
    val_x_scaled = scaler.transform(val_x)
    
    return train_x_scaled, train_y, val_x_scaled, val_y

### Logistic Regression

Using preprocessing parameters:
* K: 100
* Down-sample: True
* Descriptor limit: 12,000

In [9]:
train_x, train_y, val_x, val_y = get_train_val_data(k=100, desc_limit=12000, downsample=True)

[INFO] Beginning preprocessing part one
[INFO] Reading training images
	 [100.0 %] Fetching label 'zebra'                 
[INFO] Reading testing images
	 [100.0 %] Fetching 716 images'
[INFO] Downsampling training images..
[INFO] Denoising images
[INFO] Trimming images
[INFO] Scaling images
[INFO] Calculating mean and standard deviation for HOG feature
[INFO] Calculating mean of Harris Corner response map
[INFO] Adding features: size, aspect_ratio, red-, green-, blue- and gray-intensity
[INFO] Adding color histogram features
[INFO] Adding stride features
[INFO] Denoising images
[INFO] Trimming images
[INFO] Scaling images
[INFO] Calculating mean and standard deviation for HOG feature
[INFO] Calculating mean of Harris Corner response map
[INFO] Adding features: size, aspect_ratio, red-, green-, blue- and gray-intensity
[INFO] Adding color histogram features
[INFO] Adding stride features
[INFO] Denoising images
[INFO] Trimming images
[INFO] Scaling images
[INFO] Calculating mean and sta

##### Random Grid Search

In [10]:
penalty = ['l2']
solver = ['lbfgs', 'newton-cg', 'sag', 'saga']
C = np.logspace(-4, 4, 20)
multi_class = ['multinomial', 'ovr']
class_weight = [None, 'balanced']

# Create the random grid
random_grid = {'penalty': penalty,
               'solver': solver,
               'C': C,
               'multi_class': multi_class,
               'class_weight': class_weight}

base_model = LogisticRegression()
logreg_optimal = get_optimal_hyperparameters(base_model, random_grid, train_x, train_y)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   34.0s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  7.2min finished


In [11]:
print(f'Optimal parameters for model:\n{logreg_optimal.best_params_}')
print(f'Mean cross-validated score of the best_estimator: {logreg_optimal.best_score_}')

Optimal parameters for model:
{'solver': 'saga', 'penalty': 'l2', 'multi_class': 'multinomial', 'class_weight': None, 'C': 29.763514416313132}
Mean cross-validated score of the best_estimator: 0.44076655052264807


##### Evaluating optimal model with  5-fold cross validation

In [12]:
logreg_optimal_model = logreg_optimal.best_estimator_
error, acc, recall, prec = compute_CV_error_acc_rec_prec(logreg_optimal_model, train_x, train_y)
print(f"Mean RMSE: {error}")
print(f"Accuracy: {acc}")
print(f"Recall: {recall}")
print(f"Precision: {prec}")

Mean RMSE: 6.071572810656048
Accuracy: 0.43901651794190244
Recall: 0.43155606007208547
Precision: 0.418413309187222


##### Evaluating optimal model on separate validation ("test") set

In [13]:
logreg_optimal_model.fit(train_x, train_y)
preds = logreg_optimal_model.predict(val_x)
acc = accuracy_score(preds, val_y)
recall = np.mean(get_recall(preds, val_y))
precision = np.mean(get_precision(preds, val_y))

print(f"Test accuracy: {acc}")
print(f"Recall: {recall}")
print(f"Precision: {precision}")

Test accuracy: 0.3984375
Recall: 0.37735750295717374
Precision: 0.3948733203352865


### K-nearest Neighbors

Using preprocessing parameters:
* K: 100
* Down-sample: False
* Descriptor limit: 12,000

In [14]:
train_x, train_y, val_x, val_y = get_train_val_data(k=100, desc_limit=12000, downsample=False)

[INFO] Beginning preprocessing part one
[INFO] Reading training images
	 [100.0 %] Fetching label 'zebra'                 
[INFO] Reading testing images
	 [100.0 %] Fetching 716 images'
[INFO] Denoising images
[INFO] Trimming images
[INFO] Scaling images
[INFO] Calculating mean and standard deviation for HOG feature
[INFO] Calculating mean of Harris Corner response map
[INFO] Adding features: size, aspect_ratio, red-, green-, blue- and gray-intensity
[INFO] Adding color histogram features
[INFO] Adding stride features
[INFO] Denoising images
[INFO] Trimming images
[INFO] Scaling images
[INFO] Calculating mean and standard deviation for HOG feature
[INFO] Calculating mean of Harris Corner response map
[INFO] Adding features: size, aspect_ratio, red-, green-, blue- and gray-intensity
[INFO] Adding color histogram features
[INFO] Adding stride features
[INFO] Denoising images
[INFO] Trimming images
[INFO] Scaling images
[INFO] Calculating mean and standard deviation for HOG feature
[INFO]

##### Random Grid Search

In [15]:
n_neighbors=[i for i in range(1, 30, 2)]
weights = ['distance', 'uniform']
algorithm = ['auto', 'ball_tree', 'kd_tree', 'brute']
p = [i for i in range(1, 5)]

# Create the random grid
random_grid = {'n_neighbors': n_neighbors,
               'weights': weights,
               'algorithm': algorithm,
               'p': p}

base_model = KNeighborsClassifier()
knn_optimal = get_optimal_hyperparameters(base_model, random_grid, train_x, train_y)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   11.2s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:   44.8s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  1.8min finished


In [16]:
print(f'Optimal parameters for model:\n{knn_optimal.best_params_}')
print(f'Mean cross-validated score of the best_estimator: {knn_optimal.best_score_}')

Optimal parameters for model:
{'weights': 'distance', 'p': 1, 'n_neighbors': 27, 'algorithm': 'auto'}
Mean cross-validated score of the best_estimator: 0.38296296296296295


##### Evaluating optimal model with  5-fold cross validation

In [17]:
knn_optimal_model = knn_optimal.best_estimator_
error, acc, recall, prec = compute_CV_error_acc_rec_prec(knn_optimal_model, train_x, train_y)
print(f"Mean RMSE: {error}")
print(f"Accuracy: {acc}")
print(f"Recall: {recall}")
print(f"Precision: {prec}")

Mean RMSE: 6.104989021357785
Accuracy: 0.37851851851851853
Recall: 0.32909701414022774
Precision: 0.3955269805386118


##### Evaluating optimal model on separate validation ("test") set

In [18]:
knn_optimal_model.fit(train_x, train_y)
preds = knn_optimal_model.predict(val_x)
acc = accuracy_score(preds, val_y)
recall = np.mean(get_recall(preds, val_y))
precision = np.mean(get_precision(preds, val_y))

print(f"Test accuracy: {acc}")
print(f"Recall: {recall}")
print(f"Precision: {precision}")

Test accuracy: 0.33112582781456956
Recall: 0.3097817454772094
Precision: 0.34377966839951635


### Classification Tree

Using preprocessing parameters:
* K: 100
* Down-sample: False
* Descriptor limit: 5,000

In [19]:
train_x, train_y, val_x, val_y = get_train_val_data(k=100, desc_limit=5000, downsample=False)

[INFO] Beginning preprocessing part one
[INFO] Reading training images
	 [100.0 %] Fetching label 'zebra'                 
[INFO] Reading testing images
	 [100.0 %] Fetching 716 images'
[INFO] Denoising images
[INFO] Trimming images
[INFO] Scaling images
[INFO] Calculating mean and standard deviation for HOG feature
[INFO] Calculating mean of Harris Corner response map
[INFO] Adding features: size, aspect_ratio, red-, green-, blue- and gray-intensity
[INFO] Adding color histogram features
[INFO] Adding stride features
[INFO] Denoising images
[INFO] Trimming images
[INFO] Scaling images
[INFO] Calculating mean and standard deviation for HOG feature
[INFO] Calculating mean of Harris Corner response map
[INFO] Adding features: size, aspect_ratio, red-, green-, blue- and gray-intensity
[INFO] Adding color histogram features
[INFO] Adding stride features
[INFO] Denoising images
[INFO] Trimming images
[INFO] Scaling images
[INFO] Calculating mean and standard deviation for HOG feature
[INFO]

##### Random Grid Search

In [25]:
criterion=['gini', 'entropy']
splitter = ['best', 'random']
max_features = ['auto', 'sqrt', 'log2']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]

# Create the random grid
random_grid = {'criterion': criterion,
               'splitter': splitter,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

base_model = DecisionTreeClassifier()
dt_optimal = get_optimal_hyperparameters(base_model, random_grid, train_x, train_y)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  32 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:    3.4s finished


In [26]:
print(f'Optimal parameters for model:\n{dt_optimal.best_params_}')
print(f'Mean cross-validated score of the best_estimator: {dt_optimal.best_score_}')

Optimal parameters for model:
{'splitter': 'best', 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': None, 'criterion': 'gini'}
Mean cross-validated score of the best_estimator: 0.2659259259259259


##### Evaluating optimal model with  5-fold cross validation

In [28]:
dt_optimal_model = dt_optimal.best_estimator_
error, acc, recall, prec = compute_CV_error_acc_rec_prec(dt_optimal_model, train_x, train_y)
print(f"Mean RMSE: {error}")
print(f"Accuracy: {acc}")
print(f"Recall: {recall}")
print(f"Precision: {prec}")

Mean RMSE: 6.93302199941984
Accuracy: 0.23851851851851852
Recall: 0.20386586244493232
Precision: 0.2094802008063556


##### Evaluating optimal model on separate validation ("test") set

In [29]:
dt_optimal_model.fit(train_x, train_y)
preds = dt_optimal_model.predict(val_x)
acc = accuracy_score(preds, val_y)
recall = np.mean(get_recall(preds, val_y))
precision = np.mean(get_precision(preds, val_y))

print(f"Test accuracy: {acc}")
print(f"Recall: {recall}")
print(f"Precision: {precision}")

Test accuracy: 0.25165562913907286
Recall: 0.2577579359298161
Precision: 0.22700396792187982


### Random Forest

Using preprocessing parameters:
* K: 100
* Down-sample: False
* Descriptor limit: 12,000

In [30]:
train_x, train_y, val_x, val_y = get_train_val_data(k=100, desc_limit=12000, downsample=False)

[INFO] Beginning preprocessing part one
[INFO] Reading training images
	 [100.0 %] Fetching label 'zebra'                 
[INFO] Reading testing images
	 [100.0 %] Fetching 716 images'
[INFO] Denoising images
[INFO] Trimming images
[INFO] Scaling images
[INFO] Calculating mean and standard deviation for HOG feature
[INFO] Calculating mean of Harris Corner response map
[INFO] Adding features: size, aspect_ratio, red-, green-, blue- and gray-intensity
[INFO] Adding color histogram features
[INFO] Adding stride features
[INFO] Denoising images
[INFO] Trimming images
[INFO] Scaling images
[INFO] Calculating mean and standard deviation for HOG feature
[INFO] Calculating mean of Harris Corner response map
[INFO] Adding features: size, aspect_ratio, red-, green-, blue- and gray-intensity
[INFO] Adding color histogram features
[INFO] Adding stride features
[INFO] Denoising images
[INFO] Trimming images
[INFO] Scaling images
[INFO] Calculating mean and standard deviation for HOG feature
[INFO]

##### Random Grid Search

In [31]:
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)] 
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)] 
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

base_model = RandomForestClassifier()
rf_optimal = get_optimal_hyperparameters(base_model, random_grid, train_x, train_y)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  7.7min
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed: 16.8min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 19.1min finished


In [32]:
print(f'Optimal parameters for model:\n{rf_optimal.best_params_}')
print(f'Mean cross-validated score of the best_estimator: {rf_optimal.best_score_}')

Optimal parameters for model:
{'n_estimators': 800, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 50, 'bootstrap': False}
Mean cross-validated score of the best_estimator: 0.48148148148148145


##### Evaluating optimal model with  5-fold cross validation

In [33]:
rf_optimal_model = rf_optimal.best_estimator_
error, acc, recall, prec = compute_CV_error_acc_rec_prec(rf_optimal_model, train_x, train_y)
print(f"Mean RMSE: {error}")
print(f"Accuracy: {acc}")
print(f"Recall: {recall}")
print(f"Precision: {prec}")

Mean RMSE: 5.430012725761354
Accuracy: 0.47703703703703704
Recall: 0.43209351177628885
Precision: 0.4864141379478788


##### Evaluating optimal model on separate validation ("test") set

In [34]:
rf_optimal_model.fit(train_x, train_y)
preds = rf_optimal_model.predict(val_x)
acc = accuracy_score(preds, val_y)
recall = np.mean(get_recall(preds, val_y))
precision = np.mean(get_precision(preds, val_y))

print(f"Test accuracy: {acc}")
print(f"Recall: {recall}")
print(f"Precision: {precision}")

Test accuracy: 0.5231788079470199
Recall: 0.5106746021792027
Precision: 0.5226237144380881


### Support Vector Machine

Using preprocessing parameters:
* K: 100
* Down-sample: False
* Descriptor limit: 5,000

In [35]:
train_x, train_y, val_x, val_y = get_train_val_data(k=100, desc_limit=5000, downsample=False)

[INFO] Beginning preprocessing part one
[INFO] Reading training images
	 [100.0 %] Fetching label 'zebra'                 
[INFO] Reading testing images
	 [100.0 %] Fetching 716 images'
[INFO] Denoising images
[INFO] Trimming images
[INFO] Scaling images
[INFO] Calculating mean and standard deviation for HOG feature
[INFO] Calculating mean of Harris Corner response map
[INFO] Adding features: size, aspect_ratio, red-, green-, blue- and gray-intensity
[INFO] Adding color histogram features
[INFO] Adding stride features
[INFO] Denoising images
[INFO] Trimming images
[INFO] Scaling images
[INFO] Calculating mean and standard deviation for HOG feature
[INFO] Calculating mean of Harris Corner response map
[INFO] Adding features: size, aspect_ratio, red-, green-, blue- and gray-intensity
[INFO] Adding color histogram features
[INFO] Adding stride features
[INFO] Denoising images
[INFO] Trimming images
[INFO] Scaling images
[INFO] Calculating mean and standard deviation for HOG feature
[INFO]

##### Random Grid Search

In [36]:
C = np.logspace(-4, 4, 20)
kernel = ['rbf', 'linear', 'poly', 'sigmoid'] 
gamma = ['scale', 'auto', 0.0001, 0.001, 0.01, 0.1, 1]
shrinking = [True, False]
class_weight = [None,'balanced']
decision_function_shape = ['ovo', 'ovr']

# Create the random grid
random_grid = {'C': C,
               'kernel': kernel,
               'gamma': gamma,
               'shrinking': shrinking,
               'class_weight': class_weight,
               'decision_function_shape': decision_function_shape}

base_model = SVC()
svc_optimal = get_optimal_hyperparameters(base_model, random_grid, train_x, train_y)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   11.3s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:   51.0s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  2.4min finished


In [37]:
print(f'Optimal parameters for model:\n{svc_optimal.best_params_}')
print(f'Mean cross-validated score of the best_estimator: {svc_optimal.best_score_}')

Optimal parameters for model:
{'shrinking': False, 'kernel': 'linear', 'gamma': 'auto', 'decision_function_shape': 'ovo', 'class_weight': None, 'C': 0.004832930238571752}
Mean cross-validated score of the best_estimator: 0.45185185185185184


##### Evaluating optimal model with  5-fold cross validation

In [38]:
svc_optimal_model = svc_optimal.best_estimator_
error, acc, recall, prec = compute_CV_error_acc_rec_prec(svc_optimal_model, train_x, train_y)
print(f"Mean RMSE: {error}")
print(f"Accuracy: {acc}")
print(f"Recall: {recall}")
print(f"Precision: {prec}")

Mean RMSE: 5.743792808403207
Accuracy: 0.43851851851851853
Recall: 0.3949599648185278
Precision: 0.4400294290263521


##### Evaluating optimal model on separate validation ("test") set

In [39]:
svc_optimal_model.fit(train_x, train_y)
preds = svc_optimal_model.predict(val_x)
acc = accuracy_score(preds, val_y)
recall = np.mean(get_recall(preds, val_y))
precision = np.mean(get_precision(preds, val_y))

print(f"Test accuracy: {acc}")
print(f"Recall: {recall}")
print(f"Precision: {precision}")

Test accuracy: 0.45695364238410596
Recall: 0.45248015787922197
Precision: 0.4513556610752848


# Predicting provided test set

Here we are predicting the unlabeled testing set, provided with the assignment.
We will be predicting using our RandomForest model with tuned hyperparameters, which performed best with an cross-validation accuracy of $0.477$ and test accuracy of $0.523$.

In [40]:
# Using preprocessing parameters that, found through cross-validation, is most optimal for our best performing model
k, desc_limit, downsample, validation_size = 100, 12000, False, 0.0
training_data, _, test_data = get_preprocessed_feature_frame(k=k, desc_limit=desc_limit, downsample=downsample, validation_size=validation_size,  data_augmentation=False)

[INFO] Beginning preprocessing part one
[INFO] Reading training images
	 [100.0 %] Fetching label 'zebra'                 
[INFO] Reading testing images
	 [100.0 %] Fetching 716 images'
[INFO] Denoising images
[INFO] Trimming images
[INFO] Scaling images
[INFO] Calculating mean and standard deviation for HOG feature
[INFO] Calculating mean of Harris Corner response map
[INFO] Adding features: size, aspect_ratio, red-, green-, blue- and gray-intensity
[INFO] Adding color histogram features
[INFO] Adding stride features
[INFO] Denoising images
[INFO] Trimming images
[INFO] Scaling images
[INFO] Calculating mean and standard deviation for HOG feature
[INFO] Calculating mean of Harris Corner response map
[INFO] Adding features: size, aspect_ratio, red-, green-, blue- and gray-intensity
[INFO] Adding color histogram features
[INFO] Adding stride features
[INFO] Getting descriptors
[INFO] Fitting KMeans with k=100 to training descriptors
[INFO] Adding cluster features
[INFO] Done preprocessi

In [41]:
print(f"Training set shape: {training_data.shape}")

(1501, 676)

In [42]:
train_x, train_y = training_data.drop(columns=['Label', 'Image_Id', 'Scaled_Image', 'Image']), training_data['Label']
test_x = test_data.drop(columns=['Image_Id', 'Scaled_Image', 'Image'])

scaler = StandardScaler()
train_x_scaled = scaler.fit_transform(train_x)
test_x_scaled = scaler.transform(test_x)

In [61]:
model = rf_optimal_model
model.fit(train_x_scaled, train_y)

preds = model.predict(test_x_scaled)

In [63]:
prediction_df = pd.DataFrame(preds, columns=["Label"])
prediction_df.to_csv('predictions.csv', header=False, index=False)

prediction_df.head(10)

Unnamed: 0,Label
0,14
1,8
2,14
3,3
4,3
5,6
6,14
7,9
8,14
9,9
