
<h1> DS200A Computer Vision Assignment</h1>

<h2>  Part Three: Classifier training and performance assessment. </h2>	

In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
import numpy as np
from sklearn.model_selection import KFold

np.random.seed(42)

  from numpy.core.umath_tests import inner1d


#### Retrieving and preprocessing of the training data

In [2]:
from ipynb.fs.defs.GradProject_NB2 import preprocess_part_one, preprocess_part_two

In [None]:
# Preprocessing data
training_data = preprocess_part_one()
train_part, val_part = train_test_split(training_data, test_size=0.1)

k, desc_limit = 10, 10000
train, val = preprocess_part_two(train_part, val_part, k, desc_limit)

train_x, train_y = train.drop(columns=['Label']), train['Label']
val_x, val_y = val.drop(columns=['Label']), val['Label']


[INFO] Beginning preprocessing part one
[INFO] Reading images
	- Fetching label 'airplanes'
	- Fetching label 'bear'
	- Fetching label 'blimp'
		- Gray image ('blimp_0022.jpg') was loaded, converting to RGB
	- Fetching label 'comet'
		- Gray image ('comet_0006.jpg') was loaded, converting to RGB
		- Gray image ('comet_0011.jpg') was loaded, converting to RGB
		- Gray image ('comet_0013.jpg') was loaded, converting to RGB
		- Gray image ('comet_0021.jpg') was loaded, converting to RGB
		- Gray image ('comet_0036.jpg') was loaded, converting to RGB
		- Gray image ('comet_0038.jpg') was loaded, converting to RGB
		- Gray image ('comet_0041.jpg') was loaded, converting to RGB
		- Gray image ('comet_0049.jpg') was loaded, converting to RGB
		- Gray image ('comet_0052.jpg') was loaded, converting to RGB
		- Gray image ('comet_0053.jpg') was loaded, converting to RGB
		- Gray image ('comet_0057.jpg') was loaded, converting to RGB
		- Gray image ('comet_0058.jpg') was loaded, converting to RGB

#### Scaling the data

In [None]:
scaler = MinMaxScaler()
train_x_scaled = scaler.fit_transform(train_x)
val_x_scaled = scaler.transform(val_x)

In [None]:
# Cross-validation
def rmse(actual_y, predicted_y):
    """
    The root mean square error between the prediction and the ground truth
    """
    return np.sqrt(np.sum((actual_y - predicted_y)**2)/len(predicted_y))

def compute_CV_rmse_and_acc(model, X_train, Y_train):
    '''
    Split the training data into 5 subsets.
    For each subset, 
        fit a model holding out that subset
        compute the MSE on that subset (the validation set)
    You should be fitting 5 models total.
    Return the average MSE of these 5 folds.

    Args:
        model: an sklearn model with fit and predict functions 
        X_train (data_frame): Training data
        Y_train (data_frame): Label 

    Return:
        the average validation error and accuracy for the 5 splits.
    '''
    kf = KFold(n_splits=5)
    validation_errors = []
    validation_accuracies = []
    
    for train_idx, valid_idx in kf.split(X_train):
        
        # Split the data
        split_X_train, split_X_valid = np.take(X_train, train_idx, axis=0), np.take(X_train, valid_idx, axis=0)
        split_Y_train, split_Y_valid = np.take(Y_train, train_idx, axis=0), np.take(Y_train, valid_idx, axis=0)
        
        # Fit the model on the training split
        model.fit(split_X_train, split_Y_train)
        
        # Compute the RMSE on the validation split
        preds = model.predict(split_X_valid)
        error = rmse(split_Y_valid, preds)
        acc = accuracy_score(split_Y_valid, preds)
        
        validation_errors.append(error)
        validation_accuracies.append(acc)
        
    return np.mean(validation_errors), np.mean(validation_accuracies)

#### Performing 5-fold cross validation to find optimal preprocessing parameters

In [None]:
import sys, os

# Disable
def blockPrint():
    sys.stdout = open(os.devnull, 'w')

# Restore
def enablePrint():
    sys.stdout = sys.__stdout__
def grid_search_preprocessing_parameters():
    for k in [10, 100, 200, 500]:
        for down_sample in [True, False]:
            for descriptor_limit in [6000, 10000, 30000]:
                #blockPrint()
                training_data = preprocess_part_one(None, True, down_sample)
                print('-'*50)
                train_part, val_part = train_test_split(training_data, test_size=0.1)
                train, val = preprocess_part_two(train_part, val_part, k, descriptor_limit)

                train_x, train_y = train.drop(columns=['Label']), train['Label']
                val_x, val_y = val.drop(columns=['Label']), val['Label']

                scaler = MinMaxScaler()
                train_x_scaled = scaler.fit_transform(train_x)
                val_x_scaled = scaler.transform(val_x)

                #enablePrint()
                print(f'**** k={k}, down_sample={down_sample}, decriptor_limit={descriptor_limit}****')

                model = LogisticRegression(max_iter=500)
                model.fit(train_x_scaled, train_y)
                preds = model.predict(val_x_scaled)
                print(type(model))
                error, acc = compute_CV_rmse_and_acc(model, train_x, train_y)
                print(f"Cross validation mean error: {error}")
                print(f"Cross validation mean accuracy: {acc}")
                print(f"Test accuracy: {accuracy_score(val_y, preds)}\n")


                model = KNeighborsClassifier(10, weights='distance')
                model.fit(train_x_scaled, train_y)
                preds = model.predict(val_x_scaled)
                print(type(model))
                error, acc = compute_CV_rmse_and_acc(model, train_x, train_y)
                print(f"Cross validation mean error: {error}")
                print(f"Cross validation mean accuracy: {acc}")
                print(f"Test accuracy: {accuracy_score(val_y, preds)}\n")

                model = DecisionTreeClassifier()
                model.fit(train_x_scaled, train_y)
                preds = model.predict(val_x_scaled)
                print(type(model))
                error, acc = compute_CV_rmse_and_acc(model, train_x, train_y)
                print(f"Cross validation mean error: {error}")
                print(f"Cross validation mean accuracy: {acc}")
                print(f"Test accuracy: {accuracy_score(val_y, preds)}\n")

                model = RandomForestClassifier(n_estimators=800)
                model.fit(train_x_scaled, train_y)
                preds = model.predict(val_x_scaled)
                print(type(model))
                error, acc = compute_CV_rmse_and_acc(model, train_x, train_y)
                print(f"Cross validation mean error: {error}")
                print(f"Cross validation mean accuracy: {acc}")
                print(f"Test accuracy: {accuracy_score(val_y, preds)}\n")

                model = SVC(kernel='rbf',C=10, gamma=0.01)
                model.fit(train_x_scaled, train_y)
                preds = model.predict(val_x_scaled)
                print(type(model))
                error, acc = compute_CV_rmse_and_acc(model, train_x, train_y)
                print(f"Cross validation mean error: {error}")
                print(f"Cross validation mean accuracy: {acc}")
                print(f"Test accuracy: {accuracy_score(val_y, preds)}\n")

            

### Logistic Regression
##### Performing  5-fold cross validation for deciding hyper parameters

In [None]:
model = LogisticRegression(multi_class='multinomial', solver= 'lbfgs', penalty='l2', max_iter=1000)

error, acc = compute_CV_rmse_and_acc(model, train_x_scaled, train_y)
print(f"Mean RMSE: {error}")
print(f"Accuracy: {acc}\n")

### K-nearest Neighbors
##### Predicting training set with 5-fold cross validation

In [None]:
model = KNeighborsClassifier(n_neighbors=10, weights='distance')

error, acc = compute_CV_rmse_and_acc(model, train_x_scaled, train_y)
print(f"Mean RMSE: {error}")
print(f"Accuracy: {acc}\n")

### Classification Tree

In [None]:
model = DecisionTreeClassifier(random_state=42)

error, acc = compute_CV_rmse_and_acc(model, train_x_scaled, train_y)
print(f"Mean RMSE: {error}")
print(f"Accuracy: {acc}\n")

### Random Forest
##### Predicting training set with 5-fold cross validation

In [None]:
model = RandomForestClassifier(n_estimators=800, random_state=42)
error, acc = compute_CV_rmse_and_acc(model, train_x_scaled, train_y)
print(f"Mean RMSE: {error}")
print(f"Accuracy: {acc}\n")

In [None]:
# Showing importances
s = model.feature_importances_
index_importance_sorted = sorted(range(len(s)), key=lambda k: s[k], reverse=True)
top_index = index_importance_sorted[:50]

print("\nMost importance features:")
for index in top_index:
    print(f"Feature name: {train_x.columns[index]}, Importance={s[index]}")

#### Performing K-fold grid search to find optimal parameters for Random Forest

In [None]:
print(model.get_params())

from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores

#rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
#rf_random.fit(train_x, train_y)

In [None]:
rf_random.best_params_

In [None]:
def evaluate(model, test_features, test_labels):
    preds = model.predict(test_features)
    accuracy = accuracy_score(preds, test_labels)
    print('Accuracy = {:0.2f}%.'.format(100*accuracy))
    return accuracy

base_model = RandomForestClassifier(n_estimators = 10, random_state = 42)
base_model.fit(train_x, train_y)
base_accuracy = evaluate(base_model, val_x, val_y)

best_random = rf_random.best_estimator_
random_accuracy = evaluate(best_random, val_x, val_y)

print('Improvement of {:0.2f}%.'.format( 100 * (random_accuracy - base_accuracy) / base_accuracy))


### Support Vector Machine

In [None]:
model = SVC(kernel='rbf',C=10, gamma=0.01, decision_function_shape='ovo')

error, acc = compute_CV_rmse_and_acc(model, train_x_scaled, train_y)
print(f"Mean RMSE: {error}")
print(f"Accuracy: {acc}\n")

### Sources

* https://en.wikipedia.org/wiki/Logistic_regression
* https://en.wikipedia.org/wiki/Multiclass_classification#One-vs.-rest
* https://scikit-learn.org/stable/modules/tree.html#tree
* https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm
* https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
* https://en.wikipedia.org/wiki/Random_forest
* https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
* https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74