
<h1> DS200A Computer Vision Assignment</h1>

<h2>  Part Three: Classifier training and performance assessment. </h2>	

In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
import numpy as np
from sklearn.model_selection import KFold
np.random.seed(42)

  from numpy.core.umath_tests import inner1d


In [2]:
#from google.colab import drive
#drive.mount('/content/drive', force_remount=True)

#### Retrieving and preprocessing of the training data

In [5]:
from ipynb.fs.defs.GradProject_NB2 import feature_frame
from ipynb.fs.defs.GradProject_NB4 import get_precision, get_recall, get_accuracy

In [195]:
data = feature_frame()
y = data['Label']
x = data.drop(columns=['Label', 'Image_Id'])

Fetching label '.DS_Store'
Fetching label 'airplanes'
Fetching label 'bear'
Fetching label 'blimp'
	Gray image ('blimp_0022.jpg') was loaded, converting to RGB
Fetching label 'comet'
	Gray image ('comet_0006.jpg') was loaded, converting to RGB
	Gray image ('comet_0011.jpg') was loaded, converting to RGB
	Gray image ('comet_0013.jpg') was loaded, converting to RGB
	Gray image ('comet_0021.jpg') was loaded, converting to RGB
	Gray image ('comet_0036.jpg') was loaded, converting to RGB
	Gray image ('comet_0038.jpg') was loaded, converting to RGB
	Gray image ('comet_0041.jpg') was loaded, converting to RGB
	Gray image ('comet_0049.jpg') was loaded, converting to RGB
	Gray image ('comet_0052.jpg') was loaded, converting to RGB
	Gray image ('comet_0053.jpg') was loaded, converting to RGB
	Gray image ('comet_0057.jpg') was loaded, converting to RGB
	Gray image ('comet_0058.jpg') was loaded, converting to RGB
Fetching label 'crab'
	Gray image ('crab_0045.jpg') was loaded, converting to RGB
Fet

#### Splitting training data

In [240]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

# Min-Max scaling data
scaler = MinMaxScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [287]:
# Cross-validation
def rmse(actual_y, predicted_y):
    """
    The root mean square error between the prediction and the ground truth
    """
    return np.sqrt(np.sum((actual_y - predicted_y)**2)/len(predicted_y))

def compute_CV_rmse_and_acc(model, X_train, Y_train):
    '''
    Split the training data into 5 subsets.
    For each subset, 
        fit a model holding out that subset
        compute the MSE on that subset (the validation set)
    You should be fitting 5 models total.
    Return the average MSE of these 5 folds.

    Args:
        model: an sklearn model with fit and predict functions 
        X_train (data_frame): Training data
        Y_train (data_frame): Label 

    Return:
        the average validation error and accuracy for the 5 splits.
    '''
    kf = KFold(n_splits=5)
    validation_errors = []
    validation_accuracies = []
    
    for train_idx, valid_idx in kf.split(X_train):
        
        # Split the data
        split_X_train, split_X_valid = np.take(X_train, train_idx, axis=0), np.take(X_train, valid_idx, axis=0)
        split_Y_train, split_Y_valid = np.take(Y_train, train_idx, axis=0), np.take(Y_train, valid_idx, axis=0)
        
        # Fit the model on the training split
        model.fit(split_X_train, split_Y_train)
        
        # Compute the RMSE on the validation split
        preds = model.predict(split_X_valid)
        error = rmse(split_Y_valid, preds)
        acc = accuracy_score(split_Y_valid, preds)
        
        validation_errors.append(error)
        validation_accuracies.append(acc)
        
    return np.mean(validation_errors), np.mean(validation_accuracies)

### Logistic Regression
##### Fitting model to training set with 5-fold cross validation

In [290]:
model = LogisticRegression(multi_class='multinomial', solver= 'lbfgs', penalty='l2', max_iter=1000)

error, acc = compute_CV_rmse_and_acc(model, x_train_scaled, y_train)
print(f"Mean RMSE: {error}")
print(f"Accuracy: {acc}\n")

Mean RMSE: 6.661046578704598
Accuracy: 0.3475



##### Predicting test set

In [291]:
model = LogisticRegression(multi_class='multinomial', solver= 'newton-cg')

model.fit(x_train_scaled, y_train)
preds = model.predict(x_test_scaled)
print(f"Mean RMSE: {rmse(y_test, preds)}")
print(f"Test accuracy: {accuracy_score(y_test, preds)}\n")

Mean RMSE: 6.2955977869473685
Test accuracy: 0.3388704318936877



### K-nearest Neighbors
##### Predicting training set with 5-fold cross validation

In [292]:
model = KNeighborsClassifier(n_neighbors=10, weights='distance')

error, acc = compute_CV_rmse_and_acc(model, x_train_scaled, y_train)
print(f"Mean RMSE: {error}")
print(f"Accuracy: {acc}\n")

Mean RMSE: 6.859133726178034
Accuracy: 0.2891666666666667



##### Predicting test set

In [293]:
model = KNeighborsClassifier(10, weights='distance')

model.fit(x_train_scaled, y_train)
preds = model.predict(x_test_scaled)
print(f"Mean RMSE: {rmse(y_test, preds)}")
print(f"Test accuracy: {accuracy_score(y_test, preds)}\n")

Mean RMSE: 6.592807920686736
Test accuracy: 0.27906976744186046



### Classification Tree
##### Predicting training set with 5-fold cross validation

In [294]:
model = DecisionTreeClassifier()

error, acc = compute_CV_rmse_and_acc(model, x_train_scaled, y_train)
print(f"Mean RMSE: {error}")
print(f"Accuracy: {acc}\n")

Mean RMSE: 6.895212250220446
Accuracy: 0.225



##### Predicting test set

In [295]:
model = DecisionTreeClassifier()

model.fit(x_train_scaled, y_train)
preds = model.predict(x_test_scaled)
print(f"Mean RMSE: {rmse(y_test, preds)}")
print(f"Test accuracy: {accuracy_score(y_test, preds)}\n")

Mean RMSE: 6.521622609580688
Test accuracy: 0.3089700996677741



### Random Forest
##### Predicting training set with 5-fold cross validation

In [296]:
model = RandomForestClassifier(n_estimators=500)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
error, acc = compute_CV_rmse_and_acc(model, x_train_scaled, y_train)
print(f"Mean RMSE: {error}")
print(f"Accuracy: {acc}\n")

s = model.feature_importances_
index_importance_sorted = sorted(range(len(s)), key=lambda k: s[k], reverse=True)
top_index = index_importance_sorted[:15]

print("\nMost importance features:")
for index in top_index:
    print(f"Feature name: {x_train.columns[index]}, Importance={s[index]}")

Mean RMSE: 6.934267455506964
Accuracy: 0.07416666666666667


Most importance features:
Feature name: stride_feat_54, Importance=0.004889143313085988
Feature name: stride_feat_73, Importance=0.004629313685160276
Feature name: stride_feat_74, Importance=0.004492257667117635
Feature name: stride_feat_49, Importance=0.004385448282853304
Feature name: stride_feat_28, Importance=0.004384583985446212
Feature name: stride_feat_97, Importance=0.00436587231331786
Feature name: AspectRatio, Importance=0.0043648847306119185
Feature name: stride_feat_72, Importance=0.004358104816712034
Feature name: stride_feat_91, Importance=0.004302289851897213
Feature name: stride_feat_108, Importance=0.004215192398012875
Feature name: stride_feat_56, Importance=0.004200022683300562
Feature name: stride_feat_111, Importance=0.004192143923326466
Feature name: stride_feat_124, Importance=0.0041783881997675566
Feature name: stride_feat_140, Importance=0.0041588178861251416
Feature name: stride_feat_50, Importance=0

##### Predicting test set

In [297]:
model = RandomForestClassifier(n_estimators=500)

model.fit(x_train_scaled, y_train)
preds = model.predict(x_test_scaled)
print(f"Mean RMSE: {rmse(y_test, preds)}")
print(f"Test accuracy: {accuracy_score(y_test, preds)}\n")

Mean RMSE: 6.772771359915209
Test accuracy: 0.07641196013289037



### Support Vector Machine
##### Predicting training set with 5-fold cross validation

In [303]:
model = SVC(kernel='rbf',C=10, gamma=0.01, decision_function_shape='ovo')

error, acc = compute_CV_rmse_and_acc(model, x_train_scaled, y_train)
print(f"Mean RMSE: {error}")
print(f"Accuracy: {acc}\n")

Mean RMSE: 6.522211213377022
Accuracy: 0.08416666666666667



##### Predicting test set

In [299]:
model = SVC(kernel='rbf',C=10, gamma=0.01)

model.fit(x_train_scaled, y_train)
preds = model.predict(x_test_scaled)
print(f"Mean RMSE: {rmse(y_test, preds)}")
print(f"Test accuracy: {accuracy_score(y_test, preds)}\n")

Mean RMSE: 6.4853527787200465
Test accuracy: 0.0664451827242525



### Sources

* https://en.wikipedia.org/wiki/Logistic_regression
* https://en.wikipedia.org/wiki/Multiclass_classification#One-vs.-rest
* https://scikit-learn.org/stable/modules/tree.html#tree
* https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm
* https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
* https://en.wikipedia.org/wiki/Random_forest
* https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
* 