
<h1> DS200A Computer Vision Assignment</h1>

<h2>  Part Three: Classifier training and performance assessment. </h2>	

In [190]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
import numpy as np
from sklearn.model_selection import KFold

In [191]:
#from google.colab import drive
#drive.mount('/content/drive', force_remount=True)

#### Retrieving and preprocessing of the training data

In [192]:
from ipynb.fs.defs.GradProject_NB2 import feature_frame

In [193]:
data = feature_frame()
y = data['Label']
x = data.drop(columns=['Label', 'Image_Id'])

Fetching label '.DS_Store'
Fetching label 'airplanes'
Fetching label 'bear'
Fetching label 'blimp'
	Gray image ('blimp_0022.jpg') was loaded, converting to RGB
Fetching label 'comet'
	Gray image ('comet_0006.jpg') was loaded, converting to RGB
	Gray image ('comet_0011.jpg') was loaded, converting to RGB
	Gray image ('comet_0013.jpg') was loaded, converting to RGB
	Gray image ('comet_0021.jpg') was loaded, converting to RGB
	Gray image ('comet_0036.jpg') was loaded, converting to RGB
	Gray image ('comet_0038.jpg') was loaded, converting to RGB
	Gray image ('comet_0041.jpg') was loaded, converting to RGB
	Gray image ('comet_0049.jpg') was loaded, converting to RGB
	Gray image ('comet_0052.jpg') was loaded, converting to RGB
	Gray image ('comet_0053.jpg') was loaded, converting to RGB
	Gray image ('comet_0057.jpg') was loaded, converting to RGB
	Gray image ('comet_0058.jpg') was loaded, converting to RGB
Fetching label 'crab'
	Gray image ('crab_0045.jpg') was loaded, converting to RGB
Fet

#### Splitting training data

In [178]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

# Min-Max scaling data
scaler = MinMaxScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [179]:
# Cross-validation
def rmse(actual_y, predicted_y):
    """
    The root mean square error between the prediction and the ground truth
    """
    return np.sqrt(np.sum((actual_y - predicted_y)**2)/len(predicted_y))

def compute_CV_rmse_and_acc(model, X_train, Y_train):
    '''
    Split the training data into 5 subsets.
    For each subset, 
        fit a model holding out that subset
        compute the MSE on that subset (the validation set)
    You should be fitting 5 models total.
    Return the average MSE of these 5 folds.

    Args:
        model: an sklearn model with fit and predict functions 
        X_train (data_frame): Training data
        Y_train (data_frame): Label 

    Return:
        the average validation error and accuracy for the 5 splits.
    '''
    kf = KFold(n_splits=5)
    validation_errors = []
    validation_accuracies = []
    
    for train_idx, valid_idx in kf.split(X_train):
        # Split the data
        train_start, train_end = train_idx[0], train_idx[-1] + 1
        valid_start, valid_end = valid_idx[0], valid_idx[-1] + 1
        split_X_train, split_X_valid = X_train[train_start : train_end], X_train[valid_start : valid_end]
        split_Y_train, split_Y_valid = Y_train[train_start : train_end], Y_train[valid_start : valid_end]
        
        # Fit the model on the training split
        model.fit(split_X_train, split_Y_train)
        
        # Compute the RMSE on the validation split
        preds = model.predict(split_X_valid)
        error = rmse(split_Y_valid, preds)
        acc = accuracy_score(split_Y_valid, preds)
        

        validation_errors.append(error)
        validation_accuracies.append(acc)
        
    return np.mean(validation_errors), np.mean(validation_accuracies)

### Logistic Regression
##### Predicting training set with 5-fold cross validation

In [180]:
model = LogisticRegression()

error, acc = compute_CV_rmse_and_acc(model, x_train_scaled, y_train)
print(f"Mean RMSE: {error}")
print(f"Accuracy: {acc}\n")

Mean RMSE: 5.505754635889411
Accuracy: 0.5183333333333333



##### Predicting test set

In [181]:
model = LogisticRegression()

model.fit(x_train_scaled, y_train)
preds = model.predict(x_test_scaled)
print(f"Mean RMSE: {rmse(y_test, preds)}")
print(f"Test accuracy: {accuracy_score(y_test, preds)}\n")

Mean RMSE: 6.45016675464989
Test accuracy: 0.33554817275747506



### K-nearest Neighbors
##### Predicting training set with 5-fold cross validation

In [182]:
model = KNeighborsClassifier(10)

error, acc = compute_CV_rmse_and_acc(model, x_train_scaled, y_train)
print(f"Mean RMSE: {error}")
print(f"Accuracy: {acc}\n")

Mean RMSE: 6.320125824005414
Accuracy: 0.35000000000000003



##### Predicting test set

In [183]:
model = KNeighborsClassifier(10)

model.fit(x_train_scaled, y_train)
preds = model.predict(x_test_scaled)
print(f"Mean RMSE: {rmse(y_test, preds)}")
print(f"Test accuracy: {accuracy_score(y_test, preds)}\n")

Mean RMSE: 6.581459915394167
Test accuracy: 0.3089700996677741



### Classification Tree
##### Predicting training set with 5-fold cross validation

In [184]:
model = DecisionTreeClassifier()

error, acc = compute_CV_rmse_and_acc(model, x_train_scaled, y_train)
print(f"Mean RMSE: {error}")
print(f"Accuracy: {acc}\n")

Mean RMSE: 2.774773958756314
Accuracy: 0.7183333333333334



##### Predicting test set

In [185]:
model = DecisionTreeClassifier()

model.fit(x_train_scaled, y_train)
preds = model.predict(x_test_scaled)
print(f"Mean RMSE: {rmse(y_test, preds)}")
print(f"Test accuracy: {accuracy_score(y_test, preds)}\n")

Mean RMSE: 6.759268277054552
Test accuracy: 0.3222591362126246



### Random Forest
##### Predicting training set with 5-fold cross validation

In [186]:
model = RandomForestClassifier(n_estimators=500)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
error, acc = compute_CV_rmse_and_acc(model, x_train_scaled, y_train)
print(f"Mean RMSE: {error}")
print(f"Accuracy: {acc}\n")

s = model.feature_importances_
index_importance_sorted = sorted(range(len(s)), key=lambda k: s[k], reverse=True)
top_index = index_importance_sorted[:15]

print("\nMost importance features:")
for index in top_index:
    print(f"Feature name: {x_train.columns[index]}, Importance={s[index]}")

Mean RMSE: 2.6478857138544614
Accuracy: 0.63


Most importance features:
Feature name: stride_feat_82, Importance=0.004475233905201441
Feature name: stride_feat_98, Importance=0.0044110373631888284
Feature name: Size, Importance=0.0044079652966999405
Feature name: stride_feat_83, Importance=0.00439145596284775
Feature name: stride_feat_79, Importance=0.004363195609889434
Feature name: hist_feat_219, Importance=0.00430107449362292
Feature name: stride_feat_31, Importance=0.0042943020638679325
Feature name: hist_feat_73, Importance=0.0042777163898319245
Feature name: stride_feat_74, Importance=0.004277082026873717
Feature name: hist_feat_64, Importance=0.004251118392282519
Feature name: hist_feat_137, Importance=0.004242460895966857
Feature name: hist_feat_210, Importance=0.004228456778548464
Feature name: AspectRatio, Importance=0.004223952335062468
Feature name: stride_feat_11, Importance=0.0041973900153284265
Feature name: stride_feat_22, Importance=0.004172921464809999


##### Predicting test set

In [187]:
model = RandomForestClassifier(n_estimators=500)

model.fit(x_train_scaled, y_train)
preds = model.predict(x_test_scaled)
print(f"Mean RMSE: {rmse(y_test, preds)}")
print(f"Test accuracy: {accuracy_score(y_test, preds)}\n")

Mean RMSE: 6.872111292846068
Test accuracy: 0.08305647840531562



### Support Vector Machine
##### Predicting training set with 5-fold cross validation

In [188]:
model = SVC(kernel='rbf',C=10, gamma=0.01)

error, acc = compute_CV_rmse_and_acc(model, x_train_scaled, y_train)
print(f"Mean RMSE: {error}")
print(f"Accuracy: {acc}\n")

Mean RMSE: 5.764366245701693
Accuracy: 0.16833333333333333



##### Predicting test set

In [189]:
model = SVC(kernel='rbf',C=10, gamma=0.01)

model.fit(x_train_scaled, y_train)
preds = model.predict(x_test_scaled)
print(f"Mean RMSE: {rmse(y_test, preds)}")
print(f"Test accuracy: {accuracy_score(y_test, preds)}\n")

Mean RMSE: 6.192912034611709
Test accuracy: 0.06976744186046512

