
<h1> DS200A Computer Vision Assignment</h1>

<h2>  Part Three: Classifier training and performance assessment. </h2>	

In [139]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
import numpy as np

In [140]:
#from google.colab import drive
#drive.mount('/content/drive', force_remount=True)

#### Retrieving and preprocessing of the training data

In [141]:
from ipynb.fs.defs.GradProject_NB2 import feature_frame

In [142]:
data = feature_frame()
y = data['Label']
x = data.drop(columns=['Label', 'Image_Id'])

Fetching label '.DS_Store'
Fetching label 'airplanes'
Fetching label 'bear'
Fetching label 'blimp'
	Gray image ('blimp_0022.jpg') was loaded, converting to RGB
Fetching label 'comet'
	Gray image ('comet_0006.jpg') was loaded, converting to RGB
	Gray image ('comet_0011.jpg') was loaded, converting to RGB
	Gray image ('comet_0013.jpg') was loaded, converting to RGB
	Gray image ('comet_0021.jpg') was loaded, converting to RGB
	Gray image ('comet_0036.jpg') was loaded, converting to RGB
	Gray image ('comet_0038.jpg') was loaded, converting to RGB
	Gray image ('comet_0041.jpg') was loaded, converting to RGB
	Gray image ('comet_0049.jpg') was loaded, converting to RGB
	Gray image ('comet_0052.jpg') was loaded, converting to RGB
	Gray image ('comet_0053.jpg') was loaded, converting to RGB
	Gray image ('comet_0057.jpg') was loaded, converting to RGB
	Gray image ('comet_0058.jpg') was loaded, converting to RGB
Fetching label 'crab'
	Gray image ('crab_0045.jpg') was loaded, converting to RGB
Fet

#### Splitting training data

In [143]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

# Min-Max scaling data
scaler = MinMaxScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [144]:
# Cross-validation
from sklearn.model_selection import KFold

def rmse(actual_y, predicted_y):
    """
    The root mean square error between the prediction and the ground truth
    """
    return np.sqrt(np.sum((actual_y - predicted_y)**2)/len(predicted_y))

def compute_CV_rmse_and_acc(model, X_train, Y_train):
    '''
    Split the training data into 5 subsets.
    For each subset, 
        fit a model holding out that subset
        compute the MSE on that subset (the validation set)
    You should be fitting 5 models total.
    Return the average MSE of these 5 folds.

    Args:
        model: an sklearn model with fit and predict functions 
        X_train (data_frame): Training data
        Y_train (data_frame): Label 

    Return:
        the average validation error and accuracy for the 5 splits.
    '''
    kf = KFold(n_splits=5)
    validation_errors = []
    validation_accuracies = []
    
    for train_idx, valid_idx in kf.split(X_train):
        # Split the data
        train_start, train_end = train_idx[0], train_idx[-1] + 1
        valid_start, valid_end = valid_idx[0], valid_idx[-1] + 1
        split_X_train, split_X_valid = X_train[train_start : train_end], X_train[valid_start : valid_end]
        split_Y_train, split_Y_valid = Y_train[train_start : train_end], Y_train[valid_start : valid_end]
        
        # Fit the model on the training split
        model.fit(split_X_train, split_Y_train)
        
        # Compute the RMSE on the validation split
        preds = model.predict(split_X_valid)
        error = rmse(split_Y_valid, preds)
        acc = accuracy_score(split_Y_valid, preds)
        

        validation_errors.append(error)
        validation_accuracies.append(acc)
        
    return np.mean(validation_errors), np.mean(validation_accuracies)

### Logistic Regression
##### Predicting training set with 5-fold cross validation

In [145]:
model = LogisticRegression()

error, acc = compute_CV_rmse_and_acc(model, x_train_scaled, y_train)
print(f"Mean RMSE: {error}")
print(f"Accuracy: {acc}\n")

Mean RMSE: 5.630524885561728
Accuracy: 0.5175000000000001



##### Predicting test set

In [146]:
model = LogisticRegression()

model.fit(x_train_scaled, y_train)
preds = model.predict(x_test_scaled)
print(f"Mean RMSE: {rmse(y_test, preds)}")
print(f"Test accuracy: {accuracy_score(y_test, preds)}\n")

Mean RMSE: 6.426688539317535
Accuracy: 0.3754152823920266



### K-nearest Neighbors
##### Predicting training set with 5-fold cross validation

In [147]:
model = KNeighborsClassifier(10)

error, acc = compute_CV_rmse_and_acc(model, x_train_scaled, y_train)
print(f"Mean RMSE: {error}")
print(f"Accuracy: {acc}\n")

Mean RMSE: 6.410702659867566
Accuracy: 0.35833333333333334



##### Predicting test set

In [148]:
model = KNeighborsClassifier(10)

model.fit(x_train_scaled, y_train)
preds = model.predict(x_test_scaled)
print(f"Mean RMSE: {rmse(y_test, preds)}")
print(f"Test accuracy: {accuracy_score(y_test, preds)}\n")

Mean RMSE: 6.108106916194884
Accuracy: 0.292358803986711



### Classification Tree
##### Predicting training set with 5-fold cross validation

In [149]:
model = DecisionTreeClassifier()

error, acc = compute_CV_rmse_and_acc(model, x_train_scaled, y_train)
print(f"Mean RMSE: {error}")
print(f"Accuracy: {acc}\n")

Mean RMSE: 2.5471683325482872
Accuracy: 0.715



##### Predicting test set

In [150]:
model = DecisionTreeClassifier()

model.fit(x_train_scaled, y_train)
preds = model.predict(x_test_scaled)
print(f"Mean RMSE: {rmse(y_test, preds)}")
print(f"Test accuracy: {accuracy_score(y_test, preds)}\n")

Mean RMSE: 6.209787580426559
Accuracy: 0.29900332225913623



### Random Forest
##### Predicting training set with 5-fold cross validation

In [None]:
model = RandomForestClassifier(n_estimators=500)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
error, acc = compute_CV_rmse_and_acc(model, x_train_scaled, y_train)
print(f"Mean RMSE: {error}")
print(f"Accuracy: {acc}\n")

s = model.feature_importances_
index_importance_sorted = sorted(range(len(s)), key=lambda k: s[k], reverse=True)
top_index = index_importance_sorted[:15]

print("\nMost importance features:")
for index in top_index:
    print(f"Feature name: {x_train.columns[index]}, Importance={s[index]}")

##### Predicting test set

In [None]:
model = RandomForestClassifier(n_estimators=500)

model.fit(x_train_scaled, y_train)
preds = model.predict(x_test_scaled)
print(f"Mean RMSE: {rmse(y_test, preds)}")
print(f"Test accuracy: {accuracy_score(y_test, preds)}\n")

### Support Vector Machine
##### Predicting training set with 5-fold cross validation

In [None]:
model = SVC(kernel='rbf',C=10, gamma=0.01)

error, acc = compute_CV_rmse_and_acc(model, x_train_scaled, y_train)
print(f"Mean RMSE: {error}")
print(f"Accuracy: {acc}\n")

##### Predicting test set

In [None]:
model = RandomForestClassifier(n_estimators=500)

model.fit(x_train_scaled, y_train)
preds = model.predict(x_test_scaled)
print(f"Mean RMSE: {rmse(y_test, preds)}")
print(f"Test accuracy: {accuracy_score(y_test, preds)}\n")