
<h1> DS200A Computer Vision Assignment</h1>

<h2>  Part Three: Training Models </h2>	

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
import ray
ray.init(include_webui=False, num_cpus=4)

Process STDOUT and STDERR is being redirected to /tmp/raylogs/.
Waiting for redis server at 127.0.0.1:39807 to respond...
Waiting for redis server at 127.0.0.1:52179 to respond...
Starting the Plasma object store with 6.00 GB memory.
Starting local scheduler with the following resources: {'CPU': 4, 'GPU': 0}.


{'local_scheduler_socket_names': ['/tmp/scheduler15368597'],
 'node_ip_address': '10.142.156.36',
 'object_store_addresses': [ObjectStoreAddress(name='/tmp/plasma_store81168882', manager_name='/tmp/plasma_manager92512614', manager_port=64178)],
 'raylet_socket_names': [],
 'redis_address': '10.142.156.36:39807',
 'webui_url': ''}

In [2]:
def split(df, label='class'):
    """
    Given input df, splits the data into a training and test sets with given labels
    returns X_train, X_valid, Y_train, Y_valid 
    """
    train, valid = train_test_split(df, test_size=0.2, random_state=42, shuffle=True, stratify=df[label])
    X_train, Y_train = train.drop(columns=label, axis=1, inplace=False), train[label]
    X_valid, Y_valid = valid.drop(columns=label, axis=1, inplace=False), valid[label]
    return X_train, X_valid, Y_train, Y_valid

def accuracy(actual, pred):
    """
    Calculate the accuracy percentage of the predicted values
    """
    return accuracy_score(actual, pred)

##  Train models using all of the following methods below. Be sure to drop the actual image column, and the encoding	Take note of the differences in accuracy, and methods.


In [3]:
train_df_original = pd.read_pickle("./train_df.pkl")
train_df = train_df_original.drop(columns="filename", axis=1, inplace=False)
X_train, X_valid, Y_train, Y_valid = split(train_df)

### Logistic Regression

In [4]:
def train_test_logistic_regression(X_train, Y_train, X_valid, Y_valid, solver='liblinear'):
    """
    Given training and validation data, return the classification accuracy of a logistic regression 
    model. Defaults use l2 penalty and 'liblinear' solver.
    """
    from sklearn.linear_model import LogisticRegression
    clf = LogisticRegression(solver = solver,).fit(X_train, Y_train)
    predictions = clf.predict(X_valid)
    return accuracy_score(predictions, Y_valid)

In [5]:
train_test_logistic_regression(X_train, Y_train, X_valid, Y_valid)

0.38870431893687707

### K-nearest Neighbors

In [6]:
def train_test_knn(X_train, Y_train, X_val, Y_val, nearest_neighbor=10):
    """
    Given training and validation data, return the classification accuracy of a K-Nearest-Neighbours 
    model. Defaults use 10 neighbours.
    """
    from sklearn.neighbors import KNeighborsClassifier
    clf = KNeighborsClassifier(n_neighbors = nearest_neighbor).fit(X_train, Y_train)
    predictions = clf.predict(X_valid)
    return accuracy(predictions, Y_valid)

# using parallelization to speed up the training
@ray.remote
def train_test_knn_remote(X_train, Y_train, X_val, Y_val, nearest_neighbors=10):
    return train_test_knn(X_train, Y_train, X_val, Y_val, nearest_neighbors)

In [7]:
param_list = []
optimal = []
for nearest_neighbor in range(25,0,-1):   # tuning num of nearest neghbour classifiers 
    acc = train_test_knn_remote.remote(X_train, Y_train, X_valid, Y_valid, nearest_neighbor)
    param_list.append(nearest_neighbor)
    optimal.append(acc)

optimal = ray.get(optimal)
optimal = np.column_stack((param_list, optimal))

# Display top 5 results along with params sorted on accuracy
display(sorted(optimal, key=lambda x:x[1], reverse=True)[:5])

[array([20.        ,  0.32890365]),
 array([10.       ,  0.3255814]),
 array([21.        ,  0.32225914]),
 array([16.        ,  0.31893688]),
 array([14.        ,  0.31893688])]

### Random Forest

In [8]:
def train_test_random_forest(X_train, Y_train, X_valid, Y_valid, num_classifiers=100, depth=5):
    """
    Given training and validation data, return the classification accuracy of a random forest 
    model. Defaults use num_classifiers=100 and depth=5.
    """
    from sklearn.ensemble import RandomForestClassifier
    clf = RandomForestClassifier(n_estimators=num_classifiers, max_depth=depth).fit(X_train, Y_train)
    predictions = clf.predict(X_valid)
    return accuracy_score(Y_valid, predictions)

# using parallelization to speed up the training
@ray.remote
def train_test_random_forest_remote(X_train, Y_train, X_valid, Y_valid, num_classifiers=100, depth=5):
    return train_test_random_forest(X_train, Y_train, X_valid, Y_valid, num_classifiers, depth)

In [9]:
n_list = []
d_list = []
optimal = []
for n in range(500, 49, -10):   # tuning the number of classifiers
    for d in range(3, 11):      # tuning the depth of each classifier
        acc = train_test_random_forest_remote.remote(X_train, Y_train, X_valid, Y_valid, n, d)
        optimal.append(acc)
        n_list.append(n)
        d_list.append(d)
        
optimal = ray.get(optimal)
optimal = np.column_stack((n_list, d_list, optimal))

# Display top 5 results along with params sorted on accuracy
display(sorted(optimal, key=lambda x:x[2], reverse=True)[:5])  ##

[array([60.        , 10.        ,  0.41196013]),
 array([290.        ,  10.        ,   0.40199336]),
 array([200.        ,  10.        ,   0.40199336]),
 array([140.        ,   9.        ,   0.40199336]),
 array([4.40000000e+02, 9.00000000e+00, 3.98671096e-01])]

### Support Vector Machine

In [10]:
def train_test_svm(X_train, Y_train, X_valid, Y_valid, C=10):
    """
    Given training and validation data, return the classification accuracy of a support vector 
    machine model. Defaults use penalty param C=10 for training.
    """
    from sklearn.svm import LinearSVC
    clf = LinearSVC(C = 10).fit(X_train, Y_train)
    predictions = clf.predict(X_valid)
    return accuracy_score(Y_valid, predictions)

# using parallelization to speed up the training
@ray.remote
def train_test_svm_remote(X_train, Y_train, X_valid, Y_valid, C=10):
    return train_test_svm(X_train, Y_train, X_valid, Y_valid, C)

In [11]:
param_list = []
optimal = []

for c in range(5,11):    # tuning penalty param c
    acc=train_test_svm_remote.remote(X_train, Y_train, X_valid, Y_valid, C=c)
    optimal.append(acc)
    param_list.append(c)
    
optimal = ray.get(optimal)
optimal = np.column_stack((param_list, optimal))

# Display top 5 results along with params sorted on accuracy
display(sorted(optimal, key=lambda x:x[1], reverse=True)[:5])

[array([7.        , 0.26910299]),
 array([9.        , 0.21262458]),
 array([10.        ,  0.18272425]),
 array([8.        , 0.11960133]),
 array([6.        , 0.11295681])]