In [None]:
class simple_KNeighborsClassifier(object):
    def __init__(self, k):
        self.k = k
        
    def fit(self, X_train, Y_train):
        self.X_train = X_train
        self.Y_train = Y_train
        
    def predict(self, X_pred):
        Y_pred = []
        length = len(X_pred)
        distances = np.zeros(len(self.X_train))

        for i in range (length):
            for difference in range (len(self.X_train)):
                distances[difference] = np.linalg.norm(X_pred[i]-self.X_train[difference])
                
            min = np.argpartition(distances, self.k)
            
            labels = []
            for x in range (self.k):
                index = min[x]
                label = int(self.Y_train[index])
                labels.append(label)
                
                
            half = len(labels)/2
            num_ones = np.count_nonzero(labels)
            if num_ones > half:
                prediction = 1
            if num_ones < half:
                prediction = 0
            if num_ones == half:
                prediction = 1
            
            Y_pred.append(prediction)

        
        return np.array(Y_pred)
    

In [5]:
import time
import scipy.io as sio
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

def Report(classifier, xTest, yTest):
    
    tme = time.time()
    yPred = classifier.predict(xTest)
    elapsed_time = time.time() - tme
    
    print("Model prediction time (s): "+ str(elapsed_time))
    
    print ("Model accuracy: " + str(classifier.score(xTest, yTest)))
    
    print("f1 accuracy: " + str(f1_score(yTest,yPred)))

    print("Classification Report:")
    print('\n')
    print(classification_report(yTest, yPred, digits=4))

    print("\n\n\n")
    
def draw_heatmap_linear(acc, acc_desc, C_list):
    plt.figure(figsize = (2,4))
    ax = sns.heatmap(acc, annot=True, fmt='.3f', yticklabels=C_list, xticklabels=[])
    ax.collections[0].colorbar.set_label("accuracy")
    ax.set(ylabel='$C$')
    plt.title(acc_desc + ' w.r.t $C$')
    sns.set_style("whitegrid", {'axes.grid' : False})
    plt.show()
    
def simple_cross_validation(X_train_val, Y_train_val, k, fold):
    val_acc_list = []
    train_acc_list = []
    
    length = X_train_val.shape[0]
    subset_size = length/fold
 
    for i in range(fold):
        all_data = np.arange(length)
        validation_set = np.arange(int(np.round(i*subset_size)),int(np.round((i+1)*subset_size)))
        training_set = np.setdiff1d(all_data,validation_set)
        
        train_X = X_train_val[training_set,:]
        validation_X = X_train_val[validation_set,:]
        train_Y = Y_train_val[training_set]
        validation_Y = Y_train_val[validation_set]
        
        classifier = simple_KNeighborsClassifier(k)
        classifier.fit(train_X,train_Y)
        train_pred = classifier.predict(train_X)
        val_pred = classifier.predict(validation_X)
        
        train_acc = np.count_nonzero((train_pred-train_Y)==0)/(train_Y.shape[0])
        train_acc_list.append(train_acc)
        val_acc = np.count_nonzero((val_pred-validation_Y)==0)/(validation_Y.shape[0]) 
        val_acc_list.append(val_acc)
        
        
    return sum(val_acc_list) / len(val_acc_list), \
           sum(train_acc_list) / len(train_acc_list)
    
def simple_GridSearchCV_fit(X_train_val, Y_train_val, k_list, fold):
    val_acc_array = np.zeros(len(k_list))
    train_acc_array = np.zeros(len(k_list))
    for i in range(len(k_list)):
        val_acc_array[i], train_acc_array[i] = simple_cross_validation(
            X_train_val, Y_train_val, k_list[i], fold)
    return val_acc_array, train_acc_array
        
def draw_heatmap_knn(acc, acc_desc, k_list):
    plt.figure(figsize = (2,4))
    ax = sns.heatmap(acc, annot=True, fmt='.3f', yticklabels=k_list, xticklabels=[])
    ax.collections[0].colorbar.set_label("accuracy")
    ax.set(ylabel='$k$')
    plt.title(acc_desc + ' w.r.t $k$')
    sns.set_style("whitegrid", {'axes.grid' : False})
    plt.show()

In [6]:
CPU_CORES = 32
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm as svm
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import LinearSVC
import xgboost as xgb
import time
    
def KNN(xTrain, xTest, yTrain, yTest):
    print("KNN:")
    clf = KNeighborsClassifier()
    param_grid = {'n_neighbors' : [x for x in np.arange(1,24) if x%2 == 1] }
    clas = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, n_jobs=CPU_CORES)
    t = time.time()
    clas.fit(xTrain, yTrain)
    timeEnd = time.time() - t
    print("Best K: " +str(clas.best_params_['n_neighbors']))
    print("Training time (s):" + str(timeEnd))
    Report(clas, xTest, yTest)

def RandomForest(xTrain, xTest, yTrain, yTest):
    print("Random Forest:")
    clf = RandomForestClassifier(n_estimators = 200)
    param_grid = {
        'max_features': [x for x in [1,2,4,6,8,12,16,20] if x < len(xTrain[0])]
    }
    clas = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, n_jobs=CPU_CORES)
    tme = time.time()
    clas.fit(xTrain, yTrain)
    timeEnd = time.time() - tme
    print("Best MaxFeatures: " + str(clas.best_params_['max_features']))
    print("Training Time (s): " + str(timeEnd))
    Report(clas, xTest, yTest)
    
def SVMlin(xTrain, xTest, yTrain, yTest):
    print("SVM w/Linear kernel:")
    svc = svm.SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, gamma='auto', kernel='linear',
        max_iter=-1, probability=False, random_state=None, shrinking=True,
        tol=0.001, verbose=False)

    param_grid = {'C': [0.001, 0.01,0.1, 1, 2, 3, 4, 5, 6]}

    clf = GridSearchCV(svc, param_grid=param_grid, cv=5, n_jobs = CPU_CORES)
    tme = time.time()
    clf.fit(xTrain, yTrain)
    timeEnd = time.time() - tme
    print("Best C: "  + str(clf.best_params_['C']))
    print("Training time (s):" + str(timeEnd))
    Report(clf, xTest, yTest)
    
def SVMrbf(xTrain, xTest, yTrain, yTest):
    print("SVM w/ RBF Kernel:")

    svc = LinearSVC(C=1.0, class_weight=None,
        max_iter=-1, random_state=None,
        tol=0.001, verbose=False)
    classifier = svm.SVC(kernel='rbf')

    param_grid = {
        'C': [0.001,0.01,0.1, 1, 2, 3, 4, 5, 6], 'gamma': [500000, 20000, 5000, 200, 50, 2, 0.5, 0.125, 0.01]
    }

    CV = GridSearchCV(classifier, param_grid=param_grid, cv=5, n_jobs = CPU_CORES)
    tme = time.time()
    CV.fit(xTrain, yTrain)
    timeEnd = time.time() - tme
    print("Best C: " + str(CV.best_params_['C']))
    print("Best gamma: " + str(CV.best_params_['gamma']))
    print("Training time (s):" + str(timeEnd))
    Report(CV, xTest, yTest)
    
def BoostedDecisionTree(xTrain, xTest, yTrain, yTest):
    print("Decision Tree:")
    param_grid = {'n_estimators': [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]}
    clf = AdaBoostClassifier()
    CV = GridSearchCV(estimator=clf, param_grid = param_grid, cv = 5, n_jobs = CPU_CORES)
    t = time.time()
    CV.fit(xTrain, yTrain)
    elapsed_time = time.time() - t
    print("Best nEstimators: " + str(CV.best_params_['n_estimators']))
    print("Training time (s):"+ str(elapsed_time))
    Report(CV, xTest, yTest)

def XGBoost(xTrain, xTest, yTrain, yTest):
    xTrain = np.array(xTrain);
    xTest = np.array(xTest);
    yTrain = np.array(yTrain);
    yTest = np.array(yTest);
    
    print("XGBoost:")
    param_grid = {n_estimators': [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]}
        
    clf = xgb.XGBClassifier()
    CV = GridSearchCV(estimator=clf, param_grid = param_grid, cv = 5)
    t = time.time()
    CV.fit(xTrain, yTrain)
    elapsed_time = time.time() - t
    print("Best nEstimators: " + str(CV.best_params_['n_estimators'])) 
    print("Training time (s):" + str(elapsed_time))
    Report(CV, xTest, yTest)
    
    
def NeuralNets(xTrain, xTest, yTrain, yTest):
    print("Neural Nets(ANN):")
    param_grid = {
        'learning_rate_init': [10e-4, 10e-3, 10e-2, 10e-1]
    }
    clf = MLPClassifier(hidden_layer_sizes = (640,), solver ='sgd', early_stopping = True, nesterovs_momentum = False, learning_rate = 'constant')
    CV = GridSearchCV(estimator=clf, param_grid = param_grid, cv = 5)
    t = time.time()
    CV.fit(xTrain, yTrain)
    elapsed_time = time.time() - t
    print("Best LearningRate: " + "{0:.3f}".format(CV.best_params_['learning_rate_init']))
    print("Training time(s):"+ str(elapsed_time))
    Report(CV, xTest, yTest)