In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
import bokeh
from bokeh.plotting import figure, output_file, show, ColumnDataSource
# import bokeh.charts.utils
import bokeh.io
import bokeh.models
from bokeh.models import HoverTool
import bokeh.palettes
import bokeh.plotting
import random
from random import sample
from sklearn import svm, neighbors
from sklearn.neighbors import KNeighborsRegressor
from sklearn import linear_model


from scipy.stats import randint as sp_randint
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV


import multiprocessing as mp


In [None]:
# Preprocessing, normalization done in excel (Xnew = (X-mean)/std))
# If we want to normalize in Python we can use preprocessing.scale()
Data = pd.read_csv('songs4.csv')
Data = Data.iloc[:, 0:18]
#Data = Data.drop(Data[(Data.time_signature > 5)].index)
Data.head()

In [None]:
# check number of rows
Data.count()

In [None]:
# Divide into testing and training
x = Data.drop('valence', 1)
y = Data.valence
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.20)
train = xtrain.assign(valence=ytrain)
train = train [0: 5000]
train.head()

In [None]:
# Create correlation matrix
M = train.corr()

In [None]:
M

In [None]:
# correlation matrix
plt.imshow(M)
plt.colorbar()
plt.show()

In [None]:
# SVD using numpy function
U, E, VT = np.linalg.svd(M)

In [None]:
plt.plot(E)
plt.show()

In [None]:
P = np.dot(U[:,:2],np.diag(E[:2]))

In [None]:
plt.plot(P[:,0], P[:,1],'o')
plt.show()

In [None]:
N = train.T
N.columns = N.iloc[0]
N = N.drop('ID')
N = N.astype(float)

In [None]:
# Takes 10-15 minutes with all of the data
N = N.corr()
N.head()

In [None]:
# Identifies how different songs correlate to each other, there is a fair amount of uniqueness among songs
plt.imshow(N)
plt.colorbar()
plt.show()

In [None]:
U, E, VT = np.linalg.svd(N)

In [None]:
# Most of the variance can be explained using the first 8 or so components
plt.plot(E[:20])
plt.show()

In [None]:
P = np.dot(U[:,:2],np.diag(E[:2]))
print(P)

In [None]:
# plot first two principal components to get an idea of the shape of the data
_tools_to_show = 'box_zoom,pan,save,hover,reset,tap,wheel_zoom'        
p = figure(plot_width=400, plot_height=400, title=None, tools=_tools_to_show)

# add a circle renderer with a size, color, and alpha
p.circle(P[:,0], P[:,1], size=4, color="navy", alpha=0.2)

# show the results
show(p)

In [None]:
# SVM on PCA results

In [None]:
# Further divide training and testing based on principal components
# Slice U and E based on the ideal number of principal components
P = np.dot(U, np.diag(E))
P = P[:,:18]
PCA_xtrain, PCA_xtest, PCA_ytrain, PCA_ytest = train_test_split(P, train['valence'], test_size = 0.2, random_state = 0)

In [None]:
SVM_clf = svm.SVR(kernel='linear')

In [None]:
SVM_clf.fit(PCA_xtrain, PCA_ytrain)

In [None]:
# testing error
SVM_test_ypreds = SVM_clf.predict(PCA_xtest)
SVM_test_MSE = np.mean((SVM_test_ypreds - PCA_ytest)**2)
SVM_test_MSE

In [None]:
SVM_clf.score(PCA_xtest, PCA_ytest)

In [None]:
SVM_clf.score(PCA_xtrain, PCA_ytrain)

In [None]:
# training error
SVM_train_ypreds = SVM_clf.predict(PCA_xtrain)
SVM_test_MSE = np.mean((SVM_train_ypreds - PCA_ytrain)**2)
SVM_test_MSE

In [None]:
# Random Forest - Harry draft 1
RF_ests = []
RF_results = []
n_estimators_range = range(50, 600, 50)

for n_estimators in n_estimators_range:
    RF_clf = RandomForestRegressor(n_estimators=n_estimators, n_jobs=-1, random_state = 0)
    RF_clf.fit(PCA_xtrain, PCA_ytrain)
    RF_ests.append(n_estimators)
    RF_results.append(RF_clf.score(PCA_xtest, PCA_ytest))

tools_to_show = 'box_zoom,pan,save,hover,reset,tap,wheel_zoom'
p_est_res = figure(plot_width=400,
                   plot_height=400,
                   title="Number of Trees vs Accuracy",
                   tools=_tools_to_show,
                   x_axis_label="Number of Trees",
                   y_axis_label="Accuracy")
# add a circle renderer with a size, color, and alpha
p_est_res.circle(RF_ests, RF_results, size=10, color="green", alpha=0.5)

# show the results
show(p_est_res)

In [None]:
# Random forest - Harry draft 2
RF_feats = []
RF_feats_results = []
max_features_range = range(12, 19)
for max_features in max_features_range:
    RF_clf = RandomForestRegressor(n_estimators=400,
                                   max_features=max_features,
                                   random_state=0,
                                   n_jobs=-1)
    RF_clf.fit(PCA_xtrain, PCA_ytrain)
    RF_feats.append(max_features)
    RF_feats_results.append(RF_clf.score(PCA_xtest, PCA_ytest))
tools_to_show = 'box_zoom,pan,save,hover,reset,tap,wheel_zoom'
p_est_res = figure(plot_width=400,
                   plot_height=400,
                   title="Max Features vs Accuracy",
                   tools=_tools_to_show,
                   x_axis_label="Max Features",
                   y_axis_label="Accuracy")
# add a circle renderer with a size, color, and alpha
p_est_res.circle(RF_feats, RF_feats_results, size=10, color="green", alpha=0.5)
# show the results
show(p_est_res)

In [None]:
# Make copies of data without ID
xtrain_heir, xtest_heir, ytrain_heir, ytest_heir = train_test_split(xtrain, ytrain, test_size=0.20) 
xtrain_heir = xtrain_heir.iloc[:, 1:]
xtest_heir  = xtest_heir.iloc[:, 1:]

In [None]:
# K-Nearest-Neighbors (on original data)
print("(n, weights):")
for n in [1, 5, 10, 25, 50, 100, 250, 500]:
    for weights in ['uniform', 'distance']:
        KNN_clf = KNeighborsRegressor(n_neighbors=n, weights=weights, n_jobs=-1)
        KNN_clf.fit(xtrain_heir, ytrain_heir)

        # Training error
        KNN_train_ypreds = KNN_clf.predict(xtrain_heir)
        KNN_train_MSE = np.mean((KNN_train_ypreds - ytrain_heir) ** 2)

        # Testing error
        KNN_test_ypreds = KNN_clf.predict(xtest_heir)
        KNN_test_MSE = np.mean((KNN_test_ypreds - ytest_heir) ** 2)
        
        print("({}, {}) -> Train({:.3f}) Test({:.3f})".format(n, weights, KNN_train_MSE, KNN_test_MSE))
        print("Score: {:.3f}".format(KNN_clf.score(xtest_heir, KNN_test_ypreds)))
    
"""
Values reported are MSE. Higher = worse
(n, weights):
(1, uniform) -> Train(0.000) Test(0.824)
(1, distance) -> Train(0.000) Test(0.824)
(5, uniform) -> Train(0.424) Test(0.644)
(5, distance) -> Train(0.000) Test(0.536)
(10, uniform) -> Train(0.505) Test(0.616)
(10, distance) -> Train(0.000) Test(0.489)
(25, uniform) -> Train(0.568) Test(0.607)
(25, distance) -> Train(0.000) Test(0.467)
(50, uniform) -> Train(0.598) Test(0.614)
(50, distance) -> Train(0.000) Test(0.466)
(100, uniform) -> Train(0.623) Test(0.626)
(100, distance) -> Train(0.000) Test(0.471)
(250, uniform) -> Train(0.654) Test(0.648)
(250, distance) -> Train(0.000) Test(0.485)
(500, uniform) -> Train(0.681) Test(0.673)
(500, distance) -> Train(0.000) Test(0.501)

For whatever reason score always comes out to exactly 1.000
From the above, we can't really say anything specific except that distance as a metric seems far more useful than 
uniform.
"""


In [None]:
# Lasso (on it's own)
# Train
alpha = 0.01
lasso_model = linear_model.Lasso(alpha=alpha)
lasso_model.fit(xtrain_heir, ytrain_heir)

# Training error
lasso_train_ypreds = lasso_model.predict(xtrain_heir)
lasso_train_MSE = np.mean((lasso_train_ypreds - ytrain_heir) ** 2)

# Testing error
lasso_test_ypreds = lasso_model.predict(xtest_heir)
lasso_test_MSE = np.mean((lasso_test_ypreds - ytest_heir)**2)

# Also do linreg for comparison
linreg_model = linear_model.LinearRegression()
linreg_model.fit(xtrain_heir, ytrain_heir)

# Linreg train:
linreg_train_ypreds = linreg_model.predict(xtrain_heir)
linreg_train_MSE = np.mean((linreg_train_ypreds - ytrain_heir) ** 2)

# Linreg test
linreg_test_ypreds = linreg_model.predict(xtest_heir)
linreg_test_MSE = np.mean((linreg_test_ypreds - ytest_heir) ** 2)


# Output
print("alpha: {}".format(alpha))
print("training error (MSE): {}".format(lasso_train_MSE))
print("testing  error (MSE): {}".format(lasso_test_MSE))
print("training error linreg: ", linreg_train_MSE)
print("testing error linreg: ", linreg_test_MSE)

# Split into new testing/training dataset
scaler = lambda row: np.multiply(lasso_model.coef_, row) + lasso_model.intercept_
#scaler = lambda row: row
lasso_x = np.apply_along_axis(scaler, 1, xtrain_heir)
lasso_xtrain, lasso_xtest, lasso_ytrain, lasso_ytest = train_test_split(lasso_x, ytrain_heir, test_size = 0.2, random_state = 0)



# Findings (linreg on lasso):
# Values reported are MSE. Higher = worse
# Alpha=1.00: Train(1.002) Test(0.992) lmao
# Alpha=0.10: Train(0.725) Test(0.711)
# Alpha=0.01: Train(0.660) Test(0.650)
# Basic linreg: Train(0.657) Test(0.649)
# Conclusions - useless on its own

In [None]:
# Random Forest on LASSO. Shouldn't really do anything different from PCA (probably)
RF_lasso_clf = RandomForestRegressor()

# specify parameters and distributions to sample from
RF_lasso_parameters_rand = {
    "n_estimators": sp_randint(10, 60),
    "bootstrap": [True, False],
}

# run randomized search
# Accuracy should be comparable to grid search, but runs much much faster
n_iter_search = 20
RF_lasso_random_search = RandomizedSearchCV(RF_lasso_clf, param_distributions=RF_lasso_parameters_rand,
                                   n_iter=n_iter_search,
                                   n_jobs=-1)

RF_lasso_random_search.fit(lasso_xtrain, lasso_ytrain)

RF_lasso_predicted = RF_lasso_random_search.predict(lasso_xtest)

print("LASSO with random forest")
RF_lasso_random_search.score(lasso_xtest, lasso_ytest)

# FINDINGS (RF on lasso):
# Values reported are score. Higher = better
# Alpha = 1.00 -> -0.000195
# Alpha = 0.10 -> 0.5323 
# Alpha = 0.01 -> 0.5261
# No Lasso -> 0.5326
# Conclusion - way worse than PCA on its own. Shouldn't use it solo

In [None]:
# K-Nearest-Neighbors on LASSO
print("(n, weights):")
weights = 'distance'
for n in [1, 2, 3, 5, 10, 25, 50, 100, 250]:
    KNN_lasso_clf = KNeighborsRegressor(n_neighbors=n, weights=weights, n_jobs=-1)
    KNN_lasso_clf.fit(lasso_xtrain, lasso_ytrain)

    # Training error
    KNN_lasso_train_ypreds = KNN_lasso_clf.predict(lasso_xtrain)
    KNN_lasso_train_MSE = np.mean((KNN_lasso_train_ypreds - lasso_ytrain) ** 2)

    # Testing error
    KNN_lasso_test_ypreds = KNN_lasso_clf.predict(lasso_xtest)
    KNN_lasso_test_MSE = np.mean((KNN_lasso_test_ypreds - lasso_ytest) ** 2)

    print("({}, {}) -> Train({:.3f}) Test({:.3f})".format(n, weights, KNN_lasso_train_MSE, KNN_lasso_test_MSE))
    
"""
Findings (KNN on lasso):
Values reported are MSE - higher = worse
Alpha = 1.00:
Failed (Unbelievably high error, >1.5 and took 3 minutes to run n=1)

Alpha = 0.10:
(n, weights):
(1, distance) -> Train(0.000) Test(1.026)
(2, distance) -> Train(0.000) Test(0.795)
(3, distance) -> Train(0.000) Test(0.717)
(5, distance) -> Train(0.000) Test(0.648)
(10, distance) -> Train(0.000) Test(0.594)
(25, distance) -> Train(0.000) Test(0.558)
(50, distance) -> Train(0.000) Test(0.548)
(100, distance) -> Train(0.000) Test(0.547)
(250, distance) -> Train(0.000) Test(0.550)


Alpha = 0.01:
(n, weights):
(1, distance) -> Train(0.000) Test(0.901)
(2, distance) -> Train(0.000) Test(0.707)
(3, distance) -> Train(0.000) Test(0.641)
(5, distance) -> Train(0.000) Test(0.577)
(10, distance) -> Train(0.000) Test(0.525)
(25, distance) -> Train(0.000) Test(0.504)
(50, distance) -> Train(0.000) Test(0.502)
(100, distance) -> Train(0.000) Test(0.506)
(250, distance) -> Train(0.000) Test(0.514)

"""

In [None]:
# SVM on LASSO:
print("alpha: {}".format(alpha))
SVM_lasso_clf = svm.SVR(kernel='linear')
SVM_lasso_clf.fit(lasso_xtrain, lasso_ytrain)

# Training error
SVM_lasso_train_ypreds = SVM_lasso_clf.predict(lasso_xtrain)
SVM_lasso_train_MSE = np.mean((SVM_lasso_train_ypreds - lasso_ytrain) ** 2)

# Testing error
SVM_lasso_test_ypreds = SVM_lasso_clf.predict(lasso_xtest)
SVM_lasso_test_MSE = np.mean((SVM_lasso_test_ypreds - lasso_ytest) ** 2)
print("Error MSE: Train({:.3f}) Test({:.3f})".format(SVM_lasso_train_MSE, SVM_lasso_test_MSE))

"""
Results:
(Note: results given in MSE. Higher = worse)
Alpha = 1.0:
Error MSE: Train(0.999) Test(1.016) 

Alpha = 0.1:
Error MSE: Train(0.704) Test(0.724)

Alpha = 0.01:
Error MSE: Train(0.660) Test(0.672)

Conclusion: It's garbo

"""

In [None]:
# Lasso on PCA
# Train
alpha = 0.01
PCA_lasso_model = linear_model.Lasso(alpha=alpha)
PCA_lasso_model.fit(PCA_xtrain, PCA_ytrain)

# Training error
PCA_lasso_train_ypreds = PCA_lasso_model.predict(PCA_xtrain)
PCA_lasso_train_MSE = np.mean((PCA_lasso_train_ypreds - PCA_ytrain) ** 2)

# Testing error
PCA_lasso_test_ypreds = PCA_lasso_model.predict(PCA_xtest)
PCA_lasso_test_MSE = np.mean((PCA_lasso_test_ypreds - PCA_ytest) ** 2)

# Also do linreg for comparison
PCA_linreg_model = linear_model.LinearRegression()
PCA_linreg_model.fit(PCA_xtrain, PCA_ytrain)

# Linreg train:
PCA_linreg_train_ypreds = PCA_linreg_model.predict(PCA_xtrain)
PCA_linreg_train_MSE = np.mean((PCA_linreg_train_ypreds - PCA_ytrain) ** 2)

# Linreg test
PCA_linreg_test_ypreds = PCA_linreg_model.predict(PCA_xtest)
PCA_linreg_test_MSE = np.mean((PCA_linreg_test_ypreds - PCA_ytest) ** 2)


# Output
print("alpha: {}".format(alpha))
print("training error (MSE): {}".format(PCA_lasso_train_MSE))
print("testing  error (MSE): {}".format(PCA_lasso_test_MSE))
print("training error linreg: ", PCA_linreg_train_MSE)
print("testing error linreg: ", PCA_linreg_test_MSE)

# Split into new testing/training dataset
scaler = lambda row: np.multiply(PCA_lasso_model.coef_, row) + PCA_lasso_model.intercept_
PCA_lasso_x = np.apply_along_axis(scaler, 1, PCA_xtrain)
PCA_lasso_xtrain, PCA_lasso_xtest, PCA_lasso_ytrain, PCA_lasso_ytest = train_test_split(PCA_lasso_x, PCA_ytrain, test_size = 0.2, random_state = 0)

# FINDINGS (linreg on PCA->Lasso)
# Values reported are MSE: Higher = worse
# Alpha = 1.00-> Train(0.2810) Test(0.2913)
# Alpha = 0.10-> Train(0.0881) Test(0.0971)
# Alpha = 0.01-> Train(0.0670) Test(0.0737)
# No Lasso (linreg normally): Train(0.064) Test(0.071)
# Conclusion: Lasso on PCA doesn't help for just linear regression

In [None]:
# SVM on PCA->LASSO:
print("alpha: {}".format(alpha))
SVM_PCA_lasso_clf = svm.SVR(kernel='linear')
SVM_PCA_lasso_clf.fit(PCA_lasso_xtrain, PCA_lasso_ytrain)

# Training error
SVM_PCA_lasso_train_ypreds = SVM_PCA_lasso_clf.predict(PCA_lasso_xtrain)
SVM_PCA_lasso_train_MSE = np.mean((SVM_PCA_lasso_train_ypreds - PCA_lasso_ytrain) ** 2)

# Testing error
SVM_PCA_lasso_test_ypreds = SVM_PCA_lasso_clf.predict(PCA_lasso_xtest)
SVM_PCA_lasso_test_MSE = np.mean((SVM_PCA_lasso_test_ypreds - PCA_lasso_ytest) ** 2)
print("Error MSE: Train({:.3f}) Test({:.3f})".format(SVM_PCA_lasso_train_MSE, SVM_PCA_lasso_test_MSE))

"""
Results:
Alpha = 1.00
Error MSE: Train(0.177) Test(0.177)

Alpha = 0.10
Train(0.086) Test(0.075)

Alpha = 0.01:
Train(0.069) Test(0.064) <--- WOW -- equiv to a score of 0.936
"""

In [None]:
# Advanced SVM on PCA->LASSO utilizing  different values of C
C_range = 10. ** np.arange(4, 7)
C_MSE = []

# HERE: PCALSVMC is PCA->Lasso->SVM with C param
def computePCALSVMC_MSE(C):
    PCALSVMC_clf = svm.SVR(kernel='rbf', C = C)
    PCALSVMC_clf.fit(PCA_lasso_xtrain, PCA_lasso_ytrain)
    PCALSVMC_test_ypreds = PCALSVMC_clf.predict(PCA_lasso_xtest)
    PCALSVMC_test_MSE = np.mean((PCALSVMC_test_ypreds - PCA_lasso_ytest)**2)
    return PCALSVMC_test_MSE

C_MSE = pool.map(computePCALSVMC_MSE, C_range)
    
print("alpha: {}".format(alpha))
for c, mse in zip(C_range, C_MSE):
    print("C({}) MSE({})".format(c, mse))
    
"""
alpha: 0.01
C(0.001) MSE(0.9956695791988903)
C(0.01) MSE(0.9923254768785487)
C(0.1) MSE(0.9594169761856932)
C(1.0) MSE(0.6872493268512234)
C(10.0) MSE(0.1496952613657234)
C(100.0) MSE(0.08604381918340415)
C(1000.0) MSE(0.0787053108083247)
C(10000.0) MSE(0.07034445994490687)
C(50000) MSE(0.059558527959229696)
C(100000.0) MSE(0.054695510721515204)
C(150000) MSE(0.05171920548369242)
C(1000000.0) MSE(0.0736315107354647)
C(10000000.0) MSE(4.197551598806395)
C(100000000.0) MSE(356.09536694427385)

POST DROP TO "REAL" COMPONENTS:
C(0.01) MSE(0.9923254768785487)
C(0.1) MSE(0.9594169761856932)
C(1.0) MSE(0.6872493268512234)
C(10.0) MSE(0.1496952613657234)
C(100.0) MSE(0.08604381918340415)
C(1000.0) MSE(0.0787053108083247)
C(10000.0) MSE(0.07034445994490687)
C(100000.0) MSE(0.054695510721515204)
C(1000000.0) MSE(0.0736315107354647)


"""

In [None]:
# Plot values of C vs MSE
_tools_to_show = 'box_zoom,pan,save,hover,reset,tap,wheel_zoom'        
p_C_MSE = figure(plot_width=400, plot_height=400, title=None, tools=_tools_to_show)

# add a circle renderer with a size, color, and alpha
p_C_MSE.circle (C_range, C_MSE, size=10, color="green", alpha=0.5)

# show the results
show(p_C_MSE)

In [None]:
# Using our optimal value of C, we cross validate to find the optimal value of gamma
from itertools import chain
gamma_range = chain(10. ** np.arange(-5, 1), [0.2, 0.3, 0.4])
def computePCALSVMC_MSE_GAMMA(g): 
    PCALSVMC_clf = svm.SVR(kernel='rbf', C = 150000, gamma = g)
    PCALSVMC_clf.fit(PCA_lasso_xtrain, PCA_lasso_ytrain)
    PCALSVMC_test_ypreds = PCALSVMC_clf.predict(PCA_lasso_xtest)
    PCALSVMC_test_MSE = np.mean((PCALSVMC_test_ypreds - PCA_lasso_ytest)**2)
    gamma_MSE.append(PCALSVMC_test_MSE)
    
    
gamma_MSE = pool.map(computePCALSVMC_MSE_GAMMA, gamma_range)
    
print("alpha: {}".format(alpha))
print("C = 150000")
for c, mse in zip(gamma_range, gamma_MSE):
    print("gamma({}) MSE({})".format(c, mse)) 
   


'''
C = 15000
gamma(1e-05) MSE(0.08045146016190116)
gamma(0.0001) MSE(0.06162189690329871)
gamma(0.001) MSE(0.028257963289700597)
gamma(0.01) MSE(0.009621058122301473)
gamma(0.1) MSE(0.008151309634909475)
gamma(0.2) MSE(0.009688701255743915)
gamma(0.3) MSE(0.009684018266092493)
gamma(0.4) MSE(0.009733114479588486)
gamma(1.0) MSE(0.010803416697807116)

'''

In [None]:
for c, mse in zip(gamma_range, gamma_MSE):
    print("gamma({}) MSE({})".format(c, mse)) 

In [None]:
# Random forest on PCA->LASSO:
RF_PCA_lasso_clf = RandomForestRegressor()

# specify parameters and distributions to sample from
RF_PCA_lasso_parameters_rand = {
    "n_estimators": sp_randint(10, 60),
    "bootstrap": [True, False]
}

# run randomized search
# Accuracy should be comparable to grid search, but runs much much faster
n_iter_search = 20
RF_PCA_lasso_random_search = RandomizedSearchCV(RF_PCA_lasso_clf, param_distributions=RF_PCA_lasso_parameters_rand,
                                    n_iter=n_iter_search,
                                    n_jobs=-1,
                                    random_state=0)

RF_PCA_lasso_random_search.fit(PCA_lasso_xtrain, PCA_lasso_ytrain)

RF_PCA_lasso_predicted = RF_PCA_lasso_random_search.predict(PCA_lasso_xtest)

print("PCA->LASSO with random forest")
print("alpha = {}".format(alpha))
RF_PCA_lasso_random_search.score(PCA_lasso_xtest, PCA_lasso_ytest)

# FINDINGS (RF on PCA->lasso):
# Values reported are score. Higher = better
# Alpha = 1.00 -> 0.852
# Alpha = 0.10 -> 0.911
# Alpha = 0.01 -> 0.910
# No lasso (taken from above) -> 0.914
# Conclusion: It isn't the worst thing in the world, but is ultimately probably worse than RF on PCA without lasso
# Note that unless seeded, results vary fairly significantly from run to run.


In [None]:
# K-Nearest-Neighbors on PCA->LASSO
print("alpha = {}".format(alpha))
print("(n, weights):")
weights = 'distance'
for n in [1, 2, 3, 5, 10, 25, 50, 100, 250]:
    KNN_PCA_lasso_clf = KNeighborsRegressor(n_neighbors=n, weights=weights, n_jobs=-1)
    KNN_PCA_lasso_clf.fit(PCA_lasso_xtrain, PCA_lasso_ytrain)

    # Training error
    KNN_PCA_lasso_train_ypreds = KNN_PCA_lasso_clf.predict(PCA_lasso_xtrain)
    KNN_PCA_lasso_train_MSE = np.mean((KNN_PCA_lasso_train_ypreds - PCA_lasso_ytrain) ** 2)

    # Testing error
    KNN_PCA_lasso_test_ypreds = KNN_PCA_lasso_clf.predict(PCA_lasso_xtest)
    KNN_PCA_lasso_test_MSE = np.mean((KNN_PCA_lasso_test_ypreds - PCA_lasso_ytest) ** 2)

    print("({}, {}) -> Train({:.3f}) Test({:.3f})".format(n, weights, KNN_PCA_lasso_train_MSE, KNN_PCA_lasso_test_MSE))
    
"""
Findings (KNN on PCA->lasso):
Values reported are MSE - higher = worse
Alpha = 1.00:
(n, weights):
(1, distance) -> Train(0.000) Test(0.261)
(2, distance) -> Train(0.000) Test(0.194)
(3, distance) -> Train(0.000) Test(0.171)
(5, distance) -> Train(0.000) Test(0.153)
(10, distance) -> Train(0.000) Test(0.143)
(25, distance) -> Train(0.000) Test(0.145)
(50, distance) -> Train(0.000) Test(0.152)
(100, distance) -> Train(0.000) Test(0.170)
(250, distance) -> Train(0.000) Test(0.213)

Alpha = 0.10:
(n, weights):
(1, distance) -> Train(0.000) Test(0.153)
(2, distance) -> Train(0.000) Test(0.105)
(3, distance) -> Train(0.000) Test(0.088)
(5, distance) -> Train(0.000) Test(0.077)
(10, distance) -> Train(0.000) Test(0.078)
(25, distance) -> Train(0.000) Test(0.084)
(50, distance) -> Train(0.000) Test(0.099)
(100, distance) -> Train(0.000) Test(0.121)
(250, distance) -> Train(0.000) Test(0.165)


Alpha = 0.01:
(n, weights):
(1, distance) -> Train(0.000) Test(0.141)
(2, distance) -> Train(0.000) Test(0.096)
(3, distance) -> Train(0.000) Test(0.081)
(5, distance) -> Train(0.000) Test(0.073)
(10, distance) -> Train(0.000) Test(0.075)
(25, distance) -> Train(0.000) Test(0.083)
(50, distance) -> Train(0.000) Test(0.098)
(100, distance) -> Train(0.000) Test(0.120)
(250, distance) -> Train(0.000) Test(0.162)

alpha = 0.001
(n, weights):
(1, distance) -> Train(0.000) Test(0.140)
(2, distance) -> Train(0.000) Test(0.097)
(3, distance) -> Train(0.000) Test(0.081)
(5, distance) -> Train(0.000) Test(0.072)
(10, distance) -> Train(0.000) Test(0.073)
(25, distance) -> Train(0.000) Test(0.083)
(50, distance) -> Train(0.000) Test(0.098)
(100, distance) -> Train(0.000) Test(0.120)
(250, distance) -> Train(0.000) Test(0.162)

"""

In [None]:
# K-Nearest-Neighbors on PCA (probably should've done this one first)
print("alpha = {}".format(alpha))
print("(n, weights):")
weights = 'distance'
for n in [1, 2, 3, 5, 10, 25, 50, 100, 250]:
    KNN_PCA_clf = KNeighborsRegressor(n_neighbors=n, weights=weights, n_jobs=-1)
    KNN_PCA_clf.fit(PCA_xtrain, PCA_ytrain)

    # Training error
    KNN_PCA_train_ypreds = KNN_PCA_clf.predict(PCA_xtrain)
    KNN_PCA_train_MSE = np.mean((KNN_PCA_train_ypreds - PCA_ytrain) ** 2)

    # Testing error
    KNN_PCA_test_ypreds = KNN_PCA_clf.predict(PCA_xtest)
    KNN_PCA_test_MSE = np.mean((KNN_PCA_test_ypreds - PCA_ytest) ** 2)

    print("({}, {}) -> Train({:.3f}) Test({:.3f})".format(n, weights, KNN_PCA_train_MSE, KNN_PCA_test_MSE))
    
"""
Findings (KNN on PCA):
Values reported are MSE - higher = worse
(n, weights):
(1, distance) -> Train(0.000) Test(0.205)
(2, distance) -> Train(0.000) Test(0.141)
(3, distance) -> Train(0.000) Test(0.119)
(5, distance) -> Train(0.000) Test(0.111)
(10, distance) -> Train(0.000) Test(0.110)
(25, distance) -> Train(0.000) Test(0.133)
(50, distance) -> Train(0.000) Test(0.163)
(100, distance) -> Train(0.000) Test(0.199)
(250, distance) -> Train(0.000) Test(0.266)

Note: Seems to run noticeably slower than lasso 
"""

In [None]:
PCA_xtrain.shape