In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
import bokeh
from bokeh.plotting import figure, output_file, show, ColumnDataSource
# import bokeh.charts.utils
import bokeh.io
import bokeh.models
from bokeh.models import HoverTool
import bokeh.palettes
import bokeh.plotting
import random
from random import sample
from sklearn import svm, neighbors


from scipy.stats import randint as sp_randint
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

In [None]:
# Preprocessing, normalization done in excel (Xnew = (X-mean)/std))
# If we want to normalize in Python we can use preprocessing.scale()
Data = pd.read_csv('songs4.csv')
Data = Data.iloc[:, 0:18]
#Data = Data.drop(Data[(Data.time_signature > 5)].index)
Data.head()

In [None]:
# check number of rows
Data.count()

In [None]:
# Divide into testing and training
x = Data.drop('valence', 1)
y = Data.valence
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.20)
train = xtrain.assign(valence=ytrain)
train = train [0: 5000]
train.head()

In [None]:
# Create correlation matrix
M = train.corr()

In [None]:
M

In [None]:
# correlation matrix
plt.imshow(M)
plt.colorbar()
plt.show()

In [None]:
# SVD using numpy function
U, E, VT = np.linalg.svd(M)

In [None]:
plt.plot(E)
plt.show()

In [None]:
P = np.dot(U[:,:2],np.diag(E[:2]))

In [None]:
plt.plot(P[:,0], P[:,1],'o')
plt.show()

In [None]:
N = train.T
N.columns = N.iloc[0]
N = N.drop('ID')
N = N.astype(float)

In [None]:
# Takes 10-15 minutes with all of the data
N = N.corr()
N.head()

In [None]:
# Identifies how different songs correlate to each other, there is a fair amount of uniqueness among songs
plt.imshow(N)
plt.colorbar()
plt.show()

In [None]:
U, E, VT = np.linalg.svd(N)

In [None]:
# Most of the variance can be explained using the first 8 or so components
plt.plot(E[:10])
plt.show()

In [None]:
P = np.dot(U[:,:2],np.diag(E[:2]))
print(P)

In [None]:
# plot first two principal components to get an idea of the shape of the data
_tools_to_show = 'box_zoom,pan,save,hover,reset,tap,wheel_zoom'        
p = figure(plot_width=400, plot_height=400, title=None, tools=_tools_to_show)

# add a circle renderer with a size, color, and alpha
p.circle(P[:,0], P[:,1], size=4, color="navy", alpha=0.2)

# show the results
show(p)

In [None]:
# SVM on PCA results

In [None]:
# Further divide training and testing based on principal components
# Slice U and E based on the ideal number of principal components
P = np.dot(U, np.diag(E))
PCA_xtrain, PCA_xtest, PCA_ytrain, PCA_ytest = train_test_split(P, train['valence'], test_size = 0.2, random_state = 0)

In [None]:
SVM_clf = svm.SVR(kernel='linear')

In [None]:
SVM_clf.fit(PCA_xtrain, PCA_ytrain)

In [None]:
# testing error
SVM_test_ypreds = SVM_clf.predict(PCA_xtest)
SVM_test_MSE = np.mean((SVM_test_ypreds - PCA_ytest)**2)
SVM_test_MSE

In [None]:
SVM_clf.score(PCA_xtest, PCA_ytest)

In [None]:
SVM_clf.score(PCA_xtrain, PCA_ytrain)

In [None]:
# training error
SVM_train_ypreds = SVM_clf.predict(PCA_xtrain)
SVM_test_MSE = np.mean((SVM_train_ypreds - PCA_ytrain)**2)
SVM_test_MSE

In [None]:
# Random Forest
RF_clf = RandomForestRegressor()

# specify parameters and distributions to sample from
parameters_rand = {
    "n_estimators": sp_randint(10, 60),
    "bootstrap": [True, False],
}

# run randomized search
# Accuracy should be comparable to grid search, but runs much much faster
n_iter_search = 20
random_search = RandomizedSearchCV(RF_clf, param_distributions=parameters_rand,
                                   n_iter=n_iter_search,
                                   n_jobs=-1)

random_search.fit(PCA_xtrain, PCA_ytrain)

predicted = random_search.predict(PCA_xtest)

print("PCA with random forest")
random_search.score(PCA_xtest, PCA_ytest)

In [None]:
# Lasso (on it's own)
from sklearn import linear_model

# Train
lasso_models = {} # Keyed by alpha
xtrain_no_id = xtrain.iloc[:, 1:]
xtest_no_id  = xtest.iloc[:, 1:]

for alpha in [0.01, 0.05, 0.1, 0.2, 0.5, 0.7, 1.0]:
    lasso_model = linear_model.Lasso(alpha=alpha)
    lasso_model.fit(xtrain_no_id, ytrain)
    
    # Training error
    lasso_train_ypreds = lasso_model.predict(xtrain_no_id)
    lasso_train_MSE = np.mean((lasso_train_ypreds - ytrain) ** 2)
    
    # Testing error
    lasso_test_ypreds = lasso_model.predict(xtest_no_id)
    lasso_test_MSE = np.mean((lasso_test_ypreds - ytest)**2)
    
    # Output
    print("alpha: {}".format(alpha))
    print("training error: {}".format(lasso_train_MSE))
    print("testing  error: {}".format(lasso_test_MSE))
    
    # Save
    lasso_models[alpha] = lasso_model
