In [5]:
import csv
import numpy as np
import os
import time
import torch
from collections import OrderedDict
from matplotlib import pyplot as plt
from sklearn.linear_model import LinearRegression, LogisticRegression, BayesianRidge
from sklearn.ensemble import RandomForestRegressor
from torch.autograd import Variable
from sklearn.feature_selection import mutual_info_regression

In [6]:
hdrs = None
data = None

# parse data from song features
with open("data/random_song_features.csv", "r") as f:
    reader = csv.reader(f)
    all_rows = list(reader)
    hdrs = all_rows[0]
    data = all_rows[1:]
    
data = np.array(data)
np.random.shuffle(data)
pop_index = hdrs.index('popularity')
all_id = data[:,:2]
data = data[:,2:]
y_data = data[:,pop_index - 2]
x_data = np.delete(data, pop_index - 2, 1)
y_data = y_data.astype(np.float)
x_data = x_data.astype(np.float)

In [7]:
N_ROWS = len(x_data)
N_TRAIN = int(0.9 * N_ROWS)
N_TEST = N_ROWS - N_TRAIN
x_train = x_data[:N_TRAIN]
y_train = y_data[:N_TRAIN]
x_test = x_data[N_TRAIN:]
y_test = y_data[N_TRAIN:]
print("Train: {}\nTest:  {}".format(N_TRAIN, N_ROWS - N_TRAIN))

Train: 39060
Test:  4340


In [19]:
from sklearn.linear_model import \
    LinearRegression, LogisticRegression, BayesianRidge, ElasticNet, Lasso
from sklearn.ensemble import \
    RandomForestRegressor, AdaBoostRegressor, BaggingRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import LinearSVR
from sklearn.tree import DecisionTreeRegressor

if not os.path.exists('results'):
    os.makedirs('results')

regressors = {
    'linear':   LinearRegression(),
    'bayesian': BayesianRidge(),
    'mlp':      MLPRegressor(),
    'linSVR':   LinearSVR(),
    'elastic':  ElasticNet(),
    'lasso':    Lasso()
}
fast_regressors = {
    'forest': ('Random Forest', RandomForestRegressor),
    'bagger': ('Bagging', BaggingRegressor),
    'extrat': ('Extra Trees', ExtraTreesRegressor)
}
slow_regressors = {
    'gboost': ('Gradient Boosting', GradientBoostingRegressor),
    'aboost': ('AdaBoost Boosting', AdaBoostRegressor)
}
fast_estimators = [1, 2, 3, 4, 5, 10, 20, 30, 40, 50, 100]
slow_estimators = [1, 2, 3, 4, 5, 10, 20]
for key in fast_regressors:
    for i in fast_estimators:
        regressor = fast_regressors[key][1]
        regressors[key + '-{}'.format(i)] = regressor(n_jobs=8, n_estimators=i)
for key in slow_regressors:
    for i in slow_estimators:
        regressor = slow_regressors[key][1]
        regressors[key + '-{}'.format(i)] = regressor(n_estimators=i)

data = {}
for name, regressor in regressors.items():
    print("Starting on regressor {}     ".format(name), end='\t')
    start_time = time.time()
    regressor.fit(x_train, y_train)
    end_time = time.time()
    print("Fit time: {:.2f}".format(end_time - start_time))

Starting on regressor linear	Fit time: 0.03
Starting on regressor bayesian	Fit time: 0.05
Starting on regressor mlp	Fit time: 2.02
Starting on regressor linSVR	Fit time: 9.24
Starting on regressor elastic	



Fit time: 1.67
Starting on regressor lasso	Fit time: 1.65
Starting on regressor forest-1	Fit time: 0.84
Starting on regressor forest-2	Fit time: 0.93
Starting on regressor forest-3	Fit time: 0.93
Starting on regressor forest-4	Fit time: 0.93
Starting on regressor forest-5	Fit time: 1.04
Starting on regressor forest-10	Fit time: 2.16
Starting on regressor forest-20	Fit time: 3.32
Starting on regressor forest-30	Fit time: 4.74
Starting on regressor forest-40	Fit time: 6.69
Starting on regressor forest-50	Fit time: 8.04
Starting on regressor forest-100	Fit time: 15.24
Starting on regressor bagger-1	Fit time: 0.80
Starting on regressor bagger-2	Fit time: 1.15
Starting on regressor bagger-3	Fit time: 1.15
Starting on regressor bagger-4	Fit time: 1.25
Starting on regressor bagger-5	Fit time: 1.37
Starting on regressor bagger-10	Fit time: 2.50
Starting on regressor bagger-20	Fit time: 4.09
Starting on regressor bagger-30	Fit time: 6.05
Starting on regressor bagger-40	Fit time: 8.02
Starting o

In [29]:
for name, regressor in regressors.items():
    print("Analyzing results for regressor {}...  ".format(name), end='\t')
    pred = regressor.predict(x_test)
    errs = Variable(torch.Tensor(pred - y_test))
    loss = errs.pow(2).sum()
    avg_err = errs.abs().mean().data.numpy()[0]
    rmse = np.sqrt(loss.data.numpy()[0] / N_TEST)
    data[name] = [avg_err, rmse]
    
    plt.clf()
    plt.hist(errs.data.numpy(), bins=100)
    plt.xlabel("Popularity Prediction Error")
    plt.ylabel("Count")
    plt.title(
        'Prediction Errors for Model \"{}\"'
        '\nAverage Error: {:4.2f}, Test RMSE: {:4.2f}'.format(
        name, avg_err, rmse
    ))
    plt.savefig("results/v2-split90-{}.png".format(name))
    print("Average Error: {:4.2f}, Test RMSE: {:4.2f}".format(
        avg_err, rmse
    ))

with open("results/v2-split90-data.csv", "w+") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Model', 'Average Error', 'RMSE'])
    for model in data:
        writer.writerow([model] + data[model])

plt.clf()
for key in fast_regressors:
    plt.plot(fast_estimators, list(map(
        lambda e: data[key + '-{}'.format(e)][0], fast_estimators
    )), label=fast_regressors[key][0])
plt.xlabel('# Estimators')
plt.ylabel('Average Prediction Error')
plt.title('Average Prediction Error vs. # Estimators for Parallelizable Regressors')
plt.legend()
plt.savefig('results/fast-err.png')

plt.clf()
for key in fast_regressors:
    plt.plot(fast_estimators, list(map(
        lambda e: data[key + '-{}'.format(e)][1], fast_estimators
    )), label=fast_regressors[key][0])
plt.xlabel('# Estimators')
plt.ylabel('RMSE')
plt.title('RMSE vs. # Estimators for Parallelizable Regressors')
plt.legend()
plt.savefig('results/fast-rmse.png')

plt.clf()
for key in slow_regressors:
    plt.plot(slow_estimators, list(map(
        lambda e: data[key + '-{}'.format(e)][0], slow_estimators
    )), label=slow_regressors[key][0])
plt.xlabel('# Estimators')
plt.ylabel('Average Prediction Error')
plt.title('Average Prediction Error vs. # Estimators for Non-Parallelizable Regressors')
plt.legend()
plt.savefig('results/slow-err.png')

plt.clf()
for key in slow_regressors:
    plt.plot(slow_estimators, list(map(
        lambda e: data[key + '-{}'.format(e)][1], slow_estimators
    )), label=slow_regressors[key][0])
plt.xlabel('# Estimators')
plt.ylabel('RMSE')
plt.title('RMSE vs. # Estimators for Non-Parallelizable Regressors')
plt.legend()
plt.savefig('results/slow-rmse.png')

print("Done analyzing results!")

Analyzing results for regressor linear...  	Average Error: 8.22, Test RMSE: 11.01
Analyzing results for regressor bayesian...  	Average Error: 8.22, Test RMSE: 11.01
Analyzing results for regressor mlp...  	Average Error: 11.49, Test RMSE: 15.07
Analyzing results for regressor linSVR...  	Average Error: 20.12, Test RMSE: 28.99
Analyzing results for regressor elastic...  	Average Error: 8.24, Test RMSE: 11.07
Analyzing results for regressor lasso...  	Average Error: 8.24, Test RMSE: 11.07
Analyzing results for regressor forest-1...  	Average Error: 10.64, Test RMSE: 14.38
Analyzing results for regressor forest-2...  	Average Error: 9.20, Test RMSE: 12.28
Analyzing results for regressor forest-3...  	Average Error: 8.64, Test RMSE: 11.51
Analyzing results for regressor forest-4...  	Average Error: 8.45, Test RMSE: 11.15
Analyzing results for regressor forest-5...  	Average Error: 8.24, Test RMSE: 10.93
Analyzing results for regressor forest-10...  	Average Error: 7.78, Test RMSE: 10.42
A

In [70]:
print("Logistic Regression: Classification")
lr = LogisticRegression(n_jobs=8, tol=10 ** -10)
start_time = time.time()
lr.fit(x_train, (y_train >= 80).astype(np.int))
end_time = time.time()
print("Fit time: {:.2f}".format(end_time - start_time))

pred = lr.predict(x_test)
print(sum(pred))
total = 0
correct = 0
for i in range(len(y_test)):
    val = y_test[i]
    if val < 80:
        total += 1
        if pred[i] == 0:
            correct += 1
print(total, correct)

Fit time: 2.12


In [27]:
mi = mutual_info_regression(x_data, y_data)
feature_ranking = np.argsort(mi)[::-1]
features = hdrs[2:]
features.pop(features.index('popularity'))
np.array(features)[feature_ranking]

array(['artist_popularity', 'release_date', 'album_length', 'loudness',
       'instrumentalness', 'duration', 'num_segments', 'acousticness',
       'start_of_fade_out', 'num_tatums', 'energy', 'num_beats',
       'num_sections', 'track_number', 'num_bars', 'danceability',
       'valence', 'explicit', 'speechiness', 'time_signature_confidence',
       'end_of_fade_in', 'mode_confidence', 'tempo', 'time_signature',
       'liveness', 'tempo_confidence', 'mode', 'key', 'key_confidence'],
      dtype='<U25')

In [3]:
def table(csv):
    with open(csv) as f:
        n = None
        for line in f:
            tokens = line.strip().split(',')
            if n is None:
                n = len(tokens)
                print("\\begin{tabular}{" + "l".join(["|" for _ in range(n + 1)]) + "}\\hline")
            print(" & ".join(tokens) + " \\\\\\hline")
        print("\\end{tabular}")
    
# table("results_43400/v2-split90-data.csv")

\begin{tabular}{|l|l|l|}\hline
Model & Average Error & RMSE \\\hline
linear & 8.16702 & 10.9666643759 \\\hline
bayesian & 8.16673 & 10.9671216724 \\\hline
forest-1 & 10.468 & 14.2349686265 \\\hline
forest-2 & 9.26406 & 12.2648950167 \\\hline
forest-3 & 8.42197 & 11.2565322817 \\\hline
forest-4 & 8.20438 & 10.9301958824 \\\hline
forest-5 & 8.04336 & 10.6330513003 \\\hline
forest-10 & 7.62878 & 10.1286369129 \\\hline
forest-20 & 7.42177 & 9.91521117932 \\\hline
forest-30 & 7.3393 & 9.82849583391 \\\hline
forest-40 & 7.29246 & 9.76124764798 \\\hline
forest-50 & 7.29152 & 9.73650031398 \\\hline
forest-100 & 7.25652 & 9.72042142821 \\\hline
forest-200 & 7.20697 & 9.63950109076 \\\hline
forest-300 & 7.19834 & 9.6407047649 \\\hline
\end{tabular}
