In [4]:
import csv
import numpy as np
import os
import time
import torch
from collections import OrderedDict
from matplotlib import pyplot as plt
from sklearn.linear_model import LinearRegression, LogisticRegression, BayesianRidge
from sklearn.ensemble import RandomForestRegressor
from torch.autograd import Variable
from sklearn.feature_selection import mutual_info_regression

In [2]:
hdrs = None
data = None

# parse data from song features
with open("data/random_song_features.csv", "r") as f:
    reader = csv.reader(f)
    all_rows = list(reader)
    hdrs = all_rows[0]
    data = all_rows[1:]
    
data = np.array(data)
np.random.shuffle(data)
pop_index = hdrs.index('popularity')
all_id = data[:,:2]
data = data[:,2:]
y_data = data[:,pop_index - 2]
x_data = np.delete(data, pop_index - 2, 1)
y_data = y_data.astype(np.float)
x_data = x_data.astype(np.float)

In [3]:
N_ROWS = len(x_data)
N_TRAIN = int(0.9 * N_ROWS)
N_TEST = N_ROWS - N_TRAIN
x_train = x_data[:N_TRAIN]
y_train = y_data[:N_TRAIN]
x_test = x_data[N_TRAIN:]
y_test = y_data[N_TRAIN:]
print("Train: {}\nTest:  {}".format(N_TRAIN, N_ROWS - N_TRAIN))

Train: 39060
Test:  4340


In [5]:
from sklearn.linear_model import \
    LinearRegression, LogisticRegression, BayesianRidge, ElasticNet, Lasso
from sklearn.ensemble import \
    RandomForestRegressor, AdaBoostRegressor, BaggingRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import LinearSVR

if not os.path.exists('results'):
    os.makedirs('results')

regressors = {
    'linear':   LinearRegression(),
    'bayesian': BayesianRidge(),
    'mlp':      MLPRegressor(),
    'linSVR':   LinearSVR(),
    'adaboost': AdaBoostRegressor(),
    'bagging':  BaggingRegressor(n_jobs=8),
    'extra':    ExtraTreesRegressor(n_jobs=8),
    'gradient': GradientBoostingRegressor(),
    'elastic':  ElasticNet(),
    'lasso':    Lasso()
}
estimators = [1, 2, 3, 4, 5, 10, 20, 30, 40, 50, 100, 200, 300]
for i in estimators:
    regressors["forest-{}".format(i)] = RandomForestRegressor(n_estimators=i, n_jobs=8)

data = {}
for name, regressor in regressors.items():
    print("Starting on regressor {}".format(name), end='\t')
    start_time = time.time()
    regressor.fit(x_train, y_train)
    end_time = time.time()
    print("Fit time: {:.2f}".format(end_time - start_time))

Starting on regressor linear	Fit time: 0.04
Starting on regressor bayesian	Fit time: 0.06
Starting on regressor forest-1	Fit time: 0.94
Starting on regressor forest-2	Fit time: 0.92
Starting on regressor forest-3	Fit time: 0.93
Starting on regressor forest-4	Fit time: 1.03
Starting on regressor forest-5	Fit time: 1.03
Starting on regressor forest-10	Fit time: 2.08
Starting on regressor forest-20	Fit time: 3.31
Starting on regressor forest-30	Fit time: 4.96
Starting on regressor forest-40	Fit time: 7.29
Starting on regressor forest-50	Fit time: 9.56
Starting on regressor forest-100	Fit time: 16.33
Starting on regressor forest-200	Fit time: 35.43
Starting on regressor forest-300	Fit time: 56.60


In [65]:
for name, regressor in regressors.items():
    print("Analyzing results for regressor {}...".format(name), end='\t')
    pred = regressor.predict(x_test)
    errs = Variable(torch.Tensor(pred - y_test))
    loss = errs.pow(2).sum()
    avg_err = errs.abs().mean().data.numpy()[0]
    rmse = np.sqrt(loss.data.numpy()[0] / N_TEST)
    data[name] = [avg_err, rmse]
    
    plt.clf()
    plt.hist(errs.data.numpy(), bins=100)
    plt.xlabel("Popularity Prediction Error")
    plt.ylabel("Count")
    plt.title(
        'Prediction Errors for Model \"{}\"'
        '\nAverage Error: {:4.2f}, Test RMSE: {:4.2f}'.format(
        name, avg_err, rmse
    ))
    plt.savefig("results/v2-split90-{}.png".format(name))
    print("Average Error: {:4.2f}, Test RMSE: {:4.2f}".format(
        avg_err, rmse
    ))

with open("results/v2-split90-data.csv", "w+") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Model', 'Average Error', 'RMSE'])
    for model in data:
        writer.writerow([model] + data[model])

# plt.clf()
# plt.scatter(estimators, list(map(
#     lambda e: data['forest-{}'.format(e)][0], estimators
# )))
# plt.title('Random Forest Regressor Average Prediction Error vs. # Estimators')
# plt.xlabel('# Estimators')
# plt.ylabel('Average Prediction Error')
# plt.savefig('results/v2-split90-rfr-err.png')

# plt.clf()
# plt.scatter(estimators, list(map(
#     lambda e: data['forest-{}'.format(e)][1], estimators
# )))
# plt.title('Random Forest Regressor RMSE vs. # Estimators')
# plt.xlabel('# Estimators')
# plt.ylabel('RMSE')
# plt.savefig('results/v2-split90-rfr-rmse.png')

print("Done analyzing results!")

Analyzing results for regressor linear...	Average Error: 8.44, Test RMSE: 11.31
Analyzing results for regressor bayesian...	Average Error: 8.44, Test RMSE: 11.31
Analyzing results for regressor forest-1...	Average Error: 10.47, Test RMSE: 14.17
Done analyzing results!


In [70]:
print("Logistic Regression: Classification")
lr = LogisticRegression(n_jobs=8, tol=10 ** -10)
start_time = time.time()
lr.fit(x_train, (y_train >= 80).astype(np.int))
end_time = time.time()
print("Fit time: {:.2f}".format(end_time - start_time))

pred = lr.predict(x_test)
print(sum(pred))
total = 0
correct = 0
for i in range(len(y_test)):
    val = y_test[i]
    if val < 80:
        total += 1
        if pred[i] == 0:
            correct += 1
print(total, correct)

Fit time: 2.12


In [27]:
mi = mutual_info_regression(x_data, y_data)
feature_ranking = np.argsort(mi)[::-1]
features = hdrs[2:]
features.pop(features.index('popularity'))
np.array(features)[feature_ranking]

array(['artist_popularity', 'release_date', 'album_length', 'loudness',
       'instrumentalness', 'duration', 'num_segments', 'acousticness',
       'start_of_fade_out', 'num_tatums', 'energy', 'num_beats',
       'num_sections', 'track_number', 'num_bars', 'danceability',
       'valence', 'explicit', 'speechiness', 'time_signature_confidence',
       'end_of_fade_in', 'mode_confidence', 'tempo', 'time_signature',
       'liveness', 'tempo_confidence', 'mode', 'key', 'key_confidence'],
      dtype='<U25')