In [4]:
import csv
import numpy as np
import os
import time
import torch
from collections import OrderedDict
from matplotlib import pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from torch.autograd import Variable
from sklearn.feature_selection import mutual_info_regression

In [2]:
hdrs = None
data = None

# parse data from song features
with open("data/random_song_features.csv", "r") as f:
    reader = csv.reader(f)
    all_rows = list(reader)
    hdrs = all_rows[0]
    data = all_rows[1:]
    
data = np.array(data)
pop_index = hdrs.index('popularity')
all_id = data[:,:2]
data = data[:,2:]
y_data = data[:,pop_index - 2]
x_data = np.delete(data, pop_index - 2, 1)
y_data = y_data.astype(np.float)
x_data = x_data.astype(np.float)

In [3]:
N_ROWS = len(x_data)
N_TRAIN = int(0.9 * N_ROWS)
N_TEST = N_ROWS - N_TRAIN
x_train = x_data[:N_TRAIN]
y_train = y_data[:N_TRAIN]
x_test = x_data[N_TRAIN:]
y_test = y_data[N_TRAIN:]
print("Train: {}\nTest:  {}".format(N_TRAIN, N_ROWS - N_TRAIN))

Train: 39145
Test:  4350


In [77]:
# N_STEPS, INCREMENTS, LR = 20000, 10, 0.001
# loss_fn = torch.nn.MSELoss()
# net     = torch.nn.Sequential(OrderedDict([
#     ('lin1', torch.nn.Linear(N_FEATURES, 1)),
# #     ('relu', torch.nn.ReLU()),
# #     ('lin2', torch.nn.Linear(5, 1))
# ]))
# optim   = torch.optim.Adam(net.parameters(), lr=LR)

# losses = []
# last_time = time.time()
# for step in range(N_STEPS):
#     pred = net.forward(x_train)
#     loss = loss_fn(pred, y_train)
#     if (step + 1) % (N_STEPS // INCREMENTS) == 0:
#         curr_time = time.time()
#         elapsed = curr_time - last_time
#         last_time = curr_time
#         print("Done with batch {:02d}/{:02d}, elapsed time {:04.2f}; loss is {:.2f} ".format(
#             (step + 1) // (N_STEPS // INCREMENTS), INCREMENTS, elapsed, loss.data.numpy()[0]
#         ))
#     losses.append(loss.data.numpy()[0])
#     optim.zero_grad()
#     loss.backward()
#     optim.step()

In [88]:
if not os.path.exists('results'):
    os.makedirs('results')

regressors = {
    'linear': LinearRegression()
}
estimators = [1, 2, 3, 4, 5, 10, 20, 30, 40, 50, 100, 200, 300]
for i in estimators:
    regressors["forest-{}".format(i)] = RandomForestRegressor(n_estimators=i)

data = {}
for name, regressor in regressors.items():
    print("Starting on regressor {}".format(name), end='\t\t')
    regressor.fit(x_train, y_train)
    pred = regressor.predict(x_test)
    errs = Variable(torch.Tensor(pred - y_test))
    loss = errs.pow(2).sum()
    avg_err = errs.abs().mean().data.numpy()[0]
    rmse = np.sqrt(loss.data.numpy()[0] / N_TEST)
    data[name] = [avg_err, rmse]
    
    plt.clf()
    plt.hist(errs.data.numpy(), bins=100)
    plt.xlabel("Popularity Prediction Error")
    plt.ylabel("Count")
    plt.title(
        'Prediction Errors for Model \"{}\"'
        '\nAverage Error: {:4.2f}, Test RMSE: {:4.2f}'.format(
        name, avg_err, rmse
    ))
    plt.savefig("results/v2-split90-{}.png".format(name))
    print("Average Error: {:4.2f}, Test RMSE: {:4.2f}".format(
        avg_err, rmse
    ))

with open("results/v2-split90-data.csv", "w+") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Model', 'Average Error', 'RMSE'])
    for model in data:
        writer.writerow([model] + data[model])

plt.clf()
plt.scatter(estimators, list(map(
    lambda e: data['forest-{}'.format(e)][0], estimators
)))
plt.title('Random Forest Regressor Average Prediction Error vs. # Estimators')
plt.xlabel('# Estimators')
plt.ylabel('Average Prediction Error')
plt.savefig('results/v2-split90-rfr-err.png')

plt.clf()
plt.scatter(estimators, list(map(
    lambda e: data['forest-{}'.format(e)][1], estimators
)))
plt.title('Random Forest Regressor RMSE vs. # Estimators')
plt.xlabel('# Estimators')
plt.ylabel('RMSE')
plt.savefig('results/v2-split90-rfr-rmse.png')

print("\nDone running models!")

Starting on regressor linear		Average Error: 8.26, Test RMSE: 11.41
Starting on regressor forest-1		Average Error: 11.33, Test RMSE: 15.29
Starting on regressor forest-2		Average Error: 9.76, Test RMSE: 13.17
Starting on regressor forest-3		Average Error: 9.25, Test RMSE: 12.44
Starting on regressor forest-4		Average Error: 8.83, Test RMSE: 11.85
Starting on regressor forest-5		Average Error: 8.69, Test RMSE: 11.78
Starting on regressor forest-10		Average Error: 8.32, Test RMSE: 11.31
Starting on regressor forest-20		Average Error: 8.08, Test RMSE: 11.03
Starting on regressor forest-30		Average Error: 7.97, Test RMSE: 10.91
Starting on regressor forest-40		Average Error: 7.95, Test RMSE: 10.87
Starting on regressor forest-50		Average Error: 7.94, Test RMSE: 10.88
Starting on regressor forest-100		Average Error: 7.87, Test RMSE: 10.80
Starting on regressor forest-200		Average Error: 7.83, Test RMSE: 10.75
Starting on regressor forest-300		Average Error: 7.83, Test RMSE: 10.76

Done runn

In [5]:
mi = mutual_info_regression(x_data, y_data)

In [27]:
feature_ranking = np.argsort(mi)[::-1]
features = hdrs[2:]
features.pop(features.index('popularity'))
np.array(features)[feature_ranking]

array(['artist_popularity', 'release_date', 'album_length', 'loudness',
       'instrumentalness', 'duration', 'num_segments', 'acousticness',
       'start_of_fade_out', 'num_tatums', 'energy', 'num_beats',
       'num_sections', 'track_number', 'num_bars', 'danceability',
       'valence', 'explicit', 'speechiness', 'time_signature_confidence',
       'end_of_fade_in', 'mode_confidence', 'tempo', 'time_signature',
       'liveness', 'tempo_confidence', 'mode', 'key', 'key_confidence'],
      dtype='<U25')