In [5]:
import csv
import numpy as np
import os
import time
import torch
from collections import OrderedDict
from matplotlib import pyplot as plt
from sklearn.linear_model import LinearRegression, LogisticRegression, BayesianRidge
from sklearn.ensemble import RandomForestRegressor
from torch.autograd import Variable
from sklearn.feature_selection import mutual_info_regression

In [6]:
hdrs = None
data = None

# parse data from song features
with open("data/random_song_features.csv", "r") as f:
    reader = csv.reader(f)
    all_rows = list(reader)
    hdrs = all_rows[0]
    data = all_rows[1:]
    
data = np.array(data)
np.random.shuffle(data)
pop_index = hdrs.index('popularity')
all_id = data[:,:2]
data = data[:,2:]
y_data = data[:,pop_index - 2]
x_data = np.delete(data, pop_index - 2, 1)
y_data = y_data.astype(np.float)
x_data = x_data.astype(np.float)

In [7]:
N_ROWS = len(x_data)
N_TRAIN = int(0.9 * N_ROWS)
N_TEST = N_ROWS - N_TRAIN
x_train = x_data[:N_TRAIN]
y_train = y_data[:N_TRAIN]
x_test = x_data[N_TRAIN:]
y_test = y_data[N_TRAIN:]
print("Train: {}\nTest:  {}".format(N_TRAIN, N_ROWS - N_TRAIN))

Train: 39060
Test:  4340


In [19]:
from sklearn.linear_model import \
    LinearRegression, LogisticRegression, BayesianRidge, ElasticNet, Lasso
from sklearn.ensemble import \
    RandomForestRegressor, AdaBoostRegressor, BaggingRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import LinearSVR
from sklearn.tree import DecisionTreeRegressor

if not os.path.exists('results'):
    os.makedirs('results')

regressors = {}
random_regressors = {
    'linear':   LinearRegression,
    'bayesian': BayesianRidge,
    'mlp':      MLPRegressor,
    'linSVR':   LinearSVR,
    'elastic':  ElasticNet,
    'lasso':    Lasso
}
fast_regressors = {
    'forest': ('Random Forest', RandomForestRegressor),
    'bagger': ('Bagging', BaggingRegressor),
    'extrat': ('Extra Trees', ExtraTreesRegressor)
}
slow_regressors = {
    'gboost': ('Gradient Boosting', GradientBoostingRegressor),
    'aboost': ('AdaBoost Boosting', AdaBoostRegressor)
}
fast_estimators = [1, 2, 3, 4, 5, 10, 20, 30, 40, 50, 100]
slow_estimators = [1, 2, 3, 4, 5, 10, 20]

for key in random_regressors:
    regressors[key] = random_regressors[key]()
for key in fast_regressors:
    for i in fast_estimators:
        regressor = fast_regressors[key][1]
        regressors[key + '-{}'.format(i)] = regressor(n_jobs=8, n_estimators=i)
for key in slow_regressors:
    for i in slow_estimators:
        regressor = slow_regressors[key][1]
        regressors[key + '-{}'.format(i)] = regressor(n_estimators=i)

data = {}
for name, regressor in regressors.items():
    print("Starting on regressor {}     ".format(name), end='\t')
    start_time = time.time()
    regressor.fit(x_train, y_train)
    end_time = time.time()
    print("Fit time: {:.2f}".format(end_time - start_time))

Starting on regressor linear	Fit time: 0.03
Starting on regressor bayesian	Fit time: 0.05
Starting on regressor mlp	Fit time: 2.02
Starting on regressor linSVR	Fit time: 9.24
Starting on regressor elastic	



Fit time: 1.67
Starting on regressor lasso	Fit time: 1.65
Starting on regressor forest-1	Fit time: 0.84
Starting on regressor forest-2	Fit time: 0.93
Starting on regressor forest-3	Fit time: 0.93
Starting on regressor forest-4	Fit time: 0.93
Starting on regressor forest-5	Fit time: 1.04
Starting on regressor forest-10	Fit time: 2.16
Starting on regressor forest-20	Fit time: 3.32
Starting on regressor forest-30	Fit time: 4.74
Starting on regressor forest-40	Fit time: 6.69
Starting on regressor forest-50	Fit time: 8.04
Starting on regressor forest-100	Fit time: 15.24
Starting on regressor bagger-1	Fit time: 0.80
Starting on regressor bagger-2	Fit time: 1.15
Starting on regressor bagger-3	Fit time: 1.15
Starting on regressor bagger-4	Fit time: 1.25
Starting on regressor bagger-5	Fit time: 1.37
Starting on regressor bagger-10	Fit time: 2.50
Starting on regressor bagger-20	Fit time: 4.09
Starting on regressor bagger-30	Fit time: 6.05
Starting on regressor bagger-40	Fit time: 8.02
Starting o

In [92]:
for name, regressor in regressors.items():
    print("Analyzing results for regressor {}...  ".format(name), end='\t')
    pred = regressor.predict(x_test)
    errs = Variable(torch.Tensor(pred - y_test))
    loss = errs.pow(2).sum()
    avg_err = errs.abs().mean().data.numpy()[0]
    rmse = np.sqrt(loss.data.numpy()[0] / N_TEST)
    data[name] = [avg_err, rmse]
    
    plt.clf()
    plt.hist(errs.data.numpy(), bins=100)
    plt.xlabel("Popularity Prediction Error")
    plt.ylabel("Count")
    plt.title(
        'Prediction Errors for Model \"{}\"'
        '\nAverage Error: {:4.2f}, Test RMSE: {:4.2f}'.format(
        name, avg_err, rmse
    ))
    plt.savefig("results/v2-split90-{}.png".format(name))
    print("Average Error: {:4.2f}, Test RMSE: {:4.2f}".format(
        avg_err, rmse
    ))

with open("results/v2-split90-data.csv", "w+") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Model', 'Average Error', 'RMSE'])
    for model in data:
        writer.writerow([model] + data[model])

plt.clf()
for key in fast_regressors:
    plt.plot(fast_estimators, list(map(
        lambda e: data[key + '-{}'.format(e)][0], fast_estimators
    )), label=fast_regressors[key][0] + "-L1")
for key in fast_regressors:
    plt.plot(fast_estimators, list(map(
        lambda e: data[key + '-{}'.format(e)][1], fast_estimators
    )), label=fast_regressors[key][0] + "-L2")
plt.xlabel('# Estimators')
plt.ylabel('Norm')
plt.title('L1/L2 Norms vs. # Estimators for Parallelizable Regressors')
plt.legend()
plt.savefig('results/norm_fast.png', bbox_inches='tight', pad_inches=0.2)

plt.clf()
for key in slow_regressors:
    plt.plot(slow_estimators, list(map(
        lambda e: data[key + '-{}'.format(e)][0], slow_estimators
    )), label=slow_regressors[key][0] + "-L1")
for key in slow_regressors:
    plt.plot(slow_estimators, list(map(
        lambda e: data[key + '-{}'.format(e)][1], slow_estimators
    )), label=slow_regressors[key][0] + "-L2")
plt.xlabel('# Estimators')
plt.ylabel('Norm')
plt.title('L1/L2 Norms vs. # Estimators for Non-Parallelizable Regressors')
plt.legend()
plt.savefig('results/norm_slow.png', bbox_inches='tight', pad_inches=0.2)

print("Done analyzing results!")

Done analyzing results!


In [86]:
final_regressors = set()
for model in random_regressors:
    final_regressors.add(model)
for model in fast_regressors:
    final_regressors.add(model + "-{}".format(fast_estimators[-1]))
for model in slow_regressors:
    final_regressors.add(model + "-{}".format(slow_estimators[-1]))
final_regressors = list(final_regressors) 
final_regressors.sort(key=lambda model: model[-3:] if model[-1] == '0' else model[:3])

threshold = 50
unpopular = popular = 0
for i in range(len(y_test)):
    if y_test[i] >= threshold:
        popular += 1
    else:
        unpopular += 1
class_results = {}
for model in final_regressors:
    print("Analyzing model {} as classifier...".format(model))
    pred = regressors[model].predict(x_test)
    u_correct = p_correct = 0
    for i in range(len(y_test)):
        if y_test[i] >= threshold and pred[i] >= threshold:
            p_correct += 1
        elif y_test[i] < threshold and pred[i] < threshold:
            u_correct += 1
    class_results[model] = (u_correct, p_correct)
    print("\tGot {}/{} unpopular and {}/{} popular songs correct!".format(
        u_correct, unpopular, p_correct, popular
    ))

plt.clf()
plt.figure(figsize=(6,4))
index = np.arange(len(final_regressors))
bar_width = 0.25
opacity = 0.8

plt.bar(index, list(map(
    lambda model: class_results[model][0] / unpopular, final_regressors
)), bar_width, alpha=opacity, color='b', label='% Unpopular Correct')
plt.bar(index + bar_width, list(map(
    lambda model: class_results[model][1] / popular, final_regressors
)), bar_width, alpha=opacity, color='g', label='% Popular Correct')
plt.bar(index + bar_width * 2, list(map(
    lambda model: (class_results[model][0] + class_results[model][1]) / (unpopular + popular), final_regressors
)), bar_width, alpha=opacity, color='r', label='% Overall Correct')

plt.xlabel('Model')
plt.ylabel('% Correct')
plt.gca().set_ylim([0, 1])
plt.title('% Unpopular/Popular Labels Correct vs. Model')
plt.xticks(index + bar_width, tuple(final_regressors), rotation=90)
plt.legend(loc=4)
 
plt.tight_layout()
plt.savefig('results/correct.png')

with open("results/correct.csv", "w+") as f:
    writer = csv.writer(f)
    writer.writerow(['Model', 'Unpopular % Correct', 'Popular % Correct', 'Overall % Correct'])
    for model in final_regressors:
        writer.writerow([
            model,
            "{:.3f}".format(class_results[model][0] / unpopular), 
            "{:.3f}".format(class_results[model][1] / popular),
            "{:.3f}".format((class_results[model][0] + class_results[model][1]) / (unpopular + popular))
        ])

Analyzing model aboost-20 as classifier...
	Got 2405/2780 unpopular and 1208/1560 popular songs correct!
Analyzing model gboost-20 as classifier...
	Got 2580/2780 unpopular and 989/1560 popular songs correct!
Analyzing model extrat-100 as classifier...
	Got 2515/2780 unpopular and 1109/1560 popular songs correct!
Analyzing model bagger-100 as classifier...
	Got 2514/2780 unpopular and 1110/1560 popular songs correct!
Analyzing model forest-100 as classifier...
	Got 2506/2780 unpopular and 1109/1560 popular songs correct!
Analyzing model bayesian as classifier...
	Got 2550/2780 unpopular and 1032/1560 popular songs correct!
Analyzing model elastic as classifier...
	Got 2558/2780 unpopular and 1017/1560 popular songs correct!
Analyzing model lasso as classifier...
	Got 2559/2780 unpopular and 1019/1560 popular songs correct!
Analyzing model linear as classifier...
	Got 2548/2780 unpopular and 1030/1560 popular songs correct!
Analyzing model linSVR as classifier...
	Got 1187/2780 unpopula

In [70]:
print("Logistic Regression: Classification")
lr = LogisticRegression(n_jobs=8, tol=10 ** -10)
start_time = time.time()
lr.fit(x_train, (y_train >= 80).astype(np.int))
end_time = time.time()
print("Fit time: {:.2f}".format(end_time - start_time))

pred = lr.predict(x_test)
print(sum(pred))
total = 0
correct = 0
for i in range(len(y_test)):
    val = y_test[i]
    if val < 80:
        total += 1
        if pred[i] == 0:
            correct += 1
print(total, correct)

Fit time: 2.12


In [27]:
mi = mutual_info_regression(x_data, y_data)
feature_ranking = np.argsort(mi)[::-1]
features = hdrs[2:]
features.pop(features.index('popularity'))
np.array(features)[feature_ranking]

array(['artist_popularity', 'release_date', 'album_length', 'loudness',
       'instrumentalness', 'duration', 'num_segments', 'acousticness',
       'start_of_fade_out', 'num_tatums', 'energy', 'num_beats',
       'num_sections', 'track_number', 'num_bars', 'danceability',
       'valence', 'explicit', 'speechiness', 'time_signature_confidence',
       'end_of_fade_in', 'mode_confidence', 'tempo', 'time_signature',
       'liveness', 'tempo_confidence', 'mode', 'key', 'key_confidence'],
      dtype='<U25')

In [89]:
def table(csv):
    with open(csv) as f:
        n = None
        for line in f:
            tokens = line.strip().split(',')
            if n is None:
                n = len(tokens)
                print("\\begin{tabular}{" + "l".join(["|" for _ in range(n + 1)]) + "}\\hline")
            print(" & ".join(tokens) + " \\\\\\hline")
        print("\\end{tabular}")
    
table("results_full/train_times_other.csv")

\begin{tabular}{|l|l|}\hline
Model & Training Time (s) \\\hline
mlp & 2.02 \\\hline
linSVR & 9.24 \\\hline
elastic & 1.67 \\\hline
lasso & 1.65 \\\hline
\end{tabular}


In [70]:
timings = '''
Starting on regressor linear Fit time: 0.03
Starting on regressor bayesian Fit time: 0.05
Starting on regressor mlp Fit time: 2.02
Starting on regressor linSVR Fit time: 9.24
Starting on regressor elastic Fit time: 1.67
Starting on regressor lasso Fit time: 1.65
Starting on regressor forest-1 Fit time: 0.84
Starting on regressor forest-2 Fit time: 0.93
Starting on regressor forest-3 Fit time: 0.93
Starting on regressor forest-4 Fit time: 0.93
Starting on regressor forest-5 Fit time: 1.04
Starting on regressor forest-10 Fit time: 2.16
Starting on regressor forest-20 Fit time: 3.32
Starting on regressor forest-30 Fit time: 4.74
Starting on regressor forest-40 Fit time: 6.69
Starting on regressor forest-50 Fit time: 8.04
Starting on regressor forest-100 Fit time: 15.24
Starting on regressor bagger-1 Fit time: 0.80
Starting on regressor bagger-2 Fit time: 1.15
Starting on regressor bagger-3 Fit time: 1.15
Starting on regressor bagger-4 Fit time: 1.25
Starting on regressor bagger-5 Fit time: 1.37
Starting on regressor bagger-10 Fit time: 2.50
Starting on regressor bagger-20 Fit time: 4.09
Starting on regressor bagger-30 Fit time: 6.05
Starting on regressor bagger-40 Fit time: 8.02
Starting on regressor bagger-50 Fit time: 11.62
Starting on regressor bagger-100 Fit time: 19.53
Starting on regressor extrat-1 Fit time: 0.32
Starting on regressor extrat-2 Fit time: 0.32
Starting on regressor extrat-3 Fit time: 0.32
Starting on regressor extrat-4 Fit time: 0.32
Starting on regressor extrat-5 Fit time: 0.42
Starting on regressor extrat-10 Fit time: 0.73
Starting on regressor extrat-20 Fit time: 1.15
Starting on regressor extrat-30 Fit time: 1.76
Starting on regressor extrat-40 Fit time: 2.27
Starting on regressor extrat-50 Fit time: 2.90
Starting on regressor extrat-100 Fit time: 5.48
Starting on regressor gboost-1  Fit time: 0.18
Starting on regressor gboost-2  Fit time: 0.28
Starting on regressor gboost-3  Fit time: 0.38
Starting on regressor gboost-4  Fit time: 0.45
Starting on regressor gboost-5  Fit time: 0.54
Starting on regressor gboost-10 Fit time: 0.96
Starting on regressor gboost-20 Fit time: 1.78
Starting on regressor aboost-1  Fit time: 1.18
Starting on regressor aboost-2  Fit time: 2.14
Starting on regressor aboost-3  Fit time: 3.15
Starting on regressor aboost-4  Fit time: 4.26
Starting on regressor aboost-5  Fit time: 5.30
Starting on regressor aboost-10 Fit time: 10.28
Starting on regressor aboost-20 Fit time: 19.77
'''

In [82]:
times = list(filter(len, timings.split('\n')))
times = list(map(lambda l: l.split(" ")[3:], times))
times = {
    d[0]: float(d[-1]) for d in times
}
    
with open("results/train_times.csv", "w+") as f:
    writer = csv.writer(f)
    writer.writerow(['Model', 'Training Time (s)'])
    for regressor in regressors:
        writer.writerow([regressor, times[regressor]])

plt.clf()
for key in fast_regressors:
    plt.plot(fast_estimators, list(map(
        lambda e: times[key + '-{}'.format(e)], fast_estimators
    )), label=fast_regressors[key][0])
plt.xlabel('# Estimators')
plt.ylabel('Train Time (s)')
plt.title('Train Time vs. # Estimators for Parallelizable Regressors')
plt.legend()
plt.savefig('results/train_time_fast.png')

plt.clf()
for key in slow_regressors:
    plt.plot(slow_estimators, list(map(
        lambda e: times[key + '-{}'.format(e)], slow_estimators
    )), label=slow_regressors[key][0])
plt.xlabel('# Estimators')
plt.ylabel('Train Time (s)')
plt.title('Train Time vs. # Estimators for Non-Parallelizable Regressors')
plt.legend()
plt.savefig('results/train_time_slow.png')

In [83]:
print(unpopular, popular)

2780 1560
