In [42]:
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.sparse import csr_matrix
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold

# Imports for logistic Regression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score

import os
from tqdm import tqdm_notebook

import warnings
warnings.filterwarnings('ignore')

In [2]:
%reload_ext google.cloud.bigquery

In [3]:
%%bigquery
SELECT
    COUNT(DISTINCT(unique_pid))
FROM
    playlist_songs.playlists;


Unnamed: 0,f0_
0,909100


In [4]:
%%bigquery
SELECT
    COUNT(unique_pid)
FROM
    playlist_songs.playlists;

Unnamed: 0,f0_
0,66346428


1. Get all tracks from random list of 1000 playlists. The limiting dimension is the playlist.
2. Train test split the data

In [77]:
%%bigquery playlist_tracks
SELECT
    ps.unique_pid, 
    ps.artist_name, 
    ps.track_name, 
    ps.album_name, 
    th.trackid,
    th.danceability,
    th.energy,
    th.key,
    th.loudness,
    th.mode,
    th.speechiness,
    th.acousticness,
    th.instrumentalness,
    th.liveness,
    th.valence,
    th.tempo,
    th.duration_ms,
    th.time_signature
FROM
    (SELECT 
        unique_pid
    FROM
        (SELECT
            unique_pid
        FROM
            playlist_songs.playlists
        GROUP BY
            unique_pid)
    WHERE
    RAND() < 500/909100 ) as ps_id
    INNER JOIN
    playlist_songs.playlists as ps
    ON ps_id.unique_pid = ps.unique_pid
    INNER JOIN
    playlist_songs.tracks_headers as th
    ON th.trackid = ps.track_uri;

In [78]:
playlist_tracks.head(1)

Unnamed: 0,unique_pid,artist_name,track_name,album_name,trackid,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,92959,Survivor,Eye of the Tiger,Eye Of The Tiger,spotify:track:2HHtWyy5CgaQbC7XSoOb0e,0.815,0.438,0,-14.522,0,0.0346,0.216,0.000466,0.0787,0.552,108.965,243773,4


In [79]:
playlist_analysis = playlist_tracks[['unique_pid', 'trackid','danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
            'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
            'duration_ms', 'time_signature']]

playlist_song_info = playlist_tracks[['trackid', 'artist_name', 'track_name', 'album_name']]

In [80]:
# create column of ones for data spread procedure later
playlist_analysis.loc[:, 'one'] = 1
# data spread procedure - group by track and features one hot encode playlist assignments
playlist_analysis = playlist_analysis.drop_duplicates(subset=['unique_pid', 'trackid'])

#create train and test (stratified by playlist)
train, test = train_test_split(playlist_analysis, train_size=0.8, stratify=playlist_analysis.unique_pid)

In [81]:
playlist_train_one_hot = train.pivot(index="trackid", columns='unique_pid', values="one")
playlist_train_one_hot = playlist_train_one_hot.fillna(value=0)

In [82]:
playlist_test_one_hot = test.pivot(index="trackid", columns='unique_pid', values="one")
playlist_test_one_hot = playlist_test_one_hot.fillna(value=0)

In [83]:
# get histogram of songs associated with # playlists
playlist_train_one_hot.loc[:, "sum"] = playlist_train_one_hot.sum(axis = 1)

In [84]:
playlist_train_one_hot.head()

unique_pid,222,373,4127,4181,5220,6212,7909,10953,11499,12857,...,984648,986338,988530,988958,990368,994789,995522,995870,999112,sum
trackid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
spotify:track:000xQL6tZNLJzIrtIgxqSl,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
spotify:track:004z7UbwGrprGG1JTmNgCt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
spotify:track:005GaX6hvgeTFnR9FvejTE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
spotify:track:007Cm8jbhOP7ofnHEwSr6s,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
spotify:track:00AvOVhsUi1gOCnHxTFw7i,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [85]:
playlist_distribution = playlist_train_one_hot.groupby("sum")["sum"].count()

In [86]:
playlist_distribution

sum
1.0     14964
2.0      2163
3.0       810
4.0       448
5.0       262
6.0       176
7.0       123
8.0        82
9.0        59
10.0       47
11.0       28
12.0       21
13.0       16
14.0        7
15.0        8
16.0       11
17.0        5
18.0        4
19.0        1
25.0        1
26.0        1
Name: sum, dtype: int64

In [87]:
# get all playlist numbers
playlist_numbers = [i for i in playlist_train_one_hot.columns if i not in ['trackid', 'sum']]
# merge features with one-hot list
playlist_features = playlist_analysis[['trackid','danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
            'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
            'duration_ms', 'time_signature']]
playlist_train_one_hot = playlist_train_one_hot.drop("sum", axis = 1)

In [88]:
playlist_train_one_hot_w_features = playlist_train_one_hot.merge(playlist_features, on="trackid", how="inner")
playlist_test_one_hot_w_features = playlist_test_one_hot.merge(playlist_features, on="trackid", how="inner")

In [89]:
y_train = playlist_train_one_hot_w_features[playlist_numbers]
y_test = playlist_test_one_hot_w_features[playlist_numbers]
X_train = playlist_train_one_hot_w_features[['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
            'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
            'duration_ms', 'time_signature']]
X_test = playlist_test_one_hot_w_features[['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
            'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
            'duration_ms', 'time_signature']]

print(X_train.shape)
print(y_train.shape)
X_test = X_test.fillna(-1) #if na then fill with -1
X_train = X_train.fillna(-1) #if na then fill with -1

(33527, 13)
(33527, 506)


In [90]:
# convert to np arrays
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

In [91]:
#create mask for smaller X_train data
mask = np.random.choice([False, True], len(X_train), p=[0.75, 0.25])
X_train_compressed = X_train[mask][:]
X_train_compressed.shape
y_train_compressed = y_train[mask][:]
y_train_compressed.shape

(8401, 506)

In [23]:
# Ultimately, while we did not use normalization on predictors, because this 
# significantly slowed down the logistic regression to a process that would have 
# taken 20 hours to run; this was unfeasible.

def normalize_predictors(x_df, y_df, scaler):
    """
    Standardizes the predictors to vary between 0 and 1 to account for differences
    in scale and variability.
    
    The scaler passed in should be from the standardized data set only.
    
    Returns new x dataframe, along with y, whose index has been re-set to match
    the x df since it was run through a standardizer. 
    """
    cols_to_scale = x_df.columns
    scaled_df = scaler.fit_transform(x_df)
    
    normalized_x = pd.DataFrame(scaled_df, columns=cols_to_scale)
    normalized_x.reset_index(drop=True, inplace=True)

    # Also reset the Y-train index so that it matches the indices produced by normalization
    y_df.reset_index(drop=True, inplace=True)

    return (normalized_x, y_df)

In [97]:
# Normalize the data and get it into a dataframe formatting so I can see what I'm working with
X_train_df = pd.DataFrame(X_train, columns=['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
            'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
            'duration_ms', 'time_signature'])

X_test_df = pd.DataFrame(X_test, columns=['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
            'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
            'duration_ms', 'time_signature'])

y_train_df = pd.DataFrame(y_train, columns=playlists)
y_test_df = pd.DataFrame(y_test, columns=playlists)
print(X_test_df.shape)

scaler = MinMaxScaler().fit(X_train_df)
X_train_df, y_train_df = normalize_predictors(X_train_df, y_train_df, scaler)

display(X_train_df.head())

# Now, we must also normalize our test values, using the scalar from the X_train
X_test_df, y_test_df = normalize_predictors(X_test_df, y_test_df, scaler)
display(X_test_df.head())

(6634, 13)


Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,0.563395,0.636637,0.545455,0.861971,1.0,0.029393,0.007159,0.002057,0.217435,0.153,0.56813,0.124257,0.8
1,0.702454,0.753754,0.0,0.888983,1.0,0.02887,0.01506,0.001013,0.264529,0.423,0.545531,0.091036,0.8
2,0.791411,0.756757,0.090909,0.896694,1.0,0.04069,0.04247,0.0,0.305611,0.931,0.654652,0.103832,0.8
3,0.612474,0.777778,0.363636,0.900981,1.0,0.032008,0.001315,0.000217,0.124248,0.563,0.536396,0.074313,0.8
4,0.537832,0.893894,0.909091,0.878501,0.0,0.437238,0.170683,0.0,0.186373,0.44,0.793385,0.09646,0.8


Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,0.499289,0.721577,0.636364,0.791849,1.0,0.040425,0.32028,0.0,0.203759,0.592489,0.733075,0.133823,0.8
1,0.435881,0.867322,0.636364,0.769177,1.0,0.012224,0.001294,1.4e-05,0.104616,0.236562,0.332856,0.112438,0.8
2,0.62392,0.337615,0.636364,0.627388,1.0,0.035385,0.76004,0.0,0.061138,0.473847,0.39765,0.153836,0.8
3,0.595496,0.615033,0.181818,0.807508,1.0,0.006541,0.227911,4e-06,0.145926,0.38306,0.489391,0.115451,0.8
4,0.516781,0.697454,0.272727,0.780455,0.0,0.028737,0.032629,3.6e-05,0.107715,0.303621,0.565716,0.132965,0.8


In [105]:
from IPython.display import HTML
HTML('''<script>
code_show_err=false; 
function code_toggle_err() {
 if (code_show_err){
 $('div.output_stderr').hide();
 } else {
 $('div.output_stderr').show();
 }
 code_show_err = !code_show_err
} 
$( document ).ready(code_toggle_err);
</script>
To toggle on/off output_stderr, click <a href="javascript:code_toggle_err()">here</a>.''')

In [40]:
# We'll store the regression model for each playlist, along with its accuracy score as tuple
playlist_regression_models = []

with tqdm_notebook(total=len(playlists)) as pbar:
    for playlist_index in playlists:
        logistic_reg = LogisticRegressionCV(max_iter=1000, penalty="l1", solver="liblinear", cv=3)
        logistic_reg.fit(X_train, y_train[:, playlist_index])
        test_predictions = logistic_reg.predict(X_test)

        test_accuracy = accuracy_score(y_test, test_predictions)
        test_f1 = f1_score(y_test, test_predictions)
    
        print(f"Training model for playlist: {playlist_index} with test accuracy score of: {test_accuracy} and an F1 of {test_f1}")
        model = (regression_l1, test_accuracy, f1)
        playlist_regression_models.append(model)
        pbar.update(1)

HBox(children=(IntProgress(value=0, max=538), HTML(value='')))




NameError: name 'fit' is not defined