In [8]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error, r2_score

from matplotlib import pyplot as plt
import seaborn as sns

import itertools
import random

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import spotipy.util as util

%matplotlib inline

## Pull Spotify Data

In [2]:
CLIENT_ID="810e2eec841546269c49f338f1be189a"
CLIENT_SECRET="764290d0ab0f4f4484b0d4c7b701bdee"

token = SpotifyClientCredentials(client_id=CLIENT_ID, client_secret=CLIENT_SECRET)

cache_token = token.get_access_token()
sp = spotipy.Spotify(cache_token)

In [3]:
sample_playlist = sp.user_playlist("joycex99", "0yWeSBDVEwGPLMH7EXI3tX")

In [4]:
#Get (song id, song name, popularity) out of playlists
def extract_songs(playlist):
    tracks = playlist["tracks"]
    info = [(item["track"]["id"], item["track"]["name"], item["track"]["popularity"]) for item in tracks["items"]]
    while tracks["next"]:
        tracks = sp.next(tracks)
        info.extend([(item["track"]["id"], item["track"]["name"], item["track"]["popularity"]) for item in tracks["items"]])
    return info

In [5]:
# Get (song_id, song_name, popularity) for num_tracks songs in given year
# Goes through songs sequentially to get tracks
def tracks_from_year(year, num_tracks):
    tracks = sp.search(q='year:' + str(year), type='track', offset=0, limit=50)
    print("Number of tracks in {}: {}".format(year, tracks['tracks']['total']))
    info = [(item["id"], item["name"], item["popularity"]) for item in tracks["tracks"]["items"]]
    while tracks["tracks"]["next"] and len(info) < num_tracks:
        tracks = sp.next(tracks["tracks"])
        info.extend([(item["id"], item["name"], item["popularity"]) for item in tracks["tracks"]["items"]])
        if len(info) % 1000 == 0:
            print("Retrieved {} songs".format(len(info)))
    return info[:num_tracks]

In [6]:
# Get (song_id, song_name, popularity) for num_tracks songs in a given year
# Selects those songs randomly from the given year's top 10,000
def random_tracks_from_year(year, num_tracks):
    tracks = sp.search(q='year:' + str(year), type='track')
    print("Number of tracks in {}: {}".format(year, tracks['tracks']['total']))
    max_track = min(tracks['tracks']['total'], 9999) # Spotify limits offset to 9999
    infos = []
    for _ in range(num_tracks):
        track_num = random.randint(1, max_track) # May repeat songs
        result = sp.search(q='year:' + str(year), type='track', offset=track_num, limit=1)
        track = result['tracks']['items'][0]
        info = (track['id'], track['name'], track['popularity'])
        infos.append(info)
    return infos

In [7]:
songs = tracks_from_year(2017, 10000)
print("Songs we use: {}".format(len(songs)))
print(songs[:10])
print(songs[-10:])

Number of tracks in 2017: 7294622
Retrieved 1000 songs
Retrieved 2000 songs
Retrieved 3000 songs
Retrieved 4000 songs
Retrieved 5000 songs
Retrieved 6000 songs
Retrieved 7000 songs
Retrieved 8000 songs
Retrieved 9000 songs
Retrieved 10000 songs
Songs we use: 10000
[('7m9OqQk4RVRkw9JJdeAw96', 'Jocelyn Flores', 91), ('7AQim7LbvFVZJE3O8TYgf2', 'Fuck Love (feat. Trippie Redd)', 89), ('3GVkPk8mqxz0itaAriG1L7', 'Everybody Dies In Their Nightmares', 87), ('7sO5G9EABYOXQKNPNiE9NR', 'Ric Flair Drip (& Metro Boomin)', 85), ('7GX5flRQZVHRAGd6B4TmDO', 'XO TOUR Llif3', 87), ('7oJrVhiTPZGBXWPwhxjy9S', 'Lust', 84), ('5Z3GHaZ6ec9bsiI5BenrbY', 'Young Dumb & Broke', 86), ('41zXlQxzTi6cGAjpOXyLYH', 'idontwannabeyouanymore', 85), ('40oKW22ZNNkEdZLJTScaQI', 'Roll In Peace (feat. XXXTENTACION)', 83), ('7KXjTSCq5nL1LoYtL7XAwS', 'HUMBLE.', 85)]
[('5KLQlI6pfbyrkmj9KmAGnI', 'Little Drummer Boy - Live', 43), ('5XFRB4DDlyZf2h8ralYfqh', 'Evil - From "Descendants: Wicked World"', 44), ('1JDeGdwrfrtMAXUqD8ggVv', 'Mi

## Feature Extraction

In [9]:
sp.audio_features(songs[0][0])

[{'danceability': 0.872,
  'energy': 0.391,
  'key': 0,
  'loudness': -9.144,
  'mode': 0,
  'speechiness': 0.242,
  'acousticness': 0.469,
  'instrumentalness': 4.13e-06,
  'liveness': 0.297,
  'valence': 0.437,
  'tempo': 134.021,
  'type': 'audio_features',
  'id': '7m9OqQk4RVRkw9JJdeAw96',
  'uri': 'spotify:track:7m9OqQk4RVRkw9JJdeAw96',
  'track_href': 'https://api.spotify.com/v1/tracks/7m9OqQk4RVRkw9JJdeAw96',
  'analysis_url': 'https://api.spotify.com/v1/audio-analysis/7m9OqQk4RVRkw9JJdeAw96',
  'duration_ms': 119133,
  'time_signature': 4}]

In [10]:
''' Returns a list of each track's audio features'''
def features(songs):
    all_feats = []
    ids = [song[0] for song in songs]
    # Spotify's audio_features can only return 50 songs at once
    for i in range(0, len(songs), 50):
        track_feats = sp.audio_features(ids[i:i+50])
        for j in range(len(track_feats)):
            track_feats[j]['name'] = songs[i+j][1]
            track_feats[j]['popularity'] = songs[i+j][2]
        all_feats.extend(track_feats)
    return all_feats

In [11]:
feats = features(songs)
feats[100]

{'danceability': 0.638,
 'energy': 0.367,
 'key': 4,
 'loudness': -7.906,
 'mode': 1,
 'speechiness': 0.0266,
 'acousticness': 0.84,
 'instrumentalness': 0,
 'liveness': 0.0999,
 'valence': 0.219,
 'tempo': 71.957,
 'type': 'audio_features',
 'id': '2VIPU72L6tIR4w8J8ap6Kp',
 'uri': 'spotify:track:2VIPU72L6tIR4w8J8ap6Kp',
 'track_href': 'https://api.spotify.com/v1/tracks/2VIPU72L6tIR4w8J8ap6Kp',
 'analysis_url': 'https://api.spotify.com/v1/audio-analysis/2VIPU72L6tIR4w8J8ap6Kp',
 'duration_ms': 217120,
 'time_signature': 4,
 'name': 'Mercy',
 'popularity': 75}

In [12]:
data = pd.DataFrame(feats)
data = data.sample(frac=1)
data.head()

Unnamed: 0,acousticness,analysis_url,danceability,duration_ms,energy,id,instrumentalness,key,liveness,loudness,mode,name,popularity,speechiness,tempo,time_signature,track_href,type,uri,valence
7375,0.804,https://api.spotify.com/v1/audio-analysis/3uxy...,0.537,200663,0.6,3uxyOfvxzjhzMKAOU5S6TZ,0.0,11,0.121,-7.528,0,Slow Down,45,0.411,84.349,4,https://api.spotify.com/v1/tracks/3uxyOfvxzjhz...,audio_features,spotify:track:3uxyOfvxzjhzMKAOU5S6TZ,0.258
4541,0.00436,https://api.spotify.com/v1/audio-analysis/1C1O...,0.64,185947,0.497,1C1OBpwB6tRViTiWE2ZgNR,0.0,10,0.0858,-7.111,0,DNA.,51,0.355,139.924,4,https://api.spotify.com/v1/tracks/1C1OBpwB6tRV...,audio_features,spotify:track:1C1OBpwB6tRViTiWE2ZgNR,0.424
8671,0.188,https://api.spotify.com/v1/audio-analysis/1SuV...,0.502,195880,0.901,1SuVJpFhGQOSBAB6VUpHYH,4.9e-05,7,0.28,-4.467,1,The Boy With the Thorn In His Side - 2017 Master,54,0.0459,119.022,4,https://api.spotify.com/v1/tracks/1SuVJpFhGQOS...,audio_features,spotify:track:1SuVJpFhGQOSBAB6VUpHYH,0.821
599,0.75,https://api.spotify.com/v1/audio-analysis/2tgQ...,0.536,224947,0.574,2tgQaL85WoRfgEa4hFQgrE,0.00277,7,0.105,-6.705,1,Feathered Indians,65,0.0254,87.194,4,https://api.spotify.com/v1/tracks/2tgQaL85WoRf...,audio_features,spotify:track:2tgQaL85WoRfgEa4hFQgrE,0.877
3862,0.939,https://api.spotify.com/v1/audio-analysis/0QbT...,0.782,84315,0.127,0QbTsXbzGOnxKW5PQFR39f,0.781,7,0.125,-20.939,1,summer nights.,61,0.668,83.989,4,https://api.spotify.com/v1/tracks/0QbTsXbzGOnx...,audio_features,spotify:track:0QbTsXbzGOnxKW5PQFR39f,0.39


In [13]:
features = ["acousticness", "danceability", "energy", 
            "loudness", "speechiness", "tempo", "valence"]

In [14]:
train, test = train_test_split(data, test_size=0.15)
x_train, y_train = train[features], train["popularity"]
x_test, y_test = test[features], test["popularity"]
print("Training size: {}, Test size: {}".format(len(train), len(test)))

Training size: 8500, Test size: 1500


## Baseline (Linear Regression)

In [17]:
lin_mod = LinearRegression()
lin_mod.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [18]:
test_preds = lin_mod.predict(x_test)
print("Mean Squared Error: {}".format(mean_squared_error(test_preds, y_test)))
print("Variance Score: {}".format(r2_score(test_preds, y_test)))

Mean Squared Error: 67.99115776685458
Variance Score: -112.49328819396099


In [19]:
coeffs = dict(zip(features, lin_mod.coef_))
print("Model coefficients: {}".format(coeffs))

Model coefficients: {'acousticness': 0.15630271998958647, 'danceability': 4.043510358512564, 'energy': -1.0842767017857033, 'loudness': 0.03864707700303794, 'speechiness': -3.3099628374622294, 'tempo': -0.004667247329210432, 'valence': -0.7306622754990316}


## SVM 

In [22]:
svr = SVR(gamma='scale')
svr.fit(x_train, y_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [23]:
test_preds = svr.predict(x_test)
print("SVM Mean Squared Error: {}".format(mean_squared_error(test_preds, y_test)))
print("SVM Variance Score: {}".format(r2_score(test_preds, y_test)))

SVM Mean Squared Error: 69.73567112611396
SVM Variance Score: -134.34559581928067
