In [13]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from matplotlib import pyplot as plt
import seaborn as sns

import itertools
import random

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import spotipy.util as util

%matplotlib inline

In [2]:
CLIENT_ID="810e2eec841546269c49f338f1be189a"
CLIENT_SECRET="764290d0ab0f4f4484b0d4c7b701bdee"

token = SpotifyClientCredentials(client_id=CLIENT_ID, client_secret=CLIENT_SECRET)

cache_token = token.get_access_token()
sp = spotipy.Spotify(cache_token)

In [3]:
sample_playlist = sp.user_playlist("joycex99", "0yWeSBDVEwGPLMH7EXI3tX")

In [4]:
#Get (song id, song name, popularity) out of playlists
def extract_songs(playlist):
    tracks = playlist["tracks"]
    info = [(item["track"]["id"], item["track"]["name"], item["track"]["popularity"]) for item in tracks["items"]]
    while tracks["next"]:
        tracks = sp.next(tracks)
        info.extend([(item["track"]["id"], item["track"]["name"], item["track"]["popularity"]) for item in tracks["items"]])
    return info

In [86]:
# Get (song_id, song_name, popularity) for num_tracks songs in given year
# Goes through songs sequentially to get tracks
def tracks_from_year(year, num_tracks):
    tracks = sp.search(q='year:' + str(year), type='track', offset=0, limit=50)
    print("Number of tracks in {}: {}".format(year, tracks['tracks']['total']))
    info = [(item["id"], item["name"], item["popularity"]) for item in tracks["tracks"]["items"]]
    while tracks["tracks"]["next"] and len(info) < num_tracks:
        tracks = sp.next(tracks["tracks"])
        info.extend([(item["id"], item["name"], item["popularity"]) for item in tracks["tracks"]["items"]])
        if len(info) % 1000 == 0:
            print("Retrieved {} songs".format(len(info)))
    return info[:num_tracks]

In [87]:
# Get (song_id, song_name, popularity) for num_tracks songs in a given year
# Selects those songs randomly from the given year's top 10,000
def random_tracks_from_year(year, num_tracks):
    tracks = sp.search(q='year:' + str(year), type='track')
    print("Number of tracks in {}: {}".format(year, tracks['tracks']['total']))
    max_track = min(tracks['tracks']['total'], 9999) # Spotify limits offset to 9999
    infos = []
    for _ in range(num_tracks):
        track_num = random.randint(1, max_track) # May repeat songs
        result = sp.search(q='year:' + str(year), type='track', offset=track_num, limit=1)
        track = result['tracks']['items'][0]
        info = (track['id'], track['name'], track['popularity'])
        infos.append(info)
    return infos

In [88]:
songs = tracks_from_year(2017, 20000)
print("Songs we use: {}".format(len(songs)))
print(songs[:10])
print(songs[-10:])

Number of tracks in 2017: 7455835
Retrieved 1000 songs
Retrieved 2000 songs
Retrieved 3000 songs
Retrieved 4000 songs
Retrieved 5000 songs
Retrieved 6000 songs
Retrieved 7000 songs
Retrieved 8000 songs
Retrieved 9000 songs
Retrieved 10000 songs


SpotifyException: http status: 404, code:-1 - https://api.spotify.com/v1/search?query=year%3A2017&type=track&offset=10000&limit=50:
 Not found.

In [6]:
sp.audio_features(songs[0][0])

[{'danceability': 0.539,
  'energy': 0.855,
  'key': 0,
  'loudness': -4.987,
  'mode': 1,
  'speechiness': 0.0583,
  'acousticness': 0.0469,
  'instrumentalness': 9.5e-05,
  'liveness': 0.52,
  'valence': 0.346,
  'tempo': 138.028,
  'type': 'audio_features',
  'id': '3ftfaNstpkuUIMxlYOl1WX',
  'uri': 'spotify:track:3ftfaNstpkuUIMxlYOl1WX',
  'track_href': 'https://api.spotify.com/v1/tracks/3ftfaNstpkuUIMxlYOl1WX',
  'analysis_url': 'https://api.spotify.com/v1/audio-analysis/3ftfaNstpkuUIMxlYOl1WX',
  'duration_ms': 234572,
  'time_signature': 4}]

In [7]:
''' Returns a list of each track's audio features'''
def features(songs):
    all_feats = []
    ids = [song[0] for song in songs]
    # Spotify's audio_features can only return 50 songs at once
    for i in range(0, len(songs), 50):
        track_feats = sp.audio_features(ids[i:i+50])
        for j in range(len(track_feats)):
            track_feats[j]['name'] = songs[i+j][1]
            track_feats[j]['popularity'] = songs[i+j][2]
        all_feats.extend(track_feats)
    return all_feats

In [8]:
feats = features(songs)
feats[100]

{'danceability': 0.59,
 'energy': 0.871,
 'key': 11,
 'loudness': -3.353,
 'mode': 0,
 'speechiness': 0.0891,
 'acousticness': 0.148,
 'instrumentalness': 0,
 'liveness': 0.0739,
 'valence': 0.304,
 'tempo': 134.956,
 'type': 'audio_features',
 'id': '1839RWzPRZCVgvamu4sHSN',
 'uri': 'spotify:track:1839RWzPRZCVgvamu4sHSN',
 'track_href': 'https://api.spotify.com/v1/tracks/1839RWzPRZCVgvamu4sHSN',
 'analysis_url': 'https://api.spotify.com/v1/audio-analysis/1839RWzPRZCVgvamu4sHSN',
 'duration_ms': 191331,
 'time_signature': 4,
 'name': 'All My Love (feat. Conor Maynard)',
 'popularity': 70}

## Training

In [98]:
data = pd.DataFrame(feats)
data = data.sample(frac=1)
data.head()

Unnamed: 0,acousticness,analysis_url,danceability,duration_ms,energy,id,instrumentalness,key,liveness,loudness,mode,name,popularity,speechiness,tempo,time_signature,track_href,type,uri,valence
554,0.0293,https://api.spotify.com/v1/audio-analysis/3B54...,0.726,233902,0.769,3B54sVLJ402zGa6Xm4YGNe,0.0101,6,0.104,-5.043,1,Unforgettable,86,0.123,97.985,4,https://api.spotify.com/v1/tracks/3B54sVLJ402z...,audio_features,spotify:track:3B54sVLJ402zGa6Xm4YGNe,0.733
367,0.111,https://api.spotify.com/v1/audio-analysis/0RUX...,0.448,255893,0.885,0RUXHlYhA057fCOe2vqIqu,4e-06,9,0.0476,-3.755,1,Ships In The Night,55,0.0562,178.151,4,https://api.spotify.com/v1/tracks/0RUXHlYhA057...,audio_features,spotify:track:0RUXHlYhA057fCOe2vqIqu,0.134
341,0.253,https://api.spotify.com/v1/audio-analysis/6uBh...,0.69,210091,0.622,6uBhi9gBXWjanegOb2Phh0,0.0,5,0.116,-5.025,0,Stay (with Alessia Cara),84,0.0622,102.04,4,https://api.spotify.com/v1/tracks/6uBhi9gBXWja...,audio_features,spotify:track:6uBhi9gBXWjanegOb2Phh0,0.544
321,0.0144,https://api.spotify.com/v1/audio-analysis/0fYV...,0.526,195200,0.862,0fYVliAYKHuPmECRs1pbRf,0.0597,2,0.229,-6.003,1,Renegades,76,0.0905,90.052,4,https://api.spotify.com/v1/tracks/0fYVliAYKHuP...,audio_features,spotify:track:0fYVliAYKHuPmECRs1pbRf,0.528
654,0.000339,https://api.spotify.com/v1/audio-analysis/3GnL...,0.905,177604,0.596,3GnLo84IkdSWCPYt6tnLll,2.1e-05,1,0.0897,-7.496,0,HUMBLE.,12,0.118,149.996,4,https://api.spotify.com/v1/tracks/3GnLo84IkdSW...,audio_features,spotify:track:3GnLo84IkdSWCPYt6tnLll,0.422


In [99]:
features = ["acousticness", "danceability", "energy", 
            "loudness", "speechiness", "tempo", "valence"]

In [100]:
train, test = train_test_split(data, test_size=0.15)
x_train, y_train = train[features], train["popularity"]
x_test, y_test = test[features], test["popularity"]
print("Training size: {}, Test size: {}".format(len(train), len(test)))

Training size: 577, Test size: 102


In [103]:
lin_mod = LinearRegression()
lin_mod.fit(x_train, y_train)



LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)