In [98]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error, r2_score

from matplotlib import pyplot as plt
import seaborn as sns

import itertools
import random

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import spotipy.util as util

%matplotlib inline

In [2]:
CLIENT_ID="810e2eec841546269c49f338f1be189a"
CLIENT_SECRET="764290d0ab0f4f4484b0d4c7b701bdee"

token = SpotifyClientCredentials(client_id=CLIENT_ID, client_secret=CLIENT_SECRET)

cache_token = token.get_access_token()
sp = spotipy.Spotify(cache_token)

In [3]:
sample_playlist = sp.user_playlist("joycex99", "0yWeSBDVEwGPLMH7EXI3tX")

In [4]:
#Get (song id, song name, popularity) out of playlists
def extract_songs(playlist):
    tracks = playlist["tracks"]
    info = [(item["track"]["id"], item["track"]["name"], item["track"]["popularity"]) for item in tracks["items"]]
    while tracks["next"]:
        tracks = sp.next(tracks)
        info.extend([(item["track"]["id"], item["track"]["name"], item["track"]["popularity"]) for item in tracks["items"]])
    return info

In [86]:
# Get (song_id, song_name, popularity) for num_tracks songs in given year
# Goes through songs sequentially to get tracks
def tracks_from_year(year, num_tracks):
    tracks = sp.search(q='year:' + str(year), type='track', offset=0, limit=50)
    print("Number of tracks in {}: {}".format(year, tracks['tracks']['total']))
    info = [(item["id"], item["name"], item["popularity"]) for item in tracks["tracks"]["items"]]
    while tracks["tracks"]["next"] and len(info) < num_tracks:
        tracks = sp.next(tracks["tracks"])
        info.extend([(item["id"], item["name"], item["popularity"]) for item in tracks["tracks"]["items"]])
        if len(info) % 1000 == 0:
            print("Retrieved {} songs".format(len(info)))
    return info[:num_tracks]

In [87]:
# Get (song_id, song_name, popularity) for num_tracks songs in a given year
# Selects those songs randomly from the given year's top 10,000
def random_tracks_from_year(year, num_tracks):
    tracks = sp.search(q='year:' + str(year), type='track')
    print("Number of tracks in {}: {}".format(year, tracks['tracks']['total']))
    max_track = min(tracks['tracks']['total'], 9999) # Spotify limits offset to 9999
    infos = []
    for _ in range(num_tracks):
        track_num = random.randint(1, max_track) # May repeat songs
        result = sp.search(q='year:' + str(year), type='track', offset=track_num, limit=1)
        track = result['tracks']['items'][0]
        info = (track['id'], track['name'], track['popularity'])
        infos.append(info)
    return infos

In [89]:
songs = tracks_from_year(2017, 10000)
print("Songs we use: {}".format(len(songs)))
print(songs[:10])
print(songs[-10:])

Number of tracks in 2017: 7461170
Retrieved 1000 songs
Retrieved 2000 songs
Retrieved 3000 songs
Retrieved 4000 songs
Retrieved 5000 songs
Retrieved 6000 songs
Retrieved 7000 songs
Retrieved 8000 songs
Retrieved 9000 songs
Retrieved 10000 songs
Songs we use: 10000
[('7m9OqQk4RVRkw9JJdeAw96', 'Jocelyn Flores', 93), ('7AQim7LbvFVZJE3O8TYgf2', 'Fuck Love (feat. Trippie Redd)', 91), ('3GVkPk8mqxz0itaAriG1L7', 'Everybody Dies In Their Nightmares', 89), ('0OgGn1ofaj55l2PcihQQGV', 'Drew Barrymore', 86), ('7sO5G9EABYOXQKNPNiE9NR', 'Ric Flair Drip (& Metro Boomin)', 87), ('7GX5flRQZVHRAGd6B4TmDO', 'XO TOUR Llif3', 88), ('7oJrVhiTPZGBXWPwhxjy9S', 'Lust', 85), ('5Z3GHaZ6ec9bsiI5BenrbY', 'Young Dumb & Broke', 87), ('0At2qAoaVjIwWNAqrscXli', "Boo'd Up", 85), ('21RzyxY3EFaxVy6K4RqaU9', 'Body', 86)]
[('6BPO4rLBfsAJsdgxG4ufFx', 'The Lamp Is High', 50), ('6S1HT3w4F9rWrw8sqQuyR1', 'Потрачу', 57), ('7FQzzDpesa2sgsXkGMBRqz', 'Where We Go', 46), ('7LXCM5l6oRq0BXfgXTZz0S', 'El Fifty', 47), ('7zkXulA0S5cja2A

In [90]:
sp.audio_features(songs[0][0])

[{'danceability': 0.872,
  'energy': 0.391,
  'key': 0,
  'loudness': -9.144,
  'mode': 0,
  'speechiness': 0.242,
  'acousticness': 0.469,
  'instrumentalness': 4.13e-06,
  'liveness': 0.297,
  'valence': 0.437,
  'tempo': 134.021,
  'type': 'audio_features',
  'id': '7m9OqQk4RVRkw9JJdeAw96',
  'uri': 'spotify:track:7m9OqQk4RVRkw9JJdeAw96',
  'track_href': 'https://api.spotify.com/v1/tracks/7m9OqQk4RVRkw9JJdeAw96',
  'analysis_url': 'https://api.spotify.com/v1/audio-analysis/7m9OqQk4RVRkw9JJdeAw96',
  'duration_ms': 119133,
  'time_signature': 4}]

In [91]:
''' Returns a list of each track's audio features'''
def features(songs):
    all_feats = []
    ids = [song[0] for song in songs]
    # Spotify's audio_features can only return 50 songs at once
    for i in range(0, len(songs), 50):
        track_feats = sp.audio_features(ids[i:i+50])
        for j in range(len(track_feats)):
            track_feats[j]['name'] = songs[i+j][1]
            track_feats[j]['popularity'] = songs[i+j][2]
        all_feats.extend(track_feats)
    return all_feats

In [92]:
feats = features(songs)
feats[100]

{'danceability': 0.684,
 'energy': 0.619,
 'key': 10,
 'loudness': -7.005,
 'mode': 1,
 'speechiness': 0.0386,
 'acousticness': 0.0716,
 'instrumentalness': 0,
 'liveness': 0.122,
 'valence': 0.284,
 'tempo': 125.046,
 'type': 'audio_features',
 'id': '65fpYBrI8o2cfrwf2US4gq',
 'uri': 'spotify:track:65fpYBrI8o2cfrwf2US4gq',
 'track_href': 'https://api.spotify.com/v1/tracks/65fpYBrI8o2cfrwf2US4gq',
 'analysis_url': 'https://api.spotify.com/v1/audio-analysis/65fpYBrI8o2cfrwf2US4gq',
 'duration_ms': 217440,
 'time_signature': 4,
 'name': 'Rewrite The Stars',
 'popularity': 83}

## Training

In [93]:
data = pd.DataFrame(feats)
data = data.sample(frac=1)
data.head()

Unnamed: 0,acousticness,analysis_url,danceability,duration_ms,energy,id,instrumentalness,key,liveness,loudness,mode,name,popularity,speechiness,tempo,time_signature,track_href,type,uri,valence
6695,0.0435,https://api.spotify.com/v1/audio-analysis/1aeY...,0.28,260068,0.545,1aeYve33HMLMBEhB3WVHNa,0.00273,4,0.0573,-7.732,0,Gallows,56,0.0529,146.098,4,https://api.spotify.com/v1/tracks/1aeYve33HMLM...,audio_features,spotify:track:1aeYve33HMLMBEhB3WVHNa,0.0395
9414,0.00689,https://api.spotify.com/v1/audio-analysis/6Xtr...,0.485,225813,0.948,6XtrMjkNfRb0XjmlzefbJq,0.0,0,0.218,-2.68,1,No Future,50,0.131,162.966,4,https://api.spotify.com/v1/tracks/6XtrMjkNfRb0...,audio_features,spotify:track:6XtrMjkNfRb0XjmlzefbJq,0.481
5026,0.00619,https://api.spotify.com/v1/audio-analysis/5ZRY...,0.75,211875,0.548,5ZRYax9cJzFlxc961BVGEH,0.0,9,0.156,-6.288,1,The Race - Remix,53,0.358,160.084,4,https://api.spotify.com/v1/tracks/5ZRYax9cJzFl...,audio_features,spotify:track:5ZRYax9cJzFlxc961BVGEH,0.377
6510,0.972,https://api.spotify.com/v1/audio-analysis/08h8...,0.193,208040,0.0329,08h8ICqh9extUqzotddLRQ,0.929,7,0.0978,-28.228,1,Saint-Saëns / Transc. Vidal: Le carnaval des a...,56,0.046,82.165,3,https://api.spotify.com/v1/tracks/08h8ICqh9ext...,audio_features,spotify:track:08h8ICqh9extUqzotddLRQ,0.0366
4337,0.27,https://api.spotify.com/v1/audio-analysis/3JvD...,0.664,195493,0.788,3JvDAV3iV1KH0EmnfdU8kf,0.0,5,0.289,-4.607,1,Antecedentes De Culpa,64,0.124,150.04,4,https://api.spotify.com/v1/tracks/3JvDAV3iV1KH...,audio_features,spotify:track:3JvDAV3iV1KH0EmnfdU8kf,0.876


In [94]:
features = ["acousticness", "danceability", "energy", 
            "loudness", "speechiness", "tempo", "valence"]

In [95]:
train, test = train_test_split(data, test_size=0.15)
x_train, y_train = train[features], train["popularity"]
x_test, y_test = test[features], test["popularity"]
print("Training size: {}, Test size: {}".format(len(train), len(test)))

Training size: 8500, Test size: 1500


In [96]:
lin_mod = LinearRegression()
lin_mod.fit(x_train, y_train)

  linalg.lstsq(X, y)


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [99]:
test_preds = lin_mod.predict(x_test)
print("Mean Squared Error: {}".format(mean_squared_error(test_preds, y_test)))

Mean Squared Error: 61.10799458156196
