In [1]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from spotipy.oauth2 import SpotifyOAuth
import random
import string
import pandas as pd
import numpy as np

#Spotipy documentation: https://spotipy.readthedocs.io/en/master/

In [2]:
#Write here the client ID and secret ID from spotify API
SPOTIPY_CLIENT_ID = ''
SPOTIPY_CLIENT_SECRET = ''
REDIRECT_URI = 'http://localhost:7000/callback'
scope = "user-library-read"

cache_handler = spotipy.cache_handler.MemoryCacheHandler()
auth_manager = SpotifyClientCredentials(client_id = SPOTIPY_CLIENT_ID, client_secret=SPOTIPY_CLIENT_SECRET, cache_handler=cache_handler)
sp = spotipy.Spotify(auth_manager = auth_manager)

Next cell below finds a random song in the finnish market according to spotify, returns the name, popularity and audio features

In [4]:

random_character = random.choice(string.ascii_letters)
random_search = random_character + '%'
offset = random.randint(1,1000)
year = '2022'
genre = 'hip-hop'
    
#Test to fetch random songs with specific genre
songs = sp.search(q = 'track:' + random_search + ' year:' + year + ' genre: ' +  genre, type = 'track', market = 'FI', offset = offset)


first_song = songs['tracks']['items'][0]
first_song_name = first_song['name']
first_song_artist = first_song['artists'][0]['name']
first_song_popularity = first_song['popularity']
first_song_audio_features = sp.audio_features(first_song['id'])

first_song_name, first_song_artist, first_song_popularity, first_song_audio_features, songs['tracks']['total']

('Sobra Que Decir',
 'Neto Reyno',
 21,
 [{'danceability': 0.885,
   'energy': 0.538,
   'key': 1,
   'loudness': -10.121,
   'mode': 1,
   'speechiness': 0.317,
   'acousticness': 0.00259,
   'instrumentalness': 1.56e-06,
   'liveness': 0.045,
   'valence': 0.445,
   'tempo': 96.062,
   'type': 'audio_features',
   'id': '6BlKVfDucwuCQE8DpXB1Yl',
   'uri': 'spotify:track:6BlKVfDucwuCQE8DpXB1Yl',
   'track_href': 'https://api.spotify.com/v1/tracks/6BlKVfDucwuCQE8DpXB1Yl',
   'analysis_url': 'https://api.spotify.com/v1/audio-analysis/6BlKVfDucwuCQE8DpXB1Yl',
   'duration_ms': 175004,
   'time_signature': 4}],
 1675)

# Next, we make a function to fetch sample of songs of specific genre

In [6]:
#function to fetch songs from a specific genre
#returns: dataframe with song name, artist name, and audio features
def fetch_songs(sp, genre, year, number, columns):
    #DF where the songs are stored
    df = pd.DataFrame(columns = columns)
    
    #Fetch songs until there are more than number of songs in the DataFrame
    while (df.shape[0] < number):
        
        #Create empty list for storing songs with one fetch
        song_data = []
        
        #Make random search by some random letter
        offset = random.randint(1,1000)
        random_character = random.choice(string.ascii_letters)
        random_search = random.choice([random_character + '%'
                                       ,'%' + random_character
                                       ,'%' + random_character + '%'])
        songs = sp.search(q = 'track:' + random_search + ' year:' + year + ' genre: ' +  genre, type = 'track', market = 'FI', offset = offset, limit = 50)
        
        #Go through all songs from the fetch and extract needed features
        for song in songs['tracks']['items']:
            name = song['name']
            artist = song['artists'][0]['name']
            popularity = song['popularity']
            audio_features = sp.audio_features(song['id'])
            
            song_data.append([name, artist, popularity] + list(audio_features[0].values()))
    
        #Concatenate the found songs to a dataframe and remove duplicates
        new_df = pd.DataFrame(columns = columns, data = song_data)
        df = pd.concat([df, new_df], ignore_index = True)
        df = df.drop_duplicates(subset = ['id'])
    
    df = df.drop(['type', 'track_href', 'analysis_url', 'time_signature'], axis = 1)
    
    return df

columns = ['song_name', 'artist_name', 'popularity'] + list(first_song_audio_features[0].keys())
#Test run, fetches 50 rap songs from 2019
data = fetch_songs(sp, 'rap', '2019', 10, columns)
data.head()

Unnamed: 0,song_name,artist_name,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,id,uri,duration_ms
0,Leave Me (feat. Marc E. Bassy),Skizzy Mars,53,0.696,0.729,1,-3.779,0,0.0485,0.046,0.0,0.178,0.555,173.954,7gNRYLOwgmzMEpgXARSnX9,spotify:track:7gNRYLOwgmzMEpgXARSnX9,171034
1,Dark Matter,lil aaron,50,0.313,0.457,11,-8.187,0,0.0322,0.877,1.4e-05,0.134,0.378,75.592,59fqSD7qydA8PQBStiLZCE,spotify:track:59fqSD7qydA8PQBStiLZCE,137661
2,"Royalty (feat. Ky-Mani Marley, Stefflon Don & ...",XXXTENTACION,48,0.919,0.538,1,-6.15,1,0.241,0.0243,0.0415,0.0956,0.597,124.952,0OTBOHjEaoqroFlfut1ZsH,spotify:track:0OTBOHjEaoqroFlfut1ZsH,203570
3,WHERE'S MY MONEY,Night Lovell,43,0.787,0.536,10,-8.523,0,0.0438,0.275,2.1e-05,0.184,0.341,125.021,4ly6SAui02c2LUV0yftP3U,spotify:track:4ly6SAui02c2LUV0yftP3U,176628
4,Ingen Som Varna Mig,Einár,45,0.591,0.56,8,-8.568,1,0.421,0.692,0.0,0.11,0.588,138.05,454DGr2OoValu6yW3QtIqR,spotify:track:454DGr2OoValu6yW3QtIqR,188738


### Getting spotify featured playlist songs from Finland

The next step is to fetch all the songs from featured spotify playlists in finland, and extract their audio features that can be used to classify random songs

In [7]:
'''
Get all playlist ids, put them into a data frame
Get all songs from each playlist
Get average song for each playlist
Input: find closest playlist that fits to the song
'''

get_featured_playlists = sp.featured_playlists(country = 'FI')['playlists']['items']

playlists = {}

for playlist in get_featured_playlists:
    playlists[playlist['name']] = playlist['id']

playlists = pd.DataFrame.from_dict(playlists, orient ='index', columns = {'id'})
playlists.index.name = 'playlist_name'
playlists.reset_index(inplace = True)
playlists

Unnamed: 0,playlist_name,id
0,EQUAL Suomi,37i9dQZF1DX4B8pEz8Paq4
1,It's Hits Suomi,37i9dQZF1DX64Cx4vTeaRB
2,#vainsuomihitit,37i9dQZF1DWUvzPS8uIABd
3,Matkalaulut,37i9dQZF1DX3zHI0Yn6qtf
4,Poppia työpäivään,37i9dQZF1DX8JRb0iafpW2
5,Best New Pop,37i9dQZF1DX3bf2kWnX8S4
6,"Jytää, purkkaa ja Finnhitsejä",37i9dQZF1DXdvA6jt8SAU9
7,Rentoa konerytmiä,37i9dQZF1DX4JZt84Ykelm
8,Hit Replay Suomi,37i9dQZF1DX8sjRUtu4bjr
9,Suomirock tänään,37i9dQZF1DWSTzzP5k4u3n


In [8]:
playlist_songs = {}

#For each playlist, fetch the playlist id
for i in playlists.index:
    playlist_name = playlists['playlist_name'][i]
    id = playlists['id'][i]
    songs = sp.playlist_items(id)

    ids = []
    
    #For all the songs for each playlist, fetch the id and append it to a dictionary
    for song in songs['items']:
        if (song['track'] == None):
            continue
        song_id = song['track']['id']
        ids.append(song_id)
    
    playlist_songs[playlist_name] = ids
    

In [9]:
playlist_songs

{'EQUAL Suomi': ['3sbb2aWz92JdxfaqSXCUuR',
  '2f3M4kFTnPCncJSjN515fO',
  '0HPRKUbiCMi379ksDyFbW0',
  '2Yp3dcDxBQByH3yPIy7Zyf',
  '45AnFdNtMvqc6slOLCDMTy',
  '0D0YCEigYKmWYWSz2wOmYi',
  '4miD5oroBLYIpzJXvPgcvX',
  '1dfXFQPfDYDhrjjcIhNzE5',
  '4eZ0Vbz3g5pSl6SI6IajKZ',
  '5AbP52CPQ5X1x9yJT6uAHE',
  '5rRgxNY04Ji8ZlGstHaB80',
  '5cC1DQf1UwQHVRLpJZxpWf',
  '6mJnb88Q0RupbFaQPScT3R',
  '1KODYWcqyguVZOPaJ7sP7V',
  '3uCZZjx6LqJDkmo8KvAGCX',
  '6CITvD2LkozBWMrsHExUFC',
  '08EzYkjNJNJs36DI8Orlzr',
  '4nzQE9dz4JAEjkcEzGrJaj',
  '2LW42Oq7DIrqoTFTogiNsH',
  '5vx35J4nbvNc4bVqn3Ja7F',
  '10G4sUXO33FT5SyRTdE4GM',
  '24kuaBw8ZtA8bWYc8nKkzl',
  '5fMmWGMw9RrXOYSRih2IQc',
  '0Q68xNKpgGV0kwhcbuzrXL',
  '6mhxecCawEXGeIOKaPAFsA',
  '3kwx7XvObEbyHqSx1F0wFD',
  '3gCPzKR0Bnt6JYz386rR22',
  '5K47ppXL4DxTMh4ETLSBu7',
  '7Jflff0pSp2N3NAV9Zjmbs',
  '3z0qlpABMLpTUxlD9DGk0S',
  '0l7WmcO1o4wz4Aowqvjn0t',
  '4sWsyqci52cn5hwX44LeT9',
  '1sYm4xMCPUaGPAdbJFRpUc',
  '7ISbYeSvXIQ68pZyjONTBp',
  '68Oveqxuaou6GBokfmkiVp',
  '3s

In [10]:
#Fetch all the songs and make dataframe for each playlist
song_data = []

#Go through each playlist
for playlist in playlist_songs.keys():
    playlist_name = playlist
    #Fetch the song id and extract correct data
    for song_id in playlist_songs[playlist]:
        song = sp.track(song_id)
        
        if song is None:
            continue
            
        artist = song['artists'][0]['name']
        song_name = song['name']
        popularity = song['popularity']
        audio_features = sp.audio_features(song_id)
        
        if audio_features[0] is None:
            continue
            
        song_data.append([playlist_name, song_name, artist, popularity] + list(audio_features[0].values()))
        
columns = ['playlist_name', 'song_name', 'artist', 'popularity'] + list(audio_features[0].keys())

#Put all of the data into a dataframe
featured_playlist_song_features = pd.DataFrame(data = song_data, columns = columns)

In [12]:
import csv

columns = ['playlist_name',
 'song_name',
 'artist',
 'popularity',
 'danceability',
 'energy',
 'key',
 'loudness',
 'mode',
 'speechiness',
 'acousticness',
 'instrumentalness',
 'liveness',
 'valence',
 'tempo',
 'type',
 'id',
 'uri',
 'track_href',
 'analysis_url',
 'duration_ms',
 'time_signature']

#Fetch the original csv and concatenate the new data to it
df = pd.concat([featured_playlist_song_features, pd.read_csv('./data/finnish_playlist_data.csv')], axis = 0, ignore_index = True)

df = df.drop_duplicates(subset = ['id', 'playlist_name']).reset_index(drop = True)
    
df = df[['playlist_name', 'song_name', 'id', 'artist', 'popularity', 'danceability', 'energy', 'key', 'loudness', 'speechiness', 'mode', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature']]

df = df.sort_values(by=['playlist_name'])

df.to_csv('finnish_playlist_data.csv')

In [13]:
df = pd.read_csv('./data/finnish_playlist_data.csv')

df = df.drop_duplicates(subset = ['id', 'playlist_name']).reset_index(drop = True)
df = df[['playlist_name', 'song_name', 'id', 'artist', 'popularity', 'danceability', 'energy', 'key', 'loudness', 'speechiness', 'mode', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature']]
df = df.sort_values(by=['playlist_name']).reset_index(drop = True)
df

Unnamed: 0,playlist_name,song_name,id,artist,popularity,danceability,energy,key,loudness,speechiness,mode,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,#vainsuomihitit,PLAYA,1vQoEeHAKr8OIGIMFY2yAj,Jami Faltin,51,0.859,0.565,8,-5.485,0.1560,1,0.351000,0.000005,0.1090,0.957,142.065,128258,4
1,#vainsuomihitit,Syödään sieniä,1C5x1IosxWpAV0DD8sTCuo,Samu Haber,47,0.654,0.586,10,-6.472,0.0407,1,0.109000,0.000000,0.1180,0.582,125.061,199893,4
2,#vainsuomihitit,Ihmisen muotoinen,0D0YCEigYKmWYWSz2wOmYi,Jannika B,49,0.531,0.704,7,-8.199,0.0559,0,0.089000,0.000000,0.1880,0.363,170.118,199510,4
3,#vainsuomihitit,Vainois,33DB1mfU2H8dJe7ZhpkTBC,Mikael Gabriel,56,0.652,0.740,7,-3.374,0.1110,1,0.054000,0.763000,0.1920,0.252,100.181,158194,4
4,#vainsuomihitit,Betty Draper,0hesHlOQaPgxN8WjfA9v5x,Maija Vilkkumaa,0,0.552,0.815,7,-5.024,0.0562,1,0.000206,0.000000,0.2130,0.512,115.966,216407,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3384,Viikonloppufiilis,Never Not,4fFfyouye6vrX9o9z4PVMu,Lauv,74,0.470,0.221,5,-9.243,0.0371,1,0.693000,0.000000,0.1040,0.264,133.714,208000,4
3385,Viikonloppufiilis,Back To You - From 13 Reasons Why – Season 2 S...,4hQ6UGyWQIGJmHSo0J88JW,Selena Gomez,75,0.601,0.724,6,-4.856,0.0486,1,0.094500,0.000002,0.1200,0.508,102.061,207905,4
3386,Viikonloppufiilis,What About Us,0Qh38w01QRXK6KHIv0e3hb,P!nk,75,0.491,0.588,1,-6.177,0.0506,1,0.028100,0.000002,0.0906,0.193,113.608,269600,4
3387,Viikonloppufiilis,Your Body Is a Wonderland,7vFv0yFGMJW3qVXbAd9BK9,John Mayer,75,0.647,0.729,5,-6.251,0.0236,1,0.020600,0.048900,0.1340,0.687,94.010,249627,4


## One hot encoding for categorical features

In [14]:
def onehot_prep(df, column, new_name):
    tf_df = pd.get_dummies(df[column])
    feature_names = tf_df.columns
    tf_df.columns = [new_name + "|" + str(i) for i in feature_names]
    tf_df.reset_index(drop = True, inplace = True)
    return tf_df

## Time to get the average song from each playlist

In [15]:
averages_playlist = pd.DataFrame(columns = ['playlist_name', 'popularity', 'danceability', 'energy', 'key', 'loudness', 'speechiness', 'mode', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms'])
playlist_names = df['playlist_name'].unique()

for name in playlist_names:
    playlist_data = df.loc[df['playlist_name'] == name]
    averages = []
    averages.append(name)
    averages.append(playlist_data['popularity'].mean())
    averages.append(playlist_data['danceability'].mean())
    averages.append(playlist_data['energy'].mean())
    averages.append(playlist_data['key'].mean())
    averages.append(playlist_data['loudness'].mean())
    averages.append(playlist_data['speechiness'].mean())
    averages.append(playlist_data['mode'].mode()[0])
    averages.append(playlist_data['acousticness'].mean())
    averages.append(playlist_data['instrumentalness'].mean())
    averages.append(playlist_data['liveness'].mean())
    averages.append(playlist_data['valence'].mean())
    averages.append(playlist_data['tempo'].mean())
    averages.append(playlist_data['duration_ms'].mean())
    
    averages_playlist = averages_playlist.append(pd.DataFrame([averages], columns = averages_playlist.columns), ignore_index = True)

averages_playlist

Unnamed: 0,playlist_name,popularity,danceability,energy,key,loudness,speechiness,mode,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
0,#vainsuomihitit,41.122449,0.635867,0.707776,5.367347,-6.577602,0.060626,0,0.130015,0.02654,0.186852,0.52267,127.142857,193065.102041
1,100 Suomi,39.428571,0.737429,0.661543,5.514286,-6.457971,0.131009,1,0.199008,0.029648,0.168846,0.582,123.079586,174754.828571
2,Aitoa suomiräppiä,30.408163,0.74051,0.628816,5.5,-7.34251,0.167695,1,0.19231,0.030923,0.139526,0.550743,111.945939,180812.928571
3,Best New Pop,55.698795,0.614855,0.535805,5.987952,-7.701169,0.072594,1,0.379486,0.023302,0.157684,0.422153,112.834795,201871.819277
4,Big Country,70.04,0.57929,0.71113,5.44,-5.54876,0.046439,1,0.165173,0.000799,0.162385,0.56704,130.06458,195850.25
5,Bileräppiä,73.397727,0.767034,0.625057,5.193182,-6.679432,0.208866,1,0.152363,0.015006,0.173718,0.440665,125.418136,190304.329545
6,Calming Acoustic,42.65,0.5917,0.110733,4.87,-17.76123,0.070585,1,0.96188,0.89388,0.110494,0.277018,112.51834,168184.0
7,Chill Pop,61.3,0.55819,0.397857,5.55,-9.89492,0.052863,1,0.626345,0.009321,0.153472,0.368025,119.5165,195378.45
8,Chill Vibes,52.78,0.57622,0.433462,4.9,-10.80104,0.05898,1,0.561247,0.239208,0.164549,0.347982,114.91586,210886.24
9,EQUAL Suomi,40.24,0.64288,0.64488,4.34,-6.9337,0.068254,0,0.177359,0.023049,0.153352,0.46826,122.85284,192225.44


Time to process all the data. Here, we one-hot encoded the categorical variables, key and mode. All other variables are scaled down using the MinMaxScaler that normalizes values between 0 and 1. This is done for every single song in the data, and made to a new DataFrame that is saved to data/ folder.

In [16]:
#Preparing the original df for model
from sklearn.preprocessing import MinMaxScaler

key_onehot = onehot_prep(df, 'key', 'key') * 0.5
mode_onehot = onehot_prep(df, 'mode', 'mode') * 0.5

pop = df[['popularity']].reset_index(drop = True)
scaler = MinMaxScaler()
pop_scaled = pd.DataFrame(scaler.fit_transform(pop), columns = pop.columns) * 0.2

floats = df[df.dtypes[df.dtypes == 'float64'].index.values].reset_index(drop = True)
scaler2 = MinMaxScaler()
floats_scaled = pd.DataFrame(scaler2.fit_transform(floats), columns = floats.columns) * 0.2

final = pd.concat([df['playlist_name'], floats_scaled, pop_scaled, key_onehot, mode_onehot], axis = 1)
final


Unnamed: 0,playlist_name,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,...,key|4,key|5,key|6,key|7,key|8,key|9,key|10,key|11,mode|0,mode|1
0,#vainsuomihitit,0.174043,0.113267,0.176250,0.029384,0.070482,1.045226e-06,0.019962,0.192761,0.104197,...,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.5
1,#vainsuomihitit,0.129327,0.117493,0.170905,0.003968,0.021887,0.000000e+00,0.021965,0.115202,0.084444,...,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.5
2,#vainsuomihitit,0.102498,0.141239,0.161551,0.007318,0.017871,0.000000e+00,0.037543,0.069907,0.136785,...,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.5,0.0
3,#vainsuomihitit,0.128891,0.148483,0.187684,0.019464,0.010843,1.533668e-01,0.038433,0.046949,0.055542,...,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.5
4,#vainsuomihitit,0.107078,0.163576,0.178747,0.007385,0.000041,0.000000e+00,0.043107,0.100724,0.073879,...,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3384,Viikonloppufiilis,0.089192,0.044041,0.155897,0.003174,0.139156,0.000000e+00,0.018849,0.049431,0.094496,...,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5
3385,Viikonloppufiilis,0.117766,0.145263,0.179657,0.005709,0.018975,3.376884e-07,0.022410,0.099897,0.057726,...,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.5
3386,Viikonloppufiilis,0.093772,0.117895,0.172503,0.006150,0.005642,4.040201e-07,0.015867,0.034747,0.071140,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5
3387,Viikonloppufiilis,0.127800,0.146270,0.172102,0.000198,0.004136,9.829146e-03,0.025526,0.136918,0.048374,...,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5


For the actual program that finds the playlists, it is needed to save the processing functions, such that the given song by the user can be fed into the neural network. (More on that at Playlist Predictor)

In [21]:
#Save scalers for later use
import joblib
scaler_pop = scaler.fit(pop)
scaler_floats = scaler2.fit(floats)
joblib.dump(value = scaler_pop, filename = './objects/scaler_pop')
joblib.dump(value = scaler_floats, filename = './objects/scaler_floats')

array(['danceability', 'energy', 'loudness', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo'],
      dtype=object)

Preprocessing is now done! Next part of the project will be in the file 'PlaylistPredictor'

In [65]:
final.to_csv('data_processed.csv', index = False)