In [1]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from spotipy.oauth2 import SpotifyOAuth
import random
import string
import pandas as pd
import numpy as np

#Spotipy documentation: https://spotipy.readthedocs.io/en/master/

In [2]:
#Write here the client ID and secret ID from spotify API
SPOTIPY_CLIENT_ID = ''
SPOTIPY_CLIENT_SECRET = ''
REDIRECT_URI = 'http://localhost:7000/callback'
scope = "user-library-read"

cache_handler = spotipy.cache_handler.MemoryCacheHandler()
auth_manager = SpotifyClientCredentials(client_id = SPOTIPY_CLIENT_ID, client_secret=SPOTIPY_CLIENT_SECRET, cache_handler=cache_handler)
sp = spotipy.Spotify(auth_manager = auth_manager)

Next cell below finds a random song in the finnish market according to spotify, returns the name, popularity and audio features

In [3]:

random_character = random.choice(string.ascii_letters)
random_search = random_character + '%'
offset = random.randint(1,1000)
year = '2022'
genre = 'hip-hop'
    
#Test to fetch random songs with specific genre
songs = sp.search(q = 'track:' + random_search + ' year:' + year + ' genre: ' +  genre, type = 'track', market = 'FI', offset = offset)


first_song = songs['tracks']['items'][0]
first_song_name = first_song['name']
first_song_artist = first_song['artists'][0]['name']
first_song_popularity = first_song['popularity']
first_song_audio_features = sp.audio_features(first_song['id'])

first_song_name, first_song_artist, first_song_popularity, first_song_audio_features, songs['tracks']['total']

('Come Mi Guardi (con Madame, Coez & Bresh)',
 'Night Skinny',
 39,
 [{'danceability': 0.759,
   'energy': 0.555,
   'key': 7,
   'loudness': -8.065,
   'mode': 1,
   'speechiness': 0.174,
   'acousticness': 0.572,
   'instrumentalness': 0,
   'liveness': 0.109,
   'valence': 0.295,
   'tempo': 122.023,
   'type': 'audio_features',
   'id': '1IRllh4V0h8MUGEHROZvhe',
   'uri': 'spotify:track:1IRllh4V0h8MUGEHROZvhe',
   'track_href': 'https://api.spotify.com/v1/tracks/1IRllh4V0h8MUGEHROZvhe',
   'analysis_url': 'https://api.spotify.com/v1/audio-analysis/1IRllh4V0h8MUGEHROZvhe',
   'duration_ms': 213496,
   'time_signature': 4}],
 10000)

# Next, we make a function to fetch sample of songs of specific genre

In [4]:
#function to fetch songs from a specific genre
#returns: dataframe with song name, artist name, and audio features
def fetch_songs(sp, genre, year, number, columns):
    #DF where the songs are stored
    df = pd.DataFrame(columns = columns)
    
    #Fetch songs until there are more than number of songs in the DataFrame
    while (df.shape[0] < number):
        
        #Create empty list for storing songs with one fetch
        song_data = []
        
        #Make random search by some random letter
        offset = random.randint(1,1000)
        random_character = random.choice(string.ascii_letters)
        random_search = random.choice([random_character + '%'
                                       ,'%' + random_character
                                       ,'%' + random_character + '%'])
        songs = sp.search(q = 'track:' + random_search + ' year:' + year + ' genre: ' +  genre, type = 'track', market = 'FI', offset = offset, limit = 50)
        
        #Go through all songs from the fetch and extract needed features
        for song in songs['tracks']['items']:
            name = song['name']
            artist = song['artists'][0]['name']
            popularity = song['popularity']
            audio_features = sp.audio_features(song['id'])
            
            song_data.append([name, artist, popularity] + list(audio_features[0].values()))
    
        #Concatenate the found songs to a dataframe and remove duplicates
        new_df = pd.DataFrame(columns = columns, data = song_data)
        df = pd.concat([df, new_df], ignore_index = True)
        df = df.drop_duplicates(subset = ['id'])
    
    df = df.drop(['type', 'track_href', 'analysis_url', 'time_signature'], axis = 1)
    
    return df

columns = ['song_name', 'artist_name', 'popularity'] + list(first_song_audio_features[0].keys())
#Test run, fetches 50 rap songs from 2019
data = fetch_songs(sp, 'rap', '2019', 10, columns)
data.head()

Unnamed: 0,song_name,artist_name,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,id,uri,duration_ms
0,Alle Jubilare wieder,Dendemann,44,0.627,0.923,5,-3.363,0,0.223,0.0351,0.0,0.189,0.833,167.83,2Sluzpm6rMVnXtKLUaWfrD,spotify:track:2Sluzpm6rMVnXtKLUaWfrD,205173
1,Juice,FBG Duck,29,0.773,0.791,1,-6.166,1,0.439,0.0399,0.0,0.0658,0.84,155.923,7vpThbDnULgXv85PQinxxv,spotify:track:7vpThbDnULgXv85PQinxxv,168725
2,Pull up with a 100 (feat. Bloody Jay),YFN Lucci,18,0.794,0.717,5,-4.87,0,0.0672,0.152,0.0,0.058,0.358,115.876,5biidgnCUqa06vjJsyHjev,spotify:track:5biidgnCUqa06vjJsyHjev,162345
3,"Do It When I'm In It (feat. Jermaine Dupri, Oz...",Snoop Dogg,26,0.837,0.66,4,-5.961,0,0.0866,0.0159,2.2e-05,0.115,0.55,94.203,3iuDZuKpdLI8aQlqDslRDm,spotify:track:3iuDZuKpdLI8aQlqDslRDm,233849
4,The Jungle Book,Trippie Redd,17,0.691,0.586,1,-6.984,1,0.0805,0.0363,0.0,0.383,0.0592,85.011,6YUZneKOk7OSzo1Lpi8kZ3,spotify:track:6YUZneKOk7OSzo1Lpi8kZ3,159489


### Getting spotify featured playlist songs from Finland

The next step is to fetch all the songs from featured spotify playlists in finland, and extract their audio features that can be used to classify random songs

In [5]:
'''
Get all playlist ids, put them into a data frame
Get all songs from each playlist
Get average song for each playlist
Input: find closest playlist that fits to the song
'''

get_featured_playlists = sp.featured_playlists(country = 'FI')['playlists']['items']

playlists = {}

for playlist in get_featured_playlists:
    playlists[playlist['name']] = playlist['id']

playlists = pd.DataFrame.from_dict(playlists, orient ='index', columns = {'id'})
playlists.index.name = 'playlist_name'
playlists.reset_index(inplace = True)
playlists

Unnamed: 0,playlist_name,id
0,Ei tässä ole kiire mihinkään,37i9dQZF1DXbcCTrbdHmWr
1,Pehmeät klassikot,37i9dQZF1DWVIKZXJLByBS
2,Akustista tunnelmointia,37i9dQZF1DWYCTsyhNdNao
3,Herkimmät suomalaiset biisit,37i9dQZF1DWW1FLvEAcimu
4,#vainsuomihitit,37i9dQZF1DWUvzPS8uIABd
5,Poppia työpäivään,37i9dQZF1DX8JRb0iafpW2
6,Rentoa konerytmiä,37i9dQZF1DX4JZt84Ykelm
7,Hyvän olon treenilista,37i9dQZF1DWSEIMgzhSu5e
8,It's Hits Suomi,37i9dQZF1DX64Cx4vTeaRB
9,Hit Replay Suomi,37i9dQZF1DX8sjRUtu4bjr


In [6]:
playlist_songs = {}

#For each playlist, fetch the playlist id
for i in playlists.index:
    playlist_name = playlists['playlist_name'][i]
    id = playlists['id'][i]
    songs = sp.playlist_items(id)

    ids = []
    
    #For all the songs for each playlist, fetch the id and append it to a dictionary
    for song in songs['items']:
        if (song['track'] == None):
            continue
        song_id = song['track']['id']
        ids.append(song_id)
    
    playlist_songs[playlist_name] = ids
    

In [7]:
playlist_songs

{'Ei tässä ole kiire mihinkään': ['29d0nY7TzCoi22XBqDQkiP',
  '4Yy5d21CJvXQ8cOuaTiCRD',
  '0CHoWIR3Z1GbcW8kzNd17d',
  '3SdTKo2uVsxFblQjpScoHy',
  '5wxmQ80H5ZPvfM0SMoutry',
  '2IlAAZBNdZi9g4WKVEvT0O',
  '5Zvv7EbJTMqOp3O8GYE5h9',
  '3DgwtoZetCAdSoIpIMXTA7',
  '745H5CctFr12Mo7cqa1BMH',
  '2JdScG59fX4RjBUEbR5lpE',
  '3ovjw5HZZv43SxTwApooCM',
  '2N2yrmodOnVF10mKvItC9P',
  '1qRA5BS78u3gME0loMl9AA',
  '254bXAqt3zP6P50BdQvEsq',
  '4ByEFOBuLXpCqvO1kw8Wdm',
  '7Ewz6bJ97vUqk5HdkvguFQ',
  '2EqlS6tkEnglzr7tkKAAYD',
  '0odIT9B9BvOCnXfS0e4lB5',
  '7pKfPomDEeI4TPT6EOYjn9',
  '4Hhv2vrOTy89HFRcjU3QOx',
  '0iOZM63lendWRTTeKhZBSC',
  '54b8qPFqYqIndfdxiLApea',
  '1zng9uqqXoPkmU05nsAlsw',
  '1M2nd8jNUkkwrc1dgBPTJz',
  '12dU3vAh6AFoJkisorfoUl',
  '5jgFfDIR6FR0gvlA56Nakr',
  '4S1VYqwfkLit9mKVY3MXoo',
  '4feXcsElKIVsGwkbnTHAfV',
  '0P7DoyGrr4Wp9w5TotEtUC',
  '1KU5EHSz04JhGg3rReGJ0N',
  '4CoSCPlKNrWli7E5kFtbcl',
  '2CtemffYhT0DJWcT1XW047',
  '0cKk8BKEi7zXbdrYdyqBP5',
  '4QxDOjgpYtQDxxbWPuEJOy',
  '6TvxPS4fj4LUd

In [8]:
#Fetch all the songs and make dataframe for each playlist
song_data = []

#Go through each playlist
for playlist in playlist_songs.keys():
    playlist_name = playlist
    #Fetch the song id and extract correct data
    for song_id in playlist_songs[playlist]:
        song = sp.track(song_id)
        
        if song is None:
            continue
            
        artist = song['artists'][0]['name']
        song_name = song['name']
        popularity = song['popularity']
        audio_features = sp.audio_features(song_id)
        
        if audio_features[0] is None:
            continue
            
        song_data.append([playlist_name, song_name, artist, popularity] + list(audio_features[0].values()))
        
columns = ['playlist_name', 'song_name', 'artist', 'popularity'] + list(audio_features[0].keys())

#Put all of the data into a dataframe
featured_playlist_song_features = pd.DataFrame(data = song_data, columns = columns)

In [5]:
import csv

columns = ['playlist_name',
 'song_name',
 'artist',
 'popularity',
 'danceability',
 'energy',
 'key',
 'loudness',
 'mode',
 'speechiness',
 'acousticness',
 'instrumentalness',
 'liveness',
 'valence',
 'tempo',
 'type',
 'id',
 'uri',
 'track_href',
 'analysis_url',
 'duration_ms',
 'time_signature']

#Fetch the original csv and concatenate the new data to it
df = pd.concat([featured_playlist_song_features, pd.read_csv('finnish_playlist_data.csv')], axis = 0, ignore_index = True)

df = df.drop_duplicates(subset = ['id', 'playlist_name']).reset_index(drop = True)
    
df = df[['playlist_name', 'song_name', 'id', 'artist', 'popularity', 'danceability', 'energy', 'key', 'loudness', 'speechiness', 'mode', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature']]

df = df.sort_values(by=['playlist_name'])

df.to_csv('finnish_playlist_data.csv')

In [6]:
df = pd.read_csv('finnish_playlist_data.csv')

df = df.drop_duplicates(subset = ['id', 'playlist_name']).reset_index(drop = True)
df = df[['playlist_name', 'song_name', 'id', 'artist', 'popularity', 'danceability', 'energy', 'key', 'loudness', 'speechiness', 'mode', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature']]
df = df.sort_values(by=['playlist_name']).reset_index(drop = True)
df

Unnamed: 0,playlist_name,song_name,id,artist,popularity,danceability,energy,key,loudness,speechiness,mode,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,#vainsuomihitit,PLAYA,1vQoEeHAKr8OIGIMFY2yAj,Jami Faltin,51,0.859,0.565,8,-5.485,0.1560,1,0.35100,0.000005,0.1090,0.9570,142.065,128258,4
1,#vainsuomihitit,Ikävä meitä,10G4sUXO33FT5SyRTdE4GM,Miri,39,0.559,0.729,2,-6.657,0.1150,0,0.52800,0.000854,0.1090,0.4650,191.782,165342,4
2,#vainsuomihitit,Voimaa ja valoa,3F2Sm1D7m7uz1i0Ytrh19c,Yona,41,0.552,0.403,7,-9.076,0.0850,1,0.56600,0.000021,0.1020,0.4930,148.030,217854,4
3,#vainsuomihitit,Vapaa (Mestarit),6xUyD4l2ToLbUuazyY1aHV,Ellinoora,39,0.294,0.726,10,-6.615,0.0472,0,0.00537,0.000049,0.1260,0.0387,104.926,365038,3
4,#vainsuomihitit,Kohta sataa - Vain elämää kausi 13,2FlcD6o29XKTcHCJpvTTiO,Jyrki 69,37,0.662,0.794,4,-7.423,0.0326,0,0.00554,0.742000,0.1680,0.6310,108.049,191200,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3384,Viikonloppufiilis,Youth (feat. Khalid),1h0yImRPIVAjhhHeNVlTuC,Shawn Mendes,68,0.534,0.596,6,-6.653,0.1170,0,0.56000,0.000000,0.1330,0.3240,100.121,190867,4
3385,Viikonloppufiilis,Trigger,0Z26mbSHJ1dkUu7g9DDgif,Major Lazer,60,0.570,0.585,8,-7.041,0.2510,1,0.12800,0.000105,0.1110,0.3780,97.285,171987,4
3386,Viikonloppufiilis,What About Us - Acoustic,7e4Pftu2tzDeY0W0690OKQ,Thomas Daniel,61,0.427,0.229,4,-11.564,0.0364,1,0.93500,0.000007,0.0958,0.2220,125.195,255511,5
3387,Viikonloppufiilis,2 Hearts (feat. Gia Koka),0EdgK7ASb4kfRkW8pVMN02,Sam Feldt,62,0.503,0.754,7,-3.634,0.0424,1,0.26400,0.000058,0.2970,0.5970,116.524,186750,4


## One hot encoding for categorical features

In [7]:
def onehot_prep(df, column, new_name):
    tf_df = pd.get_dummies(df[column])
    feature_names = tf_df.columns
    tf_df.columns = [new_name + "|" + str(i) for i in feature_names]
    tf_df.reset_index(drop = True, inplace = True)
    return tf_df

## Time to get the average song from each playlist

In [9]:
averages_playlist = pd.DataFrame(columns = ['playlist_name', 'popularity', 'danceability', 'energy', 'key', 'loudness', 'speechiness', 'mode', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms'])
playlist_names = df['playlist_name'].unique()

for name in playlist_names:
    playlist_data = df.loc[df['playlist_name'] == name]
    averages = []
    averages.append(name)
    averages.append(playlist_data['popularity'].mean())
    averages.append(playlist_data['danceability'].mean())
    averages.append(playlist_data['energy'].mean())
    averages.append(playlist_data['key'].mean())
    averages.append(playlist_data['loudness'].mean())
    averages.append(playlist_data['speechiness'].mean())
    averages.append(playlist_data['mode'].mode()[0])
    averages.append(playlist_data['acousticness'].mean())
    averages.append(playlist_data['instrumentalness'].mean())
    averages.append(playlist_data['liveness'].mean())
    averages.append(playlist_data['valence'].mean())
    averages.append(playlist_data['tempo'].mean())
    averages.append(playlist_data['duration_ms'].mean())
    
    averages_playlist = averages_playlist.append(pd.DataFrame([averages], columns = averages_playlist.columns), ignore_index = True)

averages_playlist

Unnamed: 0,playlist_name,popularity,danceability,energy,key,loudness,speechiness,mode,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
0,#vainsuomihitit,41.122449,0.635867,0.707776,5.367347,-6.577602,0.060626,0,0.130015,0.02654,0.186852,0.52267,127.142857,193065.102041
1,100 Suomi,39.428571,0.737429,0.661543,5.514286,-6.457971,0.131009,1,0.199008,0.029648,0.168846,0.582,123.079586,174754.828571
2,Aitoa suomiräppiä,30.408163,0.74051,0.628816,5.5,-7.34251,0.167695,1,0.19231,0.030923,0.139526,0.550743,111.945939,180812.928571
3,Best New Pop,55.698795,0.614855,0.535805,5.987952,-7.701169,0.072594,1,0.379486,0.023302,0.157684,0.422153,112.834795,201871.819277
4,Big Country,70.04,0.57929,0.71113,5.44,-5.54876,0.046439,1,0.165173,0.000799,0.162385,0.56704,130.06458,195850.25
5,Bileräppiä,73.397727,0.767034,0.625057,5.193182,-6.679432,0.208866,1,0.152363,0.015006,0.173718,0.440665,125.418136,190304.329545
6,Calming Acoustic,42.65,0.5917,0.110732,4.87,-17.76123,0.070585,1,0.96188,0.89388,0.110494,0.277018,112.51834,168184.0
7,Chill Pop,61.3,0.55819,0.397857,5.55,-9.89492,0.052863,1,0.626345,0.009321,0.153472,0.368025,119.5165,195378.45
8,Chill Vibes,52.78,0.57622,0.433462,4.9,-10.80104,0.05898,1,0.561246,0.239208,0.164549,0.347982,114.91586,210886.24
9,EQUAL Suomi,40.24,0.64288,0.64488,4.34,-6.9337,0.068254,0,0.177359,0.023049,0.153352,0.46826,122.85284,192225.44


In [63]:
#Preparing the original df for model
from sklearn.preprocessing import MinMaxScaler

key_onehot = onehot_prep(df, 'key', 'key') * 0.5
mode_onehot = onehot_prep(df, 'mode', 'mode') * 0.5

pop = df[['popularity']].reset_index(drop = True)
scaler = MinMaxScaler()
pop_scaled = pd.DataFrame(scaler.fit_transform(pop), columns = pop.columns) * 0.2

floats = df[df.dtypes[df.dtypes == 'float64'].index.values].reset_index(drop = True)
scaler = MinMaxScaler()
floats_scaled = pd.DataFrame(scaler.fit_transform(floats), columns = floats.columns) * 0.2

final = pd.concat([df['playlist_name'], floats_scaled, pop_scaled, key_onehot, mode_onehot], axis = 1)
final


Preprocessing is now done! Next part of the project will be in the file 'PlaylistPredictor'

In [65]:
final.to_csv('data_processed.csv', index = False)