In [1]:
import spotipy, json, os
from spotipy.oauth2 import SpotifyOAuth
import pandas as pd
import numpy as np
import time
from tqdm import tqdm
from pathlib import Path
from operator import itemgetter
from sklearn.preprocessing import LabelEncoder

# constants
RAW_DATA_PATH = Path('raw_data/')
DATAFRAME_PATH = Path('dataframes/')
SELECTED_TRACK_FEATURES = ['danceability', 'energy', 'key', 'loudness',
                     'speechiness', 'acousticness', 'instrumentalness',
                     'liveness', 'valence', 'tempo']
TOTAL_TRACKS = 50
NUM_WITHHELD = 25

In [2]:
# Makes dataframes from first 10 slices of the original 1MN Dataset
def make_playlist_dfs(path):
    playlists = []
    playlist_tracks = []
    for file in tqdm(os.listdir(path)):
        if not file.endswith(".json"):
            continue
        with open(path/file) as f:
            js_slice = json.load(f)
            for playlist in js_slice['playlists']:
                    sorted_tracks = sorted(playlist['tracks'], key=itemgetter('pos')) 
                    for track in sorted_tracks:
                        yield track['track_uri'], playlist['pid'], track['pos'], track['artist_name'], track['track_name'],track['artist_uri'],playlist['name']

# Get artist name, playlists name and other extra features
def get_tracks_info(path):
    track_info = []
    playlist_info = []
    playlist_tracks = []
    for row in make_playlist_dfs(path):
        track_info.append([row[0],*row[3:6]])
        playlist_tracks.append(row[:3])
        playlist_info.append([row[1],row[-1]])
    p_tracks_df = pd.DataFrame(playlist_tracks,columns = ['uri','pid','pos'])
    t_info_df = pd.DataFrame(track_info,columns=['uri','artist_name','track_name','artist_uri']).drop_duplicates(subset=['uri'],ignore_index=True)
    p_info_df = pd.DataFrame(playlist_info,columns=['pid','playlist_name']).drop_duplicates(subset=['pid'],ignore_index=True)
    return p_tracks_df, t_info_df,p_info_df
        

playlist_tracks_df, tracks_info_df, playlists_info_df = get_tracks_info(RAW_DATA_PATH)


# save dataframes to disk
# convert uri to tids - feature encoding
label_encoder = LabelEncoder()
label_encoder.fit(tracks_info_df.uri)
tracks_info_df['tid'] = label_encoder.transform(tracks_info_df.uri)
tracks_info_df = tracks_info_df[['tid','uri','artist_name','track_name','artist_uri']].sort_values(by='tid', ignore_index=True)
tracks_info_df.to_hdf(DATAFRAME_PATH/'tracks_info_df.h5','tracks_info_df',mode='w')

playlist_tracks_df['tid'] = label_encoder.transform(playlist_tracks_df.uri)
playlist_tracks_df = playlist_tracks_df[['tid','uri','pid','pos']].sort_values(by=['pid','pos'], ignore_index=True)
playlist_tracks_df.to_hdf(DATAFRAME_PATH/'playlist_tracks_df.h5','playlist_tracks_df',mode='w')

playlists_info_df.to_hdf(DATAFRAME_PATH/'playlists_info_df.h5','playlists_info_df',mode='w')

100%|██████████████████████████████████████████████████████████████████████████████████| 11/11 [00:06<00:00,  1.80it/s]


In [3]:
# Reading the stored .h5 files/dataframes
playlist_tracks_df = pd.read_hdf(DATAFRAME_PATH/'playlist_tracks_df.h5', 'playlist_tracks_df')
tracks_info_df = pd.read_hdf(DATAFRAME_PATH/'tracks_info_df.h5', 'tracks_info_df')
playlists_info_df = pd.read_hdf(DATAFRAME_PATH/'playlists_info_df.h5', 'playlists_info_df')
playlist_tracks_df

Unnamed: 0,tid,uri,pid,pos
0,10959,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,0,0
1,137484,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,0,1
2,11768,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,0,2
3,25799,spotify:track:1AWQoqb9bSvzTjaLralEkT,0,3
4,38891,spotify:track:1lzr43nnXAijIGYnCT8M8H,0,4
...,...,...,...,...
664707,143416,spotify:track:6ZOPiKQeibCn7fP8dncucL,9999,50
664708,166569,spotify:track:7pxhKtuTwofDIdgHx2DcVK,9999,51
664709,165451,spotify:track:7mldq42yDuxiUNn08nvzHO,9999,52
664710,44962,spotify:track:23TxRN09aR1RB0G0tFoT0b,9999,53


In [4]:
# some creds to use the spotify API. Please use with care!
os.environ['SPOTIPY_CLIENT_SECRET'] = '4ba6ced99c9d4f95be4567d28b4e5a65'
os.environ['SPOTIPY_REDIRECT_URI'] = 'http://localhost:8008/callback'
os.environ['SPOTIPY_CLIENT_ID'] = '7fd6a4b022fc451d9f49ccc2371a6573'

scope = "user-library-read"

sp = spotipy.Spotify(auth_manager=SpotifyOAuth(scope=scope))

columns = ["tid","uri","artist_name","track_name","danceability", "energy", "key", "loudness", "mode", "speechiness",
        "acousticness", "instrumentalness", "liveness", "valence", "tempo",
        "duration_ms", 
        "time_signature"]


### Fetching audio features for 170,089 tracks using the Spotify API

In [5]:
start = time.time()

print('total tracks',len(tracks_info_df))
def get_audio_features(df,batch_size):
    batches = len(df)//batch_size + 1
    for batch in tqdm(np.array_split(df,batches)):
        try:
            audio_features = sp.audio_features(batch.uri)
            for idx, audio_feature in enumerate(audio_features):
                audio_feature.update({'artist_name':batch.iloc[idx]['artist_name'],
                                      'track_name':batch.iloc[idx]['track_name'],
                                      'tid':batch.iloc[idx]['tid']
                                     })
                yield audio_feature
        except:
            continue

tracks_features_df = pd.DataFrame(get_audio_features(tracks_info_df,batch_size=100),columns=[*columns])
print(f"time taken {time.time()-start:.2f} seconds")

  0%|                                                                                         | 0/1701 [00:00<?, ?it/s]

total tracks 170089


100%|██████████████████████████████████████████████████████████████████████████████| 1701/1701 [06:07<00:00,  4.63it/s]


time taken 368.60 seconds


In [6]:
# Some records from the track features dataframe
tracks_features_df.info()
tracks_features_df.sample(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170089 entries, 0 to 170088
Data columns (total 17 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   tid               170089 non-null  int64  
 1   uri               170089 non-null  object 
 2   artist_name       170089 non-null  object 
 3   track_name        170089 non-null  object 
 4   danceability      170089 non-null  float64
 5   energy            170089 non-null  float64
 6   key               170089 non-null  int64  
 7   loudness          170089 non-null  float64
 8   mode              170089 non-null  int64  
 9   speechiness       170089 non-null  float64
 10  acousticness      170089 non-null  float64
 11  instrumentalness  170089 non-null  float64
 12  liveness          170089 non-null  float64
 13  valence           170089 non-null  float64
 14  tempo             170089 non-null  float64
 15  duration_ms       170089 non-null  int64  
 16  time_signature    17

Unnamed: 0,tid,uri,artist_name,track_name,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
46235,46235,spotify:track:273274Dx2I7ppuGyair64k,Someone Still Loves You Boris Yeltsin,Some Constellation,0.698,0.298,6,-13.939,1,0.0306,0.449,0.0142,0.0802,0.513,113.528,191707,4
119917,119917,spotify:track:5UBeEw8u5VurDhPo1klAPY,Marina and the Diamonds,Froot,0.614,0.812,7,-5.849,0,0.0416,0.263,1.1e-05,0.102,0.726,120.019,331324,4
14159,14159,spotify:track:0deVzvNIA4gSXxh0EcnOQu,Rihanna,Desperado - Andrelli Remix,0.514,0.728,5,-3.531,0,0.0417,0.0666,1e-06,0.0487,0.49,125.971,183827,4
73116,73116,spotify:track:3LHOMrvVg7g9IvBiUpkhNz,The Common Linnets,Calm After The Storm - Radio Edit,0.706,0.367,6,-11.402,1,0.033,0.524,0.232,0.111,0.432,116.039,184379,4
27381,27381,spotify:track:1EwTOK7C9ZBM1D1KtGxDmu,MellowHype,Deaddeputy,0.588,0.553,1,-10.255,1,0.317,0.0434,0.0,0.814,0.447,126.837,119107,4


### Convert the audio features to np.float64

In [7]:
float_features = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
           'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature']
for s in float_features:
    tracks_features_df[s] = pd.to_numeric(tracks_features_df[s], errors='coerce')

In [8]:
# save the dataframe into h5
tracks_features_df.to_hdf(DATAFRAME_PATH/'tracks_features_df.h5', 'tracks_features_df', nan_rep='null',mode='w')

### Fetch genre information for 35,892 artists from the Spotify API

In [9]:
unique_artists = tracks_info_df['artist_uri'].unique()
print('total unique artists',len(unique_artists))
start = time.time()
result = dict()
genres = list()
def get_artists_genres(series,batch_size):
    batches = len(series)//batch_size + 1
    for batch in tqdm(np.array_split(series,batches)):
        results = sp.artists(batch)
        for artist in results['artists']:
            all_gen = ",".join(artist['genres'])
            yield {
                'artist_uri':artist['uri'],
                'artist_name':artist['name'],
                'genres':all_gen
            }    
artists_genres_df = pd.DataFrame(get_artists_genres(unique_artists,batch_size=50))
print(f"time taken {time.time()-start:.2f} seconds")

  0%|                                                                                          | 0/718 [00:00<?, ?it/s]

total unique artists 35892


100%|████████████████████████████████████████████████████████████████████████████████| 718/718 [01:25<00:00,  8.43it/s]

time taken 85.23 seconds





In [10]:
artists_genres_df.to_hdf(DATAFRAME_PATH/'artists_genres_df.h5', 'artists_genres_df', nan_rep='null',mode='w')
artists_genres_df

Unnamed: 0,artist_uri,artist_name,genres
0,spotify:artist:3n3k65ADvmwixFITCnC78C,El Poder De Zacatecas,deep norteno
1,spotify:artist:6kQB2RN7WwryMdJ1MoQh1E,The Ghost Inside,"deathcore,melodic hardcore,melodic metalcore,m..."
2,spotify:artist:2tppd6KkhK4ULAd217Ecq1,The Coronas,"indie anthem-folk,irish rock"
3,spotify:artist:5ZsFI1h6hIdQRw2ti0hz81,ZAYN,"dance pop,pop,post-teen pop,uk pop"
4,spotify:artist:3pTE9iaJTkWns3mxpNQlJV,Bombay Bicycle Club,"alternative dance,british indie rock,indie pop..."
...,...,...,...
35887,spotify:artist:4smK3nFPhkimbY6EXK156w,Hackman,"bass music,future garage"
35888,spotify:artist:07aM9WDyMQT5VaCnmsxCI8,The Elves,
35889,spotify:artist:3VvmUsYPzFheK2wJGKcXxp,The Correspondents,"electro swing,nu jazz,steampunk"
35890,spotify:artist:1JXCz2jeNMcsxiKyu3pJHt,Guther,"german electronica,indie electronica"
