In [1]:
import spotipy, json, os
from spotipy.oauth2 import SpotifyOAuth
import pandas as pd
import numpy as np
import time
from tqdm import tqdm
from pathlib import Path
from operator import itemgetter
RAW_DATA_PATH = Path('raw_data/')
DATAFRAME_PATH = Path('dataframes/')
SELECTED_TRACK_FEATURES = ['danceability', 'energy', 'key', 'loudness',
                     'speechiness', 'acousticness', 'instrumentalness',
                     'liveness', 'valence', 'tempo']
TOTAL_TRACKS = 50
NUM_WITHHELD = 25

In [2]:
def make_playlist_dfs(path):
    playlists = []
    playlist_tracks = []
    for file in tqdm(os.listdir(path)):
        if not file.endswith(".json"):
            continue
        with open(path/file) as f:
            js_slice = json.load(f)
            for playlist in js_slice['playlists']:
                if playlist['num_tracks'] > TOTAL_TRACKS:
                    sorted_tracks = sorted(playlist['tracks'], key=itemgetter('pos')) 
                    for track in sorted_tracks[:TOTAL_TRACKS]:
                        yield track['track_uri'], playlist['pid'], track['pos'], track['artist_name'], track['track_name'],track['artist_uri'],playlist['name']
def get_tracks_info(path):
    track_info = []
    playlist_info = []
    playlist_tracks = []
    for row in make_playlist_dfs(path):
        track_info.append([row[0],*row[3:6]])
        playlist_tracks.append(row[:3])
        playlist_info.append([row[1],row[-1]])
    p_tracks_df = pd.DataFrame(playlist_tracks,columns = ['tid','pid','pos'])
    t_info_df = pd.DataFrame(track_info,columns=['tid','artist_name','track_name','artist_uri']).drop_duplicates(subset=['tid'],ignore_index=True)
    p_info_df = pd.DataFrame(playlist_info,columns=['pid','playlist_name']).drop_duplicates(subset=['pid'],ignore_index=True)
    return p_tracks_df, t_info_df,p_info_df
        

playlist_tracks_df, tracks_info_df, playlists_info_df = get_tracks_info(RAW_DATA_PATH)
# save dataframes to disk
playlist_tracks_df.to_hdf(DATAFRAME_PATH/'playlist_tracks_df.h5','playlist_tracks_df',mode='w')
tracks_info_df.to_hdf(DATAFRAME_PATH/'tracks_info_df.h5','tracks_info_df',mode='w')
playlists_info_df.to_hdf(DATAFRAME_PATH/'playlists_info_df.h5','playlists_info_df',mode='w')

100%|██████████████████████████████████████████████████████████████████████████████████| 11/11 [00:04<00:00,  2.40it/s]


In [3]:
playlist_tracks_df = pd.read_hdf(DATAFRAME_PATH/'playlist_tracks_df.h5', 'playlist_tracks_df')
tracks_info_df = pd.read_hdf(DATAFRAME_PATH/'tracks_info_df.h5', 'tracks_info_df')
playlists_info_df = pd.read_hdf(DATAFRAME_PATH/'playlists_info_df.h5', 'playlists_info_df')
playlist_tracks_df

Unnamed: 0,tid,pid,pos
0,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,0,0
1,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,0,1
2,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,0,2
3,spotify:track:1AWQoqb9bSvzTjaLralEkT,0,3
4,spotify:track:1lzr43nnXAijIGYnCT8M8H,0,4
...,...,...,...
245345,spotify:track:2PEkXkHKvk4bnC1TgTDCDM,9999,45
245346,spotify:track:5qYiVdWxd84wRWGIQ9Hts6,9999,46
245347,spotify:track:498ZVInMGDkmmNVpSWqHiZ,9999,47
245348,spotify:track:2xYlyywNgefLCRDG8hlxZq,9999,48


In [4]:
os.environ['SPOTIPY_CLIENT_SECRET'] = '4ba6ced99c9d4f95be4567d28b4e5a65'
os.environ['SPOTIPY_REDIRECT_URI'] = 'http://localhost:8008/callback'
os.environ['SPOTIPY_CLIENT_ID'] = '7fd6a4b022fc451d9f49ccc2371a6573'

scope = "user-library-read"

sp = spotipy.Spotify(auth_manager=SpotifyOAuth(scope=scope))

columns = ["uri","artist_name","track_name","danceability", "energy", "key", "loudness", "mode", "speechiness",
        "acousticness", "instrumentalness", "liveness", "valence", "tempo",
        "type", "duration_ms", 
        "time_signature"]


### First loop got 198800 records taking 1225 seconds 
### Second loop got 920500 records taking 6194 seconds

In [5]:
start = time.time()

print('total tracks',len(tracks_info_df))
def get_audio_features(df,batch_size):
    batches = len(df)//batch_size + 1
    for batch in tqdm(np.array_split(df,batches)):
        audio_features = sp.audio_features(batch.tid)
        for idx, audio_feature in enumerate(audio_features):
            audio_feature.update({'artist_name':batch.iloc[idx]['artist_name'],
                                  'track_name':batch.iloc[idx]['track_name'],
                                 })
            yield audio_feature

tracks_features_df = pd.DataFrame(get_audio_features(tracks_info_df,batch_size=100),columns=[*columns])
tracks_features_df = tracks_features_df.rename(columns={"uri": "tid"})
print(f"time taken {time.time()-start:.2f} seconds")

  0%|                                                                                          | 0/838 [00:00<?, ?it/s]

total tracks 83755


100%|████████████████████████████████████████████████████████████████████████████████| 838/838 [03:03<00:00,  4.58it/s]


time taken 183.58 seconds


In [6]:
tracks_features_df.info()
tracks_features_df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83755 entries, 0 to 83754
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   tid               83755 non-null  object 
 1   artist_name       83755 non-null  object 
 2   track_name        83755 non-null  object 
 3   danceability      83755 non-null  float64
 4   energy            83755 non-null  float64
 5   key               83755 non-null  int64  
 6   loudness          83755 non-null  float64
 7   mode              83755 non-null  int64  
 8   speechiness       83755 non-null  float64
 9   acousticness      83755 non-null  float64
 10  instrumentalness  83755 non-null  float64
 11  liveness          83755 non-null  float64
 12  valence           83755 non-null  float64
 13  tempo             83755 non-null  float64
 14  type              83755 non-null  object 
 15  duration_ms       83755 non-null  int64  
 16  time_signature    83755 non-null  int64 

Unnamed: 0,tid,artist_name,track_name,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,duration_ms,time_signature
0,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,Missy Elliott,Lose Control (feat. Ciara & Fat Man Scoop),0.904,0.813,4,-7.105,0,0.1210,0.03110,0.006970,0.0471,0.810,125.461,audio_features,226864,4
1,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,Britney Spears,Toxic,0.774,0.838,5,-3.914,0,0.1140,0.02490,0.025000,0.2420,0.924,143.040,audio_features,198800,4
2,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,Beyoncé,Crazy In Love,0.664,0.758,2,-6.583,0,0.2100,0.00238,0.000000,0.0598,0.701,99.259,audio_features,235933,4
3,spotify:track:1AWQoqb9bSvzTjaLralEkT,Justin Timberlake,Rock Your Body,0.892,0.714,4,-6.055,0,0.1410,0.20100,0.000234,0.0521,0.817,100.972,audio_features,267267,4
4,spotify:track:1lzr43nnXAijIGYnCT8M8H,Shaggy,It Wasn't Me,0.853,0.606,0,-4.596,1,0.0713,0.05610,0.000000,0.3130,0.654,94.759,audio_features,227600,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83750,spotify:track:4jtKJLAMBablFv0HEPgtGW,B.B. King,Three O'clock Blues,0.552,0.432,10,-7.513,1,0.0317,0.94600,0.001430,0.2080,0.631,70.754,audio_features,183293,4
83751,spotify:track:5FI4ES0VV2zFLCH4JSgY4G,B.B. King,Don't Answer The Door,0.347,0.442,10,-9.939,1,0.0383,0.61500,0.001920,0.1630,0.217,175.474,audio_features,309373,3
83752,spotify:track:6CJBd4HsltEeC9TiF3OqUO,Brett Eldredge,Time Well Spent,0.529,0.888,4,-5.438,1,0.0817,0.00810,0.000000,0.2970,0.585,91.983,audio_features,215227,4
83753,spotify:track:2QtZvpEiGKqcwMsF1tUW7B,Canaan Smith,Hole In A Bottle,0.686,0.796,8,-5.532,0,0.0300,0.04700,0.000000,0.1560,0.733,112.987,audio_features,156613,4


### Convert the the features to np.float64

In [7]:
float_features = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
           'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature']
for s in float_features:
    tracks_features_df[s] = pd.to_numeric(tracks_features_df[s], errors='coerce')

In [8]:
# save the dataframe into h5
tracks_features_df.to_hdf(base + 'tracks_features_df.h5', 'tracks_features_df', nan_rep='null',mode='w')

NameError: name 'base' is not defined

In [None]:
unique_artists = tracks_info_df['artist_uri'].unique()
print('total unique artists',len(unique_artists))
start = time.time()
result = dict()
genres = list()
def get_artists_genres(series,batch_size):
    batches = len(series)//batch_size + 1
    for batch in tqdm(np.array_split(series,batches)):
        results = sp.artists(batch)
        for artist in results['artists']:
            all_gen = ",".join(artist['genres'])
            yield {
                'artist_uri':artist['uri'],
                'artist_name':artist['name'],
                'genres':all_gen
            }    
artists_genres_df = pd.DataFrame(get_artists_genres(unique_artists,batch_size=50))
print(f"time taken {time.time()-start:.2f} seconds")

In [None]:
artists_genres_df.to_hdf(base + 'artists_genres_df.h5', 'artists_genres_df', nan_rep='null',mode='w')
artists_genres_df