In [1]:
import os
import re
import sys
import json
import time
import config
import spotipy
import pandas as pd

from tqdm import tqdm
from datetime import datetime
from spotipy.oauth2 import SpotifyClientCredentials, SpotifyOAuth

In [3]:
# Change the path to where your Spotify Million Playlist is located. (Need to download the dataset first from 
# https://www.aicrowd.com/challenges/spotify-million-playlist-dataset-challenge) 
# The expected file structure is to have a data folder /data .csv files will be placed there.

path = '../data/smp_data/'

def loop_slices(path, num_slices=1000):
    """
    Each slice is a .json file containing 1000 playlists i.e.: 1 slice is 1000 playlists 20 slices is: 20,000 playlists.
    Parameters:
        num_slices (int): Number of slices to return, max 1000.
        path (str): Path to the Spotify Million Playlist.
        
    Output:
        mpd_playlists (list): a list of dictionaries of all the playlists.
    """
    cnt=0
    mpd_playlists = []
    filenames = os.listdir(path)
    for filename in sorted(filenames):
        print(filename)
        cnt+=1
        if filename.startswith("mpd.slice.") and filename.endswith(".json"):
            fullpath = os.sep.join((path, filename))
            f = open(fullpath)
            js = f.read()
            f.close()
            current_slice = json.loads(js)
            
            # Create a list of all playlists
            for playlist in current_slice['playlists']:
                mpd_playlists.append(playlist)


            if cnt == num_slices:
                break
    return mpd_playlists

In [4]:
def create_csv(playlists, extended=False):
    """
    This function will construct a datafrme and write to .csv file for all the input playlists, this .csv will have a single cell with all the tracks
    in the playlist. 
    An extended parameter is available to extend the tracks in a list to be a single cell per song, this will return an 
    additional .csv file
    Parameters:
        playlists (list): a list of dictionaries such as that from the loop_slices() function.
        extended (boolean): boolean to enable the extended .csv file generation
        
    Output:
        MPD.csv: .csv file with the playlists
        MPD.csv: .csv file with extended song columns
    """
    df = pd.DataFrame(playlists)
    df.to_csv('../data/MPD_1M.csv', index=False)
    
    if extended:
        df_list = []
        for playlist in playlists:
            df_list.append(pd.DataFrame(playlist))
            
        df_extended = pd.concat(df_list, axis=0)
         
        cols_to_keep = ['name', 'collaborative', 'pid', 'modified_at', 'num_tracks', 'num_albums', 
                'num_followers','num_edits', 'duration_ms', 'num_artists']
        df_extended = df_extended.reset_index().pivot(values='tracks',index=cols_to_keep, columns='index')
        df_extended.reset_index(inplace=True)
        df_extended = df_extended.rename_axis(None, axis=1)    
        df_extended.sort_values('pid', inplace=True)
        df_extended.to_csv('../data/{}_MPD_Extended_1M.csv'.format(datetime.now().strftime("%Y_%d_%d_%H_%M_%S")), index=False) 

In [None]:
playlists = loop_slices(path, num_slices=1000)
create_csv(playlists, extended=True)

# Save json file - This file will be used as database
with open('../data/{}_playlists_1M_{}.json'.format(datetime.now().strftime("%Y_%d_%d_%H_%M_%S"), len(playlists)), 'w') as outfile:
    json.dump(playlists, outfile, indent=2)

: 

In [8]:
df = pd.read_csv('../data/MPD_1M.csv')
df

Unnamed: 0,name,collaborative,pid,modified_at,num_tracks,num_albums,num_followers,tracks,num_edits,duration_ms,num_artists,description
0,Throwbacks,False,0,1493424000,52,47,1,"[{'pos': 0, 'artist_name': 'Missy Elliott', 't...",6,11532414,37,
1,Awesome Playlist,False,1,1506556800,39,23,1,"[{'pos': 0, 'artist_name': 'Survivor', 'track_...",5,11656470,21,
2,korean,False,2,1505692800,64,51,1,"[{'pos': 0, 'artist_name': 'Hoody', 'track_uri...",18,14039958,31,
3,mat,False,3,1501027200,126,107,1,"[{'pos': 0, 'artist_name': 'Camille Saint-Saën...",4,28926058,86,
4,90s,False,4,1401667200,17,16,2,"[{'pos': 0, 'artist_name': 'The Smashing Pumpk...",7,4335282,16,
...,...,...,...,...,...,...,...,...,...,...,...,...
19995,gang gang,False,115995,1499212800,39,32,1,"[{'pos': 0, 'artist_name': 'Drake', 'track_uri...",11,8983567,25,
19996,Chaos,True,115996,1408406400,20,19,1,"[{'pos': 0, 'artist_name': 'Kormac', 'track_ur...",16,5660735,17,
19997,Spring 2014,False,115997,1417305600,14,12,1,"[{'pos': 0, 'artist_name': 'Disclosure', 'trac...",8,3765841,9,
19998,autumn,False,115998,1507161600,35,32,1,"[{'pos': 0, 'artist_name': 'Hayley Kiyoko', 't...",8,8322008,31,


In [10]:
# # Read MPD_Extended
df = pd.read_csv('../smp_data/2022_19_19_23_29_24_MPD_Extended.csv')
df.sample(10)

  df = pd.read_csv('../smp_data/2022_19_19_23_29_24_MPD_Extended.csv')


Unnamed: 0,name,collaborative,pid,modified_at,num_tracks,num_albums,num_followers,num_edits,duration_ms,num_artists,...,240,241,242,243,244,245,246,247,248,249
13012,new mood,False,109012,1508371200,74,54,1,18,14998964,40,...,,,,,,,,,,
3568,for m,False,11568,1489536000,57,45,3,13,13450473,34,...,,,,,,,,,,
2968,Country,False,10968,1405036800,80,65,1,25,18235134,39,...,,,,,,,,,,
13747,Sunday Morning,False,109747,1506470400,139,130,5,80,36996705,117,...,,,,,,,,,,
458,Intro,False,458,1485388800,89,81,1,61,20724857,71,...,,,,,,,,,,
7747,Summer 16,False,103747,1472947200,47,42,1,14,10357435,38,...,,,,,,,,,,
11597,Fall,False,107597,1420675200,34,30,1,20,7723733,28,...,,,,,,,,,,
1130,SENIOR YEAR,False,1130,1507507200,74,68,1,15,16620651,60,...,,,,,,,,,,
2525,idk,False,10525,1450137600,51,49,1,20,11040612,48,...,,,,,,,,,,
7265,Yoga,False,103265,1397001600,27,19,1,5,10647735,12,...,,,,,,,,,,


In [11]:
pwd

'e:\\NEU\\Sem2\\DAMG7245-Big-data-sys-intel-Analytics\\RecSys\\code'

In [14]:
from decouple import config
# Spotify credentials
os.environ["SPOTIPY_CLIENT_ID"] = config('SPOTIPY_CLIENT_ID')
os.environ["SPOTIPY_CLIENT_SECRET"] = config('SPOTIPY_CLIENT_SECRET')
os.environ['SPOTIPY_REDIRECT_URI'] = config('SPOTIPY_REDIRECT_URI')   # Needed for user authorization
sp = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials())

In [16]:
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Start Time =", current_time)

cols_to_keep = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 
                'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature']
dfs = []

for playlist in tqdm(playlists):
    audio_feats = []
    all_uris_in_plylst = []
    for track in playlist['tracks']:
        uri = track['track_uri'].split("k:")[1]
        all_uris_in_plylst.append(uri)

    chunks_uris = [all_uris_in_plylst[i:i + 100] for i in range(0, len(all_uris_in_plylst), 100)]
    for chunk in  chunks_uris:
        for attempt in range(10):
            try:
                chunk_audio_feats = sp.audio_features(chunk)
                audio_feats.append(chunk_audio_feats)
            except Exception as e: 
                print(e)
                print('playlist: {}, chunk: {}'.format(playlist['name'], chunk))
            else:
                break
        else:
            print('Everything failed')


    
    playlist_audio_feats = [item for sublist in audio_feats for item in sublist]
    name = playlist['name']
    pid = playlist['pid']
    s1 = pd.Series([name, pid], index=['name', 'pid'])
    try: # Try/Except for when there is a None in playlist
        s2 = pd.DataFrame(playlist_audio_feats)[cols_to_keep].mean()
    except:
        print('Playlist "{}" has a None. PID: {}'.format(name, pid))
        s2 = pd.DataFrame([i for i in playlist_audio_feats if i])[cols_to_keep].mean() # List comprehension to remove None from list

    dfs.append(pd.DataFrame(s1.append(s2)).T)
# Time to beat: 5% in 5min

Start Time = 23:43:00


  dfs.append(pd.DataFrame(s1.append(s2)).T)
  dfs.append(pd.DataFrame(s1.append(s2)).T)
  dfs.append(pd.DataFrame(s1.append(s2)).T)
  dfs.append(pd.DataFrame(s1.append(s2)).T)
  dfs.append(pd.DataFrame(s1.append(s2)).T)
  dfs.append(pd.DataFrame(s1.append(s2)).T)
  dfs.append(pd.DataFrame(s1.append(s2)).T)
  dfs.append(pd.DataFrame(s1.append(s2)).T)
  dfs.append(pd.DataFrame(s1.append(s2)).T)
  dfs.append(pd.DataFrame(s1.append(s2)).T)
  dfs.append(pd.DataFrame(s1.append(s2)).T)
  dfs.append(pd.DataFrame(s1.append(s2)).T)
  dfs.append(pd.DataFrame(s1.append(s2)).T)
  dfs.append(pd.DataFrame(s1.append(s2)).T)
  dfs.append(pd.DataFrame(s1.append(s2)).T)
  dfs.append(pd.DataFrame(s1.append(s2)).T)
  dfs.append(pd.DataFrame(s1.append(s2)).T)
  dfs.append(pd.DataFrame(s1.append(s2)).T)
  dfs.append(pd.DataFrame(s1.append(s2)).T)
  dfs.append(pd.DataFrame(s1.append(s2)).T)
  dfs.append(pd.DataFrame(s1.append(s2)).T)
  dfs.append(pd.DataFrame(s1.append(s2)).T)
  dfs.append(pd.DataFrame(s1.app

Playlist "Current" has a None. PID: 109844


  dfs.append(pd.DataFrame(s1.append(s2)).T)
  dfs.append(pd.DataFrame(s1.append(s2)).T)
  dfs.append(pd.DataFrame(s1.append(s2)).T)
  dfs.append(pd.DataFrame(s1.append(s2)).T)
  dfs.append(pd.DataFrame(s1.append(s2)).T)
  dfs.append(pd.DataFrame(s1.append(s2)).T)
  dfs.append(pd.DataFrame(s1.append(s2)).T)
  dfs.append(pd.DataFrame(s1.append(s2)).T)
  dfs.append(pd.DataFrame(s1.append(s2)).T)
  dfs.append(pd.DataFrame(s1.append(s2)).T)
  dfs.append(pd.DataFrame(s1.append(s2)).T)
  dfs.append(pd.DataFrame(s1.append(s2)).T)
  dfs.append(pd.DataFrame(s1.append(s2)).T)
  dfs.append(pd.DataFrame(s1.append(s2)).T)
  dfs.append(pd.DataFrame(s1.append(s2)).T)
  dfs.append(pd.DataFrame(s1.append(s2)).T)
  dfs.append(pd.DataFrame(s1.append(s2)).T)
  dfs.append(pd.DataFrame(s1.append(s2)).T)
  dfs.append(pd.DataFrame(s1.append(s2)).T)
  dfs.append(pd.DataFrame(s1.append(s2)).T)
  dfs.append(pd.DataFrame(s1.append(s2)).T)
  dfs.append(pd.DataFrame(s1.append(s2)).T)
  dfs.append(pd.DataFrame(s1.app

('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))
playlist: Old School Jams, chunk: ['4WHQzXnWyTlE1UOmmxRDOw', '2BDRHFw5170t2kRHD3uEMU', '6mdNRnbiVerwM8kGCS1KwD', '2GRMJEIAvKlqJd9UHZTjRD', '0s6e7ZafqOAUBDoQYGmxrc', '7BVMXNnQSswMX8IJeZUb5T', '5xE7sFTlv2yrG9mmhDSpdq', '0v9kGNjkKdQUdDoBIuiph4', '7F7L9O4m2FiIWV6m4Wyrqa', '3aHqs6SCMoNQ1GtPCG6mwP', '6uQKuonTU8VKBz5SHZuQXD', '2TyW8iS8LHTKurcqQy3J2L', '2YKnKXUeGERi9yCLTO2C3t', '1EmcH7MzImxZWKRbSHmf5y', '6dEs3CH2goHCTR6XCTctVd', '1AeMo9IYEz5o6JgQDMElVu', '4dY4JVC0Q9CwpeZ6JAclIc', '6gk2V87atn7XaaAltdYsKd', '7ao6UmSKAz3kponATY1qRW', '0TT7wJiEYD5GAeJfSR1ETX', '6CfrqjtxHobS43clpcSAL7', '0tkWZxllnFv4bicSnmVJWh', '4Fkr6zTAxsXbG9kG8ISqos', '0ndVtaoJarSoK9SBCRkaJt', '7dZzBG5EzwxGKrWsWLgqgw', '4bvqOj9QiH6qKecLiefKst', '6gp5reJaprmqjZnRFbpqK2', '3OcQkcvTcohs8vO8Rd3sKF', '5KG4OVGxSrFmNHGZBezJJn', '3CzOQuOrl39RitAtekiZV8', '1kPBHRXyXdrtYfUfeRwBko', '35Iy0Neh1ewQapV6n0td6

  dfs.append(pd.DataFrame(s1.append(s2)).T)
  dfs.append(pd.DataFrame(s1.append(s2)).T)
  dfs.append(pd.DataFrame(s1.append(s2)).T)
  dfs.append(pd.DataFrame(s1.append(s2)).T)
  dfs.append(pd.DataFrame(s1.append(s2)).T)
  dfs.append(pd.DataFrame(s1.append(s2)).T)
  dfs.append(pd.DataFrame(s1.append(s2)).T)
  dfs.append(pd.DataFrame(s1.append(s2)).T)
  dfs.append(pd.DataFrame(s1.append(s2)).T)
  dfs.append(pd.DataFrame(s1.append(s2)).T)
  dfs.append(pd.DataFrame(s1.append(s2)).T)
  dfs.append(pd.DataFrame(s1.append(s2)).T)
  dfs.append(pd.DataFrame(s1.append(s2)).T)
  dfs.append(pd.DataFrame(s1.append(s2)).T)
  dfs.append(pd.DataFrame(s1.append(s2)).T)
  dfs.append(pd.DataFrame(s1.append(s2)).T)
  dfs.append(pd.DataFrame(s1.append(s2)).T)
  dfs.append(pd.DataFrame(s1.append(s2)).T)
  dfs.append(pd.DataFrame(s1.append(s2)).T)
  dfs.append(pd.DataFrame(s1.append(s2)).T)
  dfs.append(pd.DataFrame(s1.append(s2)).T)
  dfs.append(pd.DataFrame(s1.append(s2)).T)
  dfs.append(pd.DataFrame(s1.app

In [17]:
df1 = pd.concat(dfs, axis=0)
df1.to_csv('../data/{}_Playlist_Feats_1M_{}.csv'.format(datetime.now().strftime("%Y_%d_%d_%H_%M_%S"), len(df1)), index=False)
df1.head()

Unnamed: 0,name,pid,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,Throwbacks,0,0.664077,0.781077,5.038462,-4.891212,0.692308,0.103698,0.083674,0.000674,0.187087,0.64275,121.1575,221777.461538,4.0
0,Awesome Playlist,1,0.492382,0.695923,4.461538,-8.107974,0.538462,0.09101,0.162227,0.223708,0.179344,0.476667,124.987128,298837.641026,3.769231
0,korean,2,0.671062,0.692953,5.0,-4.875594,0.515625,0.096425,0.2691,0.000638,0.168894,0.565078,114.595984,219373.953125,4.0
0,mat,3,0.514429,0.620902,5.103175,-9.618754,0.714286,0.067,0.273514,0.203156,0.188278,0.451258,125.547627,229575.055556,3.952381
0,90s,4,0.576235,0.650418,3.352941,-7.634529,0.823529,0.041218,0.177189,0.081759,0.166524,0.490294,127.725412,255014.352941,3.941176
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,gang gang,115995,0.80459,0.529051,4.205128,-8.164205,0.589744,0.219638,0.21795,0.008621,0.166574,0.363638,129.452333,230348.358974,4.0
0,Chaos,115996,0.6889,0.6955,5.7,-8.23265,0.4,0.104215,0.243258,0.225892,0.15177,0.7178,135.3143,283091.0,4.05
0,Spring 2014,115997,0.646571,0.603214,5.357143,-6.890929,0.714286,0.078743,0.227477,0.024347,0.146629,0.442571,125.139071,268988.928571,4.0
0,autumn,115998,0.549943,0.562457,5.114286,-8.666057,0.657143,0.06552,0.273361,0.018066,0.16158,0.390406,114.144886,237814.542857,3.971429


In [18]:
# Code to add data on top of an existing df
# df_list.append(df1)
# feats_df = pd.concat(df_list, axis=0)
# feats_df.to_csv('../data/Playlist_Feats_Copy.csv', index=False)
# idx = len(feats_df)
# feats_df

Unnamed: 0,name,pid,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,Throwbacks,0,0.664077,0.781077,5.038462,-4.891212,0.692308,0.103698,0.083674,0.000674,0.187087,0.64275,121.1575,221777.461538,4.0
0,Awesome Playlist,1,0.492382,0.695923,4.461538,-8.107974,0.538462,0.09101,0.162227,0.223708,0.179344,0.476667,124.987128,298837.641026,3.769231
0,korean,2,0.671062,0.692953,5.0,-4.875594,0.515625,0.096425,0.2691,0.000638,0.168894,0.565078,114.595984,219373.953125,4.0
0,mat,3,0.514429,0.620902,5.103175,-9.618754,0.714286,0.067,0.273514,0.203156,0.188278,0.451258,125.547627,229575.055556,3.952381
0,90s,4,0.576235,0.650418,3.352941,-7.634529,0.823529,0.041218,0.177189,0.081759,0.166524,0.490294,127.725412,255014.352941,3.941176
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,gang gang,115995,0.80459,0.529051,4.205128,-8.164205,0.589744,0.219638,0.21795,0.008621,0.166574,0.363638,129.452333,230348.358974,4.0
0,Chaos,115996,0.6889,0.6955,5.7,-8.23265,0.4,0.104215,0.243258,0.225892,0.15177,0.7178,135.3143,283091.0,4.05
0,Spring 2014,115997,0.646571,0.603214,5.357143,-6.890929,0.714286,0.078743,0.227477,0.024347,0.146629,0.442571,125.139071,268988.928571,4.0
0,autumn,115998,0.549943,0.562457,5.114286,-8.666057,0.657143,0.06552,0.273361,0.018066,0.16158,0.390406,114.144886,237814.542857,3.971429
