In [12]:
import os
import re
import sys
import json
import time
import config
import spotipy
import pandas as pd

from tqdm import tqdm
from datetime import datetime
from spotipy.oauth2 import SpotifyClientCredentials, SpotifyOAuth


In [11]:
pip install spotipy

Collecting spotipyNote: you may need to restart the kernel to use updated packages.
  Downloading spotipy-2.19.0-py3-none-any.whl (27 kB)
Installing collected packages: spotipy
Successfully installed spotipy-2.19.0



In [13]:
 """
 Each slice is a .json file that contains 1000 playlists i.e.: 1 slice is 1000 playlists 20 slices is: 20,000 playlists.
    Parameters we used here are:
        num_slices (int): Number of slices to return, (in our case 20).
        path : Path to the dataset (Spotify Million Playlist).
        
 """

path = '/Users/dwith/DataScience/finalbigdata/data/data'

def loop_slices(path, num_slices=20):
   
   
   
    cnt=0
    mpd_playlists = []
    filenames = os.listdir(path)
    for filename in sorted(filenames):
        print(filename)
        cnt+=1
        if filename.startswith("mpd.slice.") and filename.endswith(".json"):
            fullpath = os.sep.join((path, filename))
            f = open(fullpath)
            js = f.read()
            f.close()
            current_slice = json.loads(js)
            
            # Create a list of all playlists
            for playlist in current_slice['playlists']:
                mpd_playlists.append(playlist)


            if cnt == num_slices:
                break
    return mpd_playlists

This function will construct a datafrme and write to .csv file for all the input playlists,this newly generated csv will have all the tracks in the single playlist. An extended parameter is available to extend the tracks in a list to be a single cell per song, this will return an additional .csv file

Parameters:
    playlists (list): a list of dictionaries such as that from the loop_slices() function.
    extended (boolean): boolean to enable the extended .csv file generation
        

In [14]:
def create_csv(playlists, extended=False):
    
    df = pd.DataFrame(playlists)
    df.to_csv('/Users/dwith/DataScience/finalbigdata/data/MPD.csv', index=False)
    
    if extended:
        df_list = []
        for playlist in playlists:
            df_list.append(pd.DataFrame(playlist))
            
        df_extended = pd.concat(df_list, axis=0)
         
        cols_to_keep = ['name', 'collaborative', 'pid', 'modified_at', 'num_tracks', 'num_albums', 
                'num_followers','num_edits', 'duration_ms', 'num_artists']
        df_extended = df_extended.reset_index().pivot(values='tracks',index=cols_to_keep, columns='index')
        df_extended.reset_index(inplace=True)
        df_extended = df_extended.rename_axis(None, axis=1)    
        df_extended.sort_values('pid', inplace=True)
        df_extended.to_csv('/Users/dwith/DataScience/finalbigdata/data/{}_MPD_Extended.csv'.format(datetime.now().strftime("%Y_%d_%d_%H_%M_%S")), index=False) 

Run with num_slices=20 It will save a .json locally

In [7]:
playlists = loop_slices(path, num_slices=20)
create_csv(playlists, extended=True)
with open('/Users/dwith/DataScience/finalbigdata/data/{}_playlists_{}.json'.format(datetime.now().strftime("%Y_%d_%d_%H_%M_%S"), len(playlists)), 'w') as outfile:
    json.dump(playlists, outfile, indent=2)

2022_21_21_18_41_18_MPD_Extended.csv
MPD.csv
mpd.slice.114000-114999.json
mpd.slice.115000-115999.json
mpd.slice.116000-116999.json
mpd.slice.117000-117999.json
mpd.slice.124000-124999.json
mpd.slice.125000-125999.json
mpd.slice.126000-126999.json
mpd.slice.127000-127999.json
mpd.slice.13000-13999.json
mpd.slice.144000-144999.json
mpd.slice.145000-145999.json
mpd.slice.147000-147999.json
mpd.slice.16000-16999.json
mpd.slice.174000-174999.json
mpd.slice.175000-175999.json
mpd.slice.176000-176999.json
mpd.slice.177000-177999.json
mpd.slice.19000-19999.json


In [9]:
df = pd.read_csv('/Users/dwith/DataScience/finalbigdata/data/MPD.csv')
df

Unnamed: 0,name,collaborative,pid,modified_at,num_tracks,num_albums,num_followers,tracks,num_edits,duration_ms,num_artists,description
0,Throwbacks,False,0,1493424000,52,47,1,"[{'pos': 0, 'artist_name': 'Missy Elliott', 't...",6,11532414,37,
1,Awesome Playlist,False,1,1506556800,39,23,1,"[{'pos': 0, 'artist_name': 'Survivor', 'track_...",5,11656470,21,
2,korean,False,2,1505692800,64,51,1,"[{'pos': 0, 'artist_name': 'Hoody', 'track_uri...",18,14039958,31,
3,mat,False,3,1501027200,126,107,1,"[{'pos': 0, 'artist_name': 'Camille Saint-Saën...",4,28926058,86,
4,90s,False,4,1401667200,17,16,2,"[{'pos': 0, 'artist_name': 'The Smashing Pumpk...",7,4335282,16,
...,...,...,...,...,...,...,...,...,...,...,...,...
19995,gang gang,False,115995,1499212800,39,32,1,"[{'pos': 0, 'artist_name': 'Drake', 'track_uri...",11,8983567,25,
19996,Chaos,True,115996,1408406400,20,19,1,"[{'pos': 0, 'artist_name': 'Kormac', 'track_ur...",16,5660735,17,
19997,Spring 2014,False,115997,1417305600,14,12,1,"[{'pos': 0, 'artist_name': 'Disclosure', 'trac...",8,3765841,9,
19998,autumn,False,115998,1507161600,35,32,1,"[{'pos': 0, 'artist_name': 'Hayley Kiyoko', 't...",8,8322008,31,


In [None]:
# # Read MPD_Extended
df = pd.read_csv('data/MPD_Extended.csv')
df

# Get song features from playlists extracted

In [3]:
# Spotify credentials
os.environ["SPOTIPY_CLIENT_ID"] = config.SPOTIPY_CLIENT_ID
os.environ["SPOTIPY_CLIENT_SECRET"] = config.SPOTIPY_CLIENT_SECRET
os.environ['SPOTIPY_REDIRECT_URI'] = config.SPOTIPY_REDIRECT_URI   
sp = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials())

In [14]:
# Code to retrieve and add data to an existing df
df_list = []
feats_df = pd.read_csv('Playlist_Feats.csv')
idx = len(feats_df)
df_list.append(feats_df)
feats_df

Code to retrieve song features from each playlist passed, average of song features in each playlist is computed in such a way that the final computation is each playlist is a row and it has the average of all songs in the playlist.  

In [None]:
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Start Time =", current_time)

cols_to_keep = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 
                'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature']
dfs = []

for playlist in tqdm(playlists):
    audio_feats = []
    all_uris_in_plylst = []
    for track in playlist['tracks']:
        uri = track['track_uri'].split("k:")[1]
        all_uris_in_plylst.append(uri)

    chunks_uris = [all_uris_in_plylst[i:i + 100] for i in range(0, len(all_uris_in_plylst), 100)]
    for chunk in  chunks_uris:
        for attempt in range(10):
            try:
                chunk_audio_feats = sp.audio_features(chunk)
                audio_feats.append(chunk_audio_feats)
            except Exception as e: 
                print(e)
                print('playlist: {}, chunk: {}'.format(playlist['name'], chunk))
            else:
                break
        else:
            print('Everything failed')


    
    playlist_audio_feats = [item for sublist in audio_feats for item in sublist]
    name = playlist['name']
    pid = playlist['pid']
    s1 = pd.Series([name, pid], index=['name', 'pid'])
    try: # Try/Except for when there is a None in playlist
        s2 = pd.DataFrame(playlist_audio_feats)[cols_to_keep].mean()
    except:
        print('Playlist "{}" has a None. PID: {}'.format(name, pid))
        s2 = pd.DataFrame([i for i in playlist_audio_feats if i])[cols_to_keep].mean() 

    dfs.append(pd.DataFrame(s1.append(s2)).T)

# This cell will save a .csv file locally.

In [140]:
df1 = pd.concat(dfs, axis=0)
df1.to_csv('/Users/dwith/DataScience/finalbigdata/data/{}_Playlist_Feats_{}.csv'.format(datetime.now().strftime("%Y_%d_%d_%H_%M_%S"), len(df1)), index=False)
df1


df_list.append(df1)
feats_df = pd.concat(df_list, axis=0)
feats_df.to_csv('/Users/dwith/DataScience/finalbigdata/data/Playlist_Feats_Copy.csv', index=False)
idx = len(feats_df)
feats_df

Unnamed: 0,name,pid,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,Throwbacks,0,0.664077,0.781077,5.03846,-4.89121,0.692308,0.103698,0.0836741,0.000674382,0.187087,0.64275,121.158,221777,4
0,Awesome Playlist,1,0.492382,0.695923,4.46154,-8.10797,0.538462,0.0910103,0.162227,0.223708,0.179344,0.476667,124.987,298838,3.76923
0,korean,2,0.671062,0.692953,5,-4.87559,0.515625,0.096425,0.2691,0.000637812,0.168894,0.565078,114.596,219374,4
0,mat,3,0.514349,0.620901,5.10317,-9.61875,0.714286,0.067004,0.273514,0.203148,0.188278,0.451258,125.523,229575,3.95238
0,90s,4,0.576235,0.650418,3.35294,-7.63453,0.823529,0.0412176,0.177189,0.0817588,0.166524,0.490294,127.725,255014,3.94118
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,gang gang,115995,0.804615,0.529026,4.20513,-8.16418,0.589744,0.219638,0.217947,0.00862146,0.166574,0.363818,129.452,230348,4
0,Chaos,115996,0.6889,0.6955,5.7,-8.23265,0.4,0.104215,0.243258,0.225892,0.15177,0.7178,135.314,283091,4.05
0,Spring 2014,115997,0.646571,0.603214,5.35714,-6.89093,0.714286,0.0787429,0.227477,0.0243475,0.146629,0.442571,125.139,268989,4
0,autumn,115998,0.549943,0.562457,5.11429,-8.66606,0.657143,0.06552,0.273361,0.0180656,0.16158,0.390406,114.145,237815,3.97143
