In [1]:
pip install spotipy


The following command must be run outside of the IPython shell:

    $ pip install spotipy

The Python package manager (pip) can only be used from outside of IPython.
Please reissue the `pip` command in a separate terminal or command prompt.

See the Python documentation for more information on how to install packages:

    https://docs.python.org/3/installing/


In [12]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd
import numpy as np
from collections import Counter

%env SPOTIPY_CLIENT_ID = 3543bc68b7a64fa4896cb0c1d2499e33
%env SPOTIPY_CLIENT_SECRET = b711f5fe2e9a417c89247f7036c9c069

sp = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials())

env: SPOTIPY_CLIENT_ID=3543bc68b7a64fa4896cb0c1d2499e33
env: SPOTIPY_CLIENT_SECRET=b711f5fe2e9a417c89247f7036c9c069


In [13]:
#prints number of playlists with corona_terms in title/description
#this is helpful bc it helps us see ahead of time how huge the dataset is

terms = ["2019"]
for term in terms:
    playlists = sp.search('q="{}"'.format(term), type='playlist')
    number_playlists = playlists['playlists']['total']
    print(number_playlists)

6129


In [14]:
#function that creates a list with the playlist name, URI & total number of tracks
def extract_playlist(x,list_):
    for i in x['playlists']['items']:
        list_.append({"name" : i['name'], "total":i["tracks"]['total'], "uri":i["uri"]})

In [5]:
#create a list of playlists
list_of_playlists = []

for term in terms:
    
    #use index (i) which are multiples of 50 -- random sample bc of search limitation
    #spotify can only give you 2000 results at a time
    #random sample of 40 playlists
    for i in range(0,2000,50):
        
        try:
            #create variable init_data that has random sample of playlists with terms
            playlist_data = sp.search('q="{}"'.format(term), type='playlist', limit=50, offset=i)
                        #limit = 50, can only return up to 50 items, random sample of 50 songs
                        #type = playlist, only return playlists
                        #offset = i, the index of the first item to return
            
            extract_playlist(playlist_data,list_of_playlists)
            #above function appends init_data to list_of_playlists
        
        except:
            print("Error")

In [15]:
#dataset is length 2000 --> sample of 40 playlists, from each playlist, sample of 50 songs
len(list_of_playlists)

2000

In [16]:
#creates dataframe with list of playlists, dropping duplicates & delete their index
#only need to drop duplicates if you use multiple terms, i.e. Coronavirus AND Quarantine
playlists2019 = pd.DataFrame(list_of_playlists).drop_duplicates().reset_index(drop=True)

In [17]:
#creates csv file from df
playlists2019.to_csv("playlists2019.csv")

In [19]:
#for a random sample of playlists, takes random sample of tracks, creates a list of tracks and their uris
tracks = []

#using each uri code listed in the playlist.csv
for uri in playlists2019['uri']:
        
        length = sp.playlist_tracks(uri)['total']
        #playlist_tracks(parameter) is playlist id, aka uri
        #length is the playlist length of each individual playlist
    
        for i in range(0,length, 50):
            #from 0 to playlist length, take every other 50th index
            
            playlist_data = sp.playlist_tracks(uri, limit=50, offset=i)
            #playlist_tracks get URI of tracks in playlists, input URI
            #playlist_data now has playlist track data
            #replacing old data
            
            try:
                for k in playlist_data['items']:
                    tracks.append(k['track']["uri"])
                    
            except:
                pass

In [20]:
len(tracks)

265723

In [21]:
#dataframe with individual track uri and count
df = pd.DataFrame.from_dict(Counter(tracks), orient='index').reset_index().rename(columns={"index":"uri", 0:"count"})

In [22]:
df

Unnamed: 0,uri,count
0,spotify:track:3WogGiNpaOmYcPwqslBgZG,5
1,spotify:track:3DlTgKZMiNkPa96dbLlfYO,4
2,spotify:track:6OkdbK4Y1zHsDI4w5y5MY0,2
3,spotify:track:23Na3MCF9QaYja7W7mVMxy,6
4,spotify:track:10OmfJCAYVzChZMH2CKBMT,1
5,spotify:track:4tSffeciMCcNIvITmh6QHP,7
6,spotify:track:2jdFLgkxWO9HWdufwbOoBU,3
7,spotify:track:35CgPaRWgzwlB6f8UUTgNm,3
8,spotify:track:3rxHqfRwEgzGogEXZi8vRU,3
9,spotify:track:6zS9UoeaK1Yj5Sf5kgrXbt,1


In [23]:
#useful stats to help us understand frequency of the popular tracks
#to know when we should cut off our data/if necessary
df[(df['count']>4) & (df['count']<2000)].describe()

Unnamed: 0,count
count,7298.0
mean,10.551384
std,9.816036
min,5.0
25%,6.0
50%,7.0
75%,12.0
max,183.0


In [34]:
#creates a csv file of the unfiltered track data
df.to_csv("tracks_list_2019_unfiltered.csv")

In [35]:
sliced_df_2019 = df[df['count']>4]
len(sliced_df_2019)

7298

In [36]:
#filters important variables, makes the dataframe usable
def filter_data(x):
    data = sp.track(x['uri'])
    #sets data variable for each track
    
    name = data['name']
    pop = data['popularity']
    date = data['album']['release_date']
    explic = data['explicit']
    
    features = sp.audio_features(x['uri'])
    #uses spotify function to access audio features
    
    return {**x, 
            "name" : name, 
            'release_date': date, 
            "popularity" : pop,
            'explicit' : explic, 
            **features[0]}

In [38]:
final_df_2019 = sliced_df_2019.apply(lambda x: filter_data(x), axis=1, result_type='expand')

In [39]:
final_df_2019

Unnamed: 0,acousticness,analysis_url,count,danceability,duration_ms,energy,explicit,id,instrumentalness,key,...,name,popularity,release_date,speechiness,tempo,time_signature,track_href,type,uri,valence
0,0.153000,https://api.spotify.com/v1/audio-analysis/3Wog...,5,0.641,283585,0.6840,False,3WogGiNpaOmYcPwqslBgZG,0.000000,0,...,L'Amérique pleure,59,2019-10-04,0.0345,102.025,4,https://api.spotify.com/v1/tracks/3WogGiNpaOmY...,audio_features,spotify:track:3WogGiNpaOmYcPwqslBgZG,0.6390
3,0.354000,https://api.spotify.com/v1/audio-analysis/23Na...,6,0.653,265053,0.6220,False,23Na3MCF9QaYja7W7mVMxy,0.000000,9,...,Les étoiles filantes,53,2004-11-23,0.0399,129.892,4,https://api.spotify.com/v1/tracks/23Na3MCF9QaY...,audio_features,spotify:track:23Na3MCF9QaYja7W7mVMxy,0.5300
5,0.853000,https://api.spotify.com/v1/audio-analysis/4tSf...,7,0.662,178293,0.4360,False,4tSffeciMCcNIvITmh6QHP,0.000000,9,...,I Lost My Baby,48,1996-10-01,0.0651,92.857,4,https://api.spotify.com/v1/tracks/4tSffeciMCcN...,audio_features,spotify:track:4tSffeciMCcNIvITmh6QHP,0.6270
17,0.000095,https://api.spotify.com/v1/audio-analysis/2SqZ...,6,0.672,311200,0.8480,False,2SqZ6pmqXB5szlwEgaeDYW,0.000011,2,...,1990,44,1990-08-01,0.0635,122.051,4,https://api.spotify.com/v1/tracks/2SqZ6pmqXB5s...,audio_features,spotify:track:2SqZ6pmqXB5szlwEgaeDYW,0.6000
27,0.378000,https://api.spotify.com/v1/audio-analysis/3WgK...,5,0.549,324615,0.7210,False,3WgK1nC3pQmJ9UtBF4oNMR,0.000000,7,...,Dégénérations / Le reel du fossé,47,2003,0.1040,98.861,4,https://api.spotify.com/v1/tracks/3WgK1nC3pQmJ...,audio_features,spotify:track:3WgK1nC3pQmJ9UtBF4oNMR,0.8730
28,0.301000,https://api.spotify.com/v1/audio-analysis/6JVn...,7,0.549,320320,0.1930,False,6JVngSxUtrYkWfFmFWmtm5,0.000542,0,...,La Complainte Du Phoque En Alaska - Remastered,42,1994,0.0317,113.381,3,https://api.spotify.com/v1/tracks/6JVngSxUtrYk...,audio_features,spotify:track:6JVngSxUtrYkWfFmFWmtm5,0.2390
31,0.203000,https://api.spotify.com/v1/audio-analysis/1sQ0...,5,0.558,214973,0.3860,False,1sQ0oafDSZQUElkV5ERvLx,0.000000,4,...,Toune d'automne,50,2009-05-05,0.0253,78.219,4,https://api.spotify.com/v1/tracks/1sQ0oafDSZQU...,audio_features,spotify:track:1sQ0oafDSZQUElkV5ERvLx,0.4350
34,0.336000,https://api.spotify.com/v1/audio-analysis/3w5b...,7,0.733,335840,0.5140,False,3w5bSQlQzKFUE3osgrMRBT,0.000000,2,...,Tassez-Vous De D'là,48,1998-07-01,0.1150,78.695,4,https://api.spotify.com/v1/tracks/3w5bSQlQzKFU...,audio_features,spotify:track:3w5bSQlQzKFUE3osgrMRBT,0.6480
36,0.192000,https://api.spotify.com/v1/audio-analysis/21AT...,5,0.586,267280,0.8100,False,21ATIZAwB9QXBOpTzw2lDM,0.000002,11,...,Amalgame,37,1999-12-31,0.0369,96.933,4,https://api.spotify.com/v1/tracks/21ATIZAwB9QX...,audio_features,spotify:track:21ATIZAwB9QXBOpTzw2lDM,0.5090
38,0.423000,https://api.spotify.com/v1/audio-analysis/3iT4...,5,0.699,168200,0.8890,False,3iT4c9MPnaordWrfvTs8MU,0.000000,11,...,Paradis City,45,2015-02-02,0.0683,97.028,4,https://api.spotify.com/v1/tracks/3iT4c9MPnaor...,audio_features,spotify:track:3iT4c9MPnaordWrfvTs8MU,0.8700


In [40]:
#makes a csv with the final dataset
final_df_2019.to_csv("tracks_list_2019.csv",)