In [1]:
# We used Spotipy's documentation
# https://spotipy.readthedocs.io/en/2.12.0/

In [2]:
pip install spotipy

Note: you may need to restart the kernel to use updated packages.


In [3]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd
import numpy as np
from collections import Counter

%env SPOTIPY_CLIENT_ID = 3543bc68b7a64fa4896cb0c1d2499e33
%env SPOTIPY_CLIENT_SECRET = b711f5fe2e9a417c89247f7036c9c069

sp = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials())

env: SPOTIPY_CLIENT_ID=3543bc68b7a64fa4896cb0c1d2499e33
env: SPOTIPY_CLIENT_SECRET=b711f5fe2e9a417c89247f7036c9c069


In [4]:
#prints number of playlists with corona_terms in title/description
#this is helpful bc it helps us see ahead of time how huge the dataset is

terms = ["Quarantine"]
for term in terms:
    playlists = sp.search('q="{}"'.format(term), type='playlist')
    number_playlists = playlists['playlists']['total']
    print(number_playlists)

276763


In [5]:
#function that creates a list with the playlist name, URI & total number of tracks
def extract_playlist(x,list_):
    for i in x['playlists']['items']:
        list_.append({"name" : i['name'], "total":i["tracks"]['total'], "uri":i["uri"]})

In [6]:
#create a list of playlists
list_of_playlists = []

for term in terms:
    
    #use index (i) which are multiples of 50 -- random sample bc of search limitation
    #spotify can only give you 2000 results at a time
    #random sample of 40 playlists
    for i in range(0,2000,50):
        
        try:
            #create variable init_data that has random sample of playlists with terms
            playlist_data = sp.search('q="{}"'.format(term), type='playlist', limit=50, offset=i)
                        #limit = 50, can only return up to 50 items, random sample of 50 songs
                        #type = playlist, only return playlists
                        #offset = i, the index of the first item to return
            
            extract_playlist(playlist_data,list_of_playlists)
            #above function appends init_data to list_of_playlists
        
        except:
            print("Error")

In [7]:
#dataset is length 2000 --> sample of 40 playlists, from each playlist, sample of 50 songs
len(list_of_playlists)

2000

In [8]:
#creates dataframe with list of playlists, dropping duplicates & delete their index
#only need to drop duplicates if you use multiple terms, i.e. Coronavirus AND Quarantine
playlists = pd.DataFrame(list_of_playlists).drop_duplicates().reset_index(drop=True)

In [9]:
#creates csv file from df
playlists.to_csv("playlists.csv")

In [10]:
#for a random sample of playlists, takes random sample of tracks, creates a list of tracks and their uris
tracks = []

#using each uri code listed in the playlist.csv
for uri in playlists['uri']:
        
        length = sp.playlist_tracks(uri)['total']
        #playlist_tracks(parameter) is playlist id, aka uri
        #length is the playlist length of each individual playlist
    
        for i in range(0,length, 50):
            #from 0 to playlist length, take every other 50th index
            
            playlist_data = sp.playlist_tracks(uri, limit=50, offset=i)
            #playlist_tracks get URI of tracks in playlists, input URI
            #playlist_data now has playlist track data
            #replacing old data
            
            try:
                for k in playlist_data['items']:
                    tracks.append(k['track']["uri"])
                    
            except:
                pass

In [11]:
len(tracks)

272962

In [12]:
#dataframe with individual track uri and count
df = pd.DataFrame.from_dict(Counter(tracks), orient='index').reset_index().rename(columns={"index":"uri", 0:"count"})

In [13]:
df

Unnamed: 0,uri,count
0,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,104
1,spotify:track:0VgkVdmE4gld66l8iyGjgx,60
2,spotify:track:2NxIIb2OZ1DSbfWAH0W47B,3
3,spotify:track:2xLMifQCjDGFmkHkpNLD9h,123
4,spotify:track:66s45uMhk7Y4z0xUgESdm3,22
...,...,...
132866,spotify:track:3rBY6K8EM0MmKNniTZgRvx,1
132867,spotify:track:2rffKiXNuFZSFttrvWUWJC,1
132868,spotify:track:2z7MvlJJIlzcx5vS2bWMWn,1
132869,spotify:track:6kmhapLx51lmwhIIDFsUQU,1


In [14]:
#useful stats to help us understand frequency of the popular tracks
#to know when we should cut off our data/if necessary
df[(df['count']>4) & (df['count']<2000)].describe()

Unnamed: 0,count
count,9801.0
mean,11.261708
std,11.01333
min,5.0
25%,6.0
50%,8.0
75%,12.0
max,201.0


In [15]:
#creates a csv file of the unfiltered track data
df.to_csv("tracks_list_unfiltered.csv")

In [16]:
#cuts off any tracks that appear less than twice (CAN CHANGE)
#this is useful bc we don't want tracks that don't appear frequently
sliced_df = df[df['count']>4]
len(sliced_df)

9801

In [1]:
#filters important variables, makes the dataframe usable
def filter_data(x):
    data = sp.track(x['uri'])
    #sets data variable for each track
    
    name = data['name']
    pop = data['popularity']
    date = data['album']['release_date']
    explic = data['explicit']
    
    features = sp.audio_features(x['uri'])
    #uses spotify function to access audio features
    
    return {**x, 
            "name" : name, 
            'release_date': date, 
            "popularity" : pop,
            'explicit' : explic, 
            **features[0]
           }

In [2]:
final_df = sliced_df.apply(lambda x: filter_data(x), axis=1, result_type='expand')

In [3]:
#makes a csv with the final dataset
final_df.to_csv("tracks_list.csv",)