In [1]:
# imports
import numpy as np
import pandas as pd
import json

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

In [2]:
cid = 'Insert cid token'
secret = 'Insert secret token'

In [3]:
client_credentials_manager = SpotifyClientCredentials(client_id=cid, client_secret=secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [4]:
a = sp.audio_features(tracks=['4SBVWkRIMJ6WBCYPvr5Bwr','2M2g5CVCrs9yyJHJrtusdn'])

In [5]:
a[1]['danceability']

0.688

In [6]:
genre_lst = sp.recommendation_genre_seeds()['genres']

In [7]:
genre_lst

['acoustic',
 'afrobeat',
 'alt-rock',
 'alternative',
 'ambient',
 'anime',
 'black-metal',
 'bluegrass',
 'blues',
 'bossanova',
 'brazil',
 'breakbeat',
 'british',
 'cantopop',
 'chicago-house',
 'children',
 'chill',
 'classical',
 'club',
 'comedy',
 'country',
 'dance',
 'dancehall',
 'death-metal',
 'deep-house',
 'detroit-techno',
 'disco',
 'disney',
 'drum-and-bass',
 'dub',
 'dubstep',
 'edm',
 'electro',
 'electronic',
 'emo',
 'folk',
 'forro',
 'french',
 'funk',
 'garage',
 'german',
 'gospel',
 'goth',
 'grindcore',
 'groove',
 'grunge',
 'guitar',
 'happy',
 'hard-rock',
 'hardcore',
 'hardstyle',
 'heavy-metal',
 'hip-hop',
 'holidays',
 'honky-tonk',
 'house',
 'idm',
 'indian',
 'indie',
 'indie-pop',
 'industrial',
 'iranian',
 'j-dance',
 'j-idol',
 'j-pop',
 'j-rock',
 'jazz',
 'k-pop',
 'kids',
 'latin',
 'latino',
 'malay',
 'mandopop',
 'metal',
 'metal-misc',
 'metalcore',
 'minimal-techno',
 'movies',
 'mpb',
 'new-age',
 'new-release',
 'opera',
 'pagode',

In [8]:
len(genre_lst)

126

In [9]:
# Empty lists for desired features
track_name = []
artist_name = []
album_name = []
genre = []
duration_ms = []
popularity = []
explicit = []
track_id = []
artist_id = []

# Iterate through each genre
for g in genre_lst:
    # Requests are limited to 50 units, so we need multiple API requests to get 1000 songs per genre
    for i in range(0,1000,50):
        q = 'genre:'+str(g)
        # Store API request results in a variable for extraction
        genre_results = sp.search(q=q, type='track', limit=50,offset=i)
        # Iterate through tracks and store relevant information in lists
        for i, t in enumerate(genre_results['tracks']['items']):
            track_name.append(t['name'])
            artist_name.append(t['artists'][0]['name'])
            album_name.append(t['album']['name'])
            genre.append(g)
            duration_ms.append(t['duration_ms'])
            popularity.append(t['popularity'])
            explicit.append(t['explicit'])
            track_id.append(t['id'])
            artist_id.append(t['artists'][0]['id'])

In [10]:
# Initialize DataFrame with data
df = pd.DataFrame({'track_name':track_name,'artist_name':artist_name,
                   'album_name':album_name,'genre':genre,'duration_ms':duration_ms,
                   'popularity':popularity,'explicit':explicit,
                   'track_id' : track_id,'artist_id':artist_id})

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 113000 entries, 0 to 112999
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   track_name   113000 non-null  object
 1   artist_name  113000 non-null  object
 2   album_name   113000 non-null  object
 3   genre        113000 non-null  object
 4   duration_ms  113000 non-null  int64 
 5   popularity   113000 non-null  int64 
 6   explicit     113000 non-null  bool  
 7   track_id     113000 non-null  object
 8   artist_id    113000 non-null  object
dtypes: bool(1), int64(2), object(6)
memory usage: 7.0+ MB


In [13]:
len(df.genre.unique())

113

In [14]:
len(df.track_id.unique())

85201

In [15]:
df.to_csv('../data/track_dataset_no_audio_or_lyrics.csv')

In [None]:
# OLD APPROACH: Extracting track data from playlists

# generate empty dataframe
df = pd.DataFrame(columns=['track uri','track','artist','album','duration_ms',
                           'danceability','energy','key','loudness','mode',
                           'speechiness','acousticness','instrumentalness',
                           'liveness','valence','tempo','time_signature'])
idx = 0

# loop through each track from each playlist
for playlist in file['playlists']:
    for track in playlist['tracks']:
        try:
            # get audio features for track from Spotify API
            # NOTE: all track_uri's in the JSON start with 'spotify:track:' before the actual URI, hence the index slice
            temp = sp.audio_features(tracks=[track['track_uri'][14:]])
        
            # populate dataframe with desired contents from track and Spotify API
            df.loc[idx] = [track['track_uri'], track['track_name'], track['artist_name'],
                           track['album_name'], track['duration_ms'],temp[0]['danceability'],
                           temp[0]['energy'],temp[0]['key'],temp[0]['loudness'],
                           temp[0]['mode'],temp[0]['speechiness'],temp[0]['acousticness'],
                           temp[0]['instrumentalness'],temp[0]['liveness'],temp[0]['valence'],
                           temp[0]['tempo'],temp[0]['time_signature'],]
            idx += 1
        except:
            pass

In [18]:
# OLD APPROACH: Extracting track data from Million Playlists Dataset

# read in JSON file
file = json.load(open('/Users/ken/Downloads/spotify_million_playlist_dataset/data/mpd.slice.0-999.json'))
# store desired contents in a dataframe
temp = pd.DataFrame(file['playlists'])

In [29]:
file['playlists'][0]['tracks']

[{'pos': 0,
  'artist_name': 'Missy Elliott',
  'track_uri': 'spotify:track:0UaMYEvWZi0ZqiDOoHU3YI',
  'artist_uri': 'spotify:artist:2wIVse2owClT7go1WT98tk',
  'track_name': 'Lose Control (feat. Ciara & Fat Man Scoop)',
  'album_uri': 'spotify:album:6vV5UrXcfyQD1wu4Qo2I9K',
  'duration_ms': 226863,
  'album_name': 'The Cookbook'},
 {'pos': 1,
  'artist_name': 'Britney Spears',
  'track_uri': 'spotify:track:6I9VzXrHxO9rA9A5euc8Ak',
  'artist_uri': 'spotify:artist:26dSoYclwsYLMAKD3tpOr4',
  'track_name': 'Toxic',
  'album_uri': 'spotify:album:0z7pVBGOD7HCIB7S8eLkLI',
  'duration_ms': 198800,
  'album_name': 'In The Zone'},
 {'pos': 2,
  'artist_name': 'Beyoncé',
  'track_uri': 'spotify:track:0WqIKmW4BTrj3eJFmnCKMv',
  'artist_uri': 'spotify:artist:6vWDO969PvNqNYHIOW5v0m',
  'track_name': 'Crazy In Love',
  'album_uri': 'spotify:album:25hVFAxTlDvXbx2X2QkUkE',
  'duration_ms': 235933,
  'album_name': 'Dangerously In Love (Alben für die Ewigkeit)'},
 {'pos': 3,
  'artist_name': 'Justin Timb

In [21]:
temp

Unnamed: 0,name,collaborative,pid,modified_at,num_tracks,num_albums,num_followers,tracks,num_edits,duration_ms,num_artists,description
0,Throwbacks,false,0,1493424000,52,47,1,"[{'pos': 0, 'artist_name': 'Missy Elliott', 't...",6,11532414,37,
1,Awesome Playlist,false,1,1506556800,39,23,1,"[{'pos': 0, 'artist_name': 'Survivor', 'track_...",5,11656470,21,
2,korean,false,2,1505692800,64,51,1,"[{'pos': 0, 'artist_name': 'Hoody', 'track_uri...",18,14039958,31,
3,mat,false,3,1501027200,126,107,1,"[{'pos': 0, 'artist_name': 'Camille Saint-Saën...",4,28926058,86,
4,90s,false,4,1401667200,17,16,2,"[{'pos': 0, 'artist_name': 'The Smashing Pumpk...",7,4335282,16,
...,...,...,...,...,...,...,...,...,...,...,...,...
995,old,false,995,1507852800,41,40,1,"[{'pos': 0, 'artist_name': 'Katrina', 'track_u...",8,9917901,36,
996,Daze,false,996,1479254400,17,17,1,"[{'pos': 0, 'artist_name': 'PARTYNEXTDOOR', 't...",13,3699248,15,
997,rap,false,997,1410307200,119,98,1,"[{'pos': 0, 'artist_name': 'LoveRance', 'track...",63,27538723,82,
998,Country,false,998,1507939200,108,75,1,"[{'pos': 0, 'artist_name': 'Hunter Hayes', 'tr...",37,24950143,40,


In [26]:
new = pd.DataFrame(temp['tracks'])

In [27]:
new

Unnamed: 0,tracks
0,"[{'pos': 0, 'artist_name': 'Missy Elliott', 't..."
1,"[{'pos': 0, 'artist_name': 'Survivor', 'track_..."
2,"[{'pos': 0, 'artist_name': 'Hoody', 'track_uri..."
3,"[{'pos': 0, 'artist_name': 'Camille Saint-Saën..."
4,"[{'pos': 0, 'artist_name': 'The Smashing Pumpk..."
...,...
995,"[{'pos': 0, 'artist_name': 'Katrina', 'track_u..."
996,"[{'pos': 0, 'artist_name': 'PARTYNEXTDOOR', 't..."
997,"[{'pos': 0, 'artist_name': 'LoveRance', 'track..."
998,"[{'pos': 0, 'artist_name': 'Hunter Hayes', 'tr..."


In [47]:
df.to_csv('../data/tracks_dataset.csv')