In [None]:
# Import libraries

import os
import json
import pandas as pd
import time
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import cred
import glob

# Set up Spotify credentials (to access API)
# cred.py is a separate file with the registered app id, secret, and redirect url (private file)
client_credentials_manager = SpotifyClientCredentials(cred.client_id, cred.client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

# Locate directory with original files
json_dir_name = '/Volumes/MUSIC/spotify_million_playlist_dataset/data'

# Import all data files with the .json extension
json_pattern = os.path.join(json_dir_name, '*.json')
json_files = glob.glob(json_pattern)
json_files.sort()


In [None]:
track_uris = []

# For each JSON file in the data folder, loop through each playlist in the file
# For each playist in the file, loop through each track in that playlsit
# For each track in that playlist, extract the track URI and add it to the global list
for index, js in enumerate(json_files):
    with open(os.path.join(json_dir_name, js), encoding='utf-8') as json_file:
        json_text = json.load(json_file)
        
        for j in range(len(json_text["playlists"])):
          for k in range(len(json_text["playlists"][j]["tracks"])):
            
            track_uri = json_text["playlists"][j]["tracks"][k]['track_uri']
            track_uris.append(track_uri)
            
        if (index + 1) % 50 == 0:
          print(f'I am on file {index + 1}')

# Remove duplicates, keeping order (in case restarting is needed)
list_uris = list(dict.fromkeys(track_uris))

# 25 minutes

In [None]:
n = 500 
  
# Break list of URIs into chunks of n = 500 using list comprehension
chunks = [list_uris[i * n:(i + 1) * n] for i in range((len(list_uris) + n - 1) // n )] 

#20 seconds

In [None]:
def get_album_tracks(seed_tracks):
  '''
  Takes a list of track URIs and gives, for each track, all of the tracks on the album
  '''
  print('Starting to get album tracks.')
  song_uris = []

  for i in range(len(seed_tracks)):
    # Error catching; program will fail without an internet connection
    try:
      track_info = sp.track(seed_tracks[i])
    except:
      continue
    album_uri = track_info['album']['uri']
    offset = 0

    while True:
      try:
        album_info = sp.album_tracks(album_uri, offset=offset)
      except:
        break
      if len(album_info['items']) == 0:
          break
      for j in range(len(album_info['items'])):
          if album_info['items'][j] == None:
              continue
          else:
              song_uris.append(album_info['items'][j]['uri'])  
      offset = offset + len(album_info['items'])
      time.sleep(0.001)

    if i == 249:
      print(f'I am on song 250 out of {len(seed_tracks)}')
    time.sleep(0.001)
    
  # Removes duplicates (does not keep order, but all of the songs in the final list will be the same as long 
  # as the chunks remain in the same order)
  return list(set(song_uris))

def get_track_features(song_uris):
  '''
  Gets track info and track audio features when given a list of song URIs
  '''
  print('Starting to get track features.')
  print(f'There are {len(song_uris)} to analyze.')

  # Create empty data frame to append to
  dframe = pd.DataFrame(columns=['album_uri', 'artist_uri', 'artist_name', 'album_name', 'release_date', 'disc_number', 'popularity','track_number', 'duration', 'explicit', 'track_name', 'track_uri', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'intstrumentalness', 'liveness', 'valence', 'tempo', 'time_signature'])
  
  # For each song in the inputted list of URIs, grab the song's info and features
  # Create a dictionary of the information and append that info to the dataframe created above
  # Returns completed dataframe of info for all songs in the inputted list
  for i in range(len(song_uris)):
    
    # Will fail without an internet connection
    try:
      track_info = sp.track(song_uris[i])
    
      album_uri = track_info['album']['uri']
      artist_uri = track_info['artists'][0]['uri']
      artist_name = track_info['artists'][0]['name']
      album_name = track_info['album']['name']
      release_date = track_info['album']['release_date']
      disc_number = track_info['disc_number']
      popularity = track_info['popularity']
      track_number = track_info['track_number']
      duration = track_info['duration_ms']
      explicit = track_info['explicit']
      track_name = track_info['name']
      track_uri = track_info['uri']
      time.sleep(0.001)
    except:
      continue

    try:
      track_features = sp.audio_features(song_uris[i])
      
      danceability = track_features[0]['danceability']
      energy = track_features[0]['energy']
      key = track_features[0]['key']
      loudness = track_features[0]['loudness']
      mode = track_features[0]['mode']
      speechiness = track_features[0]['speechiness']
      acousticness = track_features[0]['acousticness']
      instrumentalness = track_features[0]['instrumentalness']
      liveness = track_features[0]['liveness']
      valence = track_features[0]['valence']
      tempo = track_features[0]['tempo']
      time_signature = track_features[0]['time_signature']
      time.sleep(0.001)
    except:
      continue
    
    observation = {
      'album_uri': album_uri,
      'artist_uri': artist_uri,
      'artist_name': artist_name,
      'album_name': album_name,
      'release_date': release_date,
      'disc_number': disc_number,
      'popularity': popularity,
      'track_number': track_number,
      'duration': duration,
      'explicit': explicit,
      'track_name': track_name,
      'track_uri': track_uri,
      'danceability': danceability,
      'energy': energy,
      'key': key,
      'loudness': loudness,
      'mode': mode,
      'speechiness': speechiness,
      'acousticness': acousticness,
      'instrumentalness': instrumentalness,
      'liveness': liveness,
      'valence': valence,
      'tempo': tempo,
      'time_signature': time_signature
    }
    
    dframe = dframe.append(pd.DataFrame([observation]))
    if (i + 1) % 2000 == 0:
      print(f'Currently on song {i + 1} out of {len(song_uris)}')
  return dframe


def make_df(index):
  '''
  Uses chunks to get all album tracks for each song in the chunk and all track features for each song on
  the albums. Keeps track of how long each chunk takes.
  '''
  print(f'Starting chunk {index + 1}')
  t_1 = time.time()
  chunk_uris = get_album_tracks(chunks[index])
  chunk_df = get_track_features(chunk_uris)
  t_2 = time.time()
  print(f'Completing chunk {index + 1} took {round((t_2 - t_1)/60, 2)} minutes.')
  return chunk_df

In [45]:
# Create empty dataframe for final output
df = pd.DataFrame(columns=['album_uri', 'artist_uri', 'artist_name', 'album_name', 'release_date', 'disc_number', 'popularity','track_number', 'duration', 'explicit', 'track_name', 'track_uri', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'intstrumentalness', 'liveness', 'valence', 'tempo', 'time_signature'])

In [46]:
# len(chunks) = 4525

# With inputted indices, loops through chunks and supplies each chunk to the functions above, printing 
# the dimensions
# Try to stay below 140,000 rows for file size concerns
indices = [i for i in range(1421, 1445)]

for index in indices:
  dfpiece = make_df(index)
  df = df.append(dfpiece)
  print(f'The dimensions of the data frame before droppping duplicates are {df.shape}')
  df = df.drop_duplicates(subset = "track_uri")
  print(f'The dimensions of the data frame after droppping duplicates are {df.shape}')


Starting chunk 1422
Starting to get album tracks.
I am on song 250 out of 500
Starting to get track features.
There are 5093 to analyze.
Currently on song 2000 out of 5093
Currently on song 4000 out of 5093
Completing chunk 1422 took 20.71 minutes.
The dimensions of the data frame before droppping duplicates are (5093, 25)
The dimensions of the data frame after droppping duplicates are (5093, 25)
Starting chunk 1423
Starting to get album tracks.
I am on song 250 out of 500
Starting to get track features.
There are 4514 to analyze.
Currently on song 2000 out of 4514
Currently on song 4000 out of 4514
Completing chunk 1423 took 17.63 minutes.
The dimensions of the data frame before droppping duplicates are (9607, 25)
The dimensions of the data frame after droppping duplicates are (9573, 25)
Starting chunk 1424
Starting to get album tracks.
I am on song 250 out of 500
Starting to get track features.
There are 4354 to analyze.
Currently on song 2000 out of 4354
Currently on song 4000 out o

HTTP Error for GET to https://api.spotify.com/v1/tracks/3JJL4OmS8VmYqTODuIlKbq with Params: {'market': None} returned 404 due to non existing id


Starting to get track features.
There are 3843 to analyze.
Currently on song 2000 out of 3843
Completing chunk 1428 took 15.22 minutes.
The dimensions of the data frame before droppping duplicates are (31856, 25)
The dimensions of the data frame after droppping duplicates are (31805, 25)
Starting chunk 1429
Starting to get album tracks.
I am on song 250 out of 500
Starting to get track features.
There are 5704 to analyze.
Currently on song 2000 out of 5704
Currently on song 4000 out of 5704
Completing chunk 1429 took 21.5 minutes.
The dimensions of the data frame before droppping duplicates are (37508, 25)
The dimensions of the data frame after droppping duplicates are (37453, 25)
Starting chunk 1430
Starting to get album tracks.


HTTP Error for GET to https://api.spotify.com/v1/tracks/6CNzVurjdLR4seIfAsRxai with Params: {'market': None} returned 404 due to non existing id


I am on song 250 out of 500
Starting to get track features.
There are 4077 to analyze.
Currently on song 2000 out of 4077
Currently on song 4000 out of 4077
Completing chunk 1430 took 16.31 minutes.
The dimensions of the data frame before droppping duplicates are (41530, 25)
The dimensions of the data frame after droppping duplicates are (41460, 25)
Starting chunk 1431
Starting to get album tracks.
I am on song 250 out of 500
Starting to get track features.
There are 4524 to analyze.
Currently on song 2000 out of 4524
Currently on song 4000 out of 4524
Completing chunk 1431 took 17.53 minutes.
The dimensions of the data frame before droppping duplicates are (45983, 25)
The dimensions of the data frame after droppping duplicates are (45968, 25)
Starting chunk 1432
Starting to get album tracks.
I am on song 250 out of 500
Starting to get track features.
There are 4533 to analyze.
Currently on song 2000 out of 4533
Currently on song 4000 out of 4533
Completing chunk 1432 took 17.59 minute

HTTP Error for GET to https://api.spotify.com/v1/tracks/37TWuMW8fj2p9mfjNAQhP5 with Params: {'market': None} returned 404 due to non existing id


Starting to get track features.
There are 4116 to analyze.
Currently on song 2000 out of 4116
Currently on song 4000 out of 4116
Completing chunk 1433 took 16.24 minutes.
The dimensions of the data frame before droppping duplicates are (54556, 25)
The dimensions of the data frame after droppping duplicates are (54434, 25)
Starting chunk 1434
Starting to get album tracks.


HTTP Error for GET to https://api.spotify.com/v1/tracks/0soGt31beG9D6zsDjA75Vy with Params: {'market': None} returned 404 due to non existing id


Starting to get track features.
There are 5363 to analyze.
Currently on song 2000 out of 5363
Currently on song 4000 out of 5363
Completing chunk 1434 took 20.35 minutes.
The dimensions of the data frame before droppping duplicates are (59797, 25)
The dimensions of the data frame after droppping duplicates are (59749, 25)
Starting chunk 1435
Starting to get album tracks.
I am on song 250 out of 500
Starting to get track features.
There are 4946 to analyze.
Currently on song 2000 out of 4946
Currently on song 4000 out of 4946
Completing chunk 1435 took 19.28 minutes.
The dimensions of the data frame before droppping duplicates are (64693, 25)
The dimensions of the data frame after droppping duplicates are (64514, 25)
Starting chunk 1436
Starting to get album tracks.
I am on song 250 out of 500
Starting to get track features.
There are 5300 to analyze.
Currently on song 2000 out of 5300
Currently on song 4000 out of 5300
Completing chunk 1436 took 20.53 minutes.
The dimensions of the dat

HTTP Error for GET to https://api.spotify.com/v1/tracks/6SctaFY2Rgb3R50I7tiWlK with Params: {'market': None} returned 404 due to non existing id


Starting to get track features.
There are 4099 to analyze.
Currently on song 2000 out of 4099
Currently on song 4000 out of 4099
Completing chunk 1439 took 16.3 minutes.
The dimensions of the data frame before droppping duplicates are (81481, 25)
The dimensions of the data frame after droppping duplicates are (81330, 25)
Starting chunk 1440
Starting to get album tracks.
I am on song 250 out of 500
Starting to get track features.
There are 4511 to analyze.
Currently on song 2000 out of 4511
Currently on song 4000 out of 4511
Completing chunk 1440 took 17.52 minutes.
The dimensions of the data frame before droppping duplicates are (85841, 25)
The dimensions of the data frame after droppping duplicates are (85729, 25)
Starting chunk 1441
Starting to get album tracks.
I am on song 250 out of 500
Starting to get track features.
There are 4516 to analyze.
Currently on song 2000 out of 4516
Currently on song 4000 out of 4516
Completing chunk 1441 took 17.48 minutes.
The dimensions of the data

In [None]:
# Write dataframe to disk at the file path
df.to_csv(path_or_buf=r'/Users/iancurtis/Documents/coding/musicanalysis/original_data/chunks1401-1421.csv', index = False, index_label = False)