In [None]:
import os
import json
import pandas as pd
import time
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import cred

client_credentials_manager = SpotifyClientCredentials(cred.client_id, cred.client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

path_to_json = '/Users/iancurtis/Documents/coding/sta518_spotify/musicanalysis/data/spotify_million_playlist_dataset/data'
json_files = [pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')]


In [None]:
track_uris = []


for index, js in enumerate(json_files):
    with open(os.path.join(path_to_json, js)) as json_file:
        json_text = json.load(json_file)
        
        for j in range(len(json_text["playlists"])):
          for k in range(len(json_text["playlists"][j]["tracks"])):
            
            track_uri = json_text["playlists"][j]["tracks"][k]['track_uri']
            track_uris.append(track_uri)
            
        if (index + 1) % 50 == 0:
          print(f'I am on file {index + 1}')

list_uris = list(dict.fromkeys(track_uris))

# 17 minutes

In [3]:
# How many elements each
# list should have
n = 500 
  
# using list comprehension
chunks = [list_uris[i * n:(i + 1) * n] for i in range((len(list_uris) + n - 1) // n )] 

df = pd.DataFrame(columns=['album_uri', 'artist_uri', 'artist_name', 'album_name', 'release_date', 'disc_number', 'popularity','track_number', 'duration', 'explicit', 'track_name', 'track_uri', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'intstrumentalness', 'liveness', 'valence', 'tempo', 'time_signature'])

#33 seconds

In [11]:
def get_album_tracks(seed_tracks):
  print('Starting to get album tracks.')
  song_uris = []
  for i in range(len(seed_tracks)):
    try:
      track_info = sp.track(seed_tracks[i])
    except:
      continue
    album_uri = track_info['album']['uri']
    offset = 0

    while True:
      try:
        album_info = sp.album_tracks(album_uri, offset=offset)
      except:
        break
      if len(album_info['items']) == 0:
          break
      for j in range(len(album_info['items'])):
          if album_info['items'][j] == None:
              continue
          else:
              song_uris.append(album_info['items'][j]['uri'])  
      offset = offset + len(album_info['items'])
      time.sleep(0.001)

    if i == 249:
      print(f'I am on song 250 out of {len(seed_tracks)}')
    time.sleep(0.001)
    
  return list(set(song_uris))

def get_track_features(song_uris):
  print('Starting to get track features.')
  print(f'There are {len(song_uris)} to analyze.')
  dframe = pd.DataFrame(columns=['album_uri', 'artist_uri', 'artist_name', 'album_name', 'release_date', 'disc_number', 'popularity','track_number', 'duration', 'explicit', 'track_name', 'track_uri', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'intstrumentalness', 'liveness', 'valence', 'tempo', 'time_signature'])
  for i in range(len(song_uris)):
    
    try:
      track_info = sp.track(song_uris[i])
    
      album_uri = track_info['album']['uri']
      artist_uri = track_info['artists'][0]['uri']
      artist_name = track_info['artists'][0]['name']
      album_name = track_info['album']['name']
      release_date = track_info['album']['release_date']
      disc_number = track_info['disc_number']
      popularity = track_info['popularity']
      track_number = track_info['track_number']
      duration = track_info['duration_ms']
      explicit = track_info['explicit']
      track_name = track_info['name']
      track_uri = track_info['uri']
      time.sleep(0.001)
    except:
      continue

    try:
      track_features = sp.audio_features(song_uris[i])
      
      danceability = track_features[0]['danceability']
      energy = track_features[0]['energy']
      key = track_features[0]['key']
      loudness = track_features[0]['loudness']
      mode = track_features[0]['mode']
      speechiness = track_features[0]['speechiness']
      acousticness = track_features[0]['acousticness']
      instrumentalness = track_features[0]['instrumentalness']
      liveness = track_features[0]['liveness']
      valence = track_features[0]['valence']
      tempo = track_features[0]['tempo']
      time_signature = track_features[0]['time_signature']
      time.sleep(0.001)
    except:
      continue
    
    observation = {
      'album_uri': album_uri,
      'artist_uri': artist_uri,
      'artist_name': artist_name,
      'album_name': album_name,
      'release_date': release_date,
      'disc_number': disc_number,
      'popularity': popularity,
      'track_number': track_number,
      'duration': duration,
      'explicit': explicit,
      'track_name': track_name,
      'track_uri': track_uri,
      'danceability': danceability,
      'energy': energy,
      'key': key,
      'loudness': loudness,
      'mode': mode,
      'speechiness': speechiness,
      'acousticness': acousticness,
      'instrumentalness': instrumentalness,
      'liveness': liveness,
      'valence': valence,
      'tempo': tempo,
      'time_signature': time_signature
    }
    
    dframe = dframe.append(pd.DataFrame([observation]))
    if (i + 1) % 2000 == 0:
      print(f'Currently on song {i + 1} out of {len(song_uris)}')
  return dframe

def make_df(index):
  print(f'Starting chunk {index + 1}')
  t_1 = time.time()
  chunk_uris = get_album_tracks(chunks[index])
  chunk_df = get_track_features(chunk_uris)
  t_2 = time.time()
  print(f'Completing chunk {index + 1} took {(t_2 - t_1)/60} minutes.')
  return chunk_df

In [12]:
# len(chunks) = 4525
# currently running indices 0 - 49 (which means the first 50 chunks)

indices = [i for i in range(0, 50)]

for index in indices:
  dfpiece = make_df(index)
  df = df.append(dfpiece)
  print(f'The dimensions of the data frame before droppping duplicates is {df.shape}')
  df = df.drop_duplicates(subset = "track_uri")
  print(f'The dimensions of the data frame after droppping duplicates is {df.shape}')


Starting chunk 1
Starting to get album tracks.
I am on song 250 out of 500
Starting to get track features.
There are 4925 to analyze.
Currently on song 2000 out of 4925
Currently on song 4000 out of 4925
Completing chunk 1 took 18.603561635812124 minutes.
The dimensions of the data frame before droppping duplicates is (4925, 25)
The dimensions of the data frame after droppping duplicates is (4925, 25)
Starting chunk 2
Starting to get album tracks.


HTTP Error for GET to https://api.spotify.com/v1/tracks/4WHjf37BBXUo3WYBmJPdoU with Params: {'market': None} returned 404 due to non existing id


I am on song 250 out of 500
Starting to get track features.
There are 5502 to analyze.
Currently on song 2000 out of 5502
Currently on song 4000 out of 5502
Completing chunk 2 took 20.48076589902242 minutes.
The dimensions of the data frame before droppping duplicates is (10427, 25)
The dimensions of the data frame after droppping duplicates is (10266, 25)
Starting chunk 3
Starting to get album tracks.
I am on song 250 out of 500
Starting to get track features.
There are 4032 to analyze.
Currently on song 2000 out of 4032
Currently on song 4000 out of 4032
Completing chunk 3 took 15.680212632815044 minutes.
The dimensions of the data frame before droppping duplicates is (14297, 25)
The dimensions of the data frame after droppping duplicates is (13968, 25)
Starting chunk 4
Starting to get album tracks.
I am on song 250 out of 500
Starting to get track features.
There are 5495 to analyze.
Currently on song 2000 out of 5495
Currently on song 4000 out of 5495
Completing chunk 4 took 20.363

HTTP Error for GET to https://api.spotify.com/v1/tracks/2aeURFmfpaB6NGp09TCdzi with Params: {'market': None} returned 404 due to non existing id


Starting to get track features.
There are 4218 to analyze.
Currently on song 2000 out of 4218
Currently on song 4000 out of 4218
Completing chunk 47 took 16.470179716746014 minutes.
The dimensions of the data frame before droppping duplicates is (158448, 25)
The dimensions of the data frame after droppping duplicates is (157212, 25)
Starting chunk 48
Starting to get album tracks.
I am on song 250 out of 500
Starting to get track features.
There are 4817 to analyze.
Currently on song 2000 out of 4817
Currently on song 4000 out of 4817
Completing chunk 48 took 18.478483800093333 minutes.
The dimensions of the data frame before droppping duplicates is (162029, 25)
The dimensions of the data frame after droppping duplicates is (160697, 25)
Starting chunk 49
Starting to get album tracks.
I am on song 250 out of 500
Starting to get track features.
There are 4622 to analyze.
Currently on song 2000 out of 4622
Currently on song 4000 out of 4622
Completing chunk 49 took 17.863225396474203 minut

In [None]:
df.to_csv(path_or_buf=r'/Users/iancurtis/Documents/coding/sta518_spotify/musicanalysis/data/chunk_1.csv', index = False, index_label = False)