In [None]:
import os
import json
import pandas as pd
import time
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import cred
import glob

client_credentials_manager = SpotifyClientCredentials(cred.client_id, cred.client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

json_dir_name = '/Volumes/MUSIC/spotify_million_playlist_dataset/data'

json_pattern = os.path.join(json_dir_name, '*.json')
json_files = glob.glob(json_pattern)
json_files.sort()


In [None]:
track_uris = []

for index, js in enumerate(json_files):
    with open(os.path.join(json_dir_name, js), encoding='utf-8') as json_file:
        json_text = json.load(json_file)
        
        for j in range(len(json_text["playlists"])):
          for k in range(len(json_text["playlists"][j]["tracks"])):
            
            track_uri = json_text["playlists"][j]["tracks"][k]['track_uri']
            track_uris.append(track_uri)
            
        if (index + 1) % 50 == 0:
          print(f'I am on file {index + 1}')

list_uris = list(dict.fromkeys(track_uris))

# 25 minutes

In [None]:
# How many elements each
# list should have
n = 500 
  
# using list comprehension
chunks = [list_uris[i * n:(i + 1) * n] for i in range((len(list_uris) + n - 1) // n )] 

print(len(chunks))
print(chunks[0][0])
#20 seconds

In [None]:
def get_album_tracks(seed_tracks):
  print('Starting to get album tracks.')
  song_uris = []
  for i in range(len(seed_tracks)):
    try:
      track_info = sp.track(seed_tracks[i])
    except:
      continue
    album_uri = track_info['album']['uri']
    offset = 0

    while True:
      try:
        album_info = sp.album_tracks(album_uri, offset=offset)
      except:
        break
      if len(album_info['items']) == 0:
          break
      for j in range(len(album_info['items'])):
          if album_info['items'][j] == None:
              continue
          else:
              song_uris.append(album_info['items'][j]['uri'])  
      offset = offset + len(album_info['items'])
      time.sleep(0.001)

    if i == 249:
      print(f'I am on song 250 out of {len(seed_tracks)}')
    time.sleep(0.001)
    
  return list(set(song_uris))

def get_track_features(song_uris):
  print('Starting to get track features.')
  print(f'There are {len(song_uris)} to analyze.')
  dframe = pd.DataFrame(columns=['album_uri', 'artist_uri', 'artist_name', 'album_name', 'release_date', 'disc_number', 'popularity','track_number', 'duration', 'explicit', 'track_name', 'track_uri', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'intstrumentalness', 'liveness', 'valence', 'tempo', 'time_signature'])
  for i in range(len(song_uris)):
    
    try:
      track_info = sp.track(song_uris[i])
    
      album_uri = track_info['album']['uri']
      artist_uri = track_info['artists'][0]['uri']
      artist_name = track_info['artists'][0]['name']
      album_name = track_info['album']['name']
      release_date = track_info['album']['release_date']
      disc_number = track_info['disc_number']
      popularity = track_info['popularity']
      track_number = track_info['track_number']
      duration = track_info['duration_ms']
      explicit = track_info['explicit']
      track_name = track_info['name']
      track_uri = track_info['uri']
      time.sleep(0.001)
    except:
      continue

    try:
      track_features = sp.audio_features(song_uris[i])
      
      danceability = track_features[0]['danceability']
      energy = track_features[0]['energy']
      key = track_features[0]['key']
      loudness = track_features[0]['loudness']
      mode = track_features[0]['mode']
      speechiness = track_features[0]['speechiness']
      acousticness = track_features[0]['acousticness']
      instrumentalness = track_features[0]['instrumentalness']
      liveness = track_features[0]['liveness']
      valence = track_features[0]['valence']
      tempo = track_features[0]['tempo']
      time_signature = track_features[0]['time_signature']
      time.sleep(0.001)
    except:
      continue
    
    observation = {
      'album_uri': album_uri,
      'artist_uri': artist_uri,
      'artist_name': artist_name,
      'album_name': album_name,
      'release_date': release_date,
      'disc_number': disc_number,
      'popularity': popularity,
      'track_number': track_number,
      'duration': duration,
      'explicit': explicit,
      'track_name': track_name,
      'track_uri': track_uri,
      'danceability': danceability,
      'energy': energy,
      'key': key,
      'loudness': loudness,
      'mode': mode,
      'speechiness': speechiness,
      'acousticness': acousticness,
      'instrumentalness': instrumentalness,
      'liveness': liveness,
      'valence': valence,
      'tempo': tempo,
      'time_signature': time_signature
    }
    
    dframe = dframe.append(pd.DataFrame([observation]))
    if (i + 1) % 2000 == 0:
      print(f'Currently on song {i + 1} out of {len(song_uris)}')
  return dframe

def make_df(index):
  print(f'Starting chunk {index + 1}')
  t_1 = time.time()
  chunk_uris = get_album_tracks(chunks[index])
  chunk_df = get_track_features(chunk_uris)
  t_2 = time.time()
  print(f'Completing chunk {index + 1} took {round((t_2 - t_1)/60, 2)} minutes.')
  return chunk_df

In [36]:
df = pd.DataFrame(columns=['album_uri', 'artist_uri', 'artist_name', 'album_name', 'release_date', 'disc_number', 'popularity','track_number', 'duration', 'explicit', 'track_name', 'track_uri', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'intstrumentalness', 'liveness', 'valence', 'tempo', 'time_signature'])

In [None]:
# len(chunks) = 4525

indices = [i for i in range(326, 349)]

for index in indices:
  dfpiece = make_df(index)
  df = df.append(dfpiece)
  print(f'The dimensions of the data frame before droppping duplicates are {df.shape}')
  df = df.drop_duplicates(subset = "track_uri")
  print(f'The dimensions of the data frame after droppping duplicates are {df.shape}')


In [35]:
df.to_csv(path_or_buf=r'/Users/iancurtis/Documents/coding/sta518_spotify/musicanalysis/data/chunks275-326.csv', index = False, index_label = False)