# 01c_Collection: Retrieving Addtl Audio

In [2]:
import csv
import json
import os
import pickle
import re
import time

import numpy as np
import pandas as pd

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv());

## 1. Loading in Prior Info

### 1a. Loading in Artist List
Not the final artist list - this is the initial artist list before removing non-searchable artists.

In [10]:
artists = pd.read_csv('../data/artist_list_v2.csv')

In [11]:
artists.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2426 entries, 0 to 2425
Data columns (total 4 columns):
s_artist_id    2426 non-null object
name           2426 non-null object
popularity     2426 non-null int64
followers      2426 non-null float64
dtypes: float64(1), int64(1), object(2)
memory usage: 75.9+ KB


In [15]:
artists['s_artist_id'].head()

0    6Ff53KvcvAj5U7Z1vojB5o
1    1bDWGdIC2hardyt55nlQgG
2    0REMf7H0VP6DwfZ9MbuWph
3    0MBIKH9DjtBkv8O3nS6szj
4    7urq0VfqxEYEEiZUkebXT4
Name: s_artist_id, dtype: object

In [75]:
artists[artists['name'] == 'AC/DC']

Unnamed: 0,s_artist_id,name,popularity,followers
41,711MCceyCBcFnzjGY4Q7Un,AC/DC,83,8577740.0


##### Dropping duplicate records before I query Spotify API

In [71]:
artists.drop_duplicates(subset='s_artist_id', inplace=True)

## 2. Querying Spotify API through Spotipy

From the looks of things, I'll need to do the following to retrieve as many track id's as possible for the artists I've indexed:

1. [ARTIST ENPOINT - "Get an Artist's Albums"] Query as many albums as possible for each artist id
   - Store the corresponding album_id & name along with artist_id in a new table
2. [ALBUM ENDPOINT - "Get an Album's Tracks"] Query each album for its underlying tracks
   - Store the corresponding information that I stored previously for each track
3. [TRACK ENDPOINT - "Get Audio Features for Several Tracks"] Query each track for Audio Features
   - Store the audio features as I had before
4. [TRACK ENDPOINT - "Get Audio Analysis for a Track"] Query each track for Audio Analysis
   - Store the audio analysis as I had before

In [12]:
ccm = SpotifyClientCredentials(client_id= \
                               os.getenv('S_CLIENT_ID'),
                               client_secret=os.getenv('S_CLIENT_SECRET'))
sp = spotipy.Spotify(client_credentials_manager=ccm)

In [16]:
example = sp.artist_albums('0REMf7H0VP6DwfZ9MbuWph', 
                           album_type=['album', 'single'],limit=5)

In [99]:
example['items'][0]['artists'][0]

{'external_urls': {'spotify': 'https://open.spotify.com/artist/0REMf7H0VP6DwfZ9MbuWph'},
 'href': 'https://api.spotify.com/v1/artists/0REMf7H0VP6DwfZ9MbuWph',
 'id': '0REMf7H0VP6DwfZ9MbuWph',
 'name': '10 Years',
 'type': 'artist',
 'uri': 'spotify:artist:0REMf7H0VP6DwfZ9MbuWph'}

In [18]:
example['next']

'https://api.spotify.com/v1/artists/0REMf7H0VP6DwfZ9MbuWph/albums?offset=5&limit=5&include_groups=album'

In [97]:
example['items'][0]['artists']

[{'external_urls': {'spotify': 'https://open.spotify.com/artist/0REMf7H0VP6DwfZ9MbuWph'},
  'href': 'https://api.spotify.com/v1/artists/0REMf7H0VP6DwfZ9MbuWph',
  'id': '0REMf7H0VP6DwfZ9MbuWph',
  'name': '10 Years',
  'type': 'artist',
  'uri': 'spotify:artist:0REMf7H0VP6DwfZ9MbuWph'}]

### 2a. Retreiving Albums from Artist Endpoint

I decided to store each album search as a separate json file, so that I didn't have one potentially huge json file that would kill my AWS instance.

In [77]:
def get_albums(spot, artists):
    '''
    Queries spotify api via Spotipy to retrieve as many album results as possible.
    Takes 2 arguments:
    spot = spotipy instance
    artists = dataframe with spotify artist names & corresponding ids
    '''
    artist_count = 0
    for artist in artists['s_artist_id']:
        counter = 0
        n = 50
        results = spot.artist_albums(artist, 
                                     album_type='album,single',
                                     limit=50)
        with open('../data/albums/{}_{}.json'.format\
                  (artists['name'][artists['s_artist_id'] == artist].item().replace("/","").strip(), counter),
                  'w') as f:
            json.dump(results, f)
        while results['next'] is not None:
            counter += 1
            results = \
            spot.artist_albums(artist, 
                               album_type='album,single',
                               limit=50, offset=n)
            n += 50
            with open('../data/albums/{}_{}.json'.format\
                      (artists['name'][artists['s_artist_id'] == artist].item().replace("/","").strip(), counter),
                      'w') as f:
                json.dump(results, f)
        artist_count += 1
        time.sleep(1)
        if artist_count % 100 == 0:
            print('{} artists completed'.format(artist_count))

In [78]:
get_albums(sp, artists.iloc[41:])

100 artists completed
200 artists completed
300 artists completed
400 artists completed
500 artists completed
600 artists completed
700 artists completed
800 artists completed
900 artists completed
1000 artists completed
1100 artists completed
1200 artists completed
1300 artists completed
1400 artists completed
1500 artists completed
1600 artists completed
1700 artists completed
1800 artists completed
1900 artists completed
2000 artists completed
2100 artists completed
2200 artists completed
2300 artists completed


#### Artists -w- 0 Results

After iterating through each artist, and spot checking a few json files, I noticed that the U.G.K. entry didn't have any albums. I'm assuming that it won't be the last artist with 0 albums as well. So, I'm iterating through each result file and determining which had 0 results

#### Putting All Albums on Single Document & Removing Duplicates



In [2]:
album_list = os.listdir('../data/albums/')

In [3]:
album_list.sort()

In [4]:
len(album_list)

3173

In [5]:
album_list[-1]

't.A.T.u._0.json'

In [19]:
def get_album_id(lst):
    '''
    Iterates through spotify artist-album json result files and
    grabs the artist name, artist id, album name, and album id
    '''
    ids = {}
    null_albums = []
    album_count = 0
    file_count = 0
    for album in album_list:
        if '.json' in album:
            with open('../data/albums/{}'.format(album), 'r') as f:
                album_info = json.load(f)
            if len(album_info['items']) >= 1:
                for album_entry in album_info['items']:
                    album_count += 1
                    ids[album_count] = dict({'artist_id':album_entry['artists'][0]['id'],
                                             'artist_name':album_entry['artists'][0]['name'],
                                             'album_name':album_entry['name'],
                                             'album_id': album_entry['id']})
            else:
                null_albums.append(album)
        file_count += 1
        if file_count % 100 == 0:
            print('{} album completed'.format(file_count))
    return ids, null_albums

In [20]:
albums, null_albums = get_album_id(album_list)

100 album completed
200 album completed
300 album completed
400 album completed
500 album completed
600 album completed
700 album completed
800 album completed
900 album completed
1000 album completed
1100 album completed
1200 album completed
1300 album completed
1400 album completed
1500 album completed
1600 album completed
1700 album completed
1800 album completed
1900 album completed
2000 album completed
2100 album completed
2200 album completed
2300 album completed
2400 album completed
2500 album completed
2600 album completed
2700 album completed
2800 album completed
2900 album completed
3000 album completed
3100 album completed


##### Checking Results

In [21]:
len(albums)

89134

In [16]:
albums[1]

{'artist_id': '1bDWGdIC2hardyt55nlQgG',
 'artist_name': '"Weird Al" Yankovic',
 'album_name': 'Mandatory Fun',
 'album_id': '36jlZKG1sNZQA2HbWdYveV'}

In [23]:
null_albums

["'In The Heights' Original Broadway Company_0.json",
 'Al DiMeola Project_0.json',
 'Amg Beats_0.json',
 'Blaque Keyz_0.json',
 'Bobby Gentry_0.json',
 'Bruce Hornsby and the Range_0.json',
 'Candle Weather_0.json',
 'Carlos Santana & Buddy Miles_0.json',
 'David Crosby, Neil Young & Graham Nash_0.json',
 'Duice_0.json',
 'England Dan & John Ford Coley_0.json',
 'Falco Van Den Aker_0.json',
 'Flesh And Bone_0.json',
 'Garth Brooks_0.json',
 'George Jones & Tammy Wynette_0.json',
 'Gloria Estefan And Miami Sound Machine_0.json',
 'Helena Bonham Carter_0.json',
 'Jimmy Page & The Black Crowes_0.json',
 'London Beat_0.json',
 'Made famous by Paul Mc Cartney_0.json',
 'Mahavishnu John McLaughlin_0.json',
 'Marketa Irglova and Glen Hansard_0.json',
 'Merle Haggard & Willie Nelson_0.json',
 'Method man e redman_0.json',
 'Mickey Mouse_0.json',
 'Miley Cyrus as Hannah Montana_0.json',
 'Nancy Sinatra And Lee Hazlewood_0.json',
 'Prodigy (Of Mobb Deep) Feat. Kurupt, Jayo Felony & 40 Glocc_0.j

##### Pickling Both Lists for Later

In [24]:
with open('../pickle/albums.pkl', 'wb') as f:
    pickle.dump(albums, f)
with open('../pickle/null_albums.pkl', 'wb') as f:
    pickle.dump(null_albums, f)

### 2b. Retieving Tracks from Album Endpoint

In [2]:
with open('../pickle/albums.pkl', 'rb') as f:
    albums = pickle.load(f)

In [5]:
example = sp.album_tracks('36jlZKG1sNZQA2HbWdYveV')
example['next']

In [6]:
albums[90]

{'artist_id': '7urq0VfqxEYEEiZUkebXT4',
 'artist_name': '112',
 'album_name': 'U Already Know',
 'album_id': '6Jwq4FZFwNj32A6xGQe8Cs'}

In [None]:
sp.album_tracks()

In [7]:
def get_tracks_fr_albums(album_list, spot):
    '''
    Retrieves all tracks from albums in album_list and adds them
    to a track dictionary
    '''
    count = 0
    tracks = {}
    for album in album_list:
        track_count = 0
        results = spot.album_tracks(album_list[album]['album_id'])
        with open('../data/album_tracks/{}_{}.json'.format\
                  (album_list[album]['album_name'].replace("/","").strip(), 
                   track_count), 'w') as f:
            json.dump(results, f)
        
        # if there are greater than 50 tracks in a given release -
        # continue querying tracks
        while results['next'] is not None:
            track_count += 1
            n = 50
            results = \
            results = spot.album_tracks(album_list[album]['album_id'], offset=n)
            with open('../data/album_tracks/{}_{}.json'.format\
                      (album_list[album]['album_name'].replace("/","").strip(), 
                       track_count), 'w') as f:
                json.dump(results, f)
                
            # running into albums with WAAY too many tracks.
            # adding this to stop retriving tracks from albums with > 150 tracks
            if track_count == 3:
                break
                
        count += 1
        time.sleep(1)
        if count % 5000 == 0:
            print('{} albums completed'.format(count))

In [8]:
get_tracks_fr_albums(albums, sp)

5000 albums completed
10000 albums completed
15000 albums completed
20000 albums completed
25000 albums completed
30000 albums completed
35000 albums completed
retrying ...1secs
retrying ...1secs
retrying ...1secs
40000 albums completed
retrying ...1secs
retrying ...1secs
retrying ...1secs
retrying ...1secs
retrying ...1secs
retrying ...1secs
retrying ...1secs
retrying ...1secs
45000 albums completed
50000 albums completed
55000 albums completed
60000 albums completed
65000 albums completed
70000 albums completed
75000 albums completed
80000 albums completed
85000 albums completed


#### Checking Resultant `json`

In [10]:
with open('../data/album_tracks/100 Miles And Runnin\'_0.json', 'r') as f:
    example = json.load(f)

In [22]:
example['items']       # song list
example['items'][0]    # first song result
example['items'][0]['artists'][0] # first artist result
example['items'][0]['artists'][0]['name'] # artist name
example['items'][0]['id']  # song id
example['items'][0]['name']

"100 Miles And Runnin'"

### 2c. Retrieving Track Features from Track Endpoint

In [32]:
album_track_list = os.listdir('../data/album_tracks/')

In [33]:
len(album_track_list)

64118

In [34]:
album_track_list[:5]

['Strangers [Paranoid] (feat. Bun B)_0.json',
 'Alive In America_0.json',
 'BREAKDOWN (Chill Covers From The She Shed)_0.json',
 'Fire & Smoke_0.json',
 'Prokofiev: The Piano ConcertosViolin Concertos etc_0.json']

#### Grabbing Track Ids to Query With

In [1]:
def get_song_ids(file_list):
    '''
    Iterates through list of album data stored in json files and
    retrieves the ids associated with each track
    '''
    track_dict = {}
    file_errors = {}
    count = 0
    for file in file_list:
        try: 
            with open('../data/album_tracks/{}'.format(file), 'r') as f:
                album = json.load(f)
                for result in album['items']:
                    track_dict[result['id']] = {'Song Title': result['name'],
                                                'Artist': result['artists'][0]['name']}
        except Exception as e:
            file_errors[file] = e
        count += 1
        if count % 10000 == 0:
            print('{} albums completed'.format(count))
    return track_dict, file_errors 

In [33]:
tracks, file_load_errors = get_song_ids(album_track_list)

10000 albums completed
20000 albums completed
30000 albums completed
40000 albums completed
50000 albums completed
60000 albums completed


#### Saving Track List to Pickle

In [37]:
with open('../data/new_tracks_20190103.json', 'w') as f:
    json.dump(tracks, f)

In [21]:
len(tracks) / 50

13030.34

In [3]:
with open('../data/new_tracks_20190103.json', 'r') as f:
    tracks = json.load(f)

#### Querying Track Features

In [None]:
sp.audio_features()

In [37]:
def get_song_feat2(file_list, spot):
    '''
    Retrieve audio features for every song in `new_tracks_20190103.json`.
    Requires a Spotify API login
    '''
    song_feat = {}
    file_errors = {}
    error_log = {}
    count = 0
    for file in file_list:
        track_dict = {}
        try: 
            with open('../data/album_tracks/{}'.format(file), 'r') as f:
                album = json.load(f)
            for result in album['items']:
                track_dict[result['id']] = {'Song Title': result['name'],
                                            'Artist': result['artists'][0]['name']}
        except Exception as e:
            file_errors[file] = e
        try:
            song_feat[count] = spot.audio_features(track_dict.keys())
            time.sleep(1)
            count += 1
        except Exception as e:
            error_log[track] = e
            time.sleep(5)
            try:
                song_feat[count] = spot.audio_features(track_dict.keys())
            except Exception as e:
                print('error - {}'.format(e))
            count += 1
        if count % 10 == 0:
            with open('../data/song_features/song_feats_{}.json'.format(count), 'w') as f:
                json.dump(song_feat, f)
            song_feat.clear()
        if count % 1000 == 0:
            print('{} track collections completed'.format(count))
    return song_feat, error_log, file_errors

In [38]:
sfs, el, fe = get_song_feat2(album_track_list, sp)

1000 track collections completed
2000 track collections completed
3000 track collections completed
4000 track collections completed
5000 track collections completed
6000 track collections completed
7000 track collections completed
8000 track collections completed
9000 track collections completed
10000 track collections completed
11000 track collections completed
12000 track collections completed
13000 track collections completed
14000 track collections completed
15000 track collections completed
16000 track collections completed
17000 track collections completed
18000 track collections completed
19000 track collections completed
20000 track collections completed
21000 track collections completed
22000 track collections completed
23000 track collections completed
24000 track collections completed
25000 track collections completed
26000 track collections completed
27000 track collections completed
28000 track collections completed
29000 track collections completed
30000 track collections