In [None]:
from spotipy.oauth2 import SpotifyClientCredentials
import spotipy
import spotipy.util as util
import spotipy.oauth2 as oauth2
import pandas as pd
import numpy as np
import random
import time
import os

In [None]:
def load_data(filename):
    """
    Loads given file as pandas dataframe.
    
    Argument:
        filename: string, name of file
        
    Return:
        pandas dataframe
    """
    return pd.read_csv(filename)

In [None]:
def setup_spotify(client_id='', 
                  client_secret=''):
    """
    Sets up the spotify object to use its api
    
    Argument:
        client_id: string, client id provided by spotify
        client_secret: string, client secret provided by spotify
        
    Return:
        spotify: object
    """
    spotify = spotipy.Spotify()

    CLIENT_ID = client_id
    CLIENT_SECRET = client_secret

    credentials = oauth2.SpotifyClientCredentials(
            client_id=CLIENT_ID,
            client_secret=CLIENT_SECRET)

    token = credentials.get_access_token()
    spotify = spotipy.Spotify(auth=token)
    
    return spotify 

In [None]:
def recursive_spotify_searcher(spotify, track_uri):
    """
    Recursive function for querying spotify data. If spotify rate-limits, 
    waits 10 seconds before making the request again.
    
    Argument:
        spotify: object, spotify api
        track_uri: string, track uri to query
        
    Return:
        array of size 2, track metadata and audio features data
    """
    try:
        return [spotify.tracks(track_uri),
                spotify.audio_features(track_uri)]
    except Exception as e: 
        print(e)
        print("Waiting 10 seconds...")
        time.sleep(10)
        return recursive_spotify_searcher(spotify, track_uri)

In [None]:
def get_audio_features(audio_features, data):
    """
    Parses each audio data in audio_features array into the data array
    
    Argument:
        audio_features: array, audio features data
        data: array, to store parsed audio data
    """
    for audio in audio_features:
        if not audio:
            data[0].append(-1)
            data[1].append(-1)
            data[2].append(-1)
            data[3].append(-1)
            data[4].append(-1)
            data[5].append(-1)
            data[6].append(-1)
            data[7].append(-1)
            data[8].append(-1)
            data[9].append(-1)
            data[10].append(-1)
        else:
            data[0].append(audio["uri"])
            data[1].append(audio["danceability"])
            data[2].append(audio["energy"])
            data[3].append(audio["key"])
            data[4].append(audio["loudness"])
            data[5].append(audio["mode"])
            data[6].append(audio["acousticness"])
            data[7].append(audio["instrumentalness"])
            data[8].append(audio["liveness"])
            data[9].append(audio["valence"])
            data[10].append(audio["tempo"])

In [None]:
def get_track_features(track_features, data):
    """
    Parses each track data in track_features array into the data array
    
    Argument:
        track_features: array, track features data
        data: array, to store parsed track data
    """
    for track in track_features["tracks"]:
        if not track:
            data[11].append(-1)
            data[12].append(-1)
        else:
            data[11].append(track["popularity"])
            data[12].append(track["album"]["release_date"])    

In [None]:
def songs_cached(i=1, j=5):
    """
    Creates dataframe of songs with unique track uri and their scrapped data
    
    Argument:
        i: int, beginning song files to look through
        j: int, ending song files to look through
        
    Return:
        songs: dataframe, unique songs
    """
    songs = load_data("./Songs_features/songs0.csv")
    
    for i in range(i, j):
        song = load_data("./Songs_features/songs" + str(i) + ".csv")
        songs = pd.concat([songs, song], sort=False)
        
    col = list(songs.columns) 
    
    songs.drop_duplicates(subset=["track_uri"], inplace=True)
    
    return songs

In [None]:
def cache_songs_features(i=1, j=100, m=100, n=1000):
    """
    Uses already scraped data as cache to avoid making too many requests to spotify
    
    Argument:
        i: int, beginning song files to look through
        j: int, ending song files to look through
        m: int, beginning song files to fill out
        n: int, ending song files to look through
    """
    
    songs = songs_cached(i, j)
    
    data = songs[["track_uri", "danceability", "energy", "key",
                  "loudness", "mode", "acousticness", "instrumentalness", 
                  "liveness", "valence", "tempo", "popularity", "release_date"]]
    
    for i in range(m, n):
        filename = "./Songs_features/songs" + str(m) + ".csv"
        song = load_data(filename)
        
        print("Processing: ", m)
        print("Number of songs to fill out: ", len(song[song.danceability.isna()]))
        
        for k in range(1, len(data.columns)):
            mapper = data.set_index('track_uri')[data.columns[k]].to_dict()
            song[data.columns[k]] = song[data.columns[k]].fillna(song.track_uri.map(mapper))
        
        print("Number of songs remaining to be filled out: ", len(song[song.danceability.isna()]))
        print("*" * 10)
        
        song.to_csv(filename, index=False)

In [None]:
columns = ["track_uri", "danceability", "energy", "key", "loudness",
               "mode", "acousticness", "instrumentalness", 
               "liveness", "valence", "tempo", "popularity", "release_date"]

In [None]:
for i in range(0, 1000):  
    
    spotify = setup_spotify()
    
    filename = "./Songs_features/songs" + str(i) + ".csv" #song_file
    songs = load_data(filename)
    songs = songs[songs["danceability"].isna()]

    unique_songs = songs.groupby("track_uri").size().reset_index()
    track_uri = unique_songs["track_uri"].str.split(':')
    
    print(filename)
    print(len(unique_songs))
    
    data = []

    for i in range(13):
        data.append([])
    
    for i in range(0, len(unique_songs), 50):
        print(i)

        track_uri = unique_songs["track_uri"][i:i+50].values
        tracks = recursive_spotify_searcher(spotify, track_uri)
        get_track_features(tracks[0], data)
        get_audio_features(tracks[1], data)
        time.sleep(random.choice(np.arange(5)))
     
    df = {
        "track_uri": data[0],
        "danceability": data[1],
        "energy": data[2], 
        "key": data[3], 
        "loudness": data[4],
        "mode": data[5], 
        "acousticness": data[6], 
        "instrumentalness": data[7], 
        "liveness": data[8], 
        "valence": data[9], 
        "tempo": data[10],
        "popularity": data[11], 
        "release_date": data[12]
    }

    data = pd.DataFrame(df, columns=columns)
    songs = load_data(filename)
    
    for i in range(1, len(data.columns)):
        mapper = data.set_index('track_uri')[data.columns[i]].to_dict()
        songs[data.columns[i]] = songs[data.columns[i]].fillna(songs.track_uri.map(mapper))
    
    songs.to_csv(filename, index=False)
    
    time.sleep(60)