In [1]:
import re
import requests_oauthlib
import requests
import webbrowser
import json
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from spotify_secret_data import *
import time
CLIENT_ID = client_id
CLIENT_SECRET = client_secret


from datetime import datetime
DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S.%f"

AUTHORIZATION_URL = 'https://accounts.spotify.com/authorize'
# NOTE: you need to specify this same REDIRECT_URI in the Spotify API console of your application!
REDIRECT_URI = 'https://www.programsinformationpeople.org/runestone/oauth' # This is a URL we have specifically set up at UMSI to handle student requests, basically -- it is an "OAuth2 workaround". You could use any URL -- but it would be a bit rude to, because that's still a hit on someone's URL! In general, you'd use your own -- on your own server.
TOKEN_URL = 'https://accounts.spotify.com/api/token'


In [2]:

def has_token_expired(timestamp_str):
    """
    checks to see if the token is over one hour old

    Parameters
    -------
    timestamp_str : string
        timestamp of when the token was originally saved

    Returns
    -------
    True or False : bool
        True if timestamp has expired else False
    """

    # gives current datetime
    now = datetime.now()

    # datetime.strptime converts a formatted string into datetime object
    cache_timestamp = datetime.strptime(timestamp_str, DATETIME_FORMAT)

    # subtracting two datetime objects gives you a timedelta object
    delta = now - cache_timestamp
    
    # if the change in time between now and when the token was saved
    # is over an hour, return True, else return false
    if delta.seconds > 3600:
        return True 
    else:
        return False  


def get_saved_token():
    with open('token.json', 'r') as f:
        token_json = f.read()
        token_dict = json.loads(token_json)

        return token_dict


def save_token(token_dict):
    # adds timestamp of when token was saved
    token_dict['timestamp'] = datetime.now().strftime(DATETIME_FORMAT)
    
    with open('token.json', 'w') as f:
        token_json = json.dumps(token_dict)
        f.write(token_json)



In [3]:
def start_session():
    """
    Starts an oauth2 session


    Returns
    -------
    oauth2inst : object
        object of class requests_oauthlib
    """

    # if the token has never been saved, assign it to None
    try:
        token = get_saved_token()
    except FileNotFoundError:
        token = None

    # if token was cached and hasn't expired
    if token:
        if not has_token_expired(token['timestamp']):
            oauth2inst = requests_oauthlib.OAuth2Session(CLIENT_ID, token=token)
            return oauth2inst

    # If token either doesn't exist or has expired
    print('getting token the long way')
    oauth2inst = requests_oauthlib.OAuth2Session(CLIENT_ID, redirect_uri=REDIRECT_URI)
    authorization_url, state = oauth2inst.authorization_url(AUTHORIZATION_URL) # all we need for spotify

    webbrowser.open(authorization_url) # Opening auth URL for you to sign in to the Spotify service
    authorization_response = input('Authenticate and then enter the full callback URL: ').strip() # Need to get the full URL in order to parse the response

    # The OAuth2Session instance has a method that extracts what we need from the url, and helps do some other back and forth with spotify
    token = oauth2inst.fetch_token(TOKEN_URL, authorization_response=authorization_response, client_secret=CLIENT_SECRET)
    save_token(token)
    
    return oauth2inst

In [4]:
def get_track_info(track_dict):
    """
    Uses data from track_dict to search additional features about each track.
    Adds that data to track_dict for each track  

    Parameters
    -------
    track_dict : dict
        dictionary that holds data about each track. Current info
        is track name, ID, number, and duration

    Returns
    -------
    track_dict : dictionary 
        Each key is the track_id, the corresponding values are 
        track name, ID, number, and duration, danceability, energy,
        tempo, speechiness, and valence
    """

    # sort by track number so I can get features for the right song
    # as the oauth2 response is in track_number order
    track_keys_by_number = sorted(track_dict.items(), key = lambda x: x[1][0])
    track_keys_by_number = [x[0] for x in track_keys_by_number]
    # doing the request
    track_feat_url = 'https://api.spotify.com/v1/audio-features?ids={}'.format(','.join(list(track_dict.keys())))
    oauth2inst = start_session()
    track_features = oauth2inst.get(track_feat_url)
    # list of data for each track. Length is number of tracks
    features = json.loads(track_features.text)
    # print(features)
    features = features['audio_features']
    # print(features)
    # For each track
    for ind, key in enumerate(track_keys_by_number):
        # get the features for that track and extend the list
        # corresponding to track_dict[track_id]
        feats = features[ind]
        if feats is None:
            # need to take get rid of this whole album
            return None
        else:
            track_dict[key].extend([feats.get('danceability'),
                                    feats.get('energy'),
                                    feats.get('tempo'),
                                    feats.get('speechiness'),
                                    feats.get('valence')])
    return track_dict


In [12]:
def clean(artist_name):
    # removes capital letters, non word characters
    return re.sub(r'\W', '', artist_name).lower()
    

In [13]:
with open('scraped_songs.txt') as f:
    song_info = json.loads(f.read())

In [14]:
base = "https://api.spotify.com/v1/search"


In [15]:
oauth2inst = start_session()


In [16]:
num_matches = 0
for i, song in enumerate(song_info):
    q = {'q': f'track:{song[1]} artist:{song[0]}', 'type':'track'}
    track_url = requests.get(base, params=q).url
    track = oauth2inst.get(track_url)
    track_options = json.loads(track.text)['tracks']['items']

    for track_option in track_options:
        # very arbitrary matching
        if fuzz.token_sort_ratio(clean(track_option['artists'][0]['name']), clean(song[0])) > 60:
#             print(track_option['artists'][0]['name'],',', song[0])
            song_id = track_option['id']
            song_info[i].append(song_id)

            num_matches += 1
            break

    if len(song_info[i]) == 5:
        song_info[i].append(None)
#             print(track_option['artists'][0]['name'],',', song[0], 'NO MATCH')
    if i % 200 == 0:
        print(f'num completed: {i}')
        print(f'num matches: {num_matches}')
        time.sleep(5)

num completed: 0
num matches: 1
num completed: 200
num matches: 111
num completed: 400
num matches: 203
num completed: 600
num matches: 291
num completed: 800
num matches: 370
num completed: 1000
num matches: 435
num completed: 1200
num matches: 502
num completed: 1400
num matches: 557
num completed: 1600
num matches: 629
num completed: 1800
num matches: 695
num completed: 2000
num matches: 777
num completed: 2200
num matches: 849
num completed: 2400
num matches: 914
num completed: 2600
num matches: 999
num completed: 2800
num matches: 1064


In [17]:
num_matches / len(song_info)

0.3774441340782123

### Now to search through the IDs to get the track info

In [18]:
audio_base = "https://api.spotify.com/v1/audio-features/"

In [37]:
oauth2inst = start_session()


getting token the long way
Authenticate and then enter the full callback URL: https://www.programsinformationpeople.org/runestone/oauth/index?code=AQCkWdG_7obHdWLvYQmiDVkL9tuGSgd3_IM-CCAGzIThqYkNfq4ruadDcvk88n5eYgNfhTtE4qVpDp2CEroM10xOGuIJ4k24yOqBejc6n1AgnmRYFnBPDHQ6XIBGTUlLHSnnPdihc1rGdMoqLVJMZY8n2Az_3cV9Eiz9Wi90NNX2p5rUhooBwTFtXv9jA5en0c1XJwsPD5zoXorwRzpdUIRofO7VtE6wz8QGz3itbQ&state=qKni2dDMGifbH6V60tVHV2iPCIOalv


In [50]:
for i, song in enumerate(song_info):
    song_id = song[5]
    if song_id is not None:
        track_features = oauth2inst.get(audio_base + song_id)
        # list of data for each track. Length is number of tracks
        feat = json.loads(track_features.text)
        
        if len(song_info[i]) == 6: 
            song_info[i].extend([feat.get('danceability'), feat.get('energy'), feat.get('valence'),
                                 feat.get('tempo'), feat.get('duration_ms')])
    else:
        song_info[i].extend([None, None, None, None, None])

In [51]:
import pandas as pd

In [54]:
cols = ['Artist', "Title", "Year", "Instrument", "Genre", "ID", "Danceability", "Energy", "Valence", "Tempo", "Duration_MS"]
df = pd.DataFrame(song_info, columns=cols)

In [56]:
df.to_csv('songs.csv', index=False)