# Data Collection: 

## 2. Query Spotify Audio Analysis API for music audio features
https://developer.spotify.com/community/showcase/spotify-audio-analysis/

1. Takes song name and artist name from the top100_1958_to_2021.csv file and send to the API to query for audio features. 
2. As final output, it creates a new csv file with a column containing the sound features.

<span style="color:red"><i> Note: This process takes ~3 hours so it will not be demonstrated in class or in the recording. In case users would like to run this file, messages have been added to the query process to provide updates</i></span>

In [2]:
import os
import json
import spotipy as sp
from spotipy.oauth2 import SpotifyClientCredentials
import spotipy.util as util
import pandas as pd
import jellyfish
import re
import csv
import time

In [3]:
# Personal account credentials from Spotify. Please replace with your own account
# credentials before running the code.
CLIENT_ID = 'a93fe0f1958b48a485d1ed9e9ccXXXXX'
CLIENT_SECRET = '2b86da6604934f6c926cf503c1eXXXXX'
username = '31gv6p26mfot5zvslqhcllyXXXXX'
scope = 'user-library-read'

In [4]:
spotify = sp.Spotify(client_credentials_manager=
                     SpotifyClientCredentials(CLIENT_ID, CLIENT_SECRET))

In [3]:
top100_data = pd.read_csv("../data/top100_1958_to_2021.csv")
print(top100_data.shape)
top100_data[top100_data['year']==2020].head()

(6351, 6)


Unnamed: 0,no.,year,song,artist(s),song_wiki_url,artist_wiki_url
6151,1,2020,"""Blinding Lights""",The Weeknd,['https://en.wikipedia.org/wiki/Blinding_Lights'],['https://en.wikipedia.org/wiki/The_Weeknd']
6152,2,2020,"""Circles""",Post Malone,['https://en.wikipedia.org/wiki/Circles_(Post_...,['https://en.wikipedia.org/wiki/Post_Malone']
6153,3,2020,"""The Box""",Roddy Ricch,['https://en.wikipedia.org/wiki/The_Box_(Roddy...,['https://en.wikipedia.org/wiki/Roddy_Ricch']
6154,4,2020,"""Don't Start Now""",Dua Lipa,['https://en.wikipedia.org/wiki/Don%27t_Start_...,['https://en.wikipedia.org/wiki/Dua_Lipa']
6155,5,2020,"""Rockstar""",DaBaby featuring Roddy Ricch,['https://en.wikipedia.org/wiki/Rockstar_(DaBa...,"['https://en.wikipedia.org/wiki/DaBaby', 'http..."


In [17]:
def get_sp_features(sp_id, flag):
    """
    Given a Spotify song ID, find the wanted audio features
    Inputs: Spotify song ID, and previous state of the success flag.
    Returns: A dictionary of wanted features and update success flag.
    """
    wanted_keys = ['danceability','energy','key','loudness','mode','speechiness',
                   'acousticness','instrumentalness','liveness','valence','tempo']
    all_features = spotify.audio_features(sp_id)
    if all_features[0]:
        wanted_features = {key:all_features[0].get(key) for key in wanted_keys}
    else:
        flag = 1
        wanted_features = []
    
    return wanted_features, flag

In [18]:
def check_match(wiki_name, sp_name, wiki_artist, sp_artist):
    """
    Check whether and song name and artist name received from Wikipedia and
    Spotify match.
    Inputs: strings of Spotify and Wikipedia song and artist names.
    Returns a 0/1 flag indicating whether they match.
    """
    if wiki_name.replace('"','') == sp_name:
        jw_score = 1   
    elif " featuring" in wiki_artist:
        wanted_name = re.findall('.+?(?= featuring)', wiki_artist)
        jw_score = jellyfish.jaro_winkler_similarity(wanted_name[0], sp_artist)
    elif " and" in wiki_artist:
        wanted_name = re.findall('.+?(?= and)', wiki_artist)
        jw_score = jellyfish.jaro_winkler_similarity(wanted_name[0], sp_artist)
    else:
        jw_score = jellyfish.jaro_winkler_similarity(wiki_artist, sp_artist)
    
    flag = 1 if jw_score < 0.8 else 0 
    
    return flag

In [25]:
def unpack_sp_data(result, ind):
    """
    Unpack Spotify query results to get details for each field.
    Inputs: Query result and the target index within the result.
    Returns: Spotify ID, song name, first artist, and song duration; all strings.
    """
    sp_id = result['tracks']['items'][ind]['id']
    sp_info = spotify.track(sp_id)
    sp_name = sp_info['name'].lower().replace("'", "")
    sp_artist = sp_info['artists'][0]['name'].lower().replace("'", "")
    duration = sp_info['duration_ms']
    
    return sp_id, sp_name, sp_artist, duration

In [20]:
def unpack_wiki_data(top100_data, i):
    """
    For any given Wikipedia row, unpack the row to get each field.
    Inputs: all Wikipedia songs (pd dataframe), and the target index.
    Returns: Wikipedia song name, artist name, and year.
    """
    wiki_name = top100_data.iloc[i]['song'].lower().replace("'", "")
    wiki_artist = top100_data.iloc[i]['artist(s)'].lower().replace("''", "")
    year = top100_data.iloc[i]['year']
    
    return wiki_name, wiki_artist, year

In [21]:
def send_query(wiki_name, wiki_artist):
    """
    Form and send a search query to Spotify based on song and artist name.
    Inputs: wanted song name and artist name (strings).
    Returns: The received result from Spotify.
    """
    query = 'track:' + wiki_name + '+artist:' + wiki_artist
    result = spotify.search(q=query, type='track')
    if result_not_received(result):
        query = 'track:' + wiki_name
        result = spotify.search(q=query, type='track')
    
    return result


def result_not_received(result):
    """Check whether query was successful"""
    if result['tracks']['items']:
        return False
    else:
        return True

In [10]:
number_of_songs = len(top100_data)
output_file = open("../data/spotify_results.csv", "w")
writer = csv.writer(output_file)
writer.writerow(['index', 'wiki_name', 'wiki_artist', 'year', 'sp_id', 'sp_name',
                 'sp_artist', 'duration', 'audio_features', 'flag'])
# Iterate through all the rows of the wikipedia dataframe
for i in range(number_of_songs):
    wiki_name, wiki_artist, year = unpack_wiki_data(top100_data, i)
    result = send_query(wiki_name, wiki_artist)
    
    if result_not_received(result):
        writer.writerow([i, wiki_name, wiki_artist, year, None, None,
                         None, None, None, 1])
        continue
    
    sp_id, sp_name, sp_artist, duration = unpack_sp_data(result, 0)
    flag = check_match(wiki_name, sp_name, wiki_artist, sp_artist) 
    audio_features, flag = get_sp_features(sp_id, flag)
    
    writer.writerow([i, wiki_name, wiki_artist, year, sp_id, sp_name,
                     sp_artist, duration, audio_features, flag])
    
    if i % 100 == 0:
        print("Status: at index {} and year {}".format(i, year))
        # Pause to avoid API problems
        time.sleep(10)
    
output_file.close()

Status: at index 0 and year 1958
Status: at index 100 and year 1959
Status: at index 200 and year 1960
Status: at index 300 and year 1961
Status: at index 400 and year 1962
Status: at index 500 and year 1963
Status: at index 600 and year 1964
Status: at index 700 and year 1965
Status: at index 800 and year 1966
Status: at index 900 and year 1967
Status: at index 1000 and year 1968
Status: at index 1100 and year 1969
Status: at index 1200 and year 1970
Status: at index 1300 and year 1971
Status: at index 1400 and year 1972
Status: at index 1500 and year 1973
Status: at index 1600 and year 1974
Status: at index 1700 and year 1975
Status: at index 1800 and year 1976
Status: at index 1900 and year 1977
Status: at index 2000 and year 1978
Status: at index 2100 and year 1979
Status: at index 2200 and year 1980
Status: at index 2300 and year 1981
Status: at index 2400 and year 1982
Status: at index 2500 and year 1983
Status: at index 2600 and year 1984
Status: at index 2700 and year 1985
Stat

In [36]:
results_csv = pd.read_csv("../data/spotify_results.csv")
results_csv.tail()

Unnamed: 0,index,wiki_name,wiki_artist,year,sp_id,sp_name,sp_artist,duration,audio_features,flag
6346,6346,"""things a man oughta know""",lainey wilson,2021,5QS8PNEWbqTEZyQ6e9ZbJf,things a man oughta know,lainey wilson,203373.0,"{'danceability': 0.659, 'energy': 0.683, 'key'...",0
6347,6347,"""throat baby (go baby)""",brs kash,2021,15C4TnrrVdym7UykxQIOTZ,throat baby (go baby) (with dababy & city girl...,brs kash,211609.0,"{'danceability': 0.878, 'energy': 0.475, 'key'...",0
6348,6348,"""tombstone""",rod wave,2021,3zc8VZEpM1onYV4FWGdFvm,tombstone,rod wave,160078.0,"{'danceability': 0.55, 'energy': 0.637, 'key':...",0
6349,6349,"""drinkin beer. talkin god. amen.""",chase rice featuring florida georgia line,2021,1UYfAU2bwgjaM5rIIPQleC,drinkin beer. talkin god. amen. (feat. florida...,chase rice,160839.0,"{'danceability': 0.627, 'energy': 0.678, 'key'...",0
6350,6350,"""todo de ti""",rauw alejandro,2021,4fSIb4hdOQ151TILNsSEaF,todo de ti,rauw alejandro,199604.0,"{'danceability': 0.78, 'energy': 0.718, 'key':...",0


In [3]:
results_csv.shape

(6351, 10)

In [4]:
results_csv['sp_id'].isna().sum()

49

In [5]:
results_csv['flag'].sum()

314

In [6]:
flagged = results_csv[results_csv['flag']==1]
mismatch = flagged[flagged['sp_id'].notnull()]

In [38]:
cleaned_results = results_csv.copy(deep=True)

In [43]:
cleaned_results.shape

(6351, 10)

In [44]:
cleaned_results['flag'].sum()

314

In [62]:
# For the songs that were not found correctly, try looking at the second search result
number_of_songs = len(top100_data)
changed_ids = []
for i in range(number_of_songs):
    if cleaned_results.iloc[i]['flag'] == 0 or pd.isnull(cleaned_results.iloc[i]['sp_id']):
        continue
    
    name = cleaned_results.iloc[i]['wiki_name'].lower().replace("'", "")
    artist = cleaned_results.iloc[i]['wiki_artist'].lower().replace("'", "")
    result = send_query(name, artist)
    
    if result_not_received(result) or len(result['tracks']['items']) < 2:
        continue

    sp_id, sp_name, sp_artist, duration = unpack_sp_data(result, 1)
    flag = check_match(name, sp_name, artist, sp_artist)
    if sp_name in name and sp_artist in artist:
        audio_features, flag = get_sp_features(sp_id, flag)
        cleaned_results.at[i, 'sp_id'] = sp_id
        cleaned_results.at[i, 'sp_name'] = sp_name
        cleaned_results.at[i, 'sp_artist'] = sp_artist
        cleaned_results.at[i, 'duration'] = duration
        cleaned_results.at[i, 'audio_features'] = audio_features
        cleaned_results.at[i, 'flag'] = flag
        changed_ids.append(i)

In [78]:
len(changed_ids)

33

In [64]:
cleaned_results.shape

(6351, 10)

In [68]:
cleaned_results['flag'].sum()

281

In [72]:
flagged = cleaned_results[cleaned_results['flag']==1]
flagged.shape

(281, 10)

In [73]:
cleaned_results.drop(flagged['index'].tolist(), axis=0, inplace=True)
cleaned_results.shape

(6070, 10)

In [75]:
cleaned_results.to_csv('../data/spotify_results_cleaned.csv', index=False)