In [62]:
import pandas as pd
import numpy as np
import os
import requests
import re

In [53]:
import warnings
warnings.filterwarnings('ignore')

# Concatenate yearly song releases

In [4]:
year = [i for i in range(2012, 2022)]

sr = pd.read_csv('song_release/%s.csv' %year[0], index_col=False, names=['title', 'artist'])
sr['year'] = [year[0]] * len(sr)

for i in year[1:]:
    sr_i = pd.read_csv('song_release/%s.csv' %i, index_col=False, names=['title', 'artist'])
    sr_i['year'] = [i] * len(sr_i)
    sr = pd.concat([sr, sr_i], ignore_index=True)

# strip whitespaces
sr['title'] = sr['title'].str.strip()
sr['artist'] = sr['artist'].str.strip()
sr

Unnamed: 0,title,artist,year
0,(I Called Her) Tennessee,Tim Dugger,2012
1,"10,000 Reasons (Bless the Lord)",Matt Redman,2012
2,100 Proof,Kellie Pickler,2012
3,101,Alicia Keys,2012
4,110%,Jessie Ware,2012
...,...,...,...
20032,You're To Blame,Mammoth WVH,2021
20033,"Young, Black And Beautiful",Chris Pierce,2021
20034,Younger Me,Brothers Osborne,2021
20035,Your Power,Billie Eilish,2021


In [22]:
sr['track_id'] = [np.nan] * len(sr)
sr['release_date'] = [np.nan] * len(sr)
sr['spotify.name'] = [np.nan] * len(sr)
sr['spotify.artists'] = [np.nan] * len(sr)

## Get Spotify track ID

In [44]:
def getArtistNameList(result_artists):
    artists = []
    for a in result_artists:
        artists.append(a['name'])
    return artists

In [92]:
CLIENT_ID = '6ced8cf12e0c470f9d9dd25a75d4ec26'
CLIENT_SECRET = '4ce366cebd7c42f2ac59bf96ca9f32b9'

AUTH_URL = 'https://accounts.spotify.com/api/token'

# POST
auth_response = requests.post(AUTH_URL, {
    'grant_type': 'client_credentials',
    'client_id': CLIENT_ID,
    'client_secret': CLIENT_SECRET,
})

# convert the response to JSON
auth_response_data = auth_response.json()

# save the access token
access_token = auth_response_data['access_token']
headers = {
    'Authorization': 'Bearer {token}'.format(token=access_token)
}

# base URL of all Spotify API endpoints
BASE_URL = 'https://api.spotify.com/v1/'

### Search directly by title and artist

In [None]:
try:
    start_idx = sr.loc[~sr['track_id'].isnull(),:].index.values[-1] + 1
except:
    start_idx = 0
    
for i in range(start_idx, len(sr)):
    title = sr['title'][i]
    artist = sr['artist'][i]
    
    query = 'track:"%s" AND artist:%s AND year:%s&type=track&limit=1'%(title,
                                                                        artist,
                                                                        sr['year'][i])

    r = requests.get(BASE_URL + 'search?q=' + query, headers=headers).json()

    try:
        results = r['tracks']['items'][0]

        artists_i = getArtistNameList(results['artists'])

        for a in artists_i:
            # deal with middle name -> match only first and last name
            if (a.lower() in artist.lower()) or (artist.lower() in a.lower()) or\
            ((artist.split()[0].lower() in a.split()[0].lower()) and (artist.split()[-1].lower() in a.split()[-1].lower())):
                #print(i, results['id'])
                sr['track_id'][i] = results['id']
                sr['release_date'][i] = results['album']['release_date']
                sr['spotify.name'][i] = results['name']
                sr['spotify.artists'][i] = artists_i
                break
    except:
        pass

### Search by title (cleaned) and artist

In [None]:
for i in sr.loc[sr['track_id'].isnull(),:].index.values:
    # clean title
    title = ''.join(c for c in sr['title'][i] if c.isalnum() or c.isspace())
    #title = ' '.join(c for c in sr['title'][i] if c.isalnum()) ## special format of titles
    
    artist = sr['artist'][i]
    
    query = 'track:"%s" AND artist:%s AND year:%s&type=track&limit=1'%(title,
                                                                       artist,
                                                                       sr['year'][i])

    r = requests.get(BASE_URL + 'search?q=' + query, headers=headers).json()

    try:
        results = r['tracks']['items'][0]

        artists_i = getArtistNameList(results['artists'])

        for a in artists_i:
            # deal with middle name -> match only first and last name
            if (a.lower() in artist.lower()) or (artist.lower() in a.lower()) or\
            ((artist.split()[0].lower() in a.split()[0].lower()) and (artist.split()[-1].lower() in a.split()[-1].lower())):
                #print(i, results['id'])
                sr['track_id'][i] = results['id']
                sr['release_date'][i] = results['album']['release_date']
                sr['spotify.name'][i] = results['name']
                sr['spotify.artists'][i] = artists_i
                break
    except:
        pass

### Traverse artists for multi-artist songs with title

In [None]:
for i in sr.loc[sr['track_id'].isnull(),:].index.values:
    title = sr['title'][i]
    
    # traverse artists for multi-artist songs
    for artist in re.split(', | & | X | x | and ', sr['artist'][i]):
        
        query = 'track:"%s" AND artist:%s AND year:%s&type=track&limit=1'%(title,
                                                                           artist,
                                                                           sr['year'][i])

        r = requests.get(BASE_URL + 'search?q=' + query, headers=headers).json()

        try:
            results = results = r['tracks']['items'][0]
            
            artists_i = getArtistNameList(results['artists'])

            for a in artists_i:
            # deal with middle name -> match only first and last name
                if (a.lower() in artist.lower()) or (artist.lower() in a.lower()) or\
                ((artist.split()[0].lower() in a.split()[0].lower()) and (artist.split()[-1].lower() in a.split()[-1].lower())):
                    #print(i, results['id'])
                    sr['track_id'][i] = results['id']
                    sr['release_date'][i] = results['album']['release_date']
                    sr['spotify.name'][i] = results['name']
                    sr['spotify.artists'][i] = artists_i
                    break
            break
        except:
            pass
    

### Search by title with artist traversal with title (cleaned)

In [43]:
for i in sr.loc[sr['track_id'].isnull(),:].index.values:
    # clean title
    title = ''.join(c for c in sr['title'][i] if c.isalnum() or c.isspace())
    #title = ' '.join(c for c in sr['title'][i] if c.isalnum()) ## special format of titles
    
    # traverse artists for multi-artist songs
    for artist in re.split(', | & | X | x | and ', sr['artist'][i]):
        # direct search by title and artist
        query = 'track:"%s" AND artist:%s AND year:%s&type=track&limit=1'%(sr['title'][i],
                                                                            artist,
                                                                            sr['year'][i])
        # not specify year
        ##query = 'track:"%s" AND artist:%s&type=track&limit=50'%(title, artist)

        r = requests.get(BASE_URL + 'search?q=' + query, headers=headers).json()
        
        try:
            results = r['tracks']['items'][0]

            artists_i = getArtistNameList(results['artists'])

            for a in artists_i:
                # deal with middle name -> match only first and last name
                if (a.lower() in artist.lower()) or (artist.lower() in a.lower()) or\
                ((artist.split()[0].lower() in a.split()[0].lower()) and (artist.split()[-1].lower() in a.split()[-1].lower())):
                    #print(i, results['id'])
                    sr['track_id'][i] = results['id']
                    sr['release_date'][i] = results['album']['release_date']
                    sr['spotify.name'][i] = results['name']
                    sr['spotify.artists'][i] = artists_i
                    break
            break
        except:
            pass

### Search without specifying year

In [None]:
for i in sr.loc[sr['track_id'].isnull(),:].index.values:
    # clean title
    title = ''.join(c for c in sr['title'][i] if c.isalnum() or c.isspace())
    #title = ' '.join(c for c in sr['title'][i] if c.isalnum()) ## special format of titles
    
    # traverse artists for multi-artist songs
    for artist in re.split(', | & | X | x | and ', sr['artist'][i]):
        # direct search by title and artist
        #query = 'track:"%s" AND artist:%s AND year:%s&type=track&limit=50'%(title,
                                                                            #artist,
                                                                            #sr['year'][i])
        # not specify year
        query = 'track:"%s" AND artist:%s&type=track&limit=1'%(title, artist)

        r = requests.get(BASE_URL + 'search?q=' + query, headers=headers).json()
        
        try:
            results = r['tracks']['items'][0]
            
            artists_i = getArtistNameList(results['artists'])

            for a in artists_i:
                # deal with middle name -> match only first and last name
                if (a.lower() in artist.lower()) or (artist.lower() in a.lower()) or\
                ((artist.split()[0].lower() in a.split()[0].lower()) and (artist.split()[-1].lower() in a.split()[-1].lower())):
                    #print(i, results['id'])
                    sr['track_id'][i] = results['id']
                    sr['release_date'][i] = results['album']['release_date']
                    sr['spotify.name'][i] = results['name']
                    sr['spotify.artists'][i] = artists_i
                    break
            break
        except:
            pass

### Search title with special format

In [None]:
for i in sr.loc[sr['track_id'].isnull(),:].index.values:
    # clean title
    #title = ''.join(c for c in sr['title'][i] if c.isalnum() or c.isspace())
    title = ' '.join(c for c in sr['title'][i] if c.isalnum()) ## special format of titles
    
    # traverse artists for multi-artist songs
    for artist in re.split(', | & | X | x | and ', sr['artist'][i]):
        # direct search by title and artist
        query = 'track:"%s" AND artist:%s AND year:%s&type=track&limit=1'%(title,
                                                                            artist,
                                                                            sr['year'][i])
        # not specify year
        ##query = 'track:"%s" AND artist:%s&type=track&limit=50'%(title, artist)

        r = requests.get(BASE_URL + 'search?q=' + query, headers=headers).json()
        
        try:
            results = pd.json_normalize(r['tracks']['items']).sort_values('album.release_date', ignore_index=True)

            for j in range(len(results)):
                artists_i = getArtistNameList(results['artists'][j])

                for a in artists_i:
                    # deal with middle name -> match only first and last name
                    if (a.lower() in artist.lower()) or (artist.lower() in a.lower()) or\
                    ((artist.split()[0].lower() in a.split()[0].lower()) and (artist.split()[-1].lower() in a.split()[-1].lower())):
                        #print(i, results['id'][j])
                        sr['track_id'][i] = results['id'][j]
                        sr['release_date'][i] = results['album.release_date'][j]
                        sr['spotify.name'][i] = results['name'][j]
                        sr['spotify.artists'][i] = artists_i
                        break
                break
            break
        except:
            pass

### Check title differences

In [None]:
'''
title_dif_idx = []
for i in sr.loc[~sr['track_id'].isnull()].index.values:
    if sr['title'][i].lower() not in sr['spotify.name'][i].lower():
        title_dif_idx.append(i)

title_dif = sr.iloc[title_dif_idx]
#title_dif.to_excel('new_spotify_correct_title_dif.xlsx')

new_title_dif = pd.read_excel('title_dif.xlsx')
sr.iloc[new_title_dif['Unnamed: 0'].values, [3,4,5,6]] = new_title_dif.iloc[:,[4,5,6,7]].values
'''

### Check duplicate track IDs

In [75]:
'''
dup = sr[sr.duplicated(['track_id'],keep=False)]
dup = dup.loc[~dup['track_id'].isnull()]
dup
#dup.to_excel('new_spotify_correct_dup.xlsx')

new_dup = pd.read_excel('new_spotify_correct_dup2.xlsx')
sr.iloc[new_dup['Unnamed: 0'].values, [3,4,5,6]] = new_dup.iloc[:,[4,5,6,7]].values
'''

### Check year and release_date discrepancies

In [78]:
'''
year_dif = sr.loc[(~sr['release_date'].isnull())]
year_dif = year_dif.loc[year_dif['release_date'].str.len()>4]
year_dif = year_dif.loc[abs(year_dif['year']-year_dif['release_date'].str[:4].astype(int))>=1]

#year_dif.to_excel('new_spotify_year_dif.xlsx')

new_year_dif = pd.read_excel('year_dif.xlsx')
sr.iloc[new_year_dif['Unnamed: 0'].values, [3,4,5,6]] = new_year_dif.iloc[:,[4,5,6,7]].values
'''

### Null values

In [None]:
'''sr.loc[sr['track_id'].isnull()].to_excel('new_spotify_correct_null.xlsx')
new_null = pd.read_excel('new_spotify_correct_null.xlsx')

new_null['track_id'] = new_null['track_id'].str.replace('https://open.spotify.com/track/','')

for i in range(len(new_null)):
    if new_null['track_id'][i] is not np.nan:
        new_null['track_id'][i] = new_null['track_id'][i].split('?si=')[0]
        
sr.iloc[new_null['Unnamed: 0'].values, [3,4,5,6]] = new_null.iloc[:,[4,5,6,7]].values'''

In [104]:
#sr.dropna(subset=['track_id']).to_csv('get_id/song_release_ID.csv', index=False)
#sr.dropna(subset=['track_id']).to_excel('get_id/song_release_ID.xlsx', index=False)

## Get track information

In [93]:
sr = pd.read_csv('song_release_ID_filled.csv').iloc[:,0:4]

# initialize track_info dataframe
track_info = pd.DataFrame(columns=['artists', 'available_markets', 'disc_number', 'duration_ms',
       'explicit', 'href', 'id', 'is_local', 'name', 'popularity',
       'preview_url', 'track_number', 'type', 'uri', 'album.album_type',
       'album.artists', 'album.available_markets',
       'album.external_urls.spotify', 'album.href', 'album.id', 'album.images',
       'album.name', 'album.release_date', 'album.release_date_precision',
       'album.total_tracks', 'album.type', 'album.uri', 'external_ids.isrc',
       'external_urls.spotify'])


for i in range(len(track_info), len(sr), 50):
    if not i+50 > len(sr):
        ids = ','.join(sr['track_id'][i:i+50])
    else:
        ids = ','.join(sr['track_id'][i:])
        
    r = requests.get(BASE_URL + 'tracks?ids=' + ids , headers=headers)
    
    track_info = pd.concat([track_info, pd.json_normalize(r.json()['tracks'])],
                               ignore_index=True)

In [94]:
def getArtistName(artists_list):
    artists = []
    for a in artists_list:
        artists.append(a['name'])
    return ', '.join(artists)

track_info_clean = track_info[['name','artists','id','album.release_date']]

for i in range(len(track_info_clean)):
    track_info_clean['artists'][i] = getArtistName(track_info_clean['artists'][i])

In [100]:
sr.merge(track_info_clean, left_on='track_id', right_on='id').to_csv('song_release_ID_spotify.csv', index=False)