In [79]:
import pandas as pd
import numpy as np
import os
import requests
import re

In [80]:
import warnings
warnings.filterwarnings('ignore')

# Concatenate monthly-end Billboard Hot 100

In [81]:
path = '../billboard_data/'
file = os.listdir(path)

year = [i for i in range(2012, 2022)]

file_name = sorted([f for f in file if f[:4] in str(year)])

hit = pd.read_csv(path + file_name[0])

for f in file_name[1:]:
    hit = pd.concat([hit,
                     pd.read_csv(path + f)],
                     ignore_index=True)
    
# add chart date for further reference 
## (i.e. if we want to denote songs ever featured top 20 as hit, chart date matters)
date = []

for i in file_name:
    date += [i[:-4]] *100
    
hit['date'] = date

hit

Unnamed: 0,artist,isNew,last,peak,rank,title,week,date
0,Adele,False,2,1,1,Set Fire To The Rain,21,2012-02-04
1,Rihanna Featuring Calvin Harris,False,1,1,2,We Found Love,18,2012-02-04
2,Flo Rida,False,3,3,3,Good Feeling,17,2012-02-04
3,David Guetta Featuring Nicki Minaj,False,10,4,4,Turn Me On,8,2012-02-04
4,Katy Perry,False,6,3,5,The One That Got Away,15,2012-02-04
...,...,...,...,...,...,...,...,...
11695,Kenny Chesney,False,100,87,96,Knowing You,6,2021-10-02
11696,Lil Nas X Featuring Miley Cyrus,True,0,97,97,Am I Dreaming,1,2021-10-02
11697,Zac Brown Band,True,0,98,98,Same Boat,1,2021-10-02
11698,H.E.R. Featuring Chris Brown,False,95,64,99,Come Through,15,2021-10-02


In [82]:
hit_unique = hit.drop_duplicates(subset=['title','artist'])[['title', 'artist','date']]
hit_unique['hit'] = [1]*len(hit_unique)
hit_unique.reset_index(inplace=True, drop=True)
hit_unique

Unnamed: 0,title,artist,date,hit
0,Set Fire To The Rain,Adele,2012-02-04,1
1,We Found Love,Rihanna Featuring Calvin Harris,2012-02-04,1
2,Good Feeling,Flo Rida,2012-02-04,1
3,Turn Me On,David Guetta Featuring Nicki Minaj,2012-02-04,1
4,The One That Got Away,Katy Perry,2012-02-04,1
...,...,...,...,...
3267,7am On Bridle Path,Drake,2021-10-02,1
3268,Off The Grid,Kanye West,2021-10-02,1
3269,In Da Getto,J Balvin & Skrillex,2021-10-02,1
3270,Am I Dreaming,Lil Nas X Featuring Miley Cyrus,2021-10-02,1


In [83]:
hit_unique['track_id'] = [np.nan] * len(hit_unique)
hit_unique['release_date'] = [np.nan] * len(hit_unique)
hit_unique['spotify.name'] = [np.nan] * len(hit_unique)
hit_unique['spotify.artists'] = [np.nan] * len(hit_unique)

## Get Spotify track ID

In [84]:
def getArtistNameList(result_artists):
    artists = []
    for a in result_artists:
        artists.append(a['name'])
    return artists

### Direect search by title and artists

In [None]:
CLIENT_ID = ''
CLIENT_SECRET = ''

AUTH_URL = 'https://accounts.spotify.com/api/token'

# POST
auth_response = requests.post(AUTH_URL, {
    'grant_type': 'client_credentials',
    'client_id': CLIENT_ID,
    'client_secret': CLIENT_SECRET,
})

# convert the response to JSON
auth_response_data = auth_response.json()

# save the access token
access_token = auth_response_data['access_token']
headers = {
    'Authorization': 'Bearer {token}'.format(token=access_token)
}

# base URL of all Spotify API endpoints
BASE_URL = 'https://api.spotify.com/v1/'

try:
    start_idx = hit_unique.loc[~hit_unique['track_id'].isnull(),:].index.values[-1] + 1
except:
    start_idx = 0
    
for i in range(start_idx, len(hit_unique)):
    # direct search by title and artist
    title = hit_unique['title'][i]
    artist = hit_unique['artist'][i]
    query = 'track:"%s" AND artist:%s&type=track&limit=1'%(title, artist)

    r = requests.get(BASE_URL + 'search?q=' + query, headers=headers).json()

    try:
        results = r['tracks']['items'][0]
        
        artists_i = getArtistNameList(results['artists'])

        for a in artists_i:
            # deal with middle name -> match only first and last name
            if (a.lower() in artist.lower()) or (artist.lower() in a.lower()) or\
            ((artist.split()[0].lower() in a.split()[0].lower()) and (artist.split()[-1].lower() in a.split()[-1].lower())):
                print(i, results['id'])
                hit_unique['track_id'][i] = results['id']
                hit_unique['release_date'][i] = results['album']['release_date']
                hit_unique['spotify.name'][i] = results['name']
                hit_unique['spotify.artists'][i] = artists_i
                break
    except:
        pass

### Search by title with artist traversal

In [None]:
CLIENT_ID = ''
CLIENT_SECRET = ''

AUTH_URL = 'https://accounts.spotify.com/api/token'

# POST
auth_response = requests.post(AUTH_URL, {
    'grant_type': 'client_credentials',
    'client_id': CLIENT_ID,
    'client_secret': CLIENT_SECRET,
})

# convert the response to JSON
auth_response_data = auth_response.json()

# save the access token
access_token = auth_response_data['access_token']
headers = {
    'Authorization': 'Bearer {token}'.format(token=access_token)
}

# base URL of all Spotify API endpoints
BASE_URL = 'https://api.spotify.com/v1/'
    
    
for i in hit_unique.loc[hit_unique['track_id'].isnull(),:].index.values:
    title = hit_unique['title'][i]
    # clean title
    #title = ''.join(c for c in hit_unique['title'][i] if c.isalnum() or c.isspace())
    #title = ' '.join(c for c in hit_unique['title'][i] if c.isalnum()) ## special format of titles
    
    # traverse artists for multi-artist songs
    for artist in re.split(', | & | Featuring ', hit_unique['artist'][i]):
        query = 'track:"%s" AND artist:%s&type=track&limit=1'%(title, artist)

        r = requests.get(BASE_URL + 'search?q=' + query, headers=headers).json()
        
        try:
            results = r['tracks']['items'][0]

            
            artists_i = getArtistNameList(results['artists'])

            for a in artists_i:
                # deal with middle name -> match only first and last name
                if (a.lower() in artist.lower()) or (artist.lower() in a.lower()) or\
                ((artist.split()[0].lower() in a.split()[0].lower()) and (artist.split()[-1].lower() in a.split()[-1].lower())):
                    print(i, results['id'])
                    hit_unique['track_id'][i] = results['id']
                    hit_unique['release_date'][i] = results['album']['release_date']
                    hit_unique['spotify.name'][i] = results['name']
                    hit_unique['spotify.artists'][i] = artists_i
                    break
            break
        except:
            pass

### Search by title(cleaned) and traverse artists

In [None]:
CLIENT_ID = ''
CLIENT_SECRET = ''

AUTH_URL = 'https://accounts.spotify.com/api/token'

# POST
auth_response = requests.post(AUTH_URL, {
    'grant_type': 'client_credentials',
    'client_id': CLIENT_ID,
    'client_secret': CLIENT_SECRET,
})

# convert the response to JSON
auth_response_data = auth_response.json()

# save the access token
access_token = auth_response_data['access_token']
headers = {
    'Authorization': 'Bearer {token}'.format(token=access_token)
}

# base URL of all Spotify API endpoints
BASE_URL = 'https://api.spotify.com/v1/'
    
    
for i in hit_unique.loc[hit_unique['track_id'].isnull(),:].index.values:
    #title = hit_unique['title'][i]
    # clean title
    title = ''.join(c for c in hit_unique['title'][i] if c.isalnum() or c.isspace())
    ##title = ' '.join(c for c in hit_unique['title'][i] if c.isalnum()) ## special format of titles
    
    # traverse artists for multi-artist songs
    for artist in re.split(', | & | Featuring | X ', hit_unique['artist'][i]):
        # direct search by title and artist
        query = 'track:"%s" AND artist:%s&type=track&limit=1'%(title, artist)

        r = requests.get(BASE_URL + 'search?q=' + query, headers=headers).json()
        
        try:
            results = r['tracks']['items'][0]
        
            artists_i = getArtistNameList(results['artists'])

            for a in artists_i:
                # deal with middle name -> match only first and last name
                if (a.lower() in artist.lower()) or (artist.lower() in a.lower()) or\
                ((artist.split()[0].lower() in a.split()[0].lower()) and (artist.split()[-1].lower() in a.split()[-1].lower())):
                    print(i, results['id'])
                    hit_unique['track_id'][i] = results['id']
                    hit_unique['release_date'][i] = results['album']['release_date']
                    hit_unique['spotify.name'][i] = results['name']
                    hit_unique['spotify.artists'][i] = artists_i
                    break
            break
        except:
            pass

### Null

In [90]:
#hit_unique[hit_unique['track_id'].isnull()].to_csv('../billboard_null.csv')

#hit_null = pd.read_csv('../billboard_null.csv')

#hit_unique.iloc[hit_null['Unnamed: 0'], 4] = hit_null.iloc[:,5]

### Title discrepancy

In [None]:
#title_dif_idx = []
#for i in hit_unique.loc[~hit_unique['spotify.name'].isnull()].index.values:
    #if str(hit_unique['title'][i]).lower() not in str(hit_unique['spotify.name'][i]).lower():
        #title_dif_idx.append(i)

#title_dif = hit_unique.iloc[title_dif_idx]

#new_title = pd.read_excel('../title.xlsx')
#hit_unique.iloc[new_title['Unnamed: 0'], 4] = new_title.iloc[:,5]

### Duplicated ID

In [None]:
#dup = hit_unique[hit_unique.duplicated(['track_id'],keep=False)]
#dup = dup.loc[~dup['track_id'].isnull()].sort_values('track_id')

#new_dup = pd.read_excel('../dup.xlsx')
#hit_unique.iloc[new_dup['Unnamed: 0'], 4] = new_dup.iloc[:,5]

In [102]:
#hit_unique.dropna(subset=['track_id']).to_csv('../billboard_ID.csv', index=False)

## Get track information

In [103]:
hit_unique = pd.read_csv('../billboard_ID.csv').iloc[:,0:5]
hit_unique

Unnamed: 0,title,artist,date,hit,track_id
0,Set Fire To The Rain,Adele,2012-02-04,1,73CMRj62VK8nUS4ezD2wvi
1,We Found Love,Rihanna Featuring Calvin Harris,2012-02-04,1,6qn9YLKt13AGvpq9jfO8py
2,Good Feeling,Flo Rida,2012-02-04,1,2LEF1A8DOZ9wRYikWgVlZ8
3,Turn Me On,David Guetta Featuring Nicki Minaj,2012-02-04,1,6JOlNkT0QdHeZB0wPbI9IR
4,The One That Got Away,Katy Perry,2012-02-04,1,6hkOqJ5mE093AQf2lbZnsG
...,...,...,...,...,...
3266,7am On Bridle Path,Drake,2021-10-02,1,42m3eP1JJhtzffal9B136J
3267,Off The Grid,Kanye West,2021-10-02,1,6LNoArVBBVZzUTUiAX2aKO
3268,In Da Getto,J Balvin & Skrillex,2021-10-02,1,63aj87TQG6F3RVO5nbG2VQ
3269,Am I Dreaming,Lil Nas X Featuring Miley Cyrus,2021-10-02,1,6isTQfKXhNO3EyJd9mSxx8


In [110]:
# initialize track_info dataframe
track_info = pd.DataFrame(columns=['artists', 'available_markets', 'disc_number', 'duration_ms',
       'explicit', 'href', 'id', 'is_local', 'name', 'popularity',
       'preview_url', 'track_number', 'type', 'uri', 'album.album_type',
       'album.artists', 'album.available_markets',
       'album.external_urls.spotify', 'album.href', 'album.id', 'album.images',
       'album.name', 'album.release_date', 'album.release_date_precision',
       'album.total_tracks', 'album.type', 'album.uri', 'external_ids.isrc',
       'external_urls.spotify'])

for i in range(len(track_info), len(hit_unique), 50):
    if not i+50 > len(hit_unique):
        ids = ','.join(hit_unique['track_id'][i:i+50])
    else:
        ids = ','.join(hit_unique['track_id'][i:])
        
    r = requests.get(BASE_URL + 'tracks?ids=' + ids , headers=headers)
    
    track_info = pd.concat([track_info, pd.json_normalize(r.json()['tracks'])],
                               ignore_index=True)

In [None]:
def getArtistName(artists_list):
    artists = []
    for a in artists_list:
        artists.append(a['name'])
    return ', '.join(artists)

track_info_clean = track_info[['name','artists','id','album.release_date']]

for i in range(len(track_info_clean)):
    track_info_clean['artists'][i] = getArtistName(track_info_clean['artists'][i])

In [None]:
track_info_clean

In [None]:
hit_unique.merge(track_info_clean, left_on='track_id', right_on='id').to_csv('../billboard_ID_spotify.csv', index=False)