In [1]:
import pandas as pd
import numpy as np
import requests

## Load data

In [121]:
# load Spotify dataset with track ids
df = pd.read_csv('spotify_2.csv')
df = df.loc[~df['track_id'].isnull(),].reset_index(drop=True)

# label hit or not
hitsongs = pd.read_csv('merge_w_lyrics_full.csv')[['title','artist_x','hit']]
df = df.merge(hitsongs, how='left', on=['title','artist_x'])
df

Unnamed: 0,title,artist_x,track_id,release_data,hit
0,"10,000 Reasons (Bless the Lord)",Matt Redman,0fxpHpK3aw2nFWII6yveDD,2011-01-01,0.0
1,100 Proof,Kellie Pickler,3A3wFMufIxSUrvfGUVYvhE,2012-01-23,0.0
2,101,Alicia Keys,7wMPExYjxfDzu4SWDUt7dQ,2012-11-26,0.0
3,1313,The Big Pink,6OHDgi8V4Q6hFYvQavBVaF,2012-01-16,0.0
4,1961,The Fray,2MLQ6QSJLqaqnz00rCDsFl,2012-02-07,0.0
...,...,...,...,...,...
18079,Danny Phantom,Trippie Redd,7jjbfUEecddD2AW1UncQzb,2021-08-21,1.0
18080,What's Wrong,Rod Wave,1jNapyyVxPB4X6gewQjEM8,2021-08-20,1.0
18081,Demon Time,Trippie Redd,1hE8iI3YK9x1VUBDtSzg3x,2021-08-21,1.0
18082,Tick Tock,Young Thug,3JXebpWufTcs8iegiRCBW7,2021-08-20,1.0


## Access Spotify API

In [4]:
CLIENT_ID = '6ced8cf12e0c470f9d9dd25a75d4ec26'
CLIENT_SECRET = '4ce366cebd7c42f2ac59bf96ca9f32b9'

AUTH_URL = 'https://accounts.spotify.com/api/token'

# POST
auth_response = requests.post(AUTH_URL, {
    'grant_type': 'client_credentials',
    'client_id': CLIENT_ID,
    'client_secret': CLIENT_SECRET,
})

# convert the response to JSON
auth_response_data = auth_response.json()

# save the access token
access_token = auth_response_data['access_token']
headers = {
    'Authorization': 'Bearer {token}'.format(token=access_token)
}

BASE_URL = 'https://api.spotify.com/v1/'

## Get audio features

In [5]:
# initialize audio_feature dataframe to record audio features of tracks
audio_features = pd.DataFrame(columns = 
                              ['danceability','energy','key','loudness','mode','speechiness',
                               'acousticness','instrumentalness','liveness','valence','tempo',
                               'type','id','uri','track_href','analysis_url','duration_ms','time_signature'])

In [57]:
# max ids is 100
for i in range(0, len(df), 100):
    if not i+100 > len(df):
        ids = ','.join(df['track_id'][i:i+100])
    else:
        ids = ','.join(df['track_id'][i:])
        
    r = requests.get(BASE_URL + 'audio-features?ids=' + ids, headers=headers)
    
    audio_features = pd.concat([audio_features, pd.json_normalize(r.json()['audio_features'])],
                               ignore_index=True)

In [58]:
audio_features

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
0,0.431,0.485,7,-8.085,1,0.0274,0.452000,0.000000,0.5440,0.3670,145.220,audio_features,0fxpHpK3aw2nFWII6yveDD,spotify:track:0fxpHpK3aw2nFWII6yveDD,https://api.spotify.com/v1/tracks/0fxpHpK3aw2n...,https://api.spotify.com/v1/audio-analysis/0fxp...,342493,4
1,0.543,0.642,1,-5.231,1,0.0255,0.386000,0.000014,0.0983,0.1610,132.291,audio_features,3A3wFMufIxSUrvfGUVYvhE,spotify:track:3A3wFMufIxSUrvfGUVYvhE,https://api.spotify.com/v1/tracks/3A3wFMufIxSU...,https://api.spotify.com/v1/audio-analysis/3A3w...,226360,4
2,0.277,0.289,7,-9.389,0,0.0599,0.791000,0.000632,0.0838,0.0825,170.046,audio_features,7wMPExYjxfDzu4SWDUt7dQ,spotify:track:7wMPExYjxfDzu4SWDUt7dQ,https://api.spotify.com/v1/tracks/7wMPExYjxfDz...,https://api.spotify.com/v1/audio-analysis/7wMP...,387827,3
3,0.574,0.900,10,-7.976,0,0.0630,0.000318,0.055800,0.1600,0.4030,105.991,audio_features,6OHDgi8V4Q6hFYvQavBVaF,spotify:track:6OHDgi8V4Q6hFYvQavBVaF,https://api.spotify.com/v1/tracks/6OHDgi8V4Q6h...,https://api.spotify.com/v1/audio-analysis/6OHD...,352653,4
4,0.567,0.815,9,-4.787,1,0.0390,0.014200,0.000000,0.1480,0.6630,121.988,audio_features,2MLQ6QSJLqaqnz00rCDsFl,spotify:track:2MLQ6QSJLqaqnz00rCDsFl,https://api.spotify.com/v1/tracks/2MLQ6QSJLqaq...,https://api.spotify.com/v1/audio-analysis/2MLQ...,234080,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18079,0.634,0.804,11,-6.688,0,0.4770,0.162000,0.000000,0.3510,0.8090,147.259,audio_features,7jjbfUEecddD2AW1UncQzb,spotify:track:7jjbfUEecddD2AW1UncQzb,https://api.spotify.com/v1/tracks/7jjbfUEecddD...,https://api.spotify.com/v1/audio-analysis/7jjb...,136377,4
18080,0.712,0.636,1,-7.188,1,0.0566,0.303000,0.000000,0.1120,0.1260,134.927,audio_features,1jNapyyVxPB4X6gewQjEM8,spotify:track:1jNapyyVxPB4X6gewQjEM8,https://api.spotify.com/v1/tracks/1jNapyyVxPB4...,https://api.spotify.com/v1/audio-analysis/1jNa...,155259,5
18081,0.718,0.684,11,-5.160,0,0.2480,0.018200,0.000000,0.2330,0.8560,146.084,audio_features,1hE8iI3YK9x1VUBDtSzg3x,spotify:track:1hE8iI3YK9x1VUBDtSzg3x,https://api.spotify.com/v1/tracks/1hE8iI3YK9x1...,https://api.spotify.com/v1/audio-analysis/1hE8...,159452,4
18082,0.944,0.693,11,-8.605,0,0.2660,0.007890,0.000000,0.1640,0.6260,134.061,audio_features,3JXebpWufTcs8iegiRCBW7,spotify:track:3JXebpWufTcs8iegiRCBW7,https://api.spotify.com/v1/tracks/3JXebpWufTcs...,https://api.spotify.com/v1/audio-analysis/3JXe...,159240,4


In [59]:
#audio_features.to_csv('audio_features.csv', index=False)

## Get track info

In [78]:
# initialize track_info dataframe
track_info = pd.DataFrame(columns=['artists', 'available_markets', 'disc_number', 'duration_ms',
       'explicit', 'href', 'id', 'is_local', 'name', 'popularity',
       'preview_url', 'track_number', 'type', 'uri', 'album.album_type',
       'album.artists', 'album.available_markets',
       'album.external_urls.spotify', 'album.href', 'album.id', 'album.images',
       'album.name', 'album.release_date', 'album.release_date_precision',
       'album.total_tracks', 'album.type', 'album.uri', 'external_ids.isrc',
       'external_urls.spotify'])

In [83]:
for i in range(10100, len(df), 50):
    if not i+50 > len(df):
        ids = ','.join(df['track_id'][i:i+50])
    else:
        ids = ','.join(df['track_id'][i:])
        
    r = requests.get(BASE_URL + 'tracks?ids=' + ids , headers=headers)
    
    track_info = pd.concat([track_info, pd.json_normalize(r.json()['tracks'])],
                               ignore_index=True)

In [85]:
#track_info.to_csv('track_info.csv', index=False)

## Clean and merge all Spotify data

In [None]:
def getArtistName(artists_list):
    artists = []
    for a in artists_list:
        artists.append(a['name'])
    return ', '.join(artists)

In [111]:
# Clean track info data
track_info_clean = track_info[['name','artists','id','album.release_date']]

for i in range(len(track_info_clean)):
    track_info_clean['artists'][i] = getArtistName(track_info_clean['artists'][i])

Unnamed: 0,name,artists,id,album.release_date
0,"10,000 Reasons (Bless The Lord) - Live",[{'external_urls': {'spotify': 'https://open.s...,0fxpHpK3aw2nFWII6yveDD,2011-01-01
1,100 Proof,[{'external_urls': {'spotify': 'https://open.s...,3A3wFMufIxSUrvfGUVYvhE,2012-01-23
2,101,[{'external_urls': {'spotify': 'https://open.s...,7wMPExYjxfDzu4SWDUt7dQ,2012-11-26
3,1313,[{'external_urls': {'spotify': 'https://open.s...,6OHDgi8V4Q6hFYvQavBVaF,2012-01-16
4,1961,[{'external_urls': {'spotify': 'https://open.s...,2MLQ6QSJLqaqnz00rCDsFl,2012-02-07
...,...,...,...,...
18079,Danny Phantom (feat. XXXTENTACION),[{'external_urls': {'spotify': 'https://open.s...,7jjbfUEecddD2AW1UncQzb,2021-08-21
18080,What's Wrong,[{'external_urls': {'spotify': 'https://open.s...,1jNapyyVxPB4X6gewQjEM8,2021-08-20
18081,Demon Time (feat. Ski Mask The Slump God),[{'external_urls': {'spotify': 'https://open.s...,1hE8iI3YK9x1VUBDtSzg3x,2021-08-21
18082,Tick Tock,[{'external_urls': {'spotify': 'https://open.s...,3JXebpWufTcs8iegiRCBW7,2021-08-20


In [114]:
track_info_clean

Unnamed: 0,name,artists,id,album.release_date
0,"10,000 Reasons (Bless The Lord) - Live",Matt Redman,0fxpHpK3aw2nFWII6yveDD,2011-01-01
1,100 Proof,Kellie Pickler,3A3wFMufIxSUrvfGUVYvhE,2012-01-23
2,101,Alicia Keys,7wMPExYjxfDzu4SWDUt7dQ,2012-11-26
3,1313,The Big Pink,6OHDgi8V4Q6hFYvQavBVaF,2012-01-16
4,1961,The Fray,2MLQ6QSJLqaqnz00rCDsFl,2012-02-07
...,...,...,...,...
18079,Danny Phantom (feat. XXXTENTACION),"Trippie Redd, XXXTENTACION",7jjbfUEecddD2AW1UncQzb,2021-08-21
18080,What's Wrong,Rod Wave,1jNapyyVxPB4X6gewQjEM8,2021-08-20
18081,Demon Time (feat. Ski Mask The Slump God),"Trippie Redd, Ski Mask The Slump God",1hE8iI3YK9x1VUBDtSzg3x,2021-08-21
18082,Tick Tock,Young Thug,3JXebpWufTcs8iegiRCBW7,2021-08-20


In [115]:
#track_info_clean.to_csv('track_info_clean.csv', index=False)

In [162]:
# Merge all of them together
spotify = pd.merge(track_info_clean,
    audio_features[['danceability','energy','key','loudness','mode','speechiness',
                               'acousticness','instrumentalness','liveness','valence','tempo',
                               'id','duration_ms','time_signature']],
    how='left', on=['id'])

spotify = spotify.drop_duplicates(subset=['id'])
spotify

Unnamed: 0,name,artists,id,album.release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,"10,000 Reasons (Bless The Lord) - Live",Matt Redman,0fxpHpK3aw2nFWII6yveDD,2011-01-01,0.431,0.485,7,-8.085,1,0.0274,0.452000,0.000000,0.5440,0.3670,145.220,342493,4
1,100 Proof,Kellie Pickler,3A3wFMufIxSUrvfGUVYvhE,2012-01-23,0.543,0.642,1,-5.231,1,0.0255,0.386000,0.000014,0.0983,0.1610,132.291,226360,4
2,101,Alicia Keys,7wMPExYjxfDzu4SWDUt7dQ,2012-11-26,0.277,0.289,7,-9.389,0,0.0599,0.791000,0.000632,0.0838,0.0825,170.046,387827,3
3,1313,The Big Pink,6OHDgi8V4Q6hFYvQavBVaF,2012-01-16,0.574,0.900,10,-7.976,0,0.0630,0.000318,0.055800,0.1600,0.4030,105.991,352653,4
4,1961,The Fray,2MLQ6QSJLqaqnz00rCDsFl,2012-02-07,0.567,0.815,9,-4.787,1,0.0390,0.014200,0.000000,0.1480,0.6630,121.988,234080,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19007,Danny Phantom (feat. XXXTENTACION),"Trippie Redd, XXXTENTACION",7jjbfUEecddD2AW1UncQzb,2021-08-21,0.634,0.804,11,-6.688,0,0.4770,0.162000,0.000000,0.3510,0.8090,147.259,136377,4
19008,What's Wrong,Rod Wave,1jNapyyVxPB4X6gewQjEM8,2021-08-20,0.712,0.636,1,-7.188,1,0.0566,0.303000,0.000000,0.1120,0.1260,134.927,155259,5
19009,Demon Time (feat. Ski Mask The Slump God),"Trippie Redd, Ski Mask The Slump God",1hE8iI3YK9x1VUBDtSzg3x,2021-08-21,0.718,0.684,11,-5.160,0,0.2480,0.018200,0.000000,0.2330,0.8560,146.084,159452,4
19010,Tick Tock,Young Thug,3JXebpWufTcs8iegiRCBW7,2021-08-20,0.944,0.693,11,-8.605,0,0.2660,0.007890,0.000000,0.1640,0.6260,134.061,159240,4


In [166]:
# Label hit songs
spotify_hit = spotify.merge(df[['track_id','hit']].drop_duplicates(['track_id']), 
                            how='left', left_on='id', right_on='track_id').drop(columns='track_id')
spotify_hit

Unnamed: 0,name,artists,id,album.release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,hit
0,"10,000 Reasons (Bless The Lord) - Live",Matt Redman,0fxpHpK3aw2nFWII6yveDD,2011-01-01,0.431,0.485,7,-8.085,1,0.0274,0.452000,0.000000,0.5440,0.3670,145.220,342493,4,0.0
1,100 Proof,Kellie Pickler,3A3wFMufIxSUrvfGUVYvhE,2012-01-23,0.543,0.642,1,-5.231,1,0.0255,0.386000,0.000014,0.0983,0.1610,132.291,226360,4,0.0
2,101,Alicia Keys,7wMPExYjxfDzu4SWDUt7dQ,2012-11-26,0.277,0.289,7,-9.389,0,0.0599,0.791000,0.000632,0.0838,0.0825,170.046,387827,3,0.0
3,1313,The Big Pink,6OHDgi8V4Q6hFYvQavBVaF,2012-01-16,0.574,0.900,10,-7.976,0,0.0630,0.000318,0.055800,0.1600,0.4030,105.991,352653,4,0.0
4,1961,The Fray,2MLQ6QSJLqaqnz00rCDsFl,2012-02-07,0.567,0.815,9,-4.787,1,0.0390,0.014200,0.000000,0.1480,0.6630,121.988,234080,4,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17628,Danny Phantom (feat. XXXTENTACION),"Trippie Redd, XXXTENTACION",7jjbfUEecddD2AW1UncQzb,2021-08-21,0.634,0.804,11,-6.688,0,0.4770,0.162000,0.000000,0.3510,0.8090,147.259,136377,4,1.0
17629,What's Wrong,Rod Wave,1jNapyyVxPB4X6gewQjEM8,2021-08-20,0.712,0.636,1,-7.188,1,0.0566,0.303000,0.000000,0.1120,0.1260,134.927,155259,5,1.0
17630,Demon Time (feat. Ski Mask The Slump God),"Trippie Redd, Ski Mask The Slump God",1hE8iI3YK9x1VUBDtSzg3x,2021-08-21,0.718,0.684,11,-5.160,0,0.2480,0.018200,0.000000,0.2330,0.8560,146.084,159452,4,1.0
17631,Tick Tock,Young Thug,3JXebpWufTcs8iegiRCBW7,2021-08-20,0.944,0.693,11,-8.605,0,0.2660,0.007890,0.000000,0.1640,0.6260,134.061,159240,4,1.0
