In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests 
import base64
from time import sleep, time
from dotenv import dotenv_values
from random import randint
config = dotenv_values(".env")
CLIENT_ID = config["CLIENT_ID"]
CLIENT_SECRET = config["CLIENT_SECRET"]

to_encode = CLIENT_ID + ':' + CLIENT_SECRET
base64_encoded_id_secret = base64.b64encode(to_encode.encode()).decode()

res = requests.post('https://accounts.spotify.com/api/token', 
           headers = {'Authorization': 'Basic {}'.format(base64_encoded_id_secret)}, 
           data = {'grant_type': 'client_credentials'})

token = 'Bearer {}'.format(res.json()['access_token'])

headers = {'Authorization': token, "Accept": 'application/json', 'Content-Type': "application/json"}

In [27]:
token

'Bearer BQCxAQJCEsJaNJrfgJvqUlTU5GHweOaWgmF0hadgNk05oVzzhOr3LAyUt4_FIpk8xszeT7jt1GOfpIHjdc0'

In [28]:
#test header
url="https://api.spotify.com/v1/tracks/5Z9KJZvQzH6PFmb8SNkxuk"
r = requests.get(url, headers=headers)

In [8]:
#check for 200 response
r

<Response [200]>

In [9]:
# import tracks csv
tracks = pd.read_csv('/Users/joycetagal/Github/metis/reg/spotifydata_2020-09-30_2021-09-30.csv')

In [10]:
tracks.shape

(10200, 5)

In [11]:
tracks.drop_duplicates(subset=["Song ID"], inplace=True)

In [12]:
tracks.shape

(1388, 5)

In [13]:
tracks.drop(columns=['Streams'], inplace=True)
tracks.head()

Unnamed: 0,Title,Artist,Song ID,Weekly Chart Date
0,INDUSTRY BABY (feat. Jack Harlow),Lil Nas X,5Z9KJZvQzH6PFmb8SNkxuk,2021-09-17 - 2021-09-24
1,Way 2 Sexy (with Future & Young Thug),Drake,0k1WUmIRnG3xU6fvvDVfRG,2021-09-17 - 2021-09-24
2,STAY (with Justin Bieber),The Kid LAROI,5PjdY0CKGZdEuoNab3yDmX,2021-09-17 - 2021-09-24
3,THATS WHAT I WANT,Lil Nas X,0e8nrvls4Qqv5Rfa2UhqmO,2021-09-17 - 2021-09-24
4,Knife Talk (with 21 Savage ft. Project Pat),Drake,2BcMwX1MPV6ZHP4tUT9uq6,2021-09-17 - 2021-09-24


In [14]:
tracks.groupby(['Song ID']).count().sort_values(by='Title', ascending=False)

Unnamed: 0_level_0,Title,Artist,Weekly Chart Date
Song ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
003vvx7Niy0yvhvHt4a68B,1,1,1
51A8eKvvZz9uydvIZ7xRSV,1,1,1
54eUv9Q3D9MZuOGS5PSflp,1,1,1
54bFM56PmE4YLRnqpW6Tha,1,1,1
54DmTIv86D3sYdiawjULQ0,1,1,1
...,...,...,...
2Y0ktCGrGoGcQFXsGztvhi,1,1,1
2Xr1dTzJee307rmrkt8c0g,1,1,1
2XIc1pqjXV3Cr2BQUGNBck,1,1,1
2X7c8wSJD0urdk9aWzpu5D,1,1,1


In [29]:
def process_df(tracks, out_list):
    res = requests.post('https://accounts.spotify.com/api/token', 
               headers = {'Authorization': 'Basic {}'.format(base64_encoded_id_secret)}, 
               data= {'grant_type': 'client_credentials'})
    token = 'Bearer {}'.format(res.json()['access_token'])
    headers = {'Authorization': token, "Accept": 'application/json', 'Content-Type': "application/json"}
    start_time = time()
    songs_analyzed = 0

    songs_info = pd.DataFrame(columns=["id", "popularity"])
    songs_analysis = pd.DataFrame(columns=["id", "duration", "loudness", "tempo", "tempo_confidence", "time_signature", 
                                       "time_signature_confidence", "key", "key_confidence", "mode", "mode_confidence"])

    songs_features = pd.DataFrame(columns=["id", "acousticness", "danceability", "energy", "instrumentalness", "liveness", 
                                       "speechiness", "valence"])

    for i in range(len(tracks["Song ID"])):   
        url="https://api.spotify.com/v1/tracks/{}".format(tracks["Song ID"][i])
    
        r = requests.get(url, headers=headers)

        try:
            r = r.json()
            songs_info = songs_info.append({"id": tracks["Song ID"][i], "popularity": r["popularity"]}, ignore_index = True)
        except:
            songs_info = songs_info.append({"id": tracks["Song ID"][i], "popularity": "NaN"}, ignore_index = True)
            print("Info value not found")
    
        #------------------------#
        url="https://api.spotify.com/v1/audio-analysis/{}".format(tracks["Song ID"][i])
        r = requests.get(url, headers=headers)

        try:
            r = r.json()['track']
            songs_analysis = songs_analysis.append({"id": tracks["Song ID"][i], "duration": r["duration"], "loudness": r["loudness"], "tempo": r["tempo"],
                                                "tempo_confidence": r["tempo_confidence"], "time_signature": r["time_signature"],
                                                "time_signature_confidence": r["time_signature_confidence"], "key": r["key"], 
                                                "key_confidence": r["key_confidence"], "mode": r["mode"], 
                                                "mode_confidence": r["mode_confidence"]}, ignore_index = True)
        except:
            songs_analysis = songs_analysis.append({"id": tracks["Song ID"][i], "duration": "NaN", "loudness": "NaN", "tempo": "NaN",
                                                "tempo_confidence": "NaN", "time_signature": "NaN", "time_signature_confidence": "NaN", 
                                                "key": "NaN", "key_confidence": "NaN", "mode": "NaN", "mode_confidence": "NaN"}, 
                                               ignore_index = True)
            print("Analysis value not found")
        
    
        #-------------------------#
        url="https://api.spotify.com/v1/audio-features/{}".format(tracks["Song ID"][i])
        r = requests.get(url, headers=headers)

        try:
            r=r.json()
            songs_features = songs_features.append({"id": tracks["Song ID"][i], "acousticness": r["acousticness"], "danceability": r["danceability"], 
                                                "energy": r["energy"], "instrumentalness": r["instrumentalness"],
                                                "liveness": r["liveness"], "speechiness": r["speechiness"], 
                                                "valence": r["valence"]}, ignore_index = True)
        
        except:
            songs_features = songs_features.append({"id": tracks["Song ID"][i], "acousticness": "NaN", "danceability": "NaN", 
                                                "energy": "NaN", "instrumentalness": "NaN",
                                                "liveness": "NaN", "speechiness": "NaN", 
                                                "valence": "NaN"}, ignore_index = True)
            print("Features value not found")
        
        sleep(randint(2,10))
        songs_analyzed = i + 1
        elapsed_time = time() - start_time
        print("Songs analyzed: {}; Elapsed Time: {}".format(songs_analyzed, elapsed_time), end = "\r", flush = True)

    out = pd.merge(songs_info, songs_analysis, on='id', how='inner', copy=False)
    out = pd.merge(out, songs_features, on='id', how='inner', copy=False)
    
    out_list.append(out)
    print("ANALYSIS COMPLETE. Songs analyzed: {}; Elapsed Time: {}.".format(songs_analyzed, time()-start_time))
    

In [None]:
url="https://api.spotify.com/v1/tracks/5Z9KJZvQzH6PFmb8SNkxuk"
r = requests.get(url, headers=headers)
r

In [None]:
df_list_test = [tracks_10, tracks_20]

In [36]:
tracks_1 = tracks.iloc[:100,:].reset_index(drop=True)
tracks_2 = tracks.iloc[201:300,:].reset_index(drop=True)
tracks_3 = tracks.iloc[301:400,:].reset_index(drop=True)
tracks_4 = tracks.iloc[401:500,:].reset_index(drop=True)
tracks_5 = tracks.iloc[501:600,:].reset_index(drop=True)
tracks_6 = tracks.iloc[601:700,:].reset_index(drop=True)
tracks_7 = tracks.iloc[701:800,:].reset_index(drop=True)
tracks_8 = tracks.iloc[801:900,:].reset_index(drop=True)
tracks_9 = tracks.iloc[901:1000,:].reset_index(drop=True)
tracks_10 = tracks.iloc[1001:1100,:].reset_index(drop=True)
tracks_11 = tracks.iloc[1101:1200,:].reset_index(drop=True)
tracks_12 = tracks.iloc[1201:,:].reset_index(drop=True)

In [84]:
df_list = [
    tracks_6,
    tracks_7,
    tracks_8,
    tracks_9,
    tracks_10,
    tracks_11,
    tracks_12
]

In [85]:
i = 1
out_list = []
for df in df_list:
    print('Processing dataframe {}'.format(i))
    process_df(df, out_list)
    i+=1
    sleep(30)
    

Processing dataframe 1
ANALYSIS COMPLETE. Songs analyzed: 99; Elapsed Time: 716.330274105072.
Processing dataframe 2
ANALYSIS COMPLETE. Songs analyzed: 99; Elapsed Time: 740.5091807842255.
Processing dataframe 3
ANALYSIS COMPLETE. Songs analyzed: 99; Elapsed Time: 718.9493112564087.
Processing dataframe 4
ANALYSIS COMPLETE. Songs analyzed: 99; Elapsed Time: 693.933296918869.
Processing dataframe 5
ANALYSIS COMPLETE. Songs analyzed: 99; Elapsed Time: 695.2265057563782.
Processing dataframe 6
ANALYSIS COMPLETE. Songs analyzed: 99; Elapsed Time: 699.704715013504.
Processing dataframe 7
ANALYSIS COMPLETE. Songs analyzed: 187; Elapsed Time: 1354.2830379009247.


In [96]:
new_df2 = pd.concat(
    [out_list[0],
     out_list[1],
     out_list[2],
     out_list[3],
     out_list[4],
     out_list[5],
     out_list[6]
    ] , ignore_index=True)
new_df2.shape

(781, 19)

In [98]:
final_df = pd.concat([new_df, new_df2], ignore_index=True)
final_df.shape

(1277, 19)

In [99]:
with open('spotify_full_data.csv', 'w') as f:
        final_df.to_csv(f, header= True, index=False)

In [100]:
final_df.info

<bound method DataFrame.info of                           id popularity   duration loudness    tempo  \
0     67BtfxlNbhBmCDR2L2l8qd         93  137.87573   -6.682  178.818   
1     1lNEXDlxVhsWaq2DLBUDgC         73     167.12   -6.484   80.043   
2     5ScbulRnixQ2XAdvrPMFjz         81  229.30354    -6.77  177.693   
3     15hJmqqEtASVXl6sM7i4UF         75  270.67078    -5.62  130.027   
4     3RkNXZvOSMMElmmXztDc94         80  245.13103    -9.84   87.073   
...                      ...        ...        ...      ...      ...   
1272  44gRhRi2OhEf7moAUj6MD1         69  252.72858   -5.756   96.459   
1273  5TXQCMKN6TgemTL3c4wRTn         60  193.70232   -6.847  109.093   
1274  2D0dj3hVkRQJCp63cxCPEx         68      144.0   -6.429   99.954   
1275  1zmv0tPVWdbCuvBw90MYwO         52  174.09341   -6.452  179.095   
1276  3Nc86B5XmDlG1KuEeSLD8x         55  207.71007   -9.297  114.078   

     tempo_confidence time_signature time_signature_confidence key  \
0               0.113            

In [104]:
# check for duplicates
final_df.groupby(['id']).count()

Unnamed: 0_level_0,popularity,duration,loudness,tempo,tempo_confidence,time_signature,time_signature_confidence,key,key_confidence,mode,mode_confidence,acousticness,danceability,energy,instrumentalness,liveness,speechiness,valence
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
00R0fEFZGb5hyTgF1nrRCq,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
01FvQEvHETjWqcDpQDJdTb,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
01QdEx6kFr78ZejhQtWR5m,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
0247StOpd3AkeBQzANX4Zf,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
02MWAaffLxlfxAUY7c5dvx,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7yq4Qj7cqayVTp3FF9CWbm,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
7z4xW9WY86uH3gd1V9pfCM,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
7zLMYtNJcabv4h4wBnjNQI,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
7zQirOExB0VR8yWUOqYeio,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1


In [106]:
final_final_df = final_df.merge(tracks, left_on='id', right_on='Song ID', how='left')

In [110]:
final_final_df = final_final_df.rename(str.lower, axis='columns')

In [113]:
with open('spotify_full_data.csv', 'w') as f:
        final_final_df.to_csv(f, header= True, index=False)