In [326]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import time
import sys
import re
import config
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

In [327]:
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(
    client_id=config.client_id, client_secret=config.client_secret))


In [185]:
not_hot_100_songs = pd.read_csv("../../../Day_1/Afternoon/lab-not-hot-songs/not_hot_songs_db.csv")
not_hot_100_songs.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,...,track_href,analysis_url,duration_ms,time_signature,genre,song_name,title,song_key,song_ids,artist_name
0,0.749,0.802,10,-5.78,0,0.202,0.105,0.0,0.296,0.47,...,https://api.spotify.com/v1/tracks/5q62CK2DoFxd...,https://api.spotify.com/v1/audio-analysis/5q62...,174357,4,Underground Rap,big tyma,,big tyma_alkaline,5q62CK2DoFxdFBzFDCctDE,Alkaline
1,0.464,0.602,7,-8.185,1,0.125,0.86,0.0,0.162,0.664,...,https://api.spotify.com/v1/tracks/1npAOCsqbB61...,https://api.spotify.com/v1/audio-analysis/1npA...,162373,4,Underground Rap,mozambique drill,,mozambique drill_mach-hommy,1npAOCsqbB618Zz4jOQxZL,Mach-Hommy
2,0.577,0.692,11,-5.96,1,0.64,0.372,0.0596,0.0989,0.0658,...,https://api.spotify.com/v1/tracks/5zjdL4aLh3D3...,https://api.spotify.com/v1/audio-analysis/5zjd...,122483,4,Underground Rap,2000 rounds,,2000 rounds_ghostemane,5zjdL4aLh3D3wOKeRkZSL6,Ghostemane
3,0.642,0.694,11,-6.104,1,0.364,0.000335,0.0,0.312,0.568,...,https://api.spotify.com/v1/tracks/2TF0kVvksniU...,https://api.spotify.com/v1/audio-analysis/2TF0...,143331,4,Underground Rap,85 to africa,,85 to africa_jidenna,2TF0kVvksniUGEdwnBSw2v,Jidenna
4,0.789,0.488,4,-9.093,0,0.227,0.179,8e-06,0.104,0.0644,...,https://api.spotify.com/v1/tracks/23NWj2izXAJ4...,https://api.spotify.com/v1/audio-analysis/23NW...,209453,4,Underground Rap,pop style,,pop style_drake,23NWj2izXAJ4yL6Nah73wf,Drake


In [186]:
def scrape_hot100(file_path="./top_100_songs.csv"):
    """
    Inputs: file path to save csv of top 100 songs to
    Outputs: csv of top 100 songs
    Function: This function will scrape the current top 100 songs from the billboards with artist, rank and title
    and save them to a CSV which will then be saved locally on the directory where this function is run. 
    """
    # Define URL
    url = "https://www.billboard.com/charts/hot-100"

    # sending request to url
    response = requests.get(url)
    
    # Checking response status code
    print(response.status_code)
    

    # Defining soup after parsing for html
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # saving soup of top 100 songs table
    table = soup.find("div", {"class":"chart-results-list"})
    
    # Retrieving name of songs
    titles = [elem.get_text().replace('\n', '').replace('\t', '') for elem in table.select("div ul li ul li h3")]
    
    # Retrieving artists
    artists = [elem.get_text().replace('\n', '').replace('\t','') for elem in table.select("div ul li ul li span.a-no-trucate")]
    
    ranks = list(range(1,101))
    
    df = pd.DataFrame({"Title":titles, "Artist":artists, "Rank":ranks})
    
    df.to_csv(file_path, index=False)
    
    print("success")

In [187]:
scrape_hot100()
hot_100_songs_df = pd.read_csv("top_100_songs.csv")

200
success


In [188]:
hot_100_songs_df.head()

Unnamed: 0,Title,Artist,Rank
0,Lovin On Me,Jack Harlow,1
1,Cruel Summer,Taylor Swift,2
2,Greedy,Tate McRae,3
3,Paint The Town Red,Doja Cat,4
4,I Remember Everything,Zach Bryan Featuring Kacey Musgraves,5


### Creating a function for searching for individual songs

In [274]:
def search_song(title: str, artist: str=None, limit: int=1):
    """
    Inputs: User input for song title, with artist and limit as optional inputs
    Outputs: list of spotify IDs pertaining to the song that has been queried
    Function: Take user input for song, artist and limit (last two are optional)
    and provide song IDs from spotify endpoint for future use to get further information
    on specific songs.
    """
    # Initiating authorization with spotify
    sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(
    client_id=config.client_id, client_secret=config.client_secret))
    song_name = []
    ids = []
    
    
    # Querying spotify search method based on whether artist is present or not
    if artist is None:
        resp = sp.search(q=title)
        if len(resp['tracks']['items']) < 1:
            print("Song not found")
            ids.append(' ')
            song_name.append(' ')
        elif len(resp['tracks']['items']) < limit:
            new_limit = len(resp['tracks']['items'])
            df_ids = pd.DataFrame(resp['tracks']['items'])
            df_ids = df_ids[['name', 'id']]
            df_ids = df_ids[:new_limit]
            ids.extend(df_ids['id'].tolist())
            song_name.extend(df_ids['name'].tolist())
        else:
            df_ids = pd.DataFrame(resp['tracks']['items'])
            df_ids = df_ids[['name', 'id']]
            df_ids = df_ids[:limit]
            ids.extend(df_ids['id'].tolist())
            song_name.extend(df_ids['name'].tolist())
    else:
        query = title + ' ' + artist
        resp = sp.search(q=query)
        if len(resp['tracks']['items']) < 1:
            print("Song not found")
            ids.append(' ')
            song_name.append(' ')
        elif len(resp['tracks']['items']) < limit:
            new_limit = len(resp['tracks']['items'])
            df_ids = pd.DataFrame(resp['tracks']['items'])
            df_ids = df_ids[['name', 'id']]
            df_ids = df_ids[:new_limit]
            ids.extend(df_ids['id'].tolist())
            song_name.extend(df_ids['name'].tolist())
        else:
            df_ids = pd.DataFrame(resp['tracks']['items'])
            df_ids = df_ids[['name', 'id']]
            df_ids = df_ids[:limit]
            ids.extend(df_ids['id'].tolist())
            song_name.extend(df_ids['name'])
    
    # Creating dataframe with song names and ids to return from the function
    id_df = pd.DataFrame({"song_name":song_name, "id":ids})
    return id_df

### Creating function for bulk searching from previous function with a list of song ids.

In [310]:
df = hot_100_songs_df.copy()

def bulk_song_search(df:pd.DataFrame, song_col: str, artist_col: str):
    """
    Inputs: dataframe with songs to search for, with a column for song name and artist
    Outputs: dataframe with originally queried song title, returned song and song id from query
    Function: use search_song function to query a list of songs in order to get their spotify song id
    """
    df_ids = pd.DataFrame(columns=["song_name", "id"])
    song_names_dict = {}
    song_names_list = []
    for i in range(0, len(df[song_col])):
        artist = df[artist_col][i]
        song_name = df[song_col][i]
        df_hot_song = search_song(title=song_name, artist=artist, limit=1)
        df_ids = pd.concat([df_ids, df_hot_song])
        song_names_list.append(song_name)
        print(str(i)+"/"+(str(100)))
        time.sleep(1)
        #df_ids = pd.concat([df_ids, song_names], axis=1)
    df_ids = df_ids.reset_index(drop=True)
    song_names_dict['original'] = song_names_list
    df_dictionary = pd.DataFrame(song_names_dict)
    df_ids = pd.concat([df_ids, df_dictionary], axis=1)
    return df_ids

df_ids_hot = bulk_song_search(df, 'Title', 'Artist')

0/100
1/100
2/100
3/100
4/100
5/100
6/100
7/100
8/100
9/100
10/100
11/100
12/100
13/100
14/100
15/100
16/100
17/100
18/100
19/100
20/100
21/100
22/100
23/100
24/100
25/100
26/100
27/100
28/100
29/100
30/100
31/100
32/100
33/100
34/100
35/100
36/100
37/100
38/100
39/100
40/100
41/100
42/100
43/100
44/100
45/100
46/100
47/100
48/100
49/100
50/100
51/100
52/100
53/100
54/100
55/100
56/100
57/100
58/100
59/100
60/100
61/100
62/100
63/100
64/100
65/100
66/100
67/100
68/100
69/100
70/100
71/100
72/100
73/100
74/100
75/100
76/100
77/100
78/100
79/100
80/100
81/100
82/100
83/100
84/100
85/100
86/100
87/100
88/100
89/100
90/100
91/100
92/100
93/100
94/100
95/100
96/100
97/100
98/100
99/100


In [312]:
df_ids_hot.head()

Unnamed: 0,song_name,id,original
0,Lovin On Me,4xhsWYTOGcal8zt0J161CU,Lovin On Me
1,Cruel Summer,1BxfuPKGuaTgP7aM0Bbdwr,Cruel Summer
2,greedy,3rUGC1vUpkDG9CZFHMur1t,Greedy
3,Paint The Town Red,2IGMVunIBsBLtEQyoI1Mu7,Paint The Town Red
4,I Remember Everything (feat. Kacey Musgraves),4KULAymBBJcPRpk1yO4dOG,I Remember Everything


In [315]:
def get_audio_features(df:pd.DataFrame, col:str, merge_col:str):
    """
    Inputs: Dataframe with spotify song ids in one of the columns, and the name of that colum
    Outputs: dataframe with song_ids and columns containing audio features for each song id
    Function: Query the spotify.audio_features endpoint with a list of (chunked) song IDs in order to get the necessary audio 
    """
    chunks = np.array_split(df[col], [50])

    df2 = pd.DataFrame()
    for chunk in chunks:
        test = sp.audio_features(tracks=chunk)
        test_df = pd.json_normalize(test)
        df2 = pd.concat([df2, test_df])
    df2 = df2.reset_index(drop=True)
    df2 = df2.merge(df, on=merge_col, how='inner')
    return df2

In [316]:
audio_features_df = get_audio_features(df_ids_hot, 'id', 'id')
audio_features_df.head()

  return bound(*args, **kwds)


Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature,song_name,original
0,0.943,0.558,2,-4.911,1,0.0568,0.0026,2e-06,0.0937,0.606,104.983,audio_features,4xhsWYTOGcal8zt0J161CU,spotify:track:4xhsWYTOGcal8zt0J161CU,https://api.spotify.com/v1/tracks/4xhsWYTOGcal...,https://api.spotify.com/v1/audio-analysis/4xhs...,138411,4,Lovin On Me,Lovin On Me
1,0.552,0.702,9,-5.707,1,0.157,0.117,2.1e-05,0.105,0.564,169.994,audio_features,1BxfuPKGuaTgP7aM0Bbdwr,spotify:track:1BxfuPKGuaTgP7aM0Bbdwr,https://api.spotify.com/v1/tracks/1BxfuPKGuaTg...,https://api.spotify.com/v1/audio-analysis/1Bxf...,178427,4,Cruel Summer,Cruel Summer
2,0.75,0.733,6,-3.18,0,0.0319,0.256,0.0,0.114,0.844,111.018,audio_features,3rUGC1vUpkDG9CZFHMur1t,spotify:track:3rUGC1vUpkDG9CZFHMur1t,https://api.spotify.com/v1/tracks/3rUGC1vUpkDG...,https://api.spotify.com/v1/audio-analysis/3rUG...,131872,1,greedy,Greedy
3,0.868,0.538,5,-8.603,1,0.174,0.269,3e-06,0.0901,0.732,99.968,audio_features,2IGMVunIBsBLtEQyoI1Mu7,spotify:track:2IGMVunIBsBLtEQyoI1Mu7,https://api.spotify.com/v1/tracks/2IGMVunIBsBL...,https://api.spotify.com/v1/audio-analysis/2IGM...,231750,4,Paint The Town Red,Paint The Town Red
4,0.429,0.453,0,-7.746,1,0.0459,0.554,2e-06,0.102,0.155,77.639,audio_features,4KULAymBBJcPRpk1yO4dOG,spotify:track:4KULAymBBJcPRpk1yO4dOG,https://api.spotify.com/v1/tracks/4KULAymBBJcP...,https://api.spotify.com/v1/audio-analysis/4KUL...,227196,4,I Remember Everything (feat. Kacey Musgraves),I Remember Everything


In [317]:
def add_audio_features(df:pd.DataFrame, 
                       audio_features_df:pd.DataFrame, left_on:str,
                      right_on:str):
    """
    Inputs: dataframe with song names, title and other necessary information. Second dataframe with audio feature data about each song based on song ID.
    Outputs: merged dataframe with original songs df, and audio features for each song from spotify api
    Function: Merge the dataframes based on the original song name to allow for correct matching.
    """
    new_df = df.merge(audio_features_df, left_on=left_on, 
                      right_on=right_on, how='inner')
    return new_df

In [322]:
merged_hot_songs_df = add_audio_features(hot_100_songs_df, audio_features_df, 
                   'Title', 'original')

In [323]:
merged_hot_songs_df.head()

Unnamed: 0,Title,Artist,Rank,danceability,energy,key,loudness,mode,speechiness,acousticness,...,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature,song_name,original
0,Lovin On Me,Jack Harlow,1,0.943,0.558,2,-4.911,1,0.0568,0.0026,...,104.983,audio_features,4xhsWYTOGcal8zt0J161CU,spotify:track:4xhsWYTOGcal8zt0J161CU,https://api.spotify.com/v1/tracks/4xhsWYTOGcal...,https://api.spotify.com/v1/audio-analysis/4xhs...,138411,4,Lovin On Me,Lovin On Me
1,Cruel Summer,Taylor Swift,2,0.552,0.702,9,-5.707,1,0.157,0.117,...,169.994,audio_features,1BxfuPKGuaTgP7aM0Bbdwr,spotify:track:1BxfuPKGuaTgP7aM0Bbdwr,https://api.spotify.com/v1/tracks/1BxfuPKGuaTg...,https://api.spotify.com/v1/audio-analysis/1Bxf...,178427,4,Cruel Summer,Cruel Summer
2,Greedy,Tate McRae,3,0.75,0.733,6,-3.18,0,0.0319,0.256,...,111.018,audio_features,3rUGC1vUpkDG9CZFHMur1t,spotify:track:3rUGC1vUpkDG9CZFHMur1t,https://api.spotify.com/v1/tracks/3rUGC1vUpkDG...,https://api.spotify.com/v1/audio-analysis/3rUG...,131872,1,greedy,Greedy
3,Paint The Town Red,Doja Cat,4,0.868,0.538,5,-8.603,1,0.174,0.269,...,99.968,audio_features,2IGMVunIBsBLtEQyoI1Mu7,spotify:track:2IGMVunIBsBLtEQyoI1Mu7,https://api.spotify.com/v1/tracks/2IGMVunIBsBL...,https://api.spotify.com/v1/audio-analysis/2IGM...,231750,4,Paint The Town Red,Paint The Town Red
4,I Remember Everything,Zach Bryan Featuring Kacey Musgraves,5,0.429,0.453,0,-7.746,1,0.0459,0.554,...,77.639,audio_features,4KULAymBBJcPRpk1yO4dOG,spotify:track:4KULAymBBJcPRpk1yO4dOG,https://api.spotify.com/v1/tracks/4KULAymBBJcP...,https://api.spotify.com/v1/audio-analysis/4KUL...,227196,4,I Remember Everything (feat. Kacey Musgraves),I Remember Everything


In [325]:
merged_hot_songs_df.to_csv("./top_100_songs.csv", index=False)