#### Environment

In [None]:
import urllib.request
from bs4 import BeautifulSoup
import requests
import json
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

import pandas as pd
import numpy as np
import time
import math
import string
import datetime

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
auth_manager = SpotifyClientCredentials('41f67e00313649c0962534596da52a38','2c885133946d4253b92f50e19bcd9308')
sp = spotipy.Spotify(auth_manager=auth_manager)

# PART 1 - DATASET CREATION

### Web scrapping billboard charts

We will use __BeautifulSoup__ and __request__ libraries to search for the songs that showed in the Billboard Top 100 charts from 2010 to 2020, scraping from Wikipedia.

In [None]:
billboard=pd.DataFrame()
for year in range(2009,2021):
    url = "https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_"+str(year) # set page to scrape (iterate with year)
    page = urllib.request.urlopen(url) # retrieving the webpage
    soup = BeautifulSoup(page, "lxml") # import soup
    all_tables = soup.find_all("table") # find tables in the webpage
    chart = soup.find('table', class_='wikitable sortable') # find table of interest - top 100 chart
    #pull out the data from the charts
    A=[]
    B=[]
    C=[]
    # append the cells for each column
    for row in chart.findAll('tr'):
        cells=row.findAll('td')
        if len(cells)==3:
            A.append(cells[0].find(text=True))
            B.append(str(cells[1]).split('title="')[1].split('">')[1].split('</a>')[0])
            C.append(cells[2].find(text=True))
    df=pd.DataFrame(A,columns=['Number'])
    df['title']=B
    df['artist']=C
    df['year']=year
    print("Retrieving", url)
    print("Singles retrieved:", df.shape[0])
    # append the yearly table to the final table
    billboard = billboard.append(df)
billboard.info()

In [None]:
billboard.to_csv("billboard.csv", index=None)

### Get billboard songs from Spotify

Using the search function from the __Spotipy__ library, we will search for the billboard songs available on Spotify. Because there are multiple tracks corresponding to the same song, we will retrieve the track with top *popularity* (most played). *Popularity* is a Spotify engineered feature that accounts for the number of times a track is played.

In [None]:
def getFromSpotify(title, artist, year):
    sp_list = []

    #extract songs by release year
    for count in range(math.ceil(2000/50)):
        try:
            sInfo = sp.search(q='artist:' +str(artist) + ' track:' + title + ' year:' + str(year-1) + "-" + str(year),
                              type='track',
                              limit=1)
            for a in sInfo['tracks']['items']:
                dict1={}
                dict1['name'] = a['name']
                dict1['artist_name'] = a['artists'][0]['name']
                dict1['track_id'] = a['id']
                dict1['track_number'] = a['track_number']
                dict1['popularity'] = a['popularity']
                dict1['release_date'] = a['album']['release_date']
                sp_list.append(dict1)
        except Exception as e:
            break

    return sp_list

In [None]:
for i in range(billboard.shape[0]):
    sp_list = getFromSpotify(billboard.iloc[i]["title"], billboard.iloc[i]["artist"], billboard.loc[i]["year"])
print(sp_list)
billboard.info()
print("Songs not found:", sum(df_bb_ids["track_id"].isnull()==True))

### Creating a dataset of random hits that have not been on any billboard charts (2010-2020)

Now we need a random sample of hits that never been to billboard. This will be done by selecting a random sample of songs as big as possible from Spotify (Spotify API has a limit of 2,000 songs per search). The library __string__ will be used here. Later, duplicates will be eliminated, resulting in a dataset of around 30,000 per year.

In [None]:
# OBSERVATION - This whole code takes a few hours to complete, and may be interrupted due to Spotipy authorization limitations. 
# The code below represents the complete work implemented for the entire period of analysis (2010-2020).
for year in range(2010, 2020):
    rows_list = []
    for letterdigit in string.printable: 
        time.sleep(4)
        # selecting all songs which name starts with a printable digit (a, b, c, ... 1, 2, 3 etc).
        # This was necessary because Spotify API only returns a maximum of 2,000 songs of top popularity, 
        # by run.
        # This code returns the 2,000 top popularity that starts with each digit, and later eliminates duplicates.
        # This was an attempt to randomize the songs, while also bringing more than 2,000 songs to compose the dataset.
        for count in range(40):
            sInfo = sp.search(q=letterdigit+' '+'year:"'+str(year)+'"',type='track',limit=50,offset=count*50) # Searching for songs starting with each digit, in that year
            for a in sInfo['tracks']['items']:
                dict1={}
                # retrieving songs' identification data from Spotify
                dict1['name'] = a['name'] 
                dict1['artist_name'] = a['artists'][0]['name']
                dict1['track_id'] = a['id']
                dict1['track_number'] = a['track_number']
                dict1['popularity'] = a['popularity']
                dict1['release_date'] = a['album']['release_date']
                rows_list.append(dict1)
        print(year, letterdigit, len(rows_list))
    df1 = pd.DataFrame(rows_list)
    df1.drop_duplicates(inplace=True)
    df1.to_csv("spotify_"+str(year)+".csv")

In [None]:
# Appending all hits to a single dataset
no_billboard = pd.read_csv("spotify_2010.csv", usecols=['name','artist_name','track_id','track_number','popularity','release_date']) \
       .append(pd.read_csv("spotify_2011.csv", usecols=['name','artist_name','track_id','track_number','popularity','release_date'])) \
       .append(pd.read_csv("spotify_2012.csv", usecols=['name','artist_name','track_id','track_number','popularity','release_date'])) \
       .append(pd.read_csv("spotify_2013.csv", usecols=['name','artist_name','track_id','track_number','popularity','release_date'])) \
       .append(pd.read_csv("spotify_2014.csv", usecols=['name','artist_name','track_id','track_number','popularity','release_date'])) \
       .append(pd.read_csv("spotify_2015.csv", usecols=['name','artist_name','track_id','track_number','popularity','release_date'])) \
       .append(pd.read_csv("spotify_2016.csv", usecols=['name','artist_name','track_id','track_number','popularity','release_date'])) \
       .append(pd.read_csv("spotify_2017.csv", usecols=['name','artist_name','track_id','track_number','popularity','release_date'])) \
       .append(pd.read_csv("spotify_2018.csv", usecols=['name','artist_name','track_id','track_number','popularity','release_date'])) \
       .append(pd.read_csv("spotify_2019.csv", usecols=['name','artist_name','track_id','track_number','popularity','release_date'])) \
       .append(pd.read_csv("spotify_2020.csv", usecols=['name','artist_name','track_id','track_number','popularity','release_date'])) \
          .drop_duplicates()#.to_csv("no_billboard.csv", index=None)

In [None]:
# pd.read_csv("no_billboard.csv")

### Get Audio features from Spotify API

The next step is importing the songs features from Spotify. We will combine the billboard data to the non-billboard sample and search for the features. As the list of songs to be searched is very large, we are using the library __math__ to divide the dataset into small parts that will be effectivelly searched by the function.

In [None]:
#Combining the two datasets, with the label "billboard" (1-song is on billboard, 0-song is not on billboard)
billboard["billboard"]=1
no_billboard["billboard"]=0
all_ids = billboard.append(no_billboard)

In [None]:
# Chunking the ids: Spotipy can handle request of upto 100 track ids in one request so,
# to reduce the actual number of requests, we will break down the list of ids into sublist of 100 or smaller.
chunks = math.floor(len(all_ids)/100)
ids_sublists = list(map(lambda x: all_ids[x*100:(x+1)*100], range(chunks))) 
ids_sublists.append(all_ids[chunks*100:])

In [None]:
# OBSERVATION - This whole code takes a few hours to complete, and may be interrupted due to Spotipy authorization limitations. 
# The code below represents the complete work implemented for the entire period of analysis (2010-2020).
features = list(map(lambda id_chunks: sp.audio_features(id_chunks),ids_sublists))

In [None]:
# If some songs didn't have a valid spotify ID or can't generate a valid response for any reason we need to filter them out.
feature_dicts = [item for sublist in features for item in sublist]
features_found = list(filter(lambda f: type(f) == dict, feature_dicts))
len(features_found)

In [None]:
features_df = pd.DataFrame(features_found)\
                .drop_duplicates(subset='id', keep='first')\
                .rename(columns={'id':'track_id'})
features_df.shape

In [None]:
# Combining songs data to new features data
data = pd.merge(df, features_df, on='track_id',  how='inner')

### Extract Release Date for songs from Spotify

Here, we are retrieving information about the song release date. The only release date available on Spotify is the album release date - this will be used as the track release date.

Note: The token needs to be re-initialized when the search is being done using: https://developer.spotify.com/documentation/web-api/reference/search/search/

In [None]:
data['release_date'] = ""

for index, site in enumerate(data['track_href']):
    token = "BQAyQF2ioqRpzOnMEFbJvor_Cc8E-zxiltL17ZhNLSPajm53hkIMY-yGVij0lxnEGIakQVa6_7fmzMAamJHCxlurfWsY5QQ7ETDGlY5oqCDj4B7TKdsxkv7dHoXgU-lf-NrSHbrdl4vH-ubJ0WSKELtlVRA_mB5XYS3qTdRivdj8"
    header = {'access_token': token}
    if data.at[index, 'release_date']=="":
        time.sleep(0.2)
        result = requests.get(site, header)
        data = pd.json_normalize(json.loads(result.content))
        try:
            data.at[index,'release_date'] = data['album.release_date'][0]
        except:
            data.at[index,'release_date'] = None

In [None]:
def is_date(string, format="%Y-%m-%d"):
    try: 
        datetime.datetime.strptime(string, format)
        return True
    except ValueError:
        return False

In [None]:
data['release_year'] = data['release_date'].str[:4]
data['release_month'] = 0
data['release_day'] = 0
data['release_week'] = 0

for index, rd in enumerate(data['release_date']):
    if is_date(rd):
        data.at[index, 'release_month'] = rd[5:7]
        data.at[index, 'release_day'] = rd[8:10]
        r_date = datetime.datetime.strptime(rd, format)
        data.at[index, 'release_week'] = r_date.isocalendar()[1]

In [None]:
data_dates = df_bb[['track_id','release_date','release_year','release_month','release_day', 'release_week']].copy()

### Finding songs' musical genres

Another piece of information we retrieved from Spotify is the music genres of each song. Each song can have multiple genres. Once again, the genres are not defined by song/track, but by artist or album. Here, we are going to import the artist genres. 

In [None]:
# Deduplicating the artists names, to make the code faster
artists = list(map(lambda x: str(x), data['artist_name'].unique()))
artists = artists.sort()

In [None]:
# This function requests Spotify for each artist's list of genres
def requestGenre(name):
    gen = ''
    try:
        artists_found = sp.search(name, type='artist')['artists']['items']
        match = list(filter(lambda a: a['name'] == name, artists_found))
        gen = '_'.join(match[0]['genres'])
        print("Processed For - "+name)
    except Exception as e:
        print("Not Found - "+name)
    return {'artist_name':name,'genres':gen}

In [None]:
# OBSERVATION - This whole code takes a few hours to complete, and may be interrupted due to Spotipy authorization limitations. 
# The code below represents the complete work implemented for the entire period of analysis (2010-2020).
genre_list = list(map(requestGenre, artists))
df_genre = pd.DataFrame(genre_list)
# df_genre.to_csv(folder+"genre_by_artist.csv")

In [None]:
# Now that we have the data on genre (from artists) and songs, we can join these files based on the artist name.
df_merged = df.merge(df_reread,how="inner",on="artist_name").drop(["Unnamed: 0_x","Unnamed: 0_y"], axis=1)

In [None]:
df_merged_final.to_csv("/content/drive/MyDrive/v2_with_genre.csv/v2_with_genre.csv")