In [18]:
import sys
import os
from bs4 import BeautifulSoup
import requests

import numpy as np
import pandas as pd
import datetime as dt

Create a function to scrape the Billboards 100 HOT songs and create a local dataframe of songs with them including:

1. Song’s name
2. Song’s artist


In [185]:
def create_artist_input_prompt(artist_names):
    user_prompt = ""
    valid_input = []
    
    for i in range(len(artist_names)):
        my_num = i
        user_prompt += f"Enter {my_num} for '{artist_names[i]}'\n"
        valid_input.append(str(my_num))
    ## space is always none of the above
    user_prompt += "Enter [SPACE] for 'None of the above'\n"
    valid_input.append(" ")
    
    return valid_input, user_prompt

## Define general control flow logic
```
Load dataframe of songs

User inputs song name (not artist / band name)
Is it currently hot?
    Yes: recommend another hot song
    No: print "no recommendation"

is it hot? (takes string as input)
    ret_song_records = matching songs in dataframe created from popular-songs/*.csv files
    
    return ret_song_records

    if multiple artists, get clarification on artist name
        include "none of the above" as option for user + list of artist names from our search

recommend another song:
    return random song from popular-songs dataframe
```

In [170]:
def get_song_records(song_title, df):
    return df[df['title'] == song_title]

In [183]:
def create_normalized_df_from_csvs(filelist, isHot=False):
    '''
    NOTE: Also converts col contents to lower case
    input: a list of csv files
    output: a single dataframe containing the data from all the input files
    '''
    dfs = []
    for fname in filelist:
        dfs.append(pd.read_csv(fname, index_col=None))
        dfs[-1].apply(lambda x: x.astype(str).str.lower())
        dfs[-1]['hot'] = isHot
    
    # now create a single data frame with duplicate rows dropped
    norm_df = pd.concat(dfs, axis=0)
    norm_df = norm_df.drop_duplicates().reset_index(drop=True)
    
    return norm_df
    

In [120]:
def recommend_hot_song(hot_songs_df, not_this_title):
    myrec = hot_songs_df.sample()
    
    if (myrec.title == not_this_title).all():
        print("DEBUG: try again")
        myrec = hot_songs_df.sample()
    return myrec

In [93]:
# infiles = ['Data/timeout_50-best-sad-songs.csv', 'Data/timeout_the-50-best-80s-songs.csv']

infiles = ['Data/Hot/uk_hot100_2021-02-14.csv']
## result is a single dataframe containing the concatenation of all three input files
df1 = create_normalized_df_from_csvs(infiles)

In [123]:
rr = df1.sample()
rr

Unnamed: 0,artist,title
23,tate mcrae,she's all i wanna be


In [114]:
rr.title.astype(str) == 'wyd now'

82    True
Name: title, dtype: bool

In [132]:
print(f"{rr.title.values[0]}")

she's all i wanna be


In [197]:
## main

## Load hot songs df
hot_songs_infiles = ['Data/Hot/uk_hot100_2021-02-14.csv', 'Data/Hot/billboard_hot100_2021-02-14.csv']
hot_songs_df = create_normalized_df_from_csvs(hot_songs_infiles, isHot=True)

## Load our general songs df
gen_songs_infiles = [ 'Data/timeout_best-party-songs.csv', 'Data/timeout_50-best-sad-songs.csv', 
                     'Data/timeout_the-50-best-80s-songs.csv']
gen_songs_df = create_normalized_df_from_csvs(gen_songs_infiles)

## Combine all songs
all_songs_df = pd.concat([hot_songs_df, gen_songs_df], axis=0)
## default is keep first, so if a song is in both, we keep the hot version
all_songs_df = all_songs_df.drop_duplicates().reset_index(drop=True)

## ---- ready for user query ----
userinput_songname = input("enter song name: ")
song_matches_df = get_song_records(userinput_songname, all_songs_df)

if song_matches_df.shape[0] == 0:
    # this song is not in our db
    print("No recommendation")
elif song_matches_df.shape[0] == 1:
    # we have exactly one record with the requested title
    if song_matches_df.hot.all():
        randhot = recommend_hot_song(hot_songs_df, userinput_songname)
        print(f"I recommend: {randhot.title.values[0]} by {randhot.artist.values[0]}")
    else:
        print("I will recommend something similar...")
else:
    print("Need to clarify which song, by artist")
    ## get artist clarification
    artist_names = song_matches_df.artist.values
    valid_input, user_prompt = create_artist_input_prompt(artist_names)
    
    # continue prompting until they've entered a valid choice
    while True:
        artist_num = input(user_prompt)
        if artist_num not in valid_input:
            print(f"{artist_num} is not a valid option, try again")
        else:
            break
    
    if artist_num == ' ':  ## none of the above
        print("No recommendation")
    else:
        selected_song = song_matches_df.iloc[pd.to_numeric(artist_num)]
        # do same as single option case above
        if selected_song.hot.all():
            randhot = recommend_hot_song(hot_songs_df, selected_song.title)
            print(f"I recommend HOT song: {randhot.title} by {randhot.artist}")
        else:
            print(f"I will recommend something SIMILAR to {selected_song.title.values[0]} by {selected_song.artist.values[0]}")


enter song name: we don't talk about bruno
Need to clarify which song, by artist
Enter 0 for 'gaitan/castillo/adassa/feliz'
Enter 1 for 'carolina gaitan, mauro castillo, adassa, rhenzy feliz, diane guerrero, stephanie beatriz & encanto cast'
Enter [SPACE] for 'None of the above'
1
I recommend HOT song: 165    shotta flow 6
Name: title, dtype: object by 165    nle choppa
Name: artist, dtype: object


In [198]:
hot_songs_df.head(20)
# gen_songs_df.head()
# big_df = pd.concat([hot_songs_df, gen_songs_df], axis=0)
# print(f"{hot_songs_df.shape[0]} {gen_songs_df.shape[0]} {all_songs_df.shape[0]}")

## default is keep first
# big_df.drop_duplicates().shape # (285, 3)

Unnamed: 0,artist,title,hot
0,gaitan/castillo/adassa/feliz,we don't talk about bruno,True
1,fireboy dml & ed sheeran,peru,True
2,jessica darrow,surface pressure,True
3,gayle,abcdefu,True
4,lost frequencies/calum scott,where are you now,True
5,arrdee & aitch,war,True
6,sam fender,seventeen going under,True
7,belters only ft jazzy,make me feel good,True
8,stephanie beatriz/olga merediz,the family madrigal,True
9,d-block europe ft central cee,overseas,True


In [194]:
hot_songs_df.iloc[1]

artist    fireboy dml & ed sheeran
title                         peru
hot                           True
Name: 1, dtype: object

In [176]:
one_rec = all_songs_df.sample()
one_rec

Unnamed: 0,artist,title,hot
5,OutKast,Hey Ya!,False


In [177]:
# if one_rec.hot.all():
    print('hot')
else:
    print('NOT')

NOT


In [17]:
def make_song_df(song_artist_list, song_title_list):
    return pd.DataFrame({'artist': song_artist_list, 'title': song_title_list})

In [19]:
def save_csv(df, fname, isHot=False):
    dname = 'Data'
    if isHot:
        # todo: change outfilename to include relevant date
        dname = os.path.join(dname,'Hot')
    outfilename = os.path.join(dname,f"{fname}.csv")
    
    df.to_csv(outfilename, index=False)
    print(f'Saved file: {outfilename}')
    return

In [52]:
def get_timeout_best_party_songs():
    url = 'https://www.timeout.com/music/best-party-songs'
    song_list_name = 'timeout-best_party_songs'
    outdir = 'Data'

    ## Download html with a get request 
    response = requests.get(url)

    ## TODO: improve error checking and throw error here instead of just returning the code
    if response.status_code != 200:
        print(f"Error! HTTP Request returned status code: {response.status_code}")

    soup = BeautifulSoup(response.content, "html.parser")

    ## Get the song artist
    song_titles = []
    song_artists = []
    for i in range(len(song_by_artists)):
        try:
            title_i, artist_i = song_by_artists[i].get_text().split("’ by ")
            # drop leading num. and leading quotemark
            title_i = ".".join(title_i.split(".")[1:])[2:]
        except ValueError:
            # skip this entry if we're having trouble parsing
            continue
        song_titles.append(title_i)
        song_artists.append(artist_i)
    
    song_df = make_song_df(song_artist_list=song_artists, song_title_list=song_titles)
    save_csv(song_df, song_list_name, isHot=False)
    
    return song_df

In [53]:
to_bps = get_timeout_best_party_songs()

Saved file: Data/timeout-best_party_songs.csv


In [55]:
to_bps.shape

(97, 2)

In [78]:
def get_timeout_songlist(song_list_name):
    url = f'https://www.timeout.com/music/{song_list_name}'
    outfile_name = f'timeout_{song_list_name}'

    ## Download html with a get request
    print(f'Getting from url: {url}')
    response = requests.get(url)

    ## TODO: improve error checking and throw error here instead of just returning the code
    if response.status_code != 200:
        print(f"Error! HTTP Request returned status code: {response.status_code}")

    soup = BeautifulSoup(response.content, "html.parser")
    
    # select song by artist string
    song_by_artists = soup.select("div._title_tpquo_9 > h3")

    ## Get the song artist
    song_titles = []
    song_artists = []
    for i in range(len(song_by_artists)):
        try:
            title_i, artist_i = song_by_artists[i].get_text().split("’ by ")
            # drop leading num. and leading quotemark
            title_i = ".".join(title_i.split(".")[1:])[2:]
        except ValueError:
            try:
                title_i, artist_i = song_by_artists[i].get_text().split('” by ') ## check for double quotes
                # drop leading num. and leading quotemark
                title_i = ".".join(title_i.split(".")[1:])[2:]
            except ValueError:
                # skip this entry if we're having trouble parsing
                print(f"skipping: {song_by_artists[i].get_text()}")
                continue
        song_titles.append(title_i)
        song_artists.append(artist_i)
    
    song_df = make_song_df(song_artist_list=song_artists, song_title_list=song_titles)
    save_csv(song_df, outfile_name)
    
    return song_df

In [69]:
df = get_timeout_songlist("best-pop-songs-of-all-time")
df.head(3)

Getting from url: {url}
Saved file: Data/timeout_best-pop-songs-of-all-time.csv


Unnamed: 0,artist,title
0,Beyoncé,Single Ladies (Put a Ring on It)
1,Rihanna featuring Jay-Z,Umbrella
2,Taylor Swift,Shake it Off


In [70]:
df = get_timeout_songlist("best-90s-songs")
df.head(3)

Getting from url: {url}
skipping: 10. ‘Waterfalls’  by TLC
	split: ['10.\xa0‘Waterfalls’  by TLC']
Saved file: Data/timeout_best-90s-songs.csv


Unnamed: 0,artist,title
0,Nirvana,Smells Like Teen Spirit
1,The Notorious BIG,Juicy
2,Daft Punk,Da Funk


In [71]:
df = get_timeout_songlist("the-50-best-80s-songs")
df.head(3)

Getting from url: {url}
Saved file: Data/timeout_the-50-best-80s-songs.csv


Unnamed: 0,artist,title
0,Prince,Purple Rain
1,Michael Jackson,Beat It
2,Whitney Houston,I Wanna Dance with Somebody


In [74]:
df = get_timeout_songlist("the-50-best-karaoke-songs-ever")
df.head(3)

Getting from url: https://www.timeout.com/music/the-50-best-karaoke-songs-ever
skipping: 3. ‘I Want It That Way’  by the Backstreet Boys
	split: ['3.\xa0‘I Want It That Way’  by the Backstreet Boys']
skipping: 7. ‘Since U Been Gone’  by Kelly Clarkson
	split: ['7.\xa0‘Since U Been Gone’  by Kelly Clarkson']
skipping: 34. ‘Say My Name’ Destiny’s Child
	split: ['34.\xa0‘Say My Name’ Destiny’s Child']
Saved file: Data/timeout_the-50-best-karaoke-songs-ever.csv


Unnamed: 0,artist,title
0,Prince,Purple Rain
1,the Ronettes,Be My Baby
2,Bruce Springsteen,Born to Run


In [79]:
##...added parsing of double quotes to handle this one
df = get_timeout_songlist("best-gay-songs")
df.head(3)

Getting from url: https://www.timeout.com/music/best-gay-songs
skipping: 3. ‘Montero (Call Me By Your Name)’ By Lil Nas X
skipping: 50. "You Need to Calm Down" by Taylor Swift
Saved file: Data/timeout_best-gay-songs.csv


Unnamed: 0,artist,title
0,Gloria Gaynor,I Will Survive
1,George Michael,Freedom! ’90
2,Madonna,Vogue


In [80]:
df = get_timeout_songlist("best-happy-songs")
df.head(3)

Getting from url: https://www.timeout.com/music/best-happy-songs
skipping: 3. ‘Don’t Stop Me Now‘ by Queen
Saved file: Data/timeout_best-happy-songs.csv


Unnamed: 0,artist,title
0,Prince,Let’s Go Crazy
1,James Brown & The Famous Flames,I Got You (I Feel Good)
2,Lizzo,Good as Hell


In [81]:
df = get_timeout_songlist("50-best-sad-songs")
df.head(3)

Getting from url: https://www.timeout.com/music/50-best-sad-songs
skipping: 11. ‘Strange Fruit‘ by Billie Holiday
skipping: 41. ‘La Ritournelle’ (Instrumental Mix) by Sebastien Tellier
Saved file: Data/timeout_50-best-sad-songs.csv


Unnamed: 0,artist,title
0,Sinéad O’Connor,Nothing Compares 2 U
1,Johnny Cash,Hurt
2,Neil Young,Only Love Can Break Your Heart


In [82]:
df = get_timeout_songlist("50-best-breakup-songs")
df.head(3)

Getting from url: https://www.timeout.com/music/50-best-breakup-songs
skipping: 7. ‘All Too Well (10-Minute Version) by Taylor Swift
Saved file: Data/timeout_50-best-breakup-songs.csv


Unnamed: 0,artist,title
0,Etta James,I’d Rather Go Blind
1,Elliott Smith,Somebody That I Used to Know
2,Whitney Houston,I Will Always Love You


In [118]:
def get_hot_100():
    bb_url = 'https://www.billboard.com/charts/hot-100/'

    # download html with a get request 
    response = requests.get(bb_url)
    
    ## TODO: improve error checking and throw error here instead of just returning the code
    if response.status_code != 200:
        print(f"Error! HTTP Request returned status code: {response.status_code}")
        return
    
    # if we get this far, all's well
    soup = BeautifulSoup(response.content, "html.parser")
    
    ## Get song title
    selected_titles = soup.select("li.o-chart-results-list__item > h3.c-title")

    song_titles = []
    for i in range(len(selected_titles)):
        song_titles.append(selected_titles[i].get_text().strip().lower())
        
    ## Get the song artist
    selected_artists = soup.select("li.lrv-u-width-100p > ul > li.o-chart-results-list__item.\/\/.lrv-u-flex-grow-1.lrv-u-flex.lrv-u-flex-direction-column.lrv-u-justify-content-center.lrv-u-border-b-1.u-border-b-0\@mobile-max.lrv-u-border-color-grey-light.lrv-u-padding-l-1\@mobile-max > span")

    song_artists = []
    for i in range(len(selected_artists)):
        song_artists.append(selected_artists[i].get_text().strip().lower())
         
    return pd.DataFrame({'artist': song_artists, 'title': song_titles})
    

In [125]:
hot100_songs = get_hot_100()
hot100_songs.head(20)

Unnamed: 0,artist,title
0,"carolina gaitan, mauro castillo, adassa, rhenz...",we don't talk about bruno
1,adele,easy on me
2,glass animals,heat waves
3,the kid laroi & justin bieber,stay
4,kodak black,super gremlin
5,gayle,abcdefu
6,ed sheeran,shivers
7,jessica darrow,surface pressure
8,gunna & future featuring young thug,pushin p
9,doja cat,need to know


In [122]:
hot100_songs.to_csv('Data/hot100.csv', index=False)

In [121]:
def get_uk_hot_100():
    uk_url = 'https://www.officialcharts.com/charts/singles-chart/'

    ## Download html with a get request 
    response = requests.get(uk_url)

    ## TODO: improve error checking and throw error here instead of just returning the code
    if response.status_code != 200:
        print(f"Error! HTTP Request returned status code: {response.status_code}")
        return

    soup = BeautifulSoup(response.content, "html.parser")

    ## Get song title
    selected_titles = soup.select("div.title-artist > div.title")

    song_titles = []
    for i in range(len(selected_titles)):
        song_titles.append(selected_titles[i].get_text().strip().lower())

    ## Get the song artist
    selected_artists = soup.select("div.title-artist > div.artist")

    song_artists = []
    for i in range(len(selected_artists)):
        song_artists.append(selected_artists[i].get_text().strip().lower())
    
    return pd.DataFrame({'artist': song_artists, 'title': song_titles})  

In [16]:
uk_hot100_songs = get_uk_hot_100()
uk_hot100_songs.head(3)

NameError: name 'get_uk_hot_100' is not defined

**hmmmmmmmmmm...** need to think about how to handle multiple artists (/ vs , ... what about & ?)

## ------------- Dev space -----------------

In [3]:
url = 'https://www.timeout.com/music/best-party-songs'

## Download html with a get request 
response = requests.get(url)

## TODO: improve error checking and throw error here instead of just returning the code
if response.status_code != 200:
    print(f"Error! HTTP Request returned status code: {response.status_code}")

soup = BeautifulSoup(response.content, "html.parser")

In [6]:
#mw-content-text > div.mw-parser-output > table:nth-child(38) > tbody > tr:nth-child(1) > td:nth-child(3) > a
#mw-content-text > div.mw-parser-output > table:nth-child(38) > tbody > tr:nth-child(3) > td:nth-child(3) > a
#mw-content-text > div.mw-parser-output > table:nth-child(40) > tbody > tr:nth-child(1) > td:nth-child(3) > a

# select song by artist string
song_by_artists = soup.select("div._title_tpquo_9 > h3")

In [31]:
song_by_artists[56].get_text()

'57.\xa0‘O.P.P.’ by Naughty by Nature'

In [39]:
title_i, artist_i = song_by_artists[56].get_text().split("’ by ")
artist_i
".".join(title_i.split(".")[1:])[2:]

'O.P.P.'

In [111]:
# song_titles = []
# for i in range(len(selected_titles)):
#     song_titles.append(selected_titles[i].get_text().strip())

In [51]:
## Get the song artist
song_titles = []
song_artists = []
for i in range(len(song_by_artists)):
    print(f"{song_by_artists[i].get_text()}")
    print(f"""{song_by_artists[i].get_text().split("’ by ")}""")
#     title_i, artist_i = song_by_artists[i].get_text().split("’ by ")
#     title_i = ".".join(title_i.split(".")[1:])[2:]
#     song_titles.append(title_i)
#     song_artists.append(artist_i)
    
to_bps = pd.DataFrame({'artist': song_artists, 'title': song_titles}) 
to_bps   

1. ‘Like a Prayer’ by Madonna
['1.\xa0‘Like a Prayer', 'Madonna']
2. ‘1999’ by Prince
['2.\xa0‘1999', 'Prince']
3. ‘Single Ladies (Put a Ring on It)’ by Beyoncé
['3.\xa0‘Single Ladies (Put a Ring on It)', 'Beyoncé']
4. ‘Gonna Make You Sweat’ by C+C Music Factory
['4.\xa0‘Gonna Make You Sweat', 'C+C Music Factory']
5. ‘Call Me Maybe’ by Carly Rae Jepsen
['5.\xa0‘Call Me Maybe', 'Carly Rae Jepsen']
6. ‘Hey Ya!’ by OutKast
['6.\xa0‘Hey Ya!', 'OutKast']
7. ‘Poison’ by Bell Biv DeVoe
['7.\xa0‘Poison', 'Bell Biv DeVoe']
8. ‘Rock with You’ by Michael Jackson
['8.\xa0‘Rock with You', 'Michael Jackson']
9. ‘Push It’ by Salt-N-Pepa
['9.\xa0‘Push It', 'Salt-N-Pepa']
10. ‘Party Rock Anthem’ by LMFAO
['10.\xa0‘Party Rock Anthem', 'LMFAO']
11. ‘Uptown Funk’ by Mark Ronson ft. Bruno Mars
['11.\xa0‘Uptown Funk', 'Mark Ronson ft. Bruno Mars']
12. ‘Groove Is in the Heart‘ by Deee-Lite
['12.\xa0‘Groove Is in the Heart‘ by Deee-Lite']
13. ‘Got to Be Real’ by Cheryl Lynn
['13.\xa0‘Got to Be Real', 'Cheryl 

Unnamed: 0,artist,title


Unnamed: 0,artist,title
0,GAITAN/CASTILLO/ADASSA/FELIZ,WE DON'T TALK ABOUT BRUNO
1,FIREBOY DML & ED SHEERAN,PERU
2,JESSICA DARROW,SURFACE PRESSURE
3,GAYLE,ABCDEFU
4,LOST FREQUENCIES/CALUM SCOTT,WHERE ARE YOU NOW
...,...,...
95,LEWIS CAPALDI,SOMEONE YOU LOVED
96,EMMY MELI,I AM WOMAN
97,JONASU,BLACK MAGIC
98,NEMZZZ,ELEVATE


In [None]:
strftime('%Y-%m-%d') 