## There will be three modules here
-Module 1: scrape UK Year-End Singles Top 100 Chart information and turn into dataframe(Artist,Song)
-Module 2: scrape Billboard Year-End Singles Top 100 Chart information and turn into dataframe(Artist,Song)
-Module 3: use information to get lyrics to each song

In [80]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

### Module 1: UK Top 100 song info into dataframe

In [81]:
def ukchart_to_df(url):
    '''
    Scrapes webpage that has year-end top 100 singles of the uk charts, turns into df

    Input: str, url of uk-charts.top-source.info page that has the year-end 100 singles chart
    Output: df, dataframe of 100 songs (artist, song)
    '''
    temp = requests.get(url)
    soup = BeautifulSoup(temp.text)
    
    artists = []
    songs = []
    table = soup.find("table").find("tbody").find_all("tr")
    for r in range(100):
        artist = table[r].find_all("td")[1].get_text()
        song = table[r].find_all("td")[2].get_text()
        artists.append(artist)
        songs.append(song)

    tuples_data = list(zip(artists, songs))
    return pd.DataFrame(tuples_data, columns=['Artist','Song'])

In [82]:
# testing above function on a single year
chart_to_df('http://www.uk-charts.top-source.info/top-100-2019.shtml')

Unnamed: 0,Artist,Song
0,Lewis Capaldi,Someone You Loved
1,Lil Nas X,Old Town Road
2,Billie Eilish,bad guy
3,Calvin Harris & Rag'n'Bone Man,Giant
4,AJ Tracey,Ladbroke Grove
...,...,...
95,Taylor Swift Ft Brendon Urie,ME!
96,Jax Jones & Bebe Rexha,Harder
97,Stormzy,Crown
98,Lauv Ft Anne-Marie,"fuck, i'm lonely"


In [83]:
# Use ukchart_to_df function to get songs from not just one year, but from 1990 - 2019
uk_pops = pd.DataFrame()
for yr in range(1990, 2020):
    url = 'http://www.uk-charts.top-source.info/top-100-' + str(yr) + '.shtml'
    df = ukchart_to_df(url)
    uk_pops = uk_pops.append(df, ignore_index=True)

In [84]:
uk_pops.shape

(3000, 2)

In [85]:
# We now have 100 * 30 yrs = 3000 songs. Some songs appear in multiple years, so drop those.
uk_pops = uk_pops.drop_duplicates(ignore_index=True)
uk_pops.shape

(2798, 2)

### Module 2: Billboard Top 100 song info into dataframe

In [112]:
# similar workflow to Module 1
def uschart_to_df(url):
    '''
    Scrapes webpage that has year-end top 100 singles of the uk charts, turns into df

    Input: str, url of uk-charts.top-source.info page that has the year-end 100 singles chart
    Output: df, dataframe of 100 songs (artist, song)
    '''
    temp = requests.get(url)
    soup = BeautifulSoup(temp.text)
    
    artists = []
    songs = []
    table = soup.find("table", {'class': "wikitable sortable"}).find("tbody").find_all("tr")
    for r in range(1,101):
        artist_raw = table[r].find_all("td")[1].get_text()
        song_raw = table[r].find_all("td")[2].get_text()
        #rid of quotations and linebreaks
        artist = artist_raw[1:-1]
        song = song_raw[:-1]
        artists.append(artist)
        songs.append(song)

    tuples_data = list(zip(artists, songs))
    return pd.DataFrame(tuples_data, columns=['Artist','Song'])

In [113]:
uschart_to_df('https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_1990')

Unnamed: 0,Artist,Song
0,Hold On,Wilson Phillips
1,It Must Have Been Love,Roxette
2,Nothing Compares 2 U,Sinéad O'Connor
3,Poison,Bell Biv DeVoe
4,Vogue,Madonna
...,...,...
95,Without You,Mötley Crüe
96,Swing the Mood,Jive Bunny and the Mastermixers
97,Thieves in the Temple,Prince
98,Mentirosa,Mellow Man Ace


In [114]:
# get all songs from 1990 - 2019 and concat
us_pops = pd.DataFrame()
for yr in range(1990, 2020):
    url = 'https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_' + str(yr)
    df = uschart_to_df(url)
    us_pops = us_pops.append(df, ignore_index=True)

1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019


In [None]:
us_pops