In [1]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

In [4]:
import requests

req = requests.get("https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_1970")

In [5]:
from bs4 import BeautifulSoup
page = req.text
soup = BeautifulSoup(page, 'html.parser')
soup.table["class"]

table_html = soup.find('table', attrs={'class':'wikitable sortable'})

In [6]:
songs_list = []

table_row_list = table_html.find_all('tr')
table_rows = table_row_list[1:] 
table_rows

for row in table_rows:
#     print(row)
    table_row_values = row.find_all('td')
    
#     print(table_cols)
    
    ranking = table_row_values[0].get_text()
    title = table_row_values[1].get_text()
    band_singer = table_row_values[2].get_text().replace("\n","")
    
    url = table_row_values[1].a["href"]
    
    dic = {'band_singer': band_singer,
           'ranking': ranking,
           'title': title,
           'url': url}
    songs_list.append(dic)
print(songs_list[2:4])

[{'band_singer': 'The Guess Who', 'ranking': '3', 'title': '"American Woman"', 'url': '/wiki/American_Woman'}, {'band_singer': 'B.J. Thomas', 'ranking': '4', 'title': '"Raindrops Keep Fallin\' on My Head"', 'url': '/wiki/Raindrops_Keep_Fallin%27_on_My_Head'}]


In [7]:
years = {}

for y in range(1992,2015):
    st = "https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_"+ str(y)
    temp_req = requests.get(st)
    years[y] = temp_req.text

In [8]:
def parse_year(the_year, yeartext_dict):
    year = the_year
    yearinfo = []
    song = []
    songurl = []
    band_singer = []
    title = []
    url = []
    title_text = ''
    i = 0
    title_string = ''
    band_singer = ''
    soup = BeautifulSoup(years[year], "lxml")
    tables = soup.find('table', attrs={'class':'wikitable sortable'})        
    #iterates through tree structure, scraping our data
    tr_list = tables.find_all('tr')    
    for tr in tr_list:
        td_list = tr.find_all('td')
        if td_list == [] :
            td_list = []
        else : 
            ranking = tr.th.string
            links = tr.td.findAll('a')
            number_of_links = len(links)   
            if number_of_links == 0:
                songurl = [None]
                title_text = [a['title']]
                song.append(a['title'] )
            else :
                i = 0
                for a in tr.td.findAll('a') : 
                    song.append(a['title'] )
                    title_string = '\"' + a['title'] + '\"'    
                    if i == 0 :
                        title_text = title_string
                        i = i + 1
                    else :
                        title_text = title_text + ' / ' + title_string
                        i = i + 1    
                    songurl.append(a['href'])
            title = song
            #finds next td tag
            tr.td.findNext('td') 
            temp = len(tr.td.findNext('td').findAll('a'))
            if temp == 0:
                singer_url = [None]
                band_singer = tr.td.findNext('td').string
                band_singer = [band_singer]
            elif temp == 1:
                singer_url = tr.td.findNext('td').a['href']
                singer_url = [singer_url]
                band_singer = tr.td.findNext('td').a.string
                band_singer = [band_singer]
            else:
                singer_url = []
                band_singer = []
                for a in tr.td.findNext('td').findAll('a'):
                    webpage = a['href']
                    singer_url.append(webpage)
                    band_singer.append(a.string)            
            #creates dictionary entry                   
            dict_entry = {'band_singer' : band_singer,
            'ranking' : ranking.replace("\n",""),
            'song' : title, 'songurl': songurl, 'titletext' : title_text,
            'url' : singer_url}
            #appends new dictionary to our list
            yearinfo.append(dict_entry)      
            songurl = []
            song = []
            title_string = ''
            title_text = ''    
    return(yearinfo)

In [9]:
years_info = {}
for y in range(1992,2005):
  years_info.update({y: parse_year(y,years)})
  
print(years_info[1997])

[{'band_singer': ['Elton John'], 'ranking': '1', 'song': ['Something About the Way You Look Tonight', 'Candle in the Wind 1997'], 'songurl': ['/wiki/Something_About_the_Way_You_Look_Tonight', '/wiki/Candle_in_the_Wind_1997'], 'titletext': '"Something About the Way You Look Tonight" / "Candle in the Wind 1997"', 'url': ['/wiki/Elton_John']}, {'band_singer': ['Jewel'], 'ranking': '2', 'song': ['Foolish Games', 'You Were Meant for Me (Jewel song)'], 'songurl': ['/wiki/Foolish_Games', '/wiki/You_Were_Meant_for_Me_(Jewel_song)'], 'titletext': '"Foolish Games" / "You Were Meant for Me (Jewel song)"', 'url': ['/wiki/Jewel_(singer)']}, {'band_singer': ['Puff Daddy', 'Faith Evans', '112'], 'ranking': '3', 'song': ["I'll Be Missing You"], 'songurl': ['/wiki/I%27ll_Be_Missing_You'], 'titletext': '"I\'ll Be Missing You"', 'url': ['/wiki/Sean_Combs', '/wiki/Faith_Evans', '/wiki/112_(band)']}, {'band_singer': ['Toni Braxton'], 'ranking': '4', 'song': ['Un-Break My Heart'], 'songurl': ['/wiki/Un-Brea

In [12]:
import json

In [None]:
# RERUN WHEN SUBMITTING
# Another way to deal with files. Has the advantage of closing the file for you.
with open("yearinfo.json", "r") as fd:
    yearinfo = json.load(fd)

In [None]:
rows = []
for year in yearinfo.keys():
    for song in yearinfo[year]:
        song['year'] = year
        rows.append(song)

In [None]:
rows2
flatframe = pd.DataFrame(rows2)


# check datatypes of dataframe columns
flatframe['year'].dtype
flatframe

In [None]:
artist_count = flatframe["band_singer"].value_counts()

artist_count

In [13]:
urlcache={}

In [14]:
def get_page(url):
    # Check if URL has already been visited.
    if (url not in urlcache) or (urlcache[url]==1) or (urlcache[url]==2):
        time.sleep(1)
        # try/except blocks are used whenever the code could generate an exception (e.g. division by zero).
        # In this case we don't know if the page really exists, or even if it does, if we'll be able to reach it.
        try:
            r = requests.get("http://en.wikipedia.org%s" % url)

            if r.status_code == 200:
                urlcache[url] = r.text
            else:
                urlcache[url] = 1
        except:
            urlcache[url] = 2
    return urlcache[url]

In [None]:
flatframe=flatframe.sort_values('year')
flatframe.head()
urlcache

In [None]:
# DO NOT RERUN THIS CELL WHEN SUBMITTING
# Here we are populating the url cache
# subsequent calls to this cell should be very fast, since Python won't
# need to fetch the page from the web server.
# NOTE this function will take quite some time to run (about 30 mins for me), since we sleep 1 second before
# making a request. If you run it again it will be almost instantaneous, save requests that might have failed
# (you will need to run it again if requests fail..see cell below for how to test this)
flatframe["url"].apply(get_page)

In [None]:
# DO NOT RERUN THIS CELL WHEN SUBMITTING
print("Number of bad requests:",np.sum([(urlcache[k]==1) or (urlcache[k]==2) for k in urlcache])) # no one or 0's)
print("Did we get all urls?", len(flatframe.url.unique())==len(urlcache)) # we got all of the urls

In [None]:
# DO NOT RERUN THIS CELL WHEN SUBMITTING
with open("data/artistinfo.json","w") as fd:
    json.dump(urlcache, fd)
del urlcache

In [None]:
# RERUN WHEN SUBMITTING
with open("artistinfo.json") as json_file:
    urlcache = json.load(json_file)
    
urlcache

In [18]:
def singer_band_info(url, page_text):
    bday_dict = {}
    bday = ''
    ya = ''
    # soupify our webpage
    soup = BeautifulSoup(page_text[url], "lxml")
    tables = soup.find('table', attrs={'class':'infobox vcard plainlist'})
    if (tables == None):
        tables = soup.find('table', attrs={'class':'infobox biography vcard'})
    bday = tables.find('span', {'class': 'bday'})
    if bday: 
        bday = bday.get_text()[:4]
        bday_dict = {'url' : url, 'born' : bday, 'ya' : ya}
    else:
        ya = False
        for tr in tables.find_all('tr'):
            if hasattr(tr.th, 'span'):
                if hasattr(tr.th.span, 'string'):
                    if tr.th.span.string == 'Years active':                
                        if hasattr(tr.th, 'span'):
                            #ya = tr.td.string
                            ya = tr.td.text   #DK add
                            bday = 'False'
                            bday_dict = {'url' : url, 'born' : 'False', 'ya' : ya}
    return(bday_dict)

In [None]:
url = '/wiki/Iggy_Azalea'
singer_band_info(url, urlcache)

In [None]:
list_of_addit_dicts = []
bday_dict = {}
for url in urlcache.keys():   
    try:
        bday_dict = singer_band_info(url, urlcache)
        list_of_addit_dicts.append(bday_dict)
    except:
        break

In [None]:
additional_df = pd.DataFrame(list_of_addit_dicts)

largedf = pd.merge(flatframe, additional_df, left_on='url', right_on='url', how="outer")
largedf