In [338]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

url = 'https://en.wikipedia.org/wiki/List_of_best-selling_music_artists'

response = requests.get(url)
page = response.text
soup = BeautifulSoup(page, 'lxml')

In [339]:
alltables = soup.find_all('table')
#it finds 5 useful tables

In [340]:
#for each table:
#read the html into a pandas dataframe

m250 = alltables[0]
table1 = pandas.read_html(m250.prettify())[0]

m200 = alltables[1]
table2 = pandas.read_html(m200.prettify())[0]

m120 = alltables[2]
table3 = pandas.read_html(m120.prettify())[0]

m100 = alltables[3]
table4 = pandas.read_html(m100.prettify())[0]

m80 = alltables[4]
table5 = pandas.read_html(m80.prettify())[0]

In [341]:
#get rid of titles as datapoints
table1 = table1.drop(0)
table2 = table2.drop(0)
table3 = table3.drop(0)
table4 = table4.drop(0)
table5 = table5.drop(0)

In [342]:
#merge all 5 lists into one dataframe
df = pd.concat([table1, table2, table3, table4, table5])

In [343]:
#reconstitute the headers/column titles
df = df.rename(index=str, columns={0: "Artist", 
                                  1: "Country/Market", 
                                  2: "Period Active", 
                                  3: "Year of First Record", 
                                  4: "Genre", 
                                  5: "Total Certified Units", 
                                  6: "Claimed Sales"})

In [344]:
#renumber indexes, so all tables are included
df = df.reset_index(drop=True)

In [345]:
#set up regular expression to weed out [##] ghost-links
killbrackets = re.compile(r' \[[0-9b]+\]')

In [346]:
#use the regex for each column just in case
df["Artist"] = [killbrackets.sub("", i) for i in df["Artist"]]
df["Country/Market"] = [killbrackets.sub("", i) for i in df["Country/Market"]]
df["Period Active"] = [killbrackets.sub("", i) for i in df["Period Active"]]
df["Year of First Record"] = [killbrackets.sub("", i) for i in df["Year of First Record"]]
df["Genre"] = [killbrackets.sub("", i) for i in df["Genre"]]
df["Total Certified Units"] = [killbrackets.sub("", i) for i in df["Total Certified Units"]]
df["Claimed Sales"] = [killbrackets.sub("", i) for i in df["Claimed Sales"]]

In [347]:
df["Total Certified Units"][0]

'7002271100000000000♠  Total available certified units: 271.1 million  US  : 212.250 million   JPN  : 4.950 million    UK  : 18.045 million    GER  : 8 million    FRA  : 3.890 million     CAN  : 14.455 million    AUS  : 3.060 million     ITA  : 355,000    BRA  : 550,000    SWE  485,000    SPA  : 1.250 million     SWI  : 350,000    BEL  : 265,000     ARG  : 1.606 million     DEN  : 270,000     AUT  : 500,000    POL  : 175,000    NZ  : 660,000   '

In [348]:
df

Unnamed: 0,Artist,Country/Market,Period Active,Year of First Record,Genre,Total Certified Units,Claimed Sales
0,The Beatles,United Kingdom,1960–1970,1962,Rock / Pop,7002271100000000000♠ Total available certifie...,600 million 500 million
1,Elvis Presley,United States,1954–1977,1954,Rock and roll / Pop / Country,7002212400000000000♠ Total available certifie...,600 million 500 million
2,Michael Jackson,United States,1964–2009,1971,Pop / Rock / Dance / Soul / R&B,7002184600000000000♠ Total available certifie...,350 million 300 million
3,Madonna,United States,1979–present,1982,Pop / Dance / Electronica,7002170600000000000♠ Total available certifie...,300 million 275 million
4,Elton John,United Kingdom,1964–present,1969,Pop / Rock,7002169000000000000♠ Total available certifie...,300 million 250 million
5,Led Zeppelin,United Kingdom,1968–1980,1969,Hard rock / Blues rock / Folk rock,7002139690000099999♠ Total available certifie...,300 million 200 million
6,Pink Floyd,United Kingdom,"1965–1996, 2014",1967,Progressive rock / Psychedelic rock,7002118900000000000♠ Total available certifie...,250 million 200 million
7,Rihanna,Barbados United States,2005–present,2005,R&B / Pop / Dance / Hip-hop,7002229500000000000♠ Total available certifie...,230 million
8,Mariah Carey,United States,1988–present,1990,R&B / Pop / Soul / Hip-hop,7002137100000000000♠ Total available certifie...,200 million 175 million
9,Celine Dion,Canada,1981–present,1981,Pop,7002125100000000000♠ Total available certifie...,200 million 175 million


In [349]:
#switch "present" to 2018
df["Period Active"] = pd.Series(df["Period Active"]).str.replace("present", "2018")

In [350]:
for i in range(89):
    int1 = int(df["Period Active"][i][5:9])
    int2 = int(df["Period Active"][i][0:4])
    df['Period Active'][i] = int1-int2

In [351]:
df = df.rename(index=str, columns={"Period Active": "Years Active"})

In [353]:
df.tail(50)

Unnamed: 0,Artist,Country/Market,Years Active,Year of First Record,Genre,Total Certified Units,Claimed Sales
39,Lil Wayne,United States,22,1999,Hip-hop,7001799000000000000♠ Total available certifie...,100 million
40,Britney Spears,United States,20,1998,Pop / Dance / Dance-pop,7001787000000000000♠ Total available certifie...,100 million
41,Rod Stewart,United Kingdom,54,1969,Rock / Pop,7001781009999900000♠ Total available certifie...,100 million
42,Fleetwood Mac,United Kingdom United States,51,1968,Rock / Pop,7001763000000000000♠ Total available certifie...,100 million
43,Guns N' Roses,United States,33,1987,Hard rock / Heavy metal,7001743000000000000♠ Total available certifie...,100 million
44,George Strait,United States,37,1984,Country,7001728000000000000♠ Total available certifie...,100 million
45,Backstreet Boys,United States,25,1995,Pop,7001720000000000000♠ Total available certifie...,100 million
46,Neil Diamond,United States,52,1966,Pop / Rock,7001683000000000000♠ Total available certifie...,100 million
47,Prince,United States,40,1978,Funk / R&B / Pop / Soul / Rock,7001642000000000000♠ Total available certifie...,100 million
48,Kenny Rogers,United States,60,1975,Country / Pop,7001601000000000000♠ Total available certifie...,100 million
