In [77]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
%matplotlib inline

In [78]:
url = 'https://en.wikipedia.org/wiki/List_of_best-selling_music_artists'

response = requests.get(url)
page = response.text
soup = BeautifulSoup(page, 'lxml')

In [79]:
alltables = soup.find_all('table')
#it finds 5 useful tables

In [80]:
#for each table:
#read the html into a pandas dataframe

m250 = alltables[0]
table1 = pd.read_html(m250.prettify())[0]

m200 = alltables[1]
table2 = pd.read_html(m200.prettify())[0]

m120 = alltables[2]
table3 = pd.read_html(m120.prettify())[0]

m100 = alltables[3]
table4 = pd.read_html(m100.prettify())[0]

m80 = alltables[4]
table5 = pd.read_html(m80.prettify())[0]

In [81]:
#get rid of titles as datapoints
table1 = table1.drop(0)
table2 = table2.drop(0)
table3 = table3.drop(0)
table4 = table4.drop(0)
table5 = table5.drop(0)

In [82]:
#merge all 5 lists into one dataframe
df = pd.concat([table1, table2, table3, table4, table5])

In [83]:
#reconstitute the headers/column titles
df = df.rename(index=str, columns={0: "Artist", 
                                  1: "Country/Market", 
                                  2: "Period Active", 
                                  3: "Year of First Record", 
                                  4: "Genre", 
                                  5: "Total Certified Units", 
                                  6: "Claimed Sales"})

In [84]:
#renumber indexes, so all tables are included
df = df.reset_index(drop=True)

In [85]:
#set up regular expression to weed out [##] ghost-links
killbrackets = re.compile(r' \[[0-9b]+\]')

In [86]:
#use the regex for each column just in case
df["Artist"] = [killbrackets.sub("", i) for i in df["Artist"]]
df["Country/Market"] = [killbrackets.sub("", i) for i in df["Country/Market"]]
df["Period Active"] = [killbrackets.sub("", i) for i in df["Period Active"]]
df["Year of First Record"] = [killbrackets.sub("", i) for i in df["Year of First Record"]]
df["Genre"] = [killbrackets.sub("", i) for i in df["Genre"]]
df["Total Certified Units"] = [killbrackets.sub("", i) for i in df["Total Certified Units"]]
df["Claimed Sales"] = [killbrackets.sub("", i) for i in df["Claimed Sales"]]

In [87]:
df["Total Certified Units"][0]

'7002271100000000000♠  Total available certified units: 271.1 million  US  : 212.250 million   JPN  : 4.950 million    UK  : 18.045 million    GER  : 8 million    FRA  : 3.890 million     CAN  : 14.455 million    AUS  : 3.060 million     ITA  : 355,000    BRA  : 550,000    SWE  485,000    SPA  : 1.250 million     SWI  : 350,000    BEL  : 265,000     ARG  : 1.606 million     DEN  : 270,000     AUT  : 500,000    POL  : 175,000    NZ  : 660,000   '

In [88]:
df

Unnamed: 0,Artist,Country/Market,Period Active,Year of First Record,Genre,Total Certified Units,Claimed Sales
0,The Beatles,United Kingdom,1960–1970,1962,Rock / Pop,7002271100000000000♠ Total available certifie...,600 million 500 million
1,Elvis Presley,United States,1954–1977,1954,Rock and roll / Pop / Country,7002212400000000000♠ Total available certifie...,600 million 500 million
2,Michael Jackson,United States,1964–2009,1971,Pop / Rock / Dance / Soul / R&B,7002184600000000000♠ Total available certifie...,350 million 300 million
3,Madonna,United States,1979–present,1982,Pop / Dance / Electronica,7002170600000000000♠ Total available certifie...,300 million 275 million
4,Elton John,United Kingdom,1964–present,1969,Pop / Rock,7002169000000000000♠ Total available certifie...,300 million 250 million
5,Led Zeppelin,United Kingdom,1968–1980,1969,Hard rock / Blues rock / Folk rock,7002139690000099999♠ Total available certifie...,300 million 200 million
6,Pink Floyd,United Kingdom,"1965–1996, 2014",1967,Progressive rock / Psychedelic rock,7002118900000000000♠ Total available certifie...,250 million 200 million
7,Rihanna,Barbados United States,2005–present,2005,R&B / Pop / Dance / Hip-hop,7002229500000000000♠ Total available certifie...,230 million
8,Mariah Carey,United States,1988–present,1990,R&B / Pop / Soul / Hip-hop,7002137100000000000♠ Total available certifie...,200 million 175 million
9,Celine Dion,Canada,1981–present,1981,Pop,7002125100000000000♠ Total available certifie...,200 million 175 million


In [33]:
#switch "present" to 2018
df["Period Active"] = pd.Series(df["Period Active"]).str.replace("present", "2018")

In [34]:
for i in range(89):
    int1 = int(df["Period Active"][i][5:9])
    int2 = int(df["Period Active"][i][0:4])
    df['Period Active'][i] = int1-int2

In [35]:
df = df.rename(index=str, columns={"Period Active": "Years Active"})

In [36]:
df.tail(89)

Unnamed: 0,Artist,Country/Market,Years Active,Year of First Record,Genre,Total Certified Units,Claimed Sales
0,The Beatles,United Kingdom,10,1962,Rock / Pop,7002271100000000000♠ Total available certifie...,600 million 500 million
1,Elvis Presley,United States,23,1954,Rock and roll / Pop / Country,7002212400000000000♠ Total available certifie...,600 million 500 million
2,Michael Jackson,United States,45,1971,Pop / Rock / Dance / Soul / R&B,7002184600000000000♠ Total available certifie...,350 million 300 million
3,Madonna,United States,39,1982,Pop / Dance / Electronica,7002170600000000000♠ Total available certifie...,300 million 275 million
4,Elton John,United Kingdom,54,1969,Pop / Rock,7002169000000000000♠ Total available certifie...,300 million 250 million
5,Led Zeppelin,United Kingdom,12,1969,Hard rock / Blues rock / Folk rock,7002139690000099999♠ Total available certifie...,300 million 200 million
6,Pink Floyd,United Kingdom,31,1967,Progressive rock / Psychedelic rock,7002118900000000000♠ Total available certifie...,250 million 200 million
7,Rihanna,Barbados United States,13,2005,R&B / Pop / Dance / Hip-hop,7002229500000000000♠ Total available certifie...,230 million
8,Mariah Carey,United States,30,1990,R&B / Pop / Soul / Hip-hop,7002137100000000000♠ Total available certifie...,200 million 175 million
9,Celine Dion,Canada,37,1981,Pop,7002125100000000000♠ Total available certifie...,200 million 175 million


In [89]:
df.to_pickle("raw_band_data.pkl")

In [42]:
#doesn't work because there are too many columns that are not numerical
#df.corr()
#sns.pairplot(df)

In [46]:
byyears = df.groupby(['Years Active', 'Year of First Record', 'Total Certified Units', 'Claimed Sales']).sum()

In [47]:
byyears

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Artist,Country/Market,Genre
Years Active,Year of First Record,Total Certified Units,Claimed Sales,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
8,1967,"7001469000000000000♠ Total available certified units: 46.9 million US: 36.6 million UK: 2.040 million GER: 1.750 million FRA: 1.9 million CAN: 2.780 million AUS: 1.022 million ITA: 130,000 BRA: 100,000 SPA: 300,000 SWI: 125,000 ARG: 150,000 AUT: 100,000",80 million,The Doors,United States,Psychedelic rock
9,1972,"7002129400000000000♠ Total available certified units: 129.4 million US: 108.9 million JPN: 600,000 UK: 7.835 million GER: 925,000 FRA: 2.040 million CAN: 4.2 million AUS: 3.395 million SWE: 140,000 SPA: 500,000 SWI: 290,000 DEN: 185,000 > FIN: 124,749 NZ: 352,500",150 million,Eagles,United States,Rock
10,1962,"7002271100000000000♠ Total available certified units: 271.1 million US : 212.250 million JPN : 4.950 million UK : 18.045 million GER : 8 million FRA : 3.890 million CAN : 14.455 million AUS : 3.060 million ITA : 355,000 BRA : 550,000 SWE 485,000 SPA : 1.250 million SWI : 350,000 BEL : 265,000 ARG : 1.606 million DEN : 270,000 AUT : 500,000 POL : 175,000 NZ : 660,000",600 million 500 million,The Beatles,United Kingdom,Rock / Pop
10,1972,"7001607000000000000♠ Total available certified units: 60.7 million US: 12.7 million JPN: 1.5 million UK: 18.970 million GER: 10.450 million FRA: 2.750 million CAN: 2.785 million AUS: 6.127 million BRA: 275,000 SWE: 800,000 SPA: 1.005 million MEX: 260,000 SWI: 600,000 BEL: 380,000 ARG: 238,000 DEN: 600,000 AUT: 175,000 POL: 150,000 FIN: 656,319 NZ: 297,500",200 million 100 million,ABBA,Sweden,Pop / Disco
10,2009,"7002132600000000000♠ Total available certified units: 132.6 million US: 87.5 million JPN: 750,000 UK: 17.285 million GER: 3.3 million FRA: 1.2 CAN: 5.620 million AUS: 4.725 million ITA: 1.795 million BRA: 810,000 SWE: 2.020 million SPA: 880,000 MEX: 2.910 million BEL: 450,000 DEN: 2.230 million POL: 370,000 AUT: 140,000 NZ: 615,000",100 million,Justin Bieber,Canada United States,"Pop / Teen pop, Dance pop"
10,2010,"7001716009999900000♠ Total available certified units: 71.6 million US: 58 million JPN: 350,000 UK: 7.1 million GER: 800,000 FRA: 225,000 CAN: 480,000 AUS: 2.835 million ITA: 375,000 BRA: 250,000 SWE: 400,000 SPA: 100,000 MEX: 150,000 SWI: 180,000 DEN: 190,000 NZ: 187,500",85 million,Nicki Minaj,United States,"Hip hop, Pop"
11,2007,"7001698000000000000♠ Total available certified units: 69.8 million US: 49 million JPN: 650,000 UK: 7.960 million GER: 3 million FRA: 225,000 CAN: 2.810 million AUS: 3.535 million ITA: 330,000 SWE: 780,000 SPA: 140,000 MEX: 330,000 SWI: 420,000 DEN: 205,000 AUT: 180,000 NZ: 247,500",80 million,Flo Rida,United States,"Hip-hop, Hip house , EDM"
12,1969,"7002139690000099999♠ Total available certified units: 139.7 million US: 114.1 million JPN: 400,000 UK: 9.330 million GER: 3.775 million FRA: 2.310 million CAN: 4.710 million AUS: 2.8 million ITA: 345,000 BRA: 820,000 SPA: 450,000 SWI: 211,000 ARG: 360,000 POL: 120,000",300 million 200 million,Led Zeppelin,United Kingdom,Hard rock / Blues rock / Folk rock
12,2006,"7002181600000000000♠ Total available certified units: 181.6 million US : 157 million JPN : 2.850 million UK : 8.9 million GER : 950,000 FRA : 250,000 CAN : 4.580 million AUS : 5.495 million ITA : 285,000 BRA : 170,000 SWE : 180,000 MEX : 450,000 DEN : 140,000> NZ : 375,000",175 million,Taylor Swift,United States,Country / Country pop / Pop / Pop-rock
12,2008,"7002104900000000000♠ Total available certified units: 104.9 million US: 61 million JPN: 100,000 UK: 20.550 million GER: 4.7 million CAN: 6.120 million AUS: 3.955 million ITA: 1.570 million BRA: 1.310 million SWE: 240,000 SPA: 800,000 MEX: 1.490 million SWI: 510,000 BEL: 795,000 DEN: 747,500 POL: 300,000 FIN: 151,639 NZ: 577,500",100 million,Adele,United Kingdom,"Pop, soul"


In [48]:
globally = df[['Artist', 'Country/Market', 'Total Certified Units']].copy()

In [68]:
globally["Total Certified Units"].str.replace(' million', '000000')
globally["Total Certified Units"][0]

'7002271100000000000♠  Total available certified units: 271.1 million  US  : 212.250 million   JPN  : 4.950 million    UK  : 18.045 million    GER  : 8 million    FRA  : 3.890 million     CAN  : 14.455 million    AUS  : 3.060 million     ITA  : 355,000    BRA  : 550,000    SWE  485,000    SPA  : 1.250 million     SWI  : 350,000    BEL  : 265,000     ARG  : 1.606 million     DEN  : 270,000     AUT  : 500,000    POL  : 175,000    NZ  : 660,000   '

In [70]:
globally['Total Certified Units'][0].rsplit('   ')

['7002271100000000000♠  Total available certified units: 271.1 million  US  : 212.250 million',
 'JPN  : 4.950 million ',
 'UK  : 18.045 million ',
 'GER  : 8 million ',
 'FRA  : 3.890 million  ',
 'CAN  : 14.455 million ',
 'AUS  : 3.060 million  ',
 'ITA  : 355,000 ',
 'BRA  : 550,000 ',
 'SWE  485,000 ',
 'SPA  : 1.250 million  ',
 'SWI  : 350,000 ',
 'BEL  : 265,000  ',
 'ARG  : 1.606 million  ',
 'DEN  : 270,000  ',
 'AUT  : 500,000 ',
 'POL  : 175,000 ',
 'NZ  : 660,000',
 '']

In [57]:
pd.Series(df["Total Certified Units"][0]).str.replace(" million ", "000000")

0    7002271100000000000♠  Total available certifie...
dtype: object

0    7002271100000000000♠  Total available certifie...
dtype: object

In [72]:
for i in range(2):
    print(df["Total Certified Units"][i])

0    7002271100000000000♠  Total available certifie...
dtype: object
7002212400000000000♠  Total available certified units: 212.4 million  US: 188.650 million   JPN: 300,000    UK: 13.145 million    GER: 1.2 million    FRA: 2.590 million     CAN: 2.925 million    AUS: 1.587 million    ITA: 105,000    BRA: 125,000    SWE 380,000    SPA: 300,000      MEX  : 105,000     SWI: 185,000    BEL: 115,000     ARG: 110,000    DEN: 120,000      AUT: 205,000    FIN  : 213,945    NZ: 117,500   


In [76]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 89 entries, 0 to 88
Data columns (total 7 columns):
Artist                   89 non-null object
Country/Market           89 non-null object
Years Active             89 non-null object
Year of First Record     89 non-null object
Genre                    89 non-null object
Total Certified Units    89 non-null object
Claimed Sales            89 non-null object
dtypes: object(7)
memory usage: 8.1+ KB
