In [333]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re
import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
%matplotlib inline

In [334]:
#upload the information using BeautifulSoup
url = 'https://en.wikipedia.org/wiki/List_of_best-selling_music_artists'

response = requests.get(url)
page = response.text
soup = BeautifulSoup(page, 'lxml')

In [335]:
#find the tables located on this page
alltables = soup.find_all('table')
#it finds 5 useful tables

In [336]:
#for each table:
#read the html into a pandas dataframe

m250 = alltables[0]
table1 = pd.read_html(m250.prettify())[0]

m200 = alltables[1]
table2 = pd.read_html(m200.prettify())[0]

m120 = alltables[2]
table3 = pd.read_html(m120.prettify())[0]

m100 = alltables[3]
table4 = pd.read_html(m100.prettify())[0]

m80 = alltables[4]
table5 = pd.read_html(m80.prettify())[0]

In [337]:
#get rid of titles as datapoints
table1 = table1.drop(0)
table2 = table2.drop(0)
table3 = table3.drop(0)
table4 = table4.drop(0)
table5 = table5.drop(0)

In [338]:
#merge all 5 lists into one dataframe
df = pd.concat([table1, table2, table3, table4, table5])

In [339]:
#rename the headers/column titles
df = df.rename(index=str, columns={0: "Artist", 
                                  1: "Country/Market", 
                                  2: "Period Active", 
                                  3: "Year of First Record", 
                                  4: "Genre", 
                                  5: "Total Certified Units", 
                                  6: "Claimed Sales"})

In [340]:
#renumber indexes, so all tables are included
df = df.reset_index(drop=True)

In [341]:
#set up regular expression to weed out [##] ghost-links
killbrackets = re.compile(r' \[[0-9b]+\]')

#use the regex for each column just in case
df["Artist"] = [killbrackets.sub("", i) for i in df["Artist"]]
df["Country/Market"] = [killbrackets.sub("", i) for i in df["Country/Market"]]
df["Period Active"] = [killbrackets.sub("", i) for i in df["Period Active"]]
df["Year of First Record"] = [killbrackets.sub("", i) for i in df["Year of First Record"]]
df["Genre"] = [killbrackets.sub("", i) for i in df["Genre"]]
df["Total Certified Units"] = [killbrackets.sub("", i) for i in df["Total Certified Units"]]
df["Claimed Sales"] = [killbrackets.sub("", i) for i in df["Claimed Sales"]]

In [342]:
#replace "present" to 2018 in Period Active
df["Period Active"] = pd.Series(df["Period Active"]).str.replace("present", "2018")

#calculate the length of years playing music for each band
for i in range(89):
    int1 = int(df["Period Active"][i][5:9])
    int2 = int(df["Period Active"][i][0:4])
    df['Period Active'][i] = int1-int2

#rename column    
df = df.rename(index=str, columns={"Period Active": "Years Active"})

In [343]:
#df.head(5)

In [344]:
df.to_pickle("raw_band_data.pkl")

In [373]:
#deal with that pesky "Total Certified Units" column.
#this whole ordeal will return a dictionary of small dictionaries

bands = {}

def formats(s):
#takes in the raw TCU string
 #  uses the RegularExpressionsStuff to format it
  # returns a list of tuples [(country: number), (country2: number2), ...]'''
    rgxnospaces = re.compile(r"([A-Z]+)\s*(?:: )?([0-9]+\.[0-9]+|[0-9]+,[0-9]{3})") 
    return(rgxnospaces.findall(s))


def makedict(s):
#'''takes in the post-regex list of tuples
 #  creates a dictionary with 3digit country code is the key, values are numbers
  # returns the dictionary'''
    monies = {i:aux(j) for i,j in s}
    return monies


def aux(j):
#'''works within makedict(s)
 #  converts millions to actual numbers, and all numbers to ints'''
    if "," in j:
        return int(j.replace(",",""))
    else:
        return int(float(j)*1000000)
    
    
def banddict(df):
#'''takes in the full raw df
 #  returns a dictionary of things we want'''
    for i in range(89):
        ccoded = pd.Series(makedict(formats(df["Total Certified Units"][i])))
        bands[df["Artist"][i]] = ccoded
    return bands 


def totalcertcounts(df):
    for i in range(89):
        tcurgx = re.compile(r"^[0-9]+[^0-9]+([0-9\.]+)")
        value = (tcurgx.findall(df["Total Certified Units"][i]))
        v1 = value[0]
        v2 = float(v1)
        v3 = v2*1000000
        v4 = int(v3)
        df['Certified Sales'][i] = v4 
    return df
        
def claimedsales(df4):
    for i in range(89):
        claimedrgx = re.compile(r"([0-9\.]+)")
        value = (claimedrgx.findall(df4["Claimed Sales"][i]))
        v1 = value[0]
        v2 = float(v1)
        #print(v1)
        v3 = v2*1000000
        v4 = int(v3)
        df4['Claimed Sales'][i] = v4 
    return df4

In [346]:
#create an empty column to recieve our certified sales data
df = df.reindex(columns = ['Artist', 
                           'Country/Market', 
                           'Years Active', 
                           'Year of First Record',
                           'Genre',
                           'Total Certified Units',
                           'Claimed Sales',
                           'Certified Sales'])
#df.head(2)   

In [347]:
#populate the total certifiable sales data into the new column
df = totalcertcounts(df)
#df.head(2)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [348]:
banddict(df)

{'ABBA': ARG      238000
 AUS     6127000
 AUT      175000
 BEL      380000
 BRA      275000
 CAN     2785000
 DEN      600000
 FIN      656319
 FRA     2750000
 GER    10450000
 JPN     1500000
 MEX      260000
 NZ       297500
 POL      150000
 SPA     1004999
 SWE      800000
 SWI      600000
 UK     18970000
 US     12700000
 dtype: int64, 'AC/DC': ARG      594000
 AUS     7540000
 AUT      380000
 BEL      100000
 CAN     2490000
 DEN      130000
 FIN      321169
 FRA     4179999
 GER    10300000
 ITA      775000
 NZ       140000
 SPA     1355000
 SWE      350000
 SWI      894000
 UK      4400000
 US     79700000
 dtype: int64, 'Adele': AUS     3955000
 BEL      795000
 BRA     1310000
 CAN     6120000
 DEN      747500
 FIN      151639
 GER     4700000
 ITA     1570000
 JPN      100000
 MEX     1490000
 NZ       577500
 POL      300000
 SPA      800000
 SWE      240000
 SWI      510000
 UK     20550000
 dtype: int64, 'Aerosmith': ARG      338000
 AUS      225000
 AUT      125000
 

In [349]:
#banddict is a dictionary.
#need to convert it to a pandas dataframe, then combine it with the other columns of data

df2=pd.DataFrame.from_dict(bands, orient='index')
#df2.head(2)

In [350]:
df = df.set_index('Artist')   #sets the "Artist" to be the index, instead of arbitrary numbers

In [351]:
df3 = pd.concat([df, df2], axis=1)   #creates df3 as a pandastable of alphabetized artists and their attributes
#df3.head(2)

In [352]:
df3 = df3.drop(columns=['Total Certified Units'])

In [353]:
df3

Unnamed: 0,Country/Market,Years Active,Year of First Record,Genre,Claimed Sales,Certified Sales,ARG,AUS,AUT,BEL,...,SWE,SWI,UK,US,FIN,GER,MEX,IRE,PHL,NLD
ABBA,Sweden,10,1972,Pop / Disco,200 million 100 million,60700000.0,238000.0,6127000.0,175000.0,380000.0,...,800000.0,600000.0,18970000.0,12700000.0,656319.0,10450000.0,260000.0,,,
AC/DC,Australia,45,1975,Hard rock / Blues rock / Rock and roll,200 million 150 million,113600000.0,594000.0,7540000.0,380000.0,100000.0,...,350000.0,894000.0,4400000.0,79700000.0,321169.0,10300000.0,,,,
Adele,United Kingdom,12,2008,"Pop, soul",100 million,104900000.0,,3955000.0,,795000.0,...,240000.0,510000.0,20550000.0,,151639.0,4700000.0,1490000.0,,,
Aerosmith,United States,48,1973,Hard rock,150 million,84700000.0,338000.0,225000.0,125000.0,,...,260000.0,145000.0,3230000.0,70250000.0,101722.0,,230000.0,,,
Ayumi Hamasaki,Japan,20,1998,J-pop / Pop / dance / electronic,80 million,64300000.0,,,,,...,,,,,,,,,,
B'z,Japan,30,1988,Rock / Pop rock / Hard rock,100 million,85700000.0,,,,,...,,,,,,,,,,
Backstreet Boys,United States,25,1995,Pop,100 million,72000000.0,740000.0,1400000.0,325000.0,425000.0,...,655000.0,415000.0,4875000.0,41500000.0,171390.0,,1475000.0,,,
Barbra Streisand,United States,58,1963,Pop / Adult contemporary,145 million,97400000.0,,2047000.0,,,...,120000.0,,5765000.0,82450000.0,186501.0,750000.0,,,,
Barry White,United States,31,1973,R&B / Soul,100 million,21700000.0,,,,150000.0,...,,,3665000.0,16500000.0,,,,,,
Bee Gees,United Kingdom Australia,40,1963,Pop / Disco,120 million,68300000.0,188000.0,1187000.0,125000.0,,...,,415000.0,9065000.0,42500000.0,,6675000.0,,,,


In [376]:
df4 = df3.reset_index(drop=False)
df4 = df4.rename(index=str, columns={"index": "Artist"})

In [377]:
df4

Unnamed: 0,Artist,Country/Market,Years Active,Year of First Record,Genre,Claimed Sales,Certified Sales,ARG,AUS,AUT,...,SWE,SWI,UK,US,FIN,GER,MEX,IRE,PHL,NLD
0,ABBA,Sweden,10,1972,Pop / Disco,200 million 100 million,60700000.0,238000.0,6127000.0,175000.0,...,800000.0,600000.0,18970000.0,12700000.0,656319.0,10450000.0,260000.0,,,
1,AC/DC,Australia,45,1975,Hard rock / Blues rock / Rock and roll,200 million 150 million,113600000.0,594000.0,7540000.0,380000.0,...,350000.0,894000.0,4400000.0,79700000.0,321169.0,10300000.0,,,,
2,Adele,United Kingdom,12,2008,"Pop, soul",100 million,104900000.0,,3955000.0,,...,240000.0,510000.0,20550000.0,,151639.0,4700000.0,1490000.0,,,
3,Aerosmith,United States,48,1973,Hard rock,150 million,84700000.0,338000.0,225000.0,125000.0,...,260000.0,145000.0,3230000.0,70250000.0,101722.0,,230000.0,,,
4,Ayumi Hamasaki,Japan,20,1998,J-pop / Pop / dance / electronic,80 million,64300000.0,,,,...,,,,,,,,,,
5,B'z,Japan,30,1988,Rock / Pop rock / Hard rock,100 million,85700000.0,,,,...,,,,,,,,,,
6,Backstreet Boys,United States,25,1995,Pop,100 million,72000000.0,740000.0,1400000.0,325000.0,...,655000.0,415000.0,4875000.0,41500000.0,171390.0,,1475000.0,,,
7,Barbra Streisand,United States,58,1963,Pop / Adult contemporary,145 million,97400000.0,,2047000.0,,...,120000.0,,5765000.0,82450000.0,186501.0,750000.0,,,,
8,Barry White,United States,31,1973,R&B / Soul,100 million,21700000.0,,,,...,,,3665000.0,16500000.0,,,,,,
9,Bee Gees,United Kingdom Australia,40,1963,Pop / Disco,120 million,68300000.0,188000.0,1187000.0,125000.0,...,,415000.0,9065000.0,42500000.0,,6675000.0,,,,


In [380]:
df4["Claimed Sales"][3]
claimedsales(df4)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,Artist,Country/Market,Years Active,Year of First Record,Genre,Claimed Sales,Certified Sales,ARG,AUS,AUT,...,SWE,SWI,UK,US,FIN,GER,MEX,IRE,PHL,NLD
0,ABBA,Sweden,10,1972,Pop / Disco,200000000,60700000.0,238000.0,6127000.0,175000.0,...,800000.0,600000.0,18970000.0,12700000.0,656319.0,10450000.0,260000.0,,,
1,AC/DC,Australia,45,1975,Hard rock / Blues rock / Rock and roll,200000000,113600000.0,594000.0,7540000.0,380000.0,...,350000.0,894000.0,4400000.0,79700000.0,321169.0,10300000.0,,,,
2,Adele,United Kingdom,12,2008,"Pop, soul",100000000,104900000.0,,3955000.0,,...,240000.0,510000.0,20550000.0,,151639.0,4700000.0,1490000.0,,,
3,Aerosmith,United States,48,1973,Hard rock,150000000,84700000.0,338000.0,225000.0,125000.0,...,260000.0,145000.0,3230000.0,70250000.0,101722.0,,230000.0,,,
4,Ayumi Hamasaki,Japan,20,1998,J-pop / Pop / dance / electronic,80000000,64300000.0,,,,...,,,,,,,,,,
5,B'z,Japan,30,1988,Rock / Pop rock / Hard rock,100000000,85700000.0,,,,...,,,,,,,,,,
6,Backstreet Boys,United States,25,1995,Pop,100000000,72000000.0,740000.0,1400000.0,325000.0,...,655000.0,415000.0,4875000.0,41500000.0,171390.0,,1475000.0,,,
7,Barbra Streisand,United States,58,1963,Pop / Adult contemporary,145000000,97400000.0,,2047000.0,,...,120000.0,,5765000.0,82450000.0,186501.0,750000.0,,,,
8,Barry White,United States,31,1973,R&B / Soul,100000000,21700000.0,,,,...,,,3665000.0,16500000.0,,,,,,
9,Bee Gees,United Kingdom Australia,40,1963,Pop / Disco,120000000,68300000.0,188000.0,1187000.0,125000.0,...,,415000.0,9065000.0,42500000.0,,6675000.0,,,,
