In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
%matplotlib inline

  from pandas.core import datetools


In [3]:
#upload the information using BeautifulSoup
url = 'https://en.wikipedia.org/wiki/List_of_best-selling_music_artists'

response = requests.get(url)
page = response.text
soup = BeautifulSoup(page, 'lxml')

In [4]:
#find the tables located on this page
alltables = soup.find_all('table')
#it finds 5 useful tables

In [5]:
#for each table:
#read the html into a pandas dataframe

m250 = alltables[0]
table1 = pd.read_html(m250.prettify())[0]

m200 = alltables[1]
table2 = pd.read_html(m200.prettify())[0]

m120 = alltables[2]
table3 = pd.read_html(m120.prettify())[0]

m100 = alltables[3]
table4 = pd.read_html(m100.prettify())[0]

m80 = alltables[4]
table5 = pd.read_html(m80.prettify())[0]

In [6]:
#get rid of titles as datapoints
table1 = table1.drop(0)
table2 = table2.drop(0)
table3 = table3.drop(0)
table4 = table4.drop(0)
table5 = table5.drop(0)

In [7]:
#merge all 5 lists into one dataframe
df = pd.concat([table1, table2, table3, table4, table5])

In [8]:
#rename the headers/column titles
df = df.rename(index=str, columns={0: "Artist", 
                                  1: "Country/Market", 
                                  2: "Period Active", 
                                  3: "Year of First Record", 
                                  4: "Genre", 
                                  5: "Total Certified Units", 
                                  6: "Claimed Sales"})

In [9]:
#renumber indexes, so all tables are included
df = df.reset_index(drop=True)

In [10]:
#set up regular expression to weed out [##] ghost-links
killbrackets = re.compile(r' \[[0-9b]+\]')

#use the regex for each column just in case
df["Artist"] = [killbrackets.sub("", i) for i in df["Artist"]]
df["Country/Market"] = [killbrackets.sub("", i) for i in df["Country/Market"]]
df["Period Active"] = [killbrackets.sub("", i) for i in df["Period Active"]]
df["Year of First Record"] = [killbrackets.sub("", i) for i in df["Year of First Record"]]
df["Genre"] = [killbrackets.sub("", i) for i in df["Genre"]]
df["Total Certified Units"] = [killbrackets.sub("", i) for i in df["Total Certified Units"]]
df["Claimed Sales"] = [killbrackets.sub("", i) for i in df["Claimed Sales"]]

In [11]:
#replace "present" to 2018 in Period Active
df["Period Active"] = pd.Series(df["Period Active"]).str.replace("present", "2018")

#calculate the length of years playing music for each band
for i in range(89):
    int1 = int(df["Period Active"][i][5:9])
    int2 = int(df["Period Active"][i][0:4])
    df['Period Active'][i] = int1-int2

#rename column    
df = df.rename(index=str, columns={"Period Active": "Years Active"})

In [12]:
df.head(6)

Unnamed: 0,Artist,Country/Market,Years Active,Year of First Record,Genre,Total Certified Units,Claimed Sales
0,The Beatles,United Kingdom,10,1962,Rock / Pop,7002271100000000000♠ Total available certifie...,600 million 500 million
1,Elvis Presley,United States,23,1954,Rock and roll / Pop / Country,7002212400000000000♠ Total available certifie...,600 million 500 million
2,Michael Jackson,United States,45,1971,Pop / Rock / Dance / Soul / R&B,7002184600000000000♠ Total available certifie...,350 million 300 million
3,Madonna,United States,39,1982,Pop / Dance / Electronica,7002170600000000000♠ Total available certifie...,300 million 275 million
4,Elton John,United Kingdom,54,1969,Pop / Rock,7002169000000000000♠ Total available certifie...,300 million 250 million
5,Led Zeppelin,United Kingdom,12,1969,Hard rock / Blues rock / Folk rock,7002139690000099999♠ Total available certifie...,300 million 200 million


In [13]:
df.to_pickle("raw_band_data.pkl")

In [63]:
#deal with that pesky "Total Certified Units" column.
#this whole ordeal will return a dictionary of small dictionaries

bands = {}

def formats(s):
#takes in the raw TCU string
 #  uses the RegularExpressionsStuff to format it
  # returns a list of tuples [(country: number), (country2: number2), ...]'''
    rgxnospaces = re.compile(r"([A-Z]+)\s*(?:: )?([0-9]+\.[0-9]+|[0-9]+,[0-9]{3})") 
    return(rgxnospaces.findall(s))


def makedict(s):
#'''takes in the post-regex list of tuples
 #  creates a dictionary with 3digit country code is the key, values are numbers
  # returns the dictionary'''
    monies = {i:aux(j) for i,j in s}
    return monies


def aux(j):
#'''works within makedict(s)
 #  converts millions to actual numbers, and all numbers to ints'''
    if "," in j:
        return int(j.replace(",",""))
    else:
        return int(float(j)*1000000)
    
    
def banddict(df):
#'''takes in the full raw df
 #  returns a dictionary of things we want'''
    for i in range(89):
        ccoded = pd.Series(makedict(formats(df["Total Certified Units"][i])))
        bands[df["Artist"][i]] = ccoded
    return bands 


def totalcertcounts(df3):
    for i in range(89):
        tcurgx = re.compile(r"^[0-9]+[^0-9]+([0-9\.]+)")
        value = (tcurgx.findall(df3["Total Certified Units"][i]))
        for j in value:
            actualvalue = int(float(value[0])*1000000)
        df3['Total Certified Units'][i] = actualvalue 
                               #<------now we need it to replace the numbers in the column

In [35]:
df = df.set_index('Artist')   #sets the "Artist" to be the index, instead of arbitrary numbers

In [36]:
#banddict is a dictionary.
#need to convert it to a pandas dataframe, then combine it with the other columns of data

df2=pd.DataFrame.from_dict(bands, orient='index')

In [50]:
df3 = pd.concat([df, df2], axis=1)   #creates df3 as a pandastable of alphabetized artists and their attributes
df3.head(5)

Unnamed: 0,Country/Market,Years Active,Year of First Record,Genre,Total Certified Units,Claimed Sales,ARG,AUS,AUT,BEL,...,SWE,SWI,UK,US,FIN,GER,MEX,IRE,PHL,NLD
ABBA,Sweden,10,1972,Pop / Disco,7001607000000000000♠ Total available certifie...,200 million 100 million,238000.0,6127000.0,175000.0,380000.0,...,800000.0,600000.0,18970000.0,12700000.0,656319.0,10450000.0,260000.0,,,
AC/DC,Australia,45,1975,Hard rock / Blues rock / Rock and roll,7002113600000000000♠ Total available certifie...,200 million 150 million,594000.0,7540000.0,380000.0,100000.0,...,350000.0,894000.0,4400000.0,79700000.0,321169.0,10300000.0,,,,
Adele,United Kingdom,12,2008,"Pop, soul",7002104900000000000♠ Total available certifie...,100 million,,3955000.0,,795000.0,...,240000.0,510000.0,20550000.0,,151639.0,4700000.0,1490000.0,,,
Aerosmith,United States,48,1973,Hard rock,7001847000000000000♠ Total available certifie...,150 million,338000.0,225000.0,125000.0,,...,260000.0,145000.0,3230000.0,70250000.0,101722.0,,230000.0,,,
Ayumi Hamasaki,Japan,20,1998,J-pop / Pop / dance / electronic,7001643000000000000♠ Total available certifie...,80 million,,,,,...,,,,,,,,,,


In [64]:
totalcertcounts(df3)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [65]:
df3

Unnamed: 0,Country/Market,Years Active,Year of First Record,Genre,Total Certified Units,Claimed Sales,ARG,AUS,AUT,BEL,...,SWE,SWI,UK,US,FIN,GER,MEX,IRE,PHL,NLD
ABBA,Sweden,10,1972,Pop / Disco,60700000,200 million 100 million,238000.0,6127000.0,175000.0,380000.0,...,800000.0,600000.0,18970000.0,12700000.0,656319.0,10450000.0,260000.0,,,
AC/DC,Australia,45,1975,Hard rock / Blues rock / Rock and roll,113600000,200 million 150 million,594000.0,7540000.0,380000.0,100000.0,...,350000.0,894000.0,4400000.0,79700000.0,321169.0,10300000.0,,,,
Adele,United Kingdom,12,2008,"Pop, soul",104900000,100 million,,3955000.0,,795000.0,...,240000.0,510000.0,20550000.0,,151639.0,4700000.0,1490000.0,,,
Aerosmith,United States,48,1973,Hard rock,84700000,150 million,338000.0,225000.0,125000.0,,...,260000.0,145000.0,3230000.0,70250000.0,101722.0,,230000.0,,,
Ayumi Hamasaki,Japan,20,1998,J-pop / Pop / dance / electronic,64300000,80 million,,,,,...,,,,,,,,,,
B'z,Japan,30,1988,Rock / Pop rock / Hard rock,85700000,100 million,,,,,...,,,,,,,,,,
Backstreet Boys,United States,25,1995,Pop,72000000,100 million,740000.0,1400000.0,325000.0,425000.0,...,655000.0,415000.0,4875000.0,41500000.0,171390.0,,1475000.0,,,
Barbra Streisand,United States,58,1963,Pop / Adult contemporary,97400000,145 million,,2047000.0,,,...,120000.0,,5765000.0,82450000.0,186501.0,750000.0,,,,
Barry White,United States,31,1973,R&B / Soul,21700000,100 million,,,,150000.0,...,,,3665000.0,16500000.0,,,,,,
Bee Gees,United Kingdom Australia,40,1963,Pop / Disco,68300000,120 million,188000.0,1187000.0,125000.0,,...,,415000.0,9065000.0,42500000.0,,6675000.0,,,,
