In [154]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re
import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
%matplotlib inline

In [177]:
#upload the information using BeautifulSoup
url = 'https://en.wikipedia.org/wiki/List_of_best-selling_music_artists'

response = requests.get(url)
page = response.text
soup = BeautifulSoup(page, 'lxml')

In [178]:
#find the tables located on this page
alltables = soup.find_all('table')
#it finds 5 useful tables

In [179]:
#for each table:
#read the html into a pandas dataframe

m250 = alltables[0]
table1 = pd.read_html(m250.prettify())[0]

m200 = alltables[1]
table2 = pd.read_html(m200.prettify())[0]

m120 = alltables[2]
table3 = pd.read_html(m120.prettify())[0]

m100 = alltables[3]
table4 = pd.read_html(m100.prettify())[0]

m80 = alltables[4]
table5 = pd.read_html(m80.prettify())[0]

In [180]:
#get rid of titles as datapoints
table1 = table1.drop(0)
table2 = table2.drop(0)
table3 = table3.drop(0)
table4 = table4.drop(0)
table5 = table5.drop(0)

In [181]:
#merge all 5 lists into one dataframe
df = pd.concat([table1, table2, table3, table4, table5])

In [182]:
#rename the headers/column titles
df = df.rename(index=str, columns={0: "Artist", 
                                  1: "Country/Market", 
                                  2: "Period Active", 
                                  3: "Year of First Record", 
                                  4: "Genre", 
                                  5: "Total Certified Units", 
                                  6: "Claimed Sales"})

In [183]:
#renumber indexes, so all tables are included
df = df.reset_index(drop=True)

In [184]:
#set up regular expression to weed out [##] ghost-links
killbrackets = re.compile(r' \[[0-9b]+\]')

#use the regex for each column just in case
df["Artist"] = [killbrackets.sub("", i) for i in df["Artist"]]
df["Country/Market"] = [killbrackets.sub("", i) for i in df["Country/Market"]]
df["Period Active"] = [killbrackets.sub("", i) for i in df["Period Active"]]
df["Year of First Record"] = [killbrackets.sub("", i) for i in df["Year of First Record"]]
df["Genre"] = [killbrackets.sub("", i) for i in df["Genre"]]
df["Total Certified Units"] = [killbrackets.sub("", i) for i in df["Total Certified Units"]]
df["Claimed Sales"] = [killbrackets.sub("", i) for i in df["Claimed Sales"]]

In [185]:
#replace "present" to 2018 in Period Active
df["Period Active"] = pd.Series(df["Period Active"]).str.replace("present", "2018")

#calculate the length of years playing music for each band
for i in range(89):
    int1 = int(df["Period Active"][i][5:9])
    int2 = int(df["Period Active"][i][0:4])
    df['Period Active'][i] = int1-int2

#rename column    
df = df.rename(index=str, columns={"Period Active": "Years Active"})

In [186]:
df.head(5)

Unnamed: 0,Artist,Country/Market,Years Active,Year of First Record,Genre,Total Certified Units,Claimed Sales
0,The Beatles,United Kingdom,10,1962,Rock / Pop,7002271100000000000♠ Total available certifie...,600 million 500 million
1,Elvis Presley,United States,23,1954,Rock and roll / Pop / Country,7002212400000000000♠ Total available certifie...,600 million 500 million
2,Michael Jackson,United States,45,1971,Pop / Rock / Dance / Soul / R&B,7002184600000000000♠ Total available certifie...,350 million 300 million
3,Madonna,United States,39,1982,Pop / Dance / Electronica,7002170600000000000♠ Total available certifie...,300 million 275 million
4,Elton John,United Kingdom,54,1969,Pop / Rock,7002169000000000000♠ Total available certifie...,300 million 250 million


In [187]:
df.to_pickle("raw_band_data.pkl")

In [240]:
#deal with that pesky "Total Certified Units" column.
#this whole ordeal will return a dictionary of small dictionaries

bands = {}

def formats(s):
#takes in the raw TCU string
 #  uses the RegularExpressionsStuff to format it
  # returns a list of tuples [(country: number), (country2: number2), ...]'''
    rgxnospaces = re.compile(r"([A-Z]+)\s*(?:: )?([0-9]+\.[0-9]+|[0-9]+,[0-9]{3})") 
    return(rgxnospaces.findall(s))


def makedict(s):
#'''takes in the post-regex list of tuples
 #  creates a dictionary with 3digit country code is the key, values are numbers
  # returns the dictionary'''
    monies = {i:aux(j) for i,j in s}
    return monies


def aux(j):
#'''works within makedict(s)
 #  converts millions to actual numbers, and all numbers to ints'''
    if "," in j:
        return int(j.replace(",",""))
    else:
        return int(float(j)*1000000)
    
    
def banddict(df):
#'''takes in the full raw df
 #  returns a dictionary of things we want'''
    for i in range(89):
        ccoded = pd.Series(makedict(formats(df["Total Certified Units"][i])))
        bands[df["Artist"][i]] = ccoded
    return bands 


def totalcertcounts(df):
    for i in range(89):
        tcurgx = re.compile(r"^[0-9]+[^0-9]+([0-9\.]+)")
        value = (tcurgx.findall(df["Total Certified Units"][i]))
        v1 = value[0]
        v2 = float(v1)
        v3 = v2*1000000
        v4 = int(v3)
        df['Certified Sales'][i] = v4 
    return df
        

In [244]:
#create an empty column to recieve our certified sales data
df = df.reindex(columns = ['Artist', 
                           'Country/Market', 
                           'Years Active', 
                           'Year of First Record',
                           'Genre',
                           'Total Certified Units',
                           'Claimed Sales',
                           'Certified Sales'])
df.head(2)   

Unnamed: 0,Artist,Country/Market,Years Active,Year of First Record,Genre,Total Certified Units,Claimed Sales,Certified Sales
0,The Beatles,United Kingdom,10,1962,Rock / Pop,7002271100000000000♠ Total available certifie...,600 million 500 million,271100000.0
1,Elvis Presley,United States,23,1954,Rock and roll / Pop / Country,7002212400000000000♠ Total available certifie...,600 million 500 million,212400000.0


In [245]:
#populate the total certifiable sales data into the new column
df = totalcertcounts(df)
df.head(2)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,Artist,Country/Market,Years Active,Year of First Record,Genre,Total Certified Units,Claimed Sales,Certified Sales
0,The Beatles,United Kingdom,10,1962,Rock / Pop,7002271100000000000♠ Total available certifie...,600 million 500 million,271100000.0
1,Elvis Presley,United States,23,1954,Rock and roll / Pop / Country,7002212400000000000♠ Total available certifie...,600 million 500 million,212400000.0


In [148]:
#df = df.set_index('Artist')   #sets the "Artist" to be the index, instead of arbitrary numbers

In [None]:
#banddict is a dictionary.
#need to convert it to a pandas dataframe, then combine it with the other columns of data

df2=pd.DataFrame.from_dict(bands, orient='index')

In [None]:
df3 = pd.concat([df, df2], axis=1)   #creates df3 as a pandastable of alphabetized artists and their attributes
df3.head(2)