In [2]:
import pandas as pd
import requests 
from bs4 import BeautifulSoup
from IPython.display import clear_output


We can get a lot of data straight from the Forbes API <br>
(see forbes_api.ipynb notebook)

In [4]:
df=pd.read_csv('data/2020_Billionaires.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,name,lastName,uri,imageUri,worthChange,age,source,industry,gender,...,realTimeRank,realTimePosition,squareImage,position,rank,worth,title,government,pay,managementAssets
0,0,A. Jayson Adair,Adair,a-jayson-adair,no-pic,-12.191,52.0,damaged cars,Automotive,M,...,2210.0,2210.0,https://specials-images.forbesimg.com/imageser...,,,,,,,
1,1,Abhay Soi,Soi,abhay-soi,no-pic,31.453,48.0,healthcare,Healthcare,M,...,2576.0,2576.0,https://specials-images.forbesimg.com/imageser...,,,,,,,
2,2,Adam Foroughi,Foroughi,adam-foroughi,no-pic,-147.507,41.0,mobile games,Media & Entertainment,M,...,1337.0,1337.0,,,,,,,,
3,3,Adam Neumann,Neumann,adam-neumann,adam-neumann,-19.834,42.0,WeWork,Real Estate,M,...,2081.0,2081.0,//specials-images.forbesimg.com/imageserve/5e7...,,,,,,,
4,4,Alan Miller & family,Miller,alan-miller-1,no-pic,-5.252,84.0,healthcare services,Healthcare,M,...,2338.0,2338.0,,,,,,,,


but there is more data to be found in the individual profile Forbes writes for each billionaires, including wether they were self-made, current residence, etc...  <br>
These can be accessed using Forbes ID (uri) for each billionaires   <br>

# 1. Data Extraction

Wrapper functions for BeautifullSoup 

In [57]:

#This functions retrieve all the stats displayed on a particular Forbes profile 
def get_stats(soup):
    stats=soup.find_all(class_="listuser-block__item")
    title=[]
    value=[]

    if stats:
        for n_stat in stats:
            title.append(n_stat.find(class_="profile-stats__title").text)
            value.append(n_stat.find(class_="profile-stats__text").text)

    return dict(zip(title, value))

#This function retrieves the text description of each billionaires
def get_text(soup):
    txt=soup.find(class_='profile-text')
    s=""
    if txt and txt.text: #check if txt is empty 
        if txt.ul and txt.ul.children:
            for line in txt.ul.children:
                s+=(line.text)
        else:  #sometimes txt is just one block of text without <ul> elements
            s+=txt.text
    return s


extracting the profile infos for each billionaires 

In [58]:
#there is aprox. 2800 pages to load from Forbes website
# so this block takes some time to execute

#output lists 
l_index=[]
l_text=[]
l_stats=[]

#iterate on each billionaires in Forbes' 2020 list 
for index, row in df.iterrows():
    l_index.append(index)

    re=requests.get(f'https://www.forbes.com/profile/{row.uri}')
    soup=BeautifulSoup(re.content, "html.parser")

    l_text.append(get_text(soup))
    l_stats.append(get_stats(soup))

    #print progress
    clear_output(wait=True)
    print(f'{index}_{row.uri}')

2818_zhou-yongli


Putting the data in one data table

In [None]:
data=[l_index, l_text, l_stats]
df2=pd.DataFrame(data=data)
df2=df2.transpose()

df2=df2.set_axis(["index", "text", 'stats'], axis=1, inplace=False)
df2=df2.drop(columns="index")

df2.head()
df2.to_csv("data/2020_forbes_profile.csv")

Unnamed: 0,0,1,2
0,0,"A. Jayson ""Jay"" Adair is the CEO of Dallas-bas...","{'Age': '52', 'Source of Wealth': 'damaged car..."
1,1,Former finance professional Abhay So is chairm...,"{'Age': '48', 'Source of Wealth': 'healthcare,..."
2,2,Adam Foroughi is the cofounder and CEO of AppL...,"{'Age': '41', 'Source of Wealth': 'mobile game..."
3,3,Adam Neumann is a cofounder of coworking firm ...,"{'Age': '42', 'Source of Wealth': 'WeWork, Sel..."
4,4,"Miller founded Universal Health Services, a ch...","{'Age': '84', 'Source of Wealth': 'healthcare ..."
...,...,...,...
2814,2814,"Yu Qibing chairs the Kibing Group, an industri...","{'Age': '56', 'Source of Wealth': 'glass, Self..."
2815,2815,Zhang Yuxiang is the chairman of Nanjids (Shan...,"{'Age': '57', 'Source of Wealth': 'apparel, Se..."
2816,2816,Zhao Hongfei chairs Shenzhen-listed software f...,"{'Age': '47', 'Source of Wealth': 'software, S..."
2817,2817,Konstyantin Zhevago owns a majority stake in U...,"{'Age': '47', 'Source of Wealth': 'mining, Sel..."


# 2. Data formating

Checking if the billionaire is considered "self-made" or not

In [93]:
def self_made(dic):
    if dic and "Source of Wealth" in dic.keys() :
        if "self made" in dic["Source of Wealth"].lower():
            return True    
    return False


df2["self_made"]=df2.apply(lambda row: self_made(row.stats), axis=1)

Extracting all the data contained in df2['stats] in a way that handles the many missing values

In [None]:
stats=df2['stats'].apply(pd.Series)

We can see some columns have mostly complete data e.g. "Source of Wealth", "Residence", etc. <br>
and some columns have very little data e.g. "Agent", "Agency", "Notable Deal", etc.

In [None]:
stats.describe()

Unnamed: 0,Age,Source of Wealth,Residence,Citizenship,Marital Status,Children,Education,Self-Made Score,Philanthropy Score,Agent,Agency,Notable Deal,Salary/Winnings,Lifetime Giving,Giving as a percentage of net worth,Clients
count,2652,2788,2729,2777,2120,1616,1473,585,413,2,2,3,1,5,5,1
unique,76,1056,772,73,8,14,1176,10,5,1,1,3,1,5,4,1
top,58,"real estate, Self Made","New York, New York",United States,Married,2,"Diploma, High School",8,1,David Falk,FAME,Snowflake,$1 M,$1.8B,7%,"Robinson Cano, Yoenis Cespedes, Kyrie Irving"
freq,95,136,104,757,1759,553,24,224,183,2,2,1,1,1,2,1


Cleaning up the data

In [1]:
#Some variable name formating
stats["source"]=stats["Source of Wealth"]

#Selecting the relevant columns
stats=stats[ ['source', 'Source of Wealth', 'Residence', 'Citizenship', 'Marital Status',
       'Children', 'Education', 'Self-Made Score', 'Philanthropy Score',
       ]]

#Appending the self-made information
stats["self_made"]=df2["self_made"]
stats.head()

NameError: name 'stats' is not defined

# 3. Merging datasets 

In [179]:
#merging the 'stats' data extracted from the individual Forbes profile with the Forbes list 
df_clean=df[[ 'name', 'lastName', 'uri', 'worthChange',
       'age', 'source', 'industry', 'gender', 'country',
       'realTimeWorth'
       ]]
#merging both datasets
df_clean= df_clean.merge(right=stats, how="left", left_index=True, right_index=True)
df_clean.head()

df_clean.to_csv("data/2020_data_clean.csv")

Unnamed: 0,name,lastName,uri,worthChange,age,source_x,industry,gender,country,realTimeWorth,source_y,Source of Wealth,Residence,Citizenship,Marital Status,Children,Education,Self-Made Score,Philanthropy Score,self_made
0,A. Jayson Adair,Adair,a-jayson-adair,-12.191,52.0,damaged cars,Automotive,M,United States,1376.511,"[damaged cars, Self Made]","damaged cars, Self Made","Dallas, Texas",United States,Married,2.0,,,,True
1,Abhay Soi,Soi,abhay-soi,31.453,48.0,healthcare,Healthcare,M,India,1078.053,"[healthcare, Self Made]","healthcare, Self Made","Mumbai, India",India,Married,2.0,"Master of Business Administration, European Un...",,,True
2,Adam Foroughi,Foroughi,adam-foroughi,-147.507,41.0,mobile games,Media & Entertainment,M,United States,2536.112,"[mobile games, Self Made]","mobile games, Self Made","Truckee, California",United States,Married,,,,,True
3,Adam Neumann,Neumann,adam-neumann,-19.834,42.0,WeWork,Real Estate,M,Israel,1498.747,"[WeWork, Self Made]","WeWork, Self Made","New York, New York",Israel,Married,5.0,City University of New York Baruch,,,True
4,Alan Miller & family,Miller,alan-miller-1,-5.252,84.0,healthcare services,Healthcare,M,United States,1260.555,"[healthcare services, Self Made]","healthcare services, Self Made","Lower Merion, Pennsylvania",United States,Married,3.0,"Bachelor of Arts/Economics, College of William...",,,True
