In [8]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [12]:
# Get request from the relavent url
url = "https://en.wikipedia.org/wiki/List_of_wealthiest_families"
response = requests.get(url)
response.status_code

200

In [13]:
# Analyse html informations
doc = response.text
len(doc)

497605

In [15]:
soup = BeautifulSoup(doc, 'html.parser')
#print(soup)

<!DOCTYPE html>

<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-toc-available" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>List of wealthiest families - Wikipedia</title>
<script>(function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientp

In [19]:
# Search the table called "wikitable sortable"
table = soup.find_all('table', class_ = "wikitable sortable")[0]
#print(table)
len(table)

<table class="wikitable sortable">
<tbody><tr>
<th>Family name
</th>
<th>Notable members
</th>
<th data-sort-type="number">Combined wealth<br/>in US$ <a href="/wiki/1,000,000,000" title="1,000,000,000">billions</a><br/>(estimated)
</th>
<th>Source of<br/>wealth
</th>
<th>Country or region
</th></tr>
<tr>
<td><a href="/wiki/Walton_family" title="Walton family">Walton</a><sup class="reference" id="cite_ref-Business_Insider_Waltons_5-0"><a href="#cite_note-Business_Insider_Waltons-5"><span class="cite-bracket">[</span>5<span class="cite-bracket">]</span></a></sup><sup class="reference" id="cite_ref-Bloomberg_2022_richest_6-0"><a href="#cite_note-Bloomberg_2022_richest-6"><span class="cite-bracket">[</span>6<span class="cite-bracket">]</span></a></sup>
</td>
<td><a href="/wiki/Jim_Walton" title="Jim Walton">Jim Walton</a>, <a href="/wiki/S._Robson_Walton" title="S. Robson Walton">S. Robson Walton</a>, <a href="/wiki/Alice_Walton" title="Alice Walton">Alice Walton</a>, <a href="/wiki/Lukas_

2

In [20]:
# Extract the column headers
titles = table.find_all('th')
print(titles)

[<th>Family name
</th>, <th>Notable members
</th>, <th data-sort-type="number">Combined wealth<br/>in US$ <a href="/wiki/1,000,000,000" title="1,000,000,000">billions</a><br/>(estimated)
</th>, <th>Source of<br/>wealth
</th>, <th>Country or region
</th>]


In [22]:
# Create our dataset col names
column_names = [title.text.strip() for title in titles]
print(column_names)

['Family name', 'Notable members', 'Combined wealthin US$ billions(estimated)', 'Source ofwealth', 'Country or region']


In [23]:
# Rename the columns
column_names = [name.replace('wealthin', 'wealth in').replace('ofwealth', 'of wealth') for name in column_names]
print(column_names)

['Family name', 'Notable members', 'Combined wealth in US$ billions(estimated)', 'Source of wealth', 'Country or region']


In [25]:
# Create dataframe
df = pd.DataFrame(columns = column_names)
df

Unnamed: 0,Family name,Notable members,Combined wealth in US$ billions(estimated),Source of wealth,Country or region


In [27]:
# Find content
column_data = table.find_all('tr')
#print(column_data)

[<tr>
<th>Family name
</th>
<th>Notable members
</th>
<th data-sort-type="number">Combined wealth<br/>in US$ <a href="/wiki/1,000,000,000" title="1,000,000,000">billions</a><br/>(estimated)
</th>
<th>Source of<br/>wealth
</th>
<th>Country or region
</th></tr>, <tr>
<td><a href="/wiki/Walton_family" title="Walton family">Walton</a><sup class="reference" id="cite_ref-Business_Insider_Waltons_5-0"><a href="#cite_note-Business_Insider_Waltons-5"><span class="cite-bracket">[</span>5<span class="cite-bracket">]</span></a></sup><sup class="reference" id="cite_ref-Bloomberg_2022_richest_6-0"><a href="#cite_note-Bloomberg_2022_richest-6"><span class="cite-bracket">[</span>6<span class="cite-bracket">]</span></a></sup>
</td>
<td><a href="/wiki/Jim_Walton" title="Jim Walton">Jim Walton</a>, <a href="/wiki/S._Robson_Walton" title="S. Robson Walton">S. Robson Walton</a>, <a href="/wiki/Alice_Walton" title="Alice Walton">Alice Walton</a>, <a href="/wiki/Lukas_Walton" title="Lukas Walton">Lukas Walto

In [31]:
# Add content to the dataframe
for row in column_data[1:]:
    rows = row.find_all('td')
    
    individual_raws_data = [data.text.strip() for data in rows]
    length = len(df)
    df.loc[length] = individual_raws_data
    
df.head()

Unnamed: 0,Family name,Notable members,Combined wealth in US$ billions(estimated),Source of wealth,Country or region
0,Walton[5][6],"Jim Walton, S. Robson Walton, Alice Walton, Lu...",289.8 (2024)[7],"Walmart, Sam's Club",United States
1,Arnault,"Bernard Arnault, Delphine Arnault, Antoine Arn...",233 (2024)[7],LVMH,France
2,Tata,"Jamshedji Tata, J. R. D. Tata, Ratan Tata, Nav...",142 (2024) (including Tata Charitable Trusts),Tata Group,India
3,Koch[6],"Julia Koch, Charles Koch, Bill Koch (Fred C. K...",124.8 (2024)[7],Koch Industries,United States
4,Ortega[8],"Amancio Ortega, Sandra Ortega Mera (Rosalía Mera)",120.1 (2024)[7],Inditex (including Zara),Spain


## Data Preprocessing

In [32]:
# Remove citation numbers from square brackets
import re

def  clean_citation(text):
    return re.sub(r'\[\d+\]', '', text)

df = df.applymap(lambda x: clean_citation(x) if isinstance(x, str) else x)
df.head()


  df = df.applymap(lambda x: clean_citation(x) if isinstance(x, str) else x)


Unnamed: 0,Family name,Notable members,Combined wealth in US$ billions(estimated),Source of wealth,Country or region
0,Walton,"Jim Walton, S. Robson Walton, Alice Walton, Lu...",289.8 (2024),"Walmart, Sam's Club",United States
1,Arnault,"Bernard Arnault, Delphine Arnault, Antoine Arn...",233 (2024),LVMH,France
2,Tata,"Jamshedji Tata, J. R. D. Tata, Ratan Tata, Nav...",142 (2024) (including Tata Charitable Trusts),Tata Group,India
3,Koch,"Julia Koch, Charles Koch, Bill Koch (Fred C. K...",124.8 (2024),Koch Industries,United States
4,Ortega,"Amancio Ortega, Sandra Ortega Mera (Rosalía Mera)",120.1 (2024),Inditex (including Zara),Spain


In [33]:
# Extract the numeric values for Combined wealth column
def extract_wealth(text):
    match = re.search(r'\d+(\.\d+)?', text)
    return match.group(0) if match else text

df['Combined wealth in US$ billions(estimated)'] = df['Combined wealth in US$ billions(estimated)'].apply(extract_wealth)

df.head()

Unnamed: 0,Family name,Notable members,Combined wealth in US$ billions(estimated),Source of wealth,Country or region
0,Walton,"Jim Walton, S. Robson Walton, Alice Walton, Lu...",289.8,"Walmart, Sam's Club",United States
1,Arnault,"Bernard Arnault, Delphine Arnault, Antoine Arn...",233.0,LVMH,France
2,Tata,"Jamshedji Tata, J. R. D. Tata, Ratan Tata, Nav...",142.0,Tata Group,India
3,Koch,"Julia Koch, Charles Koch, Bill Koch (Fred C. K...",124.8,Koch Industries,United States
4,Ortega,"Amancio Ortega, Sandra Ortega Mera (Rosalía Mera)",120.1,Inditex (including Zara),Spain


In [None]:
# Replace "Notable members" with "Main member" and extract the first member's first and last name
df['Main member'] = df['Notable members'].apply(lambda x: x.split(',')[0].strip())             
df = df.drop(columns=['Notable members'])

df.head()

In [37]:
df.head()

Unnamed: 0,Family name,Combined wealth in US$ billions(estimated),Source of wealth,Country or region,Main member
0,Walton,289.8,"Walmart, Sam's Club",United States,Jim Walton
1,Arnault,233.0,LVMH,France,Bernard Arnault
2,Tata,142.0,Tata Group,India,Jamshedji Tata
3,Koch,124.8,Koch Industries,United States,Julia Koch
4,Ortega,120.1,Inditex (including Zara),Spain,Amancio Ortega


## Store the data

In [38]:
df.to_csv('wealthiest_families.csv', index=False)