In [None]:
pip install requests beautifulsoup4

In [1]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import time
import re

In [2]:
def format_name(name):
    # Convert to lowercase
    lowercased = name.lower()
    # Remove unwanted parts (like "& family") and extra spaces
    cleaned = re.sub(r'\s*&\s*family', '', lowercased)
    # Replace spaces with hyphens
    hyphenated = cleaned.replace(" ", "-")
    return hyphenated

# Opening Google Chrome (WebDriver)
driver = webdriver.Chrome()
driver.get("https://www.forbes.com/billionaires/")
time.sleep(10)        

#  Billionaires Dataframe - Part 1
url = []
rank = []
name = []
net_worth = []
age = []
country_territory = []
source = []
industry = []

while True:  # Use a while loop to iterate through pages

    # Scraping billionaires data
    billionaires = driver.find_elements(by=By.CLASS_NAME, value='Table_tableRow__lF_cY')

    for index, billionaire in enumerate(billionaires):
        formatted_name = None
        
        try:
            # Rank
            rank_element = billionaire.find_element(by=By.CLASS_NAME, value='Table_rank__X4MKf').text.strip('.')
            rank.append(rank_element)
        except NoSuchElementException:
            print(f"No 'rank' found for Billionaire {index}")
            rank.append(None)

        try:
            # Name
            name_element = billionaire.find_element(by=By.CLASS_NAME, value='Table_personName__Bus2E').text
            formatted_name = format_name(name_element)
            name.append(name_element)
        except NoSuchElementException:
            print(f"No 'name' found for Billionaire {index}")
            name.append(None)

        try:
            # Net Worth
            net_worth_element = billionaire.find_element(by=By.CLASS_NAME, value='Table_finalWorth__UZA6k').text
            net_worth.append(net_worth_element)
        except NoSuchElementException:
            print(f"No 'net worth' found for Billionaire {index}")
            net_worth.append(None)
        
        try:
            # Industry
            div_elements = billionaire.find_elements(by=By.TAG_NAME, value="div")
            if len(div_elements) > 6:
                industry_element = div_elements[6]
            elif len(div_elements) > 5:
                industry_element = div_elements[5]
            else:
                print("Not enough div elements found for industry.")
                industry_element = None

            if industry_element is not None:  # Check if element was found
                industry.append(industry_element.text)
            else:
                industry.append(None) 

        except NoSuchElementException:
            print(f"No 'industry' found for Billionaire {index}")
            industry.append(None)

        if formatted_name:  # Check if name was successfully formatted
            #URL
            url.append(f'https://www.forbes.com/profile/{formatted_name}/')
            
        print(f"Billionaire {rank[-1]} scraped.")
    
    try:
        # Wait for the "Next Page" button to be clickable
        load_more = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//*[@id='__next']/div[2]/div/div/div[3]/div[2]/div[2]/div/div[2]/div[27]/div[7]/div[1]/nav/div/button[7]")))
        driver.execute_script("arguments[0].click();", load_more)
    except TimeoutException:
        print("Next page button not found within the time limit.")
        break
        
        
# Close the browser
driver.quit()


print("=" * 12)
print("URL: ", url)
print("=" * 12)
print("Rank: ", rank)
print("=" * 12)
print("Name: ", name)
print("=" * 12)
print("Net Worth: ", net_worth)
print("=" * 12)
print("Industry: ", industry)
print("=" * 12)

Billionaire 1 scraped.
Billionaire 2 scraped.
Billionaire 3 scraped.
Billionaire 4 scraped.
Billionaire 5 scraped.
Billionaire 6 scraped.
Billionaire 7 scraped.
Billionaire 8 scraped.
Billionaire 9 scraped.
Billionaire 10 scraped.
Billionaire 11 scraped.
Billionaire 12 scraped.
Billionaire 13 scraped.
Billionaire 14 scraped.
Billionaire 15 scraped.
Billionaire 16 scraped.
Billionaire 17 scraped.
Billionaire 18 scraped.
Billionaire 19 scraped.
Billionaire 20 scraped.
Billionaire 21 scraped.
Billionaire 22 scraped.
Billionaire 23 scraped.
Billionaire 24 scraped.
Billionaire 25 scraped.
Billionaire 26 scraped.
Billionaire 27 scraped.
Billionaire 27 scraped.
Billionaire 29 scraped.
Billionaire 30 scraped.
Billionaire 31 scraped.
Billionaire 32 scraped.
Billionaire 33 scraped.
Billionaire 34 scraped.
Billionaire 35 scraped.
Billionaire 35 scraped.
Billionaire 37 scraped.
Billionaire 38 scraped.
Billionaire 39 scraped.
Billionaire 40 scraped.
Billionaire 40 scraped.
Billionaire 42 scraped.
B

In [3]:
len(url)

2781

In [5]:
import pandas as pd

# Create a dictionary with column names as keys and lists as values
data_part1 = {'rank': rank,
              'name': name, 
              'net_worth': net_worth,
              'industry': industry,
              'url': url}

# Create the DataFrame from the dictionary
df_billionaires_p1 = pd.DataFrame(data_part1)

# Show the DataFrame
df_billionaires_p1.head()

Unnamed: 0,rank,name,net_worth,industry,url
0,1,Bernard Arnault & family,$233 B,Fashion & Retail,https://www.forbes.com/profile/bernard-arnault/
1,2,Elon Musk,$195 B,Automotive,https://www.forbes.com/profile/elon-musk/
2,3,Jeff Bezos,$194 B,Technology,https://www.forbes.com/profile/jeff-bezos/
3,4,Mark Zuckerberg,$177 B,Technology,https://www.forbes.com/profile/mark-zuckerberg/
4,5,Larry Ellison,$141 B,Technology,https://www.forbes.com/profile/larry-ellison/


In [6]:
df_billionaires_p1.to_csv('billionaires_part1.csv', index=False)

In [7]:
import requests
from bs4 import BeautifulSoup

In [None]:
def extract_billionaire_data(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    data_part2 = {}  # Dictionary to store extracted data

    # Age
    try:
        age_element = soup.find('dt', class_='profile-stats__title', text='Age').find_next_sibling('dd').find('span')
        data_part2['age'] = age_element.text.strip()
    except AttributeError:
        data_part2['age'] = None

    # Source of Wealth
    try:
        source_element = soup.find('dt', class_='profile-stats__title', text='Source of Wealth').find_next_sibling('dd').find('span')
        data_part2['source_of_wealth'] = source_element.text.strip()
    except AttributeError:
        data_part2['source_of_wealth'] = None

    # Self-Made Score
    try:
        self_made_element = soup.find('a', href='https://www.forbes.com/self-made-methodology/')
        data_part2['self_made_score'] = self_made_element.find_next_sibling('dd').find('span').text.strip()
    except AttributeError:
        data_part2['self_made_score'] = None

    # Philanthropy Score
    try:
        philanthropy_element = soup.find('a', href='https://www.forbes.com/philanthropy-methodology/')
        data_part2['philanthropy_score'] = philanthropy_element.find_next_sibling('dd').find('span').text.strip()
    except AttributeError:
        data_part2['philanthropy_score'] = None

    # Residence
    try:
        residence_element = soup.find('dt', class_='profile-stats__title', text='Residence').find_next_sibling('dd').find('span')
        data_part2['residence'] = residence_element.text.strip()
    except AttributeError:
        data_part2['residence'] = None

    # Citizenship
    try:
        citizenship_element = soup.find('dt', class_='profile-stats__title', text='Citizenship').find_next_sibling('dd').find('span')
        data_part2['citizenship'] = citizenship_element.text.strip()
    except AttributeError:
        data_part2['citizenship'] = None

    # Marital Status
    try:
        marital_element = soup.find('dt', class_='profile-stats__title', text='Marital Status').find_next_sibling('dd').find('span')
        data_part2['marital_status'] = marital_element.text.strip()
    except AttributeError:
        data_part2['marital_status'] = None

    # Children
    try:
        children_element = soup.find('dt', class_='profile-stats__title', text='Children').find_next_sibling('dd').find('span')
        data_part2['children'] = children_element.text.strip()
    except AttributeError:
        data_part2['children'] = None

    # Education (handle potential absence)
    education_element = soup.find('dt', class_='profile-stats__title', text='Education')
    if education_element:
        data_part2['education'] = education_element.find_next_sibling('dd').find('span').text.strip()
    else:
        data_part2['education'] = None

    return data_part2


#  Billionaires data - Part 2
age = []
source_of_wealth = []
self_made_score = []
philanthropy_score = []
residence = []
citizenship = []
marital_status = []
children = []
education = []


# Loop through URLs and extract data using BS4
for index, url_element in enumerate(url):
    billionaire_data = extract_billionaire_data(url_element)
    
    age.append(billionaire_data['age'])
    source_of_wealth.append(billionaire_data['source_of_wealth'])
    self_made_score.append(billionaire_data['self_made_score'])
    philanthropy_score.append(billionaire_data['philanthropy_score'])
    residence.append(billionaire_data['residence'])
    citizenship.append(billionaire_data['citizenship'])
    marital_status.append(billionaire_data['marital_status'])
    children.append(billionaire_data['children'])
    education.append(billionaire_data['education'])

    print(f"Billionaire {index+1} scraped.")
    
# Print the extracted data
print('=' * 24)
print("Age: ", age)
print('=' * 24)
print("Source of wealth: ", source_of_wealth)
print('=' * 24)
print("Residence: ", residence)
print('=' * 24)
print("Citizenship: ", citizenship)
print('=' * 24)
print("Marital status: ", marital_status)
print('=' * 24)
print("Children: ", children)

Billionaire 1 scraped.
Billionaire 2 scraped.
Billionaire 3 scraped.
Billionaire 4 scraped.
Billionaire 5 scraped.
Billionaire 6 scraped.
Billionaire 7 scraped.
Billionaire 8 scraped.
Billionaire 9 scraped.
Billionaire 10 scraped.
Billionaire 11 scraped.
Billionaire 12 scraped.
Billionaire 13 scraped.
Billionaire 14 scraped.
Billionaire 15 scraped.
Billionaire 16 scraped.
Billionaire 17 scraped.
Billionaire 18 scraped.
Billionaire 19 scraped.
Billionaire 20 scraped.
Billionaire 21 scraped.
Billionaire 22 scraped.
Billionaire 23 scraped.
Billionaire 24 scraped.
Billionaire 25 scraped.
Billionaire 26 scraped.
Billionaire 27 scraped.
Billionaire 28 scraped.
Billionaire 29 scraped.
Billionaire 30 scraped.
Billionaire 31 scraped.
Billionaire 32 scraped.
Billionaire 33 scraped.
Billionaire 34 scraped.
Billionaire 35 scraped.
Billionaire 36 scraped.
Billionaire 37 scraped.
Billionaire 38 scraped.
Billionaire 39 scraped.
Billionaire 40 scraped.
Billionaire 41 scraped.
Billionaire 42 scraped.
B