In [1]:
pip install beautifulsoup4 requests pandas tqdm


Note: you may need to restart the kernel to use updated packages.


In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm  # Progress bar for tracking

# Base URL for Worldometer world population
base_url = "https://www.worldometers.info/world-population/population-by-country/"

# Function to get all country URLs
def get_country_links():
    response = requests.get(base_url)
    if response.status_code != 200:
        print("Failed to fetch the country list page")
        return []
    
    soup = BeautifulSoup(response.text, "html.parser")
    table = soup.find("table", {"id": "example2"})  # Locate the country list table
    country_links = table.find_all("a")
    
    country_urls = {}
    for link in country_links:
        country_name = link.text.strip()
        country_url = "https://www.worldometers.info" + link['href']
        country_urls[country_name] = country_url

    return country_urls

# Function to scrape population data from a country page
def scrape_population_data(country_name, country_url):
    response = requests.get(country_url)
    if response.status_code != 200:
        print(f"Failed to fetch data for {country_name}")
        return None

    soup = BeautifulSoup(response.text, "html.parser")
    table = soup.find("table", {"class": "table table-striped table-bordered table-hover table-condensed table-list"})
    rows = table.find_all("tr")[1:]  # Skip header row

    data = []
    for row in rows:
        cols = row.find_all("td")
        year = cols[0].text.strip()
        population = cols[1].text.strip()
        yearly_change = cols[2].text.strip()
        net_change = cols[3].text.strip()
        density = cols[4].text.strip()
        urban_pop = cols[5].text.strip()
        urban_pop_pct = cols[6].text.strip()

        data.append([country_name, year, population, yearly_change, net_change, density, urban_pop, urban_pop_pct])

    return data

# Main script to scrape data for all countries
all_countries = get_country_links()
if not all_countries:
    print("No countries found. Exiting.")
else:
    all_data = []

    print(f"Scraping population data for {len(all_countries)} countries...")

    for country, url in tqdm(all_countries.items()):
        country_data = scrape_population_data(country, url)
        if country_data:
            all_data.extend(country_data)

    # Convert to Pandas DataFrame
    df = pd.DataFrame(all_data, columns=[
        "Country", "Year", "Population", "Yearly Change", "Net Change", "Density (P/Km²)", "Urban Population", "Urban Population %"
    ])

    # Save data to CSV
    df.to_csv("world_population_1950_present.csv", index=False)

    print("All country population data saved to world_population_1950_present.csv")


Scraping population data for 234 countries...


100%|██████████| 234/234 [05:06<00:00,  1.31s/it]

All country population data saved to world_population_1950_present.csv



