In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
# import warnings
import warnings
warnings.filterwarnings("ignore")

url = "https://en.wikipedia.org/wiki/World_population"

response = requests.get(url)

soup = BeautifulSoup(response.content, "html.parser")

print(soup.title)
print(soup.title.text)
print(soup.title.string)

## print the whole html code
# print(soup.prettify())

# find all the tables
tables = soup.find_all("table")

dataframe = [] # empty list to store dataframes
for i, table in enumerate(tables):
    rows = table.find_all("tr")[1:]  # skip the first row
    data = [] # empty list to store data
    for row in rows:
        cols = row.find_all("td")
        cols = [col.text.strip() for col in cols]
        data.append(cols)
    df = pd.DataFrame(data)
    dataframe.append(df)
    
dataframe[1].head()

<title>World population - Wikipedia</title>
World population - Wikipedia
World population - Wikipedia


Unnamed: 0,0,1,2,3
0,Sub-Saharan Africa,"1,152 (14.51%)","1,401 (16.46%)","2,094 (21.62%)"
1,Northern Africa and Western Asia,549 (6.91%),617 (7.25%),771 (7.96%)
2,Central Asia and Southern Asia,"2,075 (26.13%)","2,248 (26.41%)","2,575 (26.58%)"
3,Eastern Asia and Southeastern Asia,"2,342 (29.49%)","2,372 (27.87%)","2,317 (23.92%)"
4,Europe and Northern America,"1,120 (14.10%)","1,129 (13.26%)","1,125 (11.61%)"


## ✅How to Clean and Save to CSV Using BeautifulSoup + Python

In [2]:
import re

# Get the raw table
df_raw = dataframe[1].copy()

# Define cleaned columns
clean_data = []

for row in df_raw.itertuples(index=False):
    region = row[0]
    year_data = row[1:]
    cleaned_row = [region]
    
    for cell in year_data:
        if not cell:
            # Handle missing cell
            cleaned_row.extend(["", ""])
            continue
        
        # Clean trailing commas
        cell = cell.rstrip(',')

        # Extract number and percent
        match = re.match(r'([\d,]+)\s*\(([\d.]+)%\)', cell)
        if match:
            population = match.group(1)
            percentage = match.group(2)
            cleaned_row.extend([population, percentage])
        else:
            cleaned_row.extend([cell, ""])  # fallback if no match
    
    clean_data.append(cleaned_row)

# Define new headers
headers = [
    "Region",
    "2000 Population", "2000 %",
    "2010 Population", "2010 %",
    "2020 Population", "2020 %"
]

df_clean = pd.DataFrame(clean_data, columns=headers)
df_clean.head()


df_clean.to_csv("cleaned_population_data.csv", index=False)

In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

url = "https://www.worldometers.info/world-population/population-by-country/"

response = requests.get(url)

soup = BeautifulSoup(response.content, "html.parser")

tables = soup.find_all("table")

print(soup.title.string)

# print(soup.prettify())
dataframe = [] # empty list to store dataframes
for i, table in enumerate(tables):
    rows = table.find_all("tr")[1:]  # skip the first row
    data = [] # empty list to store data
    for row in rows:
        cols = row.find_all("td")
        cols = [col.text.strip() for col in cols]
        data.append(cols)
    df = pd.DataFrame(data)
    dataframe.append(df)
dataframe[0].head()
    

Population by Country (2025) - Worldometer


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,1,India,1463865525,0.89%,12929734,492,2973190,"−495,753",1.94,28.8,37.1%,17.78%
1,2,China,1416096094,−0.23%,"−3,225,184",151,9388211,"−268,126",1.02,40.1,67.5%,17.20%
2,3,United States,347275807,0.54%,1849236,38,9147420,1230663,1.62,38.5,82.8%,4.22%
3,4,Indonesia,285721236,0.79%,2233305,158,1811570,"−39,509",2.1,30.4,59.6%,3.47%
4,5,Pakistan,255219554,1.57%,3950390,331,770880,"−1,235,336",3.5,20.6,34.4%,3.10%


In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Step 1: Scrape the table
url = "https://www.worldometers.info/world-population/population-by-country/"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
table = soup.find("table")  # this page has one main table

# Step 2: Extract headers
headers = [th.text.strip() for th in table.find_all("tr")[0].find_all("th")]

# Step 3: Extract rows
data = []
rows = table.find_all("tr")[1:]  # Skip header row
for row in rows:
    cols = row.find_all("td")
    cols = [col.text.strip() for col in cols]
    data.append(cols)

# Step 4: Create DataFrame
df = pd.DataFrame(data, columns=headers)

# Step 5: Clean the data
def clean_population_data(df):
    # Replace empty strings with None
    df.replace('', None, inplace=True)

    # Drop fully empty rows (just in case)
    df.dropna(how='all', inplace=True)

    # Remove duplicates
    df.drop_duplicates(inplace=True)

    # Remove commas and % signs, convert numeric
    for col in df.columns:
        df[col] = df[col].astype(str).str.replace(',', '').str.replace('%', '').str.strip()
        df[col] = pd.to_numeric(df[col], errors='ignore')  # convert where possible
        try:
            df[col] = pd.to_datetime(df[col], errors='ignore')  # if date-like
        except:
            pass

    return df

# Step 6: Apply cleaning
df_cleaned = clean_population_data(df)
df_cleaned.head()

Unnamed: 0,#,Country (ordependency),Population(2025),YearlyChange,NetChange,Density(P/Km²),Land Area(Km²),Migrants(net),Fert.Rate,MedianAge,UrbanPop %,WorldShare
0,1970-01-01 00:00:00.000000001,India,1970-01-01 00:00:01.463865525,0.89,12929734,1970-01-01 00:00:00.000000492,1970-01-01 00:00:00.002973190,−495753,1970-01-01 00:00:00.000000001,1970-01-01 00:00:00.000000028,37.1,1970-01-01 00:00:00.000000017
1,1970-01-01 00:00:00.000000002,China,1970-01-01 00:00:01.416096094,−0.23,−3225184,1970-01-01 00:00:00.000000151,1970-01-01 00:00:00.009388211,−268126,1970-01-01 00:00:00.000000001,1970-01-01 00:00:00.000000040,67.5,1970-01-01 00:00:00.000000017
2,1970-01-01 00:00:00.000000003,United States,1970-01-01 00:00:00.347275807,0.54,1849236,1970-01-01 00:00:00.000000038,1970-01-01 00:00:00.009147420,1230663,1970-01-01 00:00:00.000000001,1970-01-01 00:00:00.000000038,82.8,1970-01-01 00:00:00.000000004
3,1970-01-01 00:00:00.000000004,Indonesia,1970-01-01 00:00:00.285721236,0.79,2233305,1970-01-01 00:00:00.000000158,1970-01-01 00:00:00.001811570,−39509,1970-01-01 00:00:00.000000002,1970-01-01 00:00:00.000000030,59.6,1970-01-01 00:00:00.000000003
4,1970-01-01 00:00:00.000000005,Pakistan,1970-01-01 00:00:00.255219554,1.57,3950390,1970-01-01 00:00:00.000000331,1970-01-01 00:00:00.000770880,−1235336,1970-01-01 00:00:00.000000003,1970-01-01 00:00:00.000000020,34.4,1970-01-01 00:00:00.000000003


## Fix

In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def clean_population_data(df):
    """Cleans scraped population data: fixes minus signs, removes commas/percent, converts to numeric."""
    df.replace('', None, inplace=True)
    df.dropna(how='all', inplace=True)
    df.drop_duplicates(inplace=True)

    for col in df.columns:
        df[col] = df[col].astype(str)
        df[col] = df[col].str.replace('−', '-', regex=False)  # fix unicode minus
        df[col] = df[col].str.replace(',', '', regex=False)
        df[col] = df[col].str.replace('%', '', regex=False)
        df[col] = df[col].str.strip()

        # Try numeric conversion only (no datetime, to avoid misinterpretation)
        df[col] = pd.to_numeric(df[col], errors='ignore')

    return df

# =======================
# Step 1: Scrape webpage
# =======================
url = "https://www.worldometers.info/world-population/population-by-country/"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")

# =======================
# Step 2: Extract table
# =======================
table = soup.find("table")  # main table on the page

# Extract headers
headers = [th.text.strip() for th in table.find_all("tr")[0].find_all("th")]

# Extract rows
data = []
rows = table.find_all("tr")[1:]  # skip header row
for row in rows:
    cols = row.find_all("td")
    cols = [col.text.strip() for col in cols]
    data.append(cols)

# =======================
# Step 3: Create DataFrame
# =======================
df = pd.DataFrame(data, columns=headers)

# =======================
# Step 4: Clean Data
# =======================
df_cleaned = clean_population_data(df)

# =======================
# Step 5: Results
# =======================
df_cleaned.head()

# save to CSV
df_cleaned.to_csv("world_population_by_country.csv", index=False)