# Web Scraping

## (1) Population of Countries

In [67]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

url = 'https://worldpopulationreview.com/countries'
page=requests.get(url)
soup= BeautifulSoup(page.text, 'html')
table=soup.find_all('table')[0]
columns=table.find_all('th')
print(columns)

[<th class="datatable-th sticky top-0 text-nowrap border-none border-wpr-table_border bg-wpr-table_header_bg px-3 py-2 text-left align-bottom leading-4 text-wpr-table_header md:px-4 sticky left-0 z-50 font-bold pointer-events-none" data-field="flagCode" style=""> <div class="relative flex flex-col items-stretch justify-start bg-wpr-table_header_bg"> <div class="flex cursor-pointer select-none flex-row items-center"> <span class="sort-indicator ml-1"> </span> </div> </div> </th>, <th class="datatable-th sticky top-0 text-nowrap border-none border-wpr-table_border bg-wpr-table_header_bg px-3 py-2 text-left align-bottom leading-4 text-wpr-table_header md:px-4 z-40 false" data-field="country" style=""> <div class="relative flex flex-col items-stretch justify-start bg-wpr-table_header_bg"> <div class="flex cursor-pointer select-none flex-row items-center"> Country <span class="sort-indicator ml-1"> </span> </div> </div> </th>, <th class="datatable-th sticky top-0 text-nowrap border-none bor

In [68]:
title_data = [data.text.strip() for data in columns]
column_data = pd.DataFrame(columns=title_data)
column_data = pd.DataFrame(columns=title_data[1:8])
print("Columns:")
display(column_data)

Columns:


Unnamed: 0,Country,2025 Pop.,Area (kmÂ²),Density,Change,% Global Pop,Rank


In [69]:
rows = soup.find_all('tr')[1:]
data = []

for row in rows:
    columns = row.find_all('td')
    country = columns[1].text.strip()
    pop_2025 = columns[2].text.strip()
    area = columns[3].text.strip()
    density = columns[4].text.strip()
    change = columns[5].text.strip()
    global_pop = columns[6].text.strip()
    rank = columns[7].text.strip()
    data.append([country, pop_2025, area, density, change, global_pop, rank])

print("Sample Record:")
display(data[1])

Sample Record:


['China', '1,416,100,000', '9.7M', '150', '-0.23%', '17.69%', '2']

In [70]:
import pandas as pd

column_data = ['Country', 'Population 2025', 'Area (Km^2)', 'Density', 'Change (%)', 'Global Population (%)', 'Rank']

def clean_value(value, is_precent=False):
    value = value.replace('<', '').strip()
    if is_precent:
        value = value.replace('%', '').strip()
    if 'K' in value:
        return float(value.replace('K', '').replace(',', '')) * 1000
    elif 'M' in value:
        return float(value.replace('M', '').replace(',', '')) * 1000000
    else:
        return float(value.replace(',', ''))

cleaned_data = []
for row in data:
    country = row[0]
    pop_2025 = clean_value(row[1])
    area = clean_value(row[2])
    density = row[3]
    change = clean_value(row[4], is_precent=True)
    global_pop = clean_value(row[5], is_precent=True)
    rank = row[6]
    cleaned_data.append([country, pop_2025, area, density, change, global_pop, rank])

print("Sample Record after pre-processing:")
display(cleaned_data[1])


Sample Record after pre-processing:


['China', 1416100000.0, 9700000.0, '150', -0.23, 17.69, '2']

In [71]:
df = pd.DataFrame(cleaned_data, columns=column_data)
display(df.head())
df.to_csv('./dataset/population_data.csv', index=False)
print("Data has been saved as 'population_data.csv'!")

Unnamed: 0,Country,Population 2025,Area (Km^2),Density,Change (%),Global Population (%),Rank
0,India,1463870000.0,3300000.0,492,0.89,18.29,1
1,China,1416100000.0,9700000.0,150,-0.23,17.69,2
2,United States,347276000.0,9400000.0,38,0.54,4.34,3
3,Indonesia,285721000.0,1900000.0,152,0.79,3.57,4
4,Pakistan,255220000.0,881900.0,331,1.57,3.19,5


Data has been saved as 'population_data.csv'!


## (2) Population of Countries

### Country ISO codes

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# URL to scrape data from
url = 'https://en.wikipedia.org/wiki/ISO_3166-1_alpha-3'

# Send a GET request to fetch the raw HTML content
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

# Find the table containing country data and ISO codes
table = soup.find('table', {'class': 'wikitable'})

# Extract rows from the table (excluding the header)
rows = table.find_all('tr')[1:]

# Prepare lists to store country names and ISO codes
countries = []
iso_codes = []

# Loop through rows to extract data
for row in rows:
    columns = row.find_all('td')
    if len(columns) >= 2:
        country = columns[0].get_text(strip=True)
        iso_code = columns[1].get_text(strip=True)
        countries.append(country)
        iso_codes.append(iso_code)

# Create a DataFrame to store the data
df = pd.DataFrame({
    'Country': countries,
    'ISO Alpha-3 Code': iso_codes
})

# Save the data to a CSV file
df.to_csv('countries_iso_codes.csv', index=False)

# Show the first few rows of the DataFrame
print(df.head())
