# Web Scraping Phase 1 - Haseeb

## (1) Population of Countries
https://worldpopulationreview.com/countries

In [84]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

url = 'https://worldpopulationreview.com/countries'
page=requests.get(url)
soup= BeautifulSoup(page.text, 'html')
table=soup.find_all('table')[0]
columns=table.find_all('th')
print(columns)

[<th class="datatable-th sticky top-0 text-nowrap border-none border-wpr-table_border bg-wpr-table_header_bg px-3 py-2 text-left align-bottom leading-4 text-wpr-table_header md:px-4 sticky left-0 z-50 font-bold pointer-events-none" data-field="flagCode" style=""> <div class="relative flex flex-col items-stretch justify-start bg-wpr-table_header_bg"> <div class="flex cursor-pointer select-none flex-row items-center"> <span class="sort-indicator ml-1"> </span> </div> </div> </th>, <th class="datatable-th sticky top-0 text-nowrap border-none border-wpr-table_border bg-wpr-table_header_bg px-3 py-2 text-left align-bottom leading-4 text-wpr-table_header md:px-4 z-40 false" data-field="country" style=""> <div class="relative flex flex-col items-stretch justify-start bg-wpr-table_header_bg"> <div class="flex cursor-pointer select-none flex-row items-center"> Country <span class="sort-indicator ml-1"> </span> </div> </div> </th>, <th class="datatable-th sticky top-0 text-nowrap border-none bor

In [85]:
title_data = [data.text.strip() for data in columns]
column_data = pd.DataFrame(columns=title_data)
column_data = pd.DataFrame(columns=title_data[1:8])
print("Columns:")
display(column_data)

Columns:


Unnamed: 0,Country,2025 Pop.,Area (kmÂ²),Density,Change,% Global Pop,Rank


In [86]:
rows = soup.find_all('tr')[1:]
data = []

for row in rows:
    columns = row.find_all('td')
    country = columns[1].text.strip()
    pop_2025 = columns[2].text.strip()
    area = columns[3].text.strip()
    density = columns[4].text.strip()
    change = columns[5].text.strip()
    global_pop = columns[6].text.strip()
    rank = columns[7].text.strip()
    data.append([country, pop_2025, area, density, change, global_pop, rank])

print("Sample Record:")
display(data[1])

Sample Record:


['China', '1,416,100,000', '9.7M', '150', '-0.23%', '17.69%', '2']

In [87]:
import pandas as pd

column_data = ['Country', 'Population 2025', 'Area (Km^2)', 'Density', 'Change (%)', 'Global Population (%)', 'Rank']

def clean_value(value, is_precent=False):
    value = value.replace('<', '').strip()
    if is_precent:
        value = value.replace('%', '').strip()
    if 'K' in value:
        return float(value.replace('K', '').replace(',', '')) * 1000
    elif 'M' in value:
        return float(value.replace('M', '').replace(',', '')) * 1000000
    else:
        return float(value.replace(',', ''))

cleaned_data = []
for row in data:
    country = row[0]
    pop_2025 = clean_value(row[1])
    area = clean_value(row[2])
    density = row[3]
    change = clean_value(row[4], is_precent=True)
    global_pop = clean_value(row[5], is_precent=True)
    rank = row[6]
    cleaned_data.append([country, pop_2025, area, density, change, global_pop, rank])

print("Sample Record after pre-processing:")
display(cleaned_data[1])


Sample Record after pre-processing:


['China', 1416100000.0, 9700000.0, '150', -0.23, 17.69, '2']

In [88]:
df = pd.DataFrame(cleaned_data, columns=column_data)
display(df.head())
df.to_csv('./datasets/1-population-data.csv', index=False)
print("Population data has been saved as '1-population-data.csv'!")

Unnamed: 0,Country,Population 2025,Area (Km^2),Density,Change (%),Global Population (%),Rank
0,India,1463870000.0,3300000.0,492,0.89,18.29,1
1,China,1416100000.0,9700000.0,150,-0.23,17.69,2
2,United States,347276000.0,9400000.0,38,0.54,4.34,3
3,Indonesia,285721000.0,1900000.0,152,0.79,3.57,4
4,Pakistan,255220000.0,881900.0,331,1.57,3.19,5


Population data has been saved as '1-population-data.csv'!


## (2) ISO codes of countries
https://en.wikipedia.org/wiki/ISO_3166-1_alpha-3

In [89]:
import requests
from lxml import html
import pandas as pd

url = 'https://en.wikipedia.org/wiki/ISO_3166-1_alpha-3'
response = requests.get(url)

if response.status_code == 200:
    tree = html.fromstring(response.content)
    country_list = tree.xpath('//div[@class="plainlist"]/ul/li')
    countries = []
    iso_codes = []
    for item in country_list:
        iso_code = item.xpath('.//span[@class="monospaced"]/text()')
        country_name = item.xpath('.//a/text()')
        if iso_code and country_name:
            cleaned_country_name = country_name[0].strip("'\"")
            iso_codes.append(iso_code[0].strip())
            countries.append(cleaned_country_name)
    df = pd.DataFrame({
        'Country': countries,
        'ISO Code (Alpha-3)': iso_codes
    })
    df.to_csv('./datasets/2-countries-iso-codes.csv', index=False)
    print(df.head())
    print("\nISO Codes data has been saved as '2-countries-iso-codes.csv'!")
else:
    print(f"Failed to retrieve the webpage. Status code: {response.status_code}")


         Country ISO Code (Alpha-3)
0          Aruba                ABW
1    Afghanistan                AFG
2         Angola                AGO
3       Anguilla                AIA
4  Åland Islands                ALA

ISO Codes data has been saved as '2-countries-iso-codes.csv'!


# (3) Demographics of Countries
https://worldpopulationreview.com/countries/

In [93]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time

country_df = pd.read_csv('./datasets/2-countries-iso-codes.csv')
country_names = country_df['Country'].tolist()

data = []

for country in country_names:
    country_slug = country.lower().replace(" ", "-")
    url = f"https://worldpopulationreview.com/countries/{country_slug}"
    
    try:
        response = requests.get(url, timeout=10)
        if response.status_code != 200:
            continue
        
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find the container that holds all the required demographic info
        container = soup.find('div', class_="mb-4 flex flex-col gap-1 lg:w-3/4 rounded-lg bg-wpr-ui_bg p-3")
        
        values = {
            "Country": country,
            "Population": None,
            "Births per Day": None,
            "Deaths per Day": None,
            "Immigrations per Day": None,
            "Net Change per Day": None,
            "2025 Population Change": None
        }
        
        if container:
            blocks = container.find_all('div', class_='flex flex-col items-center justify-between p-2.5')
            if not blocks:
                blocks = container.find_all('div', recursive=False)
            
            for block in blocks:
                label_div = block.find('div', class_='text-sm') or block.find('div', class_='text-sm font-semibold') or block.find('div', class_='text-sm font-semibold leading-5 text-wpr-subtitle') or block.find('div', class_='text-sm font-semibold leading-5 text-wpr-title')
                value_div = block.find('div', class_='font-inter')
                
                if label_div and value_div:
                    label = label_div.text.strip()
                    value = value_div.text.strip()
                    
                    if "Population" in label and "Change" not in label:
                        values["Population"] = value
                    elif "Births per Day" in label:
                        values["Births per Day"] = value
                    elif "Deaths per Day" in label:
                        values["Deaths per Day"] = value
                    elif "Immigrations per Day" in label:
                        values["Immigrations per Day"] = value
                    elif "Net Change per Day" in label:
                        values["Net Change per Day"] = value
                    elif "2025 Population Change" in label:
                        values["2025 Population Change"] = value

        data.append(values)
        time.sleep(1)

    except Exception as e:
        print(f"Error scraping {country}: {e}")
        continue

In [95]:
df = pd.DataFrame(data)
display(df.head())
df.to_csv('./datasets/3-countries-demographic-stats.csv', index=False)
print("\nDemographics data has been saved as '3-countries-demographic-stats.csv'!")

Unnamed: 0,Country,Population,Births per Day,Deaths per Day,Immigrations per Day,Net Change per Day,2025 Population Change
0,Aruba,108131.0,3.0,3.0,,0.0,0.0
1,Afghanistan,43492950.0,4128.0,673.0,,3251.0,237323.0
2,Angola,38696293.0,3915.0,722.0,,3182.0,232286.0
3,Anguilla,,,,,,
4,Albania,2777639.0,75.0,66.0,,57.0,4161.0



Demographics data has been saved as '3-countries-demographic-stats.csv'!


## (4) Climate Stats of Countries
https://www.climatewatchdata.org/

In [98]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

country_df = pd.read_csv('./datasets/2-countries-iso-codes.csv')
country_list = country_df.to_dict("records")

chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
final_data = []

for row in country_list:
    country_name = row["Country"]
    iso_code = row["ISO Code (Alpha-3)"]
    url = f"https://www.climatewatchdata.org/countries/{iso_code}?end_year=2023&start_year=1990"
    driver.get(url)
    time.sleep(5)
    
    metrics = {
        "Country": country_name,
        "Country Code": iso_code,
        "Total Emissions (MtCO2e)": None,
        "Emissions per Capita (tCO2e/person)": None,
        "Emissions per GDP (tCO2e/million $GDP)": None,
        "ND-GAIN Vulnerability Score": None,
        "Population (millions)": None,
        "GDP per Capita (USD)": None
    }

    try:
        cards = driver.find_elements(By.CLASS_NAME, "country-header-styles__cardContent__Y6vBv")
        for card in cards:
            try:
                label = card.find_element(By.CLASS_NAME, "country-header-styles__title__trYG1").text.strip()
                value = card.find_element(By.CLASS_NAME, "country-header-styles__value__3f0d2").text.strip()
                if "Total Emissions" in label:
                    metrics["Total Emissions (MtCO2e)"] = value
                elif "Emissions per Capita" in label:
                    metrics["Emissions per Capita (tCO2e/person)"] = value
                elif "Emissions per GDP" in label:
                    metrics["Emissions per GDP (tCO2e/million $GDP)"] = value
                elif "ND-GAIN Vulnerability Score" in label:
                    metrics["ND-GAIN Vulnerability Score"] = value
                elif "Population (millions)" in label:
                    metrics["Population (millions)"] = value
                elif "GDP per Capita" in label:
                    metrics["GDP per Capita (USD)"] = value
            except:
                continue
        final_data.append(metrics)
    except Exception as e:
        print(f"Error scraping {country_name}: {e}")
        continue
    time.sleep(2)

driver.quit()

In [99]:
df = pd.DataFrame(final_data)
display(df.head())
df.to_csv("./datasets/4-countries-climate-watchdata.csv", index=False)
print("\nClimate watchdata has been saved as '4-countries-climate-watchdata.csv'!")

Unnamed: 0,Country,Country Code,Total Emissions (MtCO2e),Emissions per Capita (tCO2e/person),Emissions per GDP (tCO2e/million $GDP),ND-GAIN Vulnerability Score,Population (millions),GDP per Capita (USD)
0,Aruba,ABW,,,,,,
1,Afghanistan,AFG,31.27,0.8,1554.66,0.59,39.84,516.75
2,Angola,AGO,119.41,3.63,2226.99,0.51,33.93,1631.43
3,Anguilla,AIA,,,,,,
4,Åland Islands,ALA,,,,,,



Climate watchdata has been saved as '4-countries-climate-watchdata.csv'!
