In [55]:
import requests
from bs4 import BeautifulSoup

url = "https://en.wikipedia.org/wiki/ISO_3166-1_alpha-3"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"}

response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
country_data = {}

for li in soup.find_all("li"):
    code_span = li.find("span", class_="monospaced")
    country_link = li.find("a")
    if code_span and country_link:
        country_code = code_span.text.strip()
        country_name = country_link.text.strip()
        country_data[country_name] = country_code

code_list = list(country_data.values())
print(country_data)
print(code_list)

{'Aruba': 'ABW', 'Afghanistan': 'AFG', 'Angola': 'AGO', 'Anguilla': 'AIA', 'Åland Islands': 'ALA', 'Albania': 'ALB', 'Andorra': 'AND', 'United Arab Emirates': 'ARE', 'Argentina': 'ARG', 'Armenia': 'ARM', 'American Samoa': 'ASM', 'Antarctica': 'ATA', 'French Southern Territories': 'ATF', 'Antigua and Barbuda': 'ATG', 'Australia': 'AUS', 'Austria': 'AUT', 'Azerbaijan': 'AZE', 'Burundi': 'BDI', 'Belgium': 'BEL', 'Benin': 'BEN', 'Bonaire, Sint Eustatius and Saba': 'BES', 'Burkina Faso': 'BFA', 'Bangladesh': 'BGD', 'Bulgaria': 'BGR', 'Bahrain': 'BHR', 'Bahamas': 'BHS', 'Bosnia and Herzegovina': 'BIH', 'Saint Barthélemy': 'BLM', 'Belarus': 'BLR', 'Belize': 'BLZ', 'Bermuda': 'BMU', 'Bolivia, Plurinational State of': 'BOL', 'Brazil': 'BRA', 'Barbados': 'BDS', 'Brunei Darussalam': 'BRN', 'Bhutan': 'BTN', 'Bouvet Island': 'BVT', 'Botswana': 'BWA', 'Central African Republic': 'RCA', 'Canada': 'CDN', 'Cocos (Keeling) Islands': 'CCK', 'Switzerland': 'CHE', 'Chile': 'RCH', 'China': 'CHN', "Côte d'Iv

In [54]:
import json
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup

options = webdriver.ChromeOptions()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
start_year = 1990
end_year = 2023
all_data = []

def get_text(element):
    return element.text.strip() if element else "N/A"

for country_code in code_list:
    url = f"https://www.climatewatchdata.org/countries/{country_code}?end_year={end_year}&start_year={start_year}"
    print(f"Scraping: {url}")
    driver.get(url)
    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "div.some-content-selector"))
        )
    except Exception as e:
        print("Timed out waiting for page to load:", e)
    
    soup = BeautifulSoup(driver.page_source, "html.parser")
    country_name = get_text(soup.find("h1"))  
    values = soup.find_all("div", class_="country-header-styles__value__3f0d2")
    for v in values:
        print(v)
    extracted_values = [value.get_text(strip=True) for value in values]
    if len(extracted_values) < 6:
        print(f"Skipping {country_code} due to missing data")
        continue
    data = {
        "Country": country_name,
        "Country Code": country_code,
        "Total Emissions (MtCO2e)": extracted_values[0],
        "Emissions per Capita (tCO2e/person)": extracted_values[1],
        "Emissions per GDP (tCO2e/million $GDP)": extracted_values[2],
        "ND-GAIN Vulnerability Score": extracted_values[3],
        "Population (millions)": extracted_values[4],
        "GDP per Capita (USD)": extracted_values[5]
    }
    all_data.append(data)

df = pd.DataFrame(all_data)
df.to_csv("co2_emissions_data.csv", index=False)
print(df)
driver.quit()

Scraping: https://www.climatewatchdata.org/countries/ABW?end_year=2023&start_year=1990
Timed out waiting for page to load: Message: 
Stacktrace:
0   chromedriver                        0x0000000100dc9804 cxxbridge1$str$ptr + 2785964
1   chromedriver                        0x0000000100dc1ddc cxxbridge1$str$ptr + 2754692
2   chromedriver                        0x0000000100915ea8 cxxbridge1$string$len + 92928
3   chromedriver                        0x000000010095d1d0 cxxbridge1$string$len + 384552
4   chromedriver                        0x000000010099e678 cxxbridge1$string$len + 651984
5   chromedriver                        0x000000010095135c cxxbridge1$string$len + 335796
6   chromedriver                        0x0000000100d8ecd4 cxxbridge1$str$ptr + 2545532
7   chromedriver                        0x0000000100d91fa0 cxxbridge1$str$ptr + 2558536
8   chromedriver                        0x0000000100d6ed04 cxxbridge1$str$ptr + 2414508
9   chromedriver                        0x0000000100d928

In [None]:
import pandas as pd

input_file = "co2_emissions_data.csv"
output_file = "/climate.csv"

df_filtered = df.dropna(how='all', subset=df.columns[1:])
df_filtered = df_filtered[~df_filtered.iloc[:, 0].str.startswith(',')]
df_filtered.to_csv(output_file, index=False)

print(f"Cleaned data saved to {output_file}")
display(df_filtered.head())

Cleaned data saved to ./datasets/climate.csv


Unnamed: 0,Country,Country Code,Total Emissions (MtCO2e),Emissions per Capita (tCO2e/person),Emissions per GDP (tCO2e/million $GDP),ND-GAIN Vulnerability Score,Population (millions),GDP per Capita (USD)
1,Afghanistan,AFG,31.27,0.8,1554.66,0.59,39.84,516.75
2,Angola,AGO,119.41,3.63,2226.99,0.51,33.93,1631.43
4,Albania,ALB,8.06,2.84,532.51,0.4,2.81,5332.16
5,Andorra,AND,0.58,7.48,199.78,,0.08,37416.7
6,United Arab Emirates,ARE,249.93,25.27,696.43,0.37,9.99,36284.56
