In [3]:
import pandas as pd
import requests

In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

url = "https://data.un.org/" 

response = requests.get(url)
response.raise_for_status() 

soup = BeautifulSoup(response.text, "html.parser")

country_list_div = soup.find("div", class_="CountryList")
if not country_list_div:
    print("Could not find the 'CountryList' div. Check class names or structure.")
    exit()

li_tags = country_list_div.find_all("li")

country_data = []

for li in li_tags:
    a_tag = li.find("a")
    if a_tag:
        country_name = a_tag.get_text(strip=True)
        country_url = a_tag.get("href")

        country_data.append({"country_name": country_name, "country_url": country_url})

df_countries = pd.DataFrame(country_data)

In [5]:
def parse_country_page(url, country_name):
    """
    Scrapes the UN Data country detail page and returns four DataFrames:
    (df_general, df_economic, df_social, df_environment).
    """

    # print(url)
    response = requests.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, "html.parser")

    general_data = []
    economic_data = []
    social_data = []
    environment_data = []

    details_sections = soup.find_all("details")

    for section in details_sections:
        # The <summary> text gives us the category
        summary_tag = section.find("summary")
        if not summary_tag:
            continue
        summary_text = summary_tag.get_text(strip=True)

        if "General" in summary_text:
            current_list = general_data
        elif "Economic" in summary_text:
            current_list = economic_data
        elif "Social" in summary_text:
            current_list = social_data
        elif "Environment" in summary_text:
            current_list = environment_data
        else:
            # If it's some unexpected category, skip or handle differently
            continue

        table = section.find("table", class_="pure-table")
        if not table:
            continue

        thead = table.find("thead")

        if thead:
            header_cells = thead.find("tr").find_all("td")
            year_headers = [cell.get_text(strip=True) for cell in header_cells[1:]]

            tbody = table.find("tbody")
            rows = tbody.find_all("tr", class_="pure-table") + tbody.find_all(
                "tr", class_="pure-table-odd"
            )

            for row in rows:
                cells = row.find_all("td")
                if len(cells) == 0:
                    continue

                indicator_name = cells[0].get_text(strip=True)
                value_cells = cells[1:]

                for year_header, value_cell in zip(year_headers, value_cells):
                    # Extract text and footnote
                    value_text = value_cell.get_text(strip=True)
                    sup_tag = value_cell.find("sup")

                    current_list.append(
                        {
                            "Country": country_name,
                            "Indicator": indicator_name,
                            "Year": year_header,
                            "Value": value_text,
                        }
                    )

        else:
            # Single-column table (General Information)
            tbody = table.find("tbody")
            rows = tbody.find_all("tr", class_="pure-table") + tbody.find_all(
                "tr", class_="pure-table-odd"
            )

            for row in rows:
                cells = row.find_all("td")
                if len(cells) < 3:
                    continue

                indicator_name = cells[0].get_text(strip=True)
                value_cell = cells[2]
                value_text = value_cell.get_text(strip=True)
                sup_tag = value_cell.find("sup")

                current_list.append(
                    {
                        "Country": country_name,
                        "Indicator": indicator_name,
                        "Year": "",  # No year columns in General Info
                        "Value": value_text,
                    }
                )

    df_general = pd.DataFrame(general_data)
    df_economic = pd.DataFrame(economic_data)
    df_social = pd.DataFrame(social_data)
    df_environment = pd.DataFrame(environment_data)

    return df_general, df_economic, df_social, df_environment

In [6]:
base_url = "https://data.un.org"

from tqdm.auto import tqdm

df_general_list = []
df_economic_list = []
df_social_list = []
df_environment_list = []

for idx, row in tqdm(df_countries.iterrows(), total=df_countries.shape[0]):
    country_name = row["country_name"]
    country_url = row["country_url"]

    full_url = base_url + "/" + country_url

    # print(full_url)

    df_general, df_economic, df_social, df_environment = parse_country_page(
        full_url, country_name
    )

    df_general_list.append(df_general)
    df_economic_list.append(df_economic)
    df_social_list.append(df_social)
    df_environment_list.append(df_environment)


  from .autonotebook import tqdm as notebook_tqdm
100%|██████████| 232/232 [03:07<00:00,  1.24it/s]


In [7]:
df_general_list = pd.concat(df_general_list, ignore_index=True)
df_economic_list = pd.concat(df_economic_list, ignore_index=True) 
df_social_list = pd.concat(df_social_list, ignore_index=True)
df_environment_list = pd.concat(df_environment_list, ignore_index=True)

In [8]:
df_general_list

Unnamed: 0,Country,Indicator,Year,Value
0,Afghanistan,Region,,Southern Asia
1,Afghanistan,"Population (000, 2021)",,39 835a
2,Afghanistan,"Pop. density (per km2, 2021)",,61a
3,Afghanistan,Capital city,,Kabul
4,Afghanistan,"Capital city pop. (000, 2021)",,4 114.0b
...,...,...,...,...
2221,Zimbabwe,UN membership date,,25-Aug-80
2222,Zimbabwe,Surface area (km2),,390 757b
2223,Zimbabwe,Sex ratio (m per 100 f),,91.4a
2224,Zimbabwe,National currency,,Zimbabwe Dollar (ZWL)


In [9]:
df_economic_list

Unnamed: 0,Country,Indicator,Year,Value
0,Afghanistan,"GDP growth rate(annual %, const. 2015 prices)",2010,5.2
1,Afghanistan,"GDP growth rate(annual %, const. 2015 prices)",2015,-1.4
2,Afghanistan,"GDP growth rate(annual %, const. 2015 prices)",2021,4b
3,Afghanistan,Economy: Agriculture(% of Gross Value Added),2010,33.2
4,Afghanistan,Economy: Agriculture(% of Gross Value Added),2015,27.3
...,...,...,...,...
10609,Zimbabwe,International trade: imports(million current US$),2015,6 053
10610,Zimbabwe,International trade: imports(million current US$),2021,5 048c
10611,Zimbabwe,"Balance of payments, current account(million US$)",2010,- 1 444
10612,Zimbabwe,"Balance of payments, current account(million US$)",2015,- 1 678


In [10]:
df_general_list.to_csv("general_data.csv", index=False)
df_economic_list.to_csv("economic_data.csv", index=False)
df_social_list.to_csv("social_data.csv", index=False)
df_environment_list.to_csv("environment_data.csv", index=False)