In [1]:
import requests
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.edge.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

#### API calls (WORLD BANK)

In [2]:
def getData(url,name):
    response = requests.get(url)
    data = response.json()
    data_list = []
    for entry in data[1]:
        if entry["value"] is not None:
            data_list.append({
                "Country": entry["country"]["value"],
                "Year": entry["date"],
                name: entry["value"]
            })
    data_df = pd.DataFrame(data_list)
    print(data_df)
    data_df.to_csv(f"{name}.csv", index=False)

In [3]:
def getUrl(code,num_page = 20000):
    return f"http://api.worldbank.org/v2/country/all/indicator/{code}?format=json&per_page={num_page}"

In [4]:
population_url = getUrl("SP.POP.TOTL")
male_ratio_url = getUrl("SP.POP.TOTL.MA.ZS")
urban_population = getUrl("SP.URB.TOTL.IN.ZS")
gdp_url = getUrl("NY.GDP.PCAP.CD")
area_url = getUrl("AG.SRF.TOTL.K2")
old_url = getUrl("SP.POP.DPND.OL")
young_url = getUrl("SP.POP.DPND.YG")
life_url = getUrl("SP.DYN.LE00.IN")

In [5]:
getData(population_url,"Population")

                           Country  Year  Population
0      Africa Eastern and Southern  2023   739108306
1      Africa Eastern and Southern  2022   720859132
2      Africa Eastern and Southern  2021   702977106
3      Africa Eastern and Southern  2020   685112979
4      Africa Eastern and Southern  2019   667242986
...                            ...   ...         ...
16925                     Zimbabwe  1964     4310332
16926                     Zimbabwe  1963     4177931
16927                     Zimbabwe  1962     4049778
16928                     Zimbabwe  1961     3925952
16929                     Zimbabwe  1960     3806310

[16930 rows x 3 columns]


In [6]:
getData(male_ratio_url,"Male Population Ratio")

                           Country  Year  Male Population Ratio
0      Africa Eastern and Southern  2023              49.535786
1      Africa Eastern and Southern  2022              49.527652
2      Africa Eastern and Southern  2021              49.521084
3      Africa Eastern and Southern  2020              49.513455
4      Africa Eastern and Southern  2019              49.504007
...                            ...   ...                    ...
16955                     Zimbabwe  1964              49.407888
16956                     Zimbabwe  1963              49.428126
16957                     Zimbabwe  1962              49.449155
16958                     Zimbabwe  1961              49.471720
16959                     Zimbabwe  1960              49.496126

[16960 rows x 3 columns]


In [7]:
getData(urban_population,"Urban Population Ratio")

                           Country  Year  Urban Population Ratio
0      Africa Eastern and Southern  2023               38.335337
1      Africa Eastern and Southern  2022               37.825158
2      Africa Eastern and Southern  2021               37.323699
3      Africa Eastern and Southern  2020               36.828302
4      Africa Eastern and Southern  2019               36.336259
...                            ...   ...                     ...
16827                     Zimbabwe  1964               14.092000
16828                     Zimbabwe  1963               13.578000
16829                     Zimbabwe  1962               13.082000
16830                     Zimbabwe  1961               12.821000
16831                     Zimbabwe  1960               12.608000

[16832 rows x 3 columns]


In [8]:
getData(gdp_url,"GDP_per_Capita")

                           Country  Year  GDP_per_Capita
0      Africa Eastern and Southern  2023     1672.505957
1      Africa Eastern and Southern  2022     1642.432039
2      Africa Eastern and Southern  2021     1545.956697
3      Africa Eastern and Southern  2020     1356.088871
4      Africa Eastern and Southern  2019     1508.486886
...                            ...   ...             ...
13974                     Zimbabwe  1964      282.376856
13975                     Zimbabwe  1963      277.532515
13976                     Zimbabwe  1962      275.966139
13977                     Zimbabwe  1961      279.332656
13978                     Zimbabwe  1960      276.643363

[13979 rows x 3 columns]


In [9]:
getData(area_url,"Total Area (sq km)")

                           Country  Year  Total Area (sq km)
0      Africa Eastern and Southern  2021        1.516201e+07
1      Africa Eastern and Southern  2020        1.516201e+07
2      Africa Eastern and Southern  2019        1.516205e+07
3      Africa Eastern and Southern  2018        1.516204e+07
4      Africa Eastern and Southern  2017        1.516204e+07
...                            ...   ...                 ...
15927                     Zimbabwe  1965        3.907600e+05
15928                     Zimbabwe  1964        3.907600e+05
15929                     Zimbabwe  1963        3.907600e+05
15930                     Zimbabwe  1962        3.907600e+05
15931                     Zimbabwe  1961        3.907600e+05

[15932 rows x 3 columns]


In [10]:
getData(old_url,"Retirement Age Dependency Ratio")

                           Country  Year  Retirement Age Dependency Ratio
0      Africa Eastern and Southern  2023                         5.641678
1      Africa Eastern and Southern  2022                         5.662353
2      Africa Eastern and Southern  2021                         5.706199
3      Africa Eastern and Southern  2020                         5.733111
4      Africa Eastern and Southern  2019                         5.718424
...                            ...   ...                              ...
16925                     Zimbabwe  1964                         6.855007
16926                     Zimbabwe  1963                         6.858850
16927                     Zimbabwe  1962                         6.824991
16928                     Zimbabwe  1961                         6.793782
16929                     Zimbabwe  1960                         6.759052

[16930 rows x 3 columns]


In [11]:
getData(young_url,"Young Age Dependency Ratio")

                           Country  Year  Young Age Dependency Ratio
0      Africa Eastern and Southern  2023                   72.978971
1      Africa Eastern and Southern  2022                   73.799990
2      Africa Eastern and Southern  2021                   74.565149
3      Africa Eastern and Southern  2020                   75.310862
4      Africa Eastern and Southern  2019                   76.028564
...                            ...   ...                         ...
16925                     Zimbabwe  1964                  101.575828
16926                     Zimbabwe  1963                  100.673025
16927                     Zimbabwe  1962                   98.816809
16928                     Zimbabwe  1961                   97.259899
16929                     Zimbabwe  1960                   95.879938

[16930 rows x 3 columns]


In [12]:
getData(life_url,"Life Expectancy")

                           Country  Year  Life Expectancy
0      Africa Eastern and Southern  2022        62.899031
1      Africa Eastern and Southern  2021        62.454590
2      Africa Eastern and Southern  2020        63.313860
3      Africa Eastern and Southern  2019        63.755678
4      Africa Eastern and Southern  2018        63.365863
...                            ...   ...              ...
16119                     Zimbabwe  1964        54.994000
16120                     Zimbabwe  1963        54.549000
16121                     Zimbabwe  1962        54.071000
16122                     Zimbabwe  1961        53.619000
16123                     Zimbabwe  1960        53.235000

[16124 rows x 3 columns]


#### API Calls (WHO)

In [13]:
who_url = "https://ghoapi.azureedge.net/api/Indicator"
response = requests.get(who_url)

indicators_df = pd.DataFrame(response.json()['value'])


In [14]:
bmi_indicators_df = indicators_df[indicators_df['IndicatorName'].str.contains('bmi', case=False)]
print(bmi_indicators_df)

            IndicatorCode                                      IndicatorName   
210           NCD_BMI_18C  Prevalence of underweight among adults, BMI < ...  \
211           NCD_BMI_25C  Prevalence of overweight among adults, BMI &Gr...   
212           NCD_BMI_30A  Prevalence of obesity among adults, BMI &Great...   
213        NCD_BMI_PLUS1C  Prevalence of overweight among children and ad...   
263           NCD_BMI_25A  Prevalence of overweight among adults, BMI &Gr...   
264        NCD_BMI_PLUS2A  Prevalence of obesity among children and adole...   
286          NCD_BMI_MEAN       Mean BMI (kg/m²) (age-standardized estimate)   
294         NCD_BMI_MEANC             Mean BMI (kg/m&#xb2;) (crude estimate)   
295        NCD_BMI_PLUS1A  Prevalence of overweight among children and ad...   
296        NCD_BMI_PLUS2C  Prevalence of obesity among children and adole...   
415           NCD_BMI_18A  Prevalence of underweight among adults, BMI < ...   
416           NCD_BMI_30C  Prevalence of

In [15]:
bmi_indicator_code = 'NCD_BMI_MEAN'
bmi_url = f"https://ghoapi.azureedge.net/api/{bmi_indicator_code}"
response = requests.get(bmi_url)
data = response.json()
bmi_data_df = pd.DataFrame(data['value'])
print("Data for NCD_BMI_MEAN:")
print(bmi_data_df)
bmi_data_df.to_csv("bmi_data.csv")

Data for NCD_BMI_MEAN:
            Id IndicatorCode SpatialDimType SpatialDim TimeDimType   
0          664  NCD_BMI_MEAN        COUNTRY        IRQ        YEAR  \
1          729  NCD_BMI_MEAN        COUNTRY        LAO        YEAR   
2         1303  NCD_BMI_MEAN         REGION        AMR        YEAR   
3         1899  NCD_BMI_MEAN        COUNTRY        ARG        YEAR   
4         2929  NCD_BMI_MEAN        COUNTRY        SOM        YEAR   
...        ...           ...            ...        ...         ...   
25818  9435695  NCD_BMI_MEAN        COUNTRY        ARG        YEAR   
25819  9435727  NCD_BMI_MEAN        COUNTRY        GRC        YEAR   
25820  9436293  NCD_BMI_MEAN        COUNTRY        COG        YEAR   
25821  9436890  NCD_BMI_MEAN        COUNTRY        ARM        YEAR   
25822  9437164  NCD_BMI_MEAN        COUNTRY        PAK        YEAR   

      ParentLocationCode         ParentLocation Dim1Type  TimeDim      Dim1   
0                    EMR  Eastern Mediterranean      SEX 

#### Scrape using Selenium

In [16]:
webdriver_path = r"C:\Users\Admin\Downloads\edgedriver_win64\msedgedriver.exe" # replace with actual path
service = Service(webdriver_path)
options = webdriver.EdgeOptions()
driver = webdriver.Edge(service=service, options=options)

In [17]:
country_code_url = "https://wits.worldbank.org/wits/wits/witshelp/content/codes/country_codes.htm"
driver.get(country_code_url)
time.sleep(5)
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'table.wt1')))

table = driver.find_element(By.CSS_SELECTOR, 'table.wt1')

rows = table.find_elements(By.TAG_NAME, 'tr')
data = []
for row in rows[2:]:
    cols = [col.text for col in row.find_elements(By.TAG_NAME, 'td')]
    if cols:  # Check if cols is not empty
        data.append(cols)
df = pd.DataFrame(data, columns=["Country","ISO3","Code"])
print(df)
df.to_csv("country_codes.csv", index=False)

                            Country ISO3 Code
0                       Afghanistan  AFG  004
1                           Albania  ALB  008
2                           Algeria  DZA  012
3                    American Samoa  ASM  016
4                           Andorra  AND  020
..                              ...  ...  ...
259                     Yemen, Rep.  YEM  887
260                      Yugoslavia  SER  891
261  Yugoslavia, FR (Serbia/Montene  YUG  890
262                          Zambia  ZMB  894
263                        Zimbabwe  ZWE  716

[264 rows x 3 columns]


In [18]:
sdg_url = "https://ourworldindata.org/grapher/world-regions-sdg-united-nations?tab=table"
driver.get(sdg_url)
time.sleep(5)
table = driver.find_element(By.CLASS_NAME, 'table-wrapper')
headers = [header.text for header in table.find_elements(By.TAG_NAME, 'th')]

rows = table.find_elements(By.TAG_NAME, 'tr')
data = []
for row in rows[1:]:
    cols = [col.text for col in row.find_elements(By.TAG_NAME, 'td')]
    data.append(cols)

df = pd.DataFrame(data, columns=headers)
print(df)
df.to_csv("world_regions_sdg.csv", index=False)


                             Country/area                              2024
0                             Afghanistan         Central and Southern Asia
1                           Aland Islands       Europe and Northern America
2                                 Albania       Europe and Northern America
3                                 Algeria  Northern Africa and Western Asia
4                          American Samoa                           Oceania
..                                    ...                               ...
243                                Zambia                Sub-Saharan Africa
244                              Zimbabwe                Sub-Saharan Africa
245                                 Other                                  
246                Svalbard and Jan Mayen       Europe and Northern America
247  United States Minor Outlying Islands                           Oceania

[248 rows x 2 columns]


In [19]:
infant_url = r"https://data.un.org/Data.aspx?q=infant&d=PopDiv&f=variableID%3a77"
driver.get(infant_url)

wait = WebDriverWait(driver, 10)
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.DataContainer')))
checkboxes = driver.find_elements(By.CSS_SELECTOR, "input[type='checkbox'][value]")

for checkbox in checkboxes:
    try:
        year = int(checkbox.get_attribute('name'))
    except ValueError as e:
        continue
    if year <= 2024:
        if not checkbox.is_selected():
            driver.execute_script("arguments[0].scrollIntoView(true);", checkbox)
            time.sleep(1)
            checkbox.click()


apply_filters_link = driver.find_element(By.ID, "ctl00_main_filters_anchorApplyBottom")
apply_filters_link.click()

time.sleep(5) 
all_data = []
def scrape_table():
    data_container = driver.find_element(By.CSS_SELECTOR, 'div.DataContainer')
    headers = [header.text for header in data_container.find_elements(By.TAG_NAME, 'th')]
    rows = data_container.find_elements(By.TAG_NAME, 'tr')
    page_data = []
    for row in rows[1:]:
        cols = [col.text for col in row.find_elements(By.TAG_NAME, 'td')]
        page_data.append(cols)
    
    return headers, page_data

while True:
    headers, page_data = scrape_table()
    all_data.extend(page_data)
    try:
        next_button = driver.find_element(By.ID, "linkNextB")
        if 'disabled' in next_button.get_attribute('class'):
            break
        next_button.click()
        time.sleep(5)
    except Exception as e:
        print(f"Reached the last page or encountered an error: {e}")
        break

df = pd.DataFrame(all_data, columns=headers)
print(df)
df.to_csv("Infant Mortality.csv", index=False)
driver.quit()


KeyboardInterrupt: 