## Fetching Internet Users

Steps: 
1. Fetch World Bank data for the indicator 'IT.NET.USER.ZS' (Internet users).
2. Normalize JSON response into a clean pandas DataFrame.
3. Filter to only valid countries of interest.
4. Keep the most recent year with a non-NA value for each country.
5. Save the processed data to a CSV file.

In [None]:
import requests
import pandas as pd
import os

### Configuration

In [11]:
INDICATOR = "IT.NET.USER.ZS"
BASE_URL = f"https://api.worldbank.org/v2/country/all/indicator/{INDICATOR}"
HEADERS = {
    "User-Agent": "Python-requests/2.0 (isabelabarton@gmail.com)",
    "From": "isabelabarton@gmail.com"
}

In [None]:
# set of countries to keep in the final dataset
valid_countries = {
    'Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola', 'Antigua and Barbuda',
    'Argentina', 'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas, The',
    'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin',
    'Bhutan', 'Bolivia', 'Bosnia and Herzegovina', 'Botswana', 'Brazil', 'Brunei Darussalam',
    'Bulgaria', 'Burkina Faso', 'Burundi', 'Cabo Verde', 'Cambodia', 'Cameroon', 'Canada',
    'Central African Republic', 'Chad', 'Chile', 'China', 'Colombia', 'Comoros', 'Congo, Dem. Rep.',
    'Congo, Rep.', 'Costa Rica', "Cote d'Ivoire", 'Croatia', 'Cuba', 'Cyprus', 'Czechia',
    'Denmark', 'Djibouti', 'Dominica', 'Dominican Republic', 'Ecuador', 'Egypt, Arab Rep.',
    'El Salvador', 'Equatorial Guinea', 'Eritrea', 'Estonia', 'Eswatini', 'Ethiopia',
    'Fiji', 'Finland', 'France', 'Gabon', 'Gambia, The', 'Georgia', 'Germany', 'Ghana',
    'Greece', 'Grenada', 'Guatemala', 'Guinea', 'Guinea-Bissau', 'Guyana', 'Haiti',
    'Honduras', 'Hungary', 'Iceland', 'India', 'Indonesia', 'Iran, Islamic Rep.',
    'Iraq', 'Ireland', 'Israel', 'Italy', 'Jamaica', 'Japan', 'Jordan', 'Kazakhstan',
    'Kenya', 'Kiribati', "Korea, Dem. People's Rep.", 'Korea, Rep.', 'Kuwait',
    'Kyrgyz Republic', 'Lao PDR', 'Latvia', 'Lebanon', 'Lesotho', 'Liberia', 'Libya',
    'Lithuania', 'Luxembourg', 'Madagascar', 'Malawi', 'Malaysia', 'Maldives', 'Mali',
    'Malta', 'Marshall Islands', 'Mauritania', 'Mauritius', 'Mexico',
    'Micronesia, Fed. Sts.', 'Moldova', 'Monaco', 'Mongolia', 'Montenegro', 'Morocco',
    'Mozambique', 'Myanmar', 'Namibia', 'Nauru', 'Nepal', 'Netherlands', 'New Zealand',
    'Nicaragua', 'Niger', 'Nigeria', 'North Macedonia', 'Norway', 'Oman', 'Pakistan',
    'Palau', 'Panama', 'Papua New Guinea', 'Paraguay', 'Peru', 'Philippines', 'Poland',
    'Portugal', 'Qatar', 'Romania', 'Russian Federation', 'Rwanda', 'Samoa',
    'San Marino', 'Sao Tome and Principe', 'Saudi Arabia', 'Senegal', 'Serbia',
    'Seychelles', 'Sierra Leone', 'Singapore', 'Slovak Republic', 'Slovenia',
    'Solomon Islands', 'Somalia, Fed. Rep.', 'South Africa', 'South Sudan', 'Spain',
    'Sri Lanka', 'St. Kitts and Nevis', 'St. Lucia', 'St. Vincent and the Grenadines',
    'Sudan', 'Suriname', 'Sweden', 'Switzerland', 'Syrian Arab Republic', 'Tajikistan',
    'Tanzania', 'Thailand', 'Timor-Leste', 'Togo', 'Tonga', 'Trinidad and Tobago',
    'Tunisia', 'Turkiye', 'Turkmenistan', 'Tuvalu', 'Uganda', 'Ukraine',
    'United Arab Emirates', 'United Kingdom', 'United States', 'Uruguay',
    'Uzbekistan', 'Vanuatu', 'Venezuela, RB', 'Viet Nam', 'Yemen, Rep.', 'Zambia',
    'Zimbabwe'
}

### Fetch World Bank Data

In [None]:
"""

    Fetch data from World Bank API for all pages in the range 2000â€“2023.

    Returns a pandas DataFrame of raw records.

"""
def fetch_worldbank_data(base_url, headers, per_page=1000):
    all_records = []
    page = 1
    
    while True:
        url = f"{base_url}?format=json&per_page={per_page}&date=2000:2023&page={page}"
        r = requests.get(url, headers=headers, timeout=60)
        r.raise_for_status()
        data = r.json()

        # Stop if response is empty or malformed
        if not data or len(data) < 2:
            break
        meta, records = data[0], data[1]
        all_records.extend(records)

        # Stop if we reached the last page
        if meta["page"] >= meta["pages"]:
            break
        page += 1

    return pd.DataFrame(all_records)

### Normalize The Data

In [None]:
"""

    Normalize World Bank raw DataFrame:
    - Extract country names and indicator names
    - Rename columns for clarity
    - Convert year and value to numeric types
    
 """
def normalize_worldbank_df(df):
    df_clean = df.copy()
    df_clean["country"] = df_clean["country"].apply(lambda x: x["value"] if isinstance(x, dict) else None)
    df_clean["indicator_name"] = df_clean["indicator"].apply(lambda x: x["value"] if isinstance(x, dict) else INDICATOR)
    df_clean = df_clean.rename(columns={
        "countryiso3code": "iso3",
        "date": "year",
        "value": "value"
    })
    df_clean = df_clean[["country", "iso3", "year", "value", "indicator_name"]]
    df_clean["year"] = pd.to_numeric(df_clean["year"], errors="coerce").astype("Int64")
    df_clean["value"] = pd.to_numeric(df_clean["value"], errors="coerce")
    return df_clean

### Filter to Valid Countries

In [None]:
"""

    Keep only rows where country is in the valid_countries set

"""
def filter_valid_countries(df, valid_countries):
    return df[df["country"].isin(valid_countries)].reset_index(drop=True)

In [None]:
"""

    For each country, keep only the row with the most recent year where value is not NA.

"""
def get_most_recent_non_na(df):
    df_valid = df.dropna(subset=["value"])
    df_sorted = df_valid.sort_values("year", ascending=False)
    df_latest = df_sorted.drop_duplicates(subset=["country"])
    return df_latest.reset_index(drop=True)

### Run ETL

In [None]:
# Fetch -> normalize -> filter -> latest

df_raw = fetch_worldbank_data(BASE_URL, HEADERS)
df_norm = normalize_worldbank_df(df_raw)
df_countries = filter_valid_countries(df_norm, valid_countries)
df_latest_non_na = get_most_recent_non_na(df_countries)

In [39]:
os.makedirs('./filtered_data', exist_ok=True)
df_latest_non_na.to_csv('./filtered_data/worldbank_filtered.csv', index=False)