In [3]:
import csv
import os
import requests
from bs4 import BeautifulSoup
import urllib.parse
import pandas as pd

In [None]:
BASE_URL = "https://data.humdata.org"
CSV_FILE_PATH = "./dataset/wfp_countries_global.csv"
OUTPUT_DIR = "./dataset"
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [None]:
with open(CSV_FILE_PATH, newline="", encoding="utf-8") as f:
    reader = csv.reader(f)

    header = next(reader)

    for i, row in enumerate(reader):
        country_name = row[0].strip()
        dataset_page_url = row[1].strip()

        if not dataset_page_url:
            continue

        print(f"[INFO] Now processing: {country_name}, URL={dataset_page_url}")

        try:
            page_response = requests.get(dataset_page_url)
            page_response.raise_for_status()
        except Exception as e:
            print(f" - [ERROR] Failed to get dataset page: {e}")

        soup = BeautifulSoup(page_response.text, "html.parser")

        download_link_tag = soup.find("a", class_="resource-download-button")

        if not download_link_tag:
            print(" - [ERROR] No download link found in the page.")
            continue

        download_href = download_link_tag.get("href")
        if not download_href:
            print(" - [ERROR] 'href' not found in the download link tag.")
            continue

        full_download_url = urllib.parse.urljoin(dataset_page_url, download_href)

        try:
            file_response = requests.get(full_download_url, stream=True)
            file_response.raise_for_status()
        except Exception as e:
            print(f" - [ERROR] Failed to download file: {e}")
            continue

        filename = os.path.basename(download_href)
        if not filename:
            filename = country_name.replace(" ", "_") + ".csv"

        output_path = os.path.join(OUTPUT_DIR, filename)

        with open(output_path, "wb") as out_file:
            for chunk in file_response.iter_content(chunk_size=8192):
                out_file.write(chunk)

        print(f" - Downloaded: {output_path}")

[INFO] Now processing: IRN, URL=https://data.humdata.org/dataset/wfp-food-prices-for-iran-islamic-republic-of
 - Downloaded: /Users/jaeeunc/Desktop/MDS/Block5/DS551/project/dataset/wfp_food_prices_irn.csv
[INFO] Now processing: IRQ, URL=https://data.humdata.org/dataset/wfp-food-prices-for-iraq
 - Downloaded: /Users/jaeeunc/Desktop/MDS/Block5/DS551/project/dataset/wfp_food_prices_irq.csv
[INFO] Now processing: JOR, URL=https://data.humdata.org/dataset/wfp-food-prices-for-jordan
 - Downloaded: /Users/jaeeunc/Desktop/MDS/Block5/DS551/project/dataset/wfp_food_prices_jor.csv
[INFO] Now processing: JPN, URL=https://data.humdata.org/dataset/wfp-food-prices-for-japan
 - Downloaded: /Users/jaeeunc/Desktop/MDS/Block5/DS551/project/dataset/wfp_food_prices_jpn.csv
[INFO] Now processing: KAZ, URL=https://data.humdata.org/dataset/wfp-food-prices-for-kazakhstan
 - Downloaded: /Users/jaeeunc/Desktop/MDS/Block5/DS551/project/dataset/wfp_food_prices_kaz.csv
[INFO] Now processing: KEN, URL=https://data.h

In [5]:
file_list = os.listdir(OUTPUT_DIR)
file_count = len(file_list)
print(f"There are {file_count} files.")

There are 101 files.


In [6]:
file_list.remove("wfp_countries_global.csv")
file_list.remove(".DS_Store")

In [7]:
print(f"There are {len(file_list)} files.")

There are 99 files.


In [20]:
df_list = []
error_files = {}
for file in file_list:
    try:
        df = pd.read_csv(OUTPUT_DIR + "/" + file, header=0, skiprows=[1])
        df["country"] = file.split(".")[0].split("_")[-1].upper()
        df_list.append(df)
    except Exception as e:
        print(file, e)
        error_files[file] = str(e)
        continue

df = pd.concat(df_list, ignore_index=True)
df.shape

(2903527, 15)

In [22]:
df["country"].nunique()

99

In [27]:
df.head()

Unnamed: 0,date,admin1,admin2,market,latitude,longitude,category,commodity,unit,priceflag,pricetype,currency,price,usdprice,country
0,2004-04-15,,,National Average,,,cereals and tubers,Maize (white),50 KG,actual,Wholesale,SZL,57.5,8.7386,SWZ
1,2004-05-15,,,National Average,,,cereals and tubers,Maize (white),50 KG,actual,Wholesale,SZL,57.5,8.5912,SWZ
2,2004-07-15,,,National Average,,,cereals and tubers,Maize (white),50 KG,actual,Wholesale,SZL,57.5,9.3801,SWZ
3,2004-08-15,,,National Average,,,cereals and tubers,Maize (white),50 KG,actual,Wholesale,SZL,57.5,8.9072,SWZ
4,2004-09-15,,,National Average,,,cereals and tubers,Maize (white),50 KG,actual,Wholesale,SZL,57.5,8.8872,SWZ


In [None]:
df["date"] = pd.to_datetime(df["date"], format="%Y-%m-%d", errors="coerce")

In [30]:
df.dtypes

date         datetime64[ns]
admin1               object
admin2               object
market               object
latitude            float64
longitude           float64
category             object
commodity            object
unit                 object
priceflag            object
pricetype            object
currency             object
price               float64
usdprice            float64
country              object
dtype: object

In [32]:
df.to_parquet("./dataset/wfp_dataset.parquet")