In [11]:
import pandas as pd
import re

In [20]:
wfp = pd.read_parquet('data/raw/wfp_dataset.parquet')
wfp = wfp.rename(columns={'country': 'countryiso3'})
wfp.head()

Unnamed: 0,date,admin1,admin2,market,latitude,longitude,category,commodity,unit,priceflag,pricetype,currency,price,usdprice,countryiso3
0,2004-04-15,,,National Average,,,cereals and tubers,Maize (white),50 KG,actual,Wholesale,SZL,57.5,8.7386,SWZ
1,2004-05-15,,,National Average,,,cereals and tubers,Maize (white),50 KG,actual,Wholesale,SZL,57.5,8.5912,SWZ
2,2004-07-15,,,National Average,,,cereals and tubers,Maize (white),50 KG,actual,Wholesale,SZL,57.5,9.3801,SWZ
3,2004-08-15,,,National Average,,,cereals and tubers,Maize (white),50 KG,actual,Wholesale,SZL,57.5,8.9072,SWZ
4,2004-09-15,,,National Average,,,cereals and tubers,Maize (white),50 KG,actual,Wholesale,SZL,57.5,8.8872,SWZ


In [28]:
wfp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2903527 entries, 0 to 2903526
Data columns (total 16 columns):
 #   Column       Dtype         
---  ------       -----         
 0   date         datetime64[ns]
 1   admin1       object        
 2   admin2       object        
 3   market       object        
 4   latitude     float64       
 5   longitude    float64       
 6   category     object        
 7   commodity    object        
 8   unit         object        
 9   priceflag    object        
 10  pricetype    object        
 11  currency     object        
 12  price        float64       
 13  usdprice     float64       
 14  countryiso3  object        
 15  country      object        
dtypes: datetime64[ns](1), float64(4), object(11)
memory usage: 354.4+ MB


In [10]:
countries = pd.read_csv('data/raw/wfp_countries_global.csv', header=0, skiprows=[1])
countries.head()

Unnamed: 0,countryiso3,url,start_date,end_date
0,AFG,https://data.humdata.org/dataset/wfp-food-pric...,2000-01-15 00:00:00+00:00,2025-02-15 23:59:59+00:00
1,AGO,https://data.humdata.org/dataset/wfp-food-pric...,2008-01-15 00:00:00+00:00,2020-12-15 23:59:59+00:00
2,ARG,https://data.humdata.org/dataset/wfp-food-pric...,2005-01-15 00:00:00+00:00,2022-06-15 23:59:59+00:00
3,ARM,https://data.humdata.org/dataset/wfp-food-pric...,1996-01-15 00:00:00+00:00,2024-12-15 23:59:59+00:00
4,AZE,https://data.humdata.org/dataset/wfp-food-pric...,2007-01-15 00:00:00+00:00,2010-02-15 23:59:59+00:00


In [12]:
def extract_country(url):
    match = re.search(r'for-([a-z-]+)', url)
    if match:
        country = match.group(1).replace('-', ' ').title()
        return country
    return None

countries['country'] = countries['url'].apply(extract_country)
countries.head()

Unnamed: 0,countryiso3,url,start_date,end_date,country
0,AFG,https://data.humdata.org/dataset/wfp-food-pric...,2000-01-15 00:00:00+00:00,2025-02-15 23:59:59+00:00,Afghanistan
1,AGO,https://data.humdata.org/dataset/wfp-food-pric...,2008-01-15 00:00:00+00:00,2020-12-15 23:59:59+00:00,Angola
2,ARG,https://data.humdata.org/dataset/wfp-food-pric...,2005-01-15 00:00:00+00:00,2022-06-15 23:59:59+00:00,Argentina
3,ARM,https://data.humdata.org/dataset/wfp-food-pric...,1996-01-15 00:00:00+00:00,2024-12-15 23:59:59+00:00,Armenia
4,AZE,https://data.humdata.org/dataset/wfp-food-pric...,2007-01-15 00:00:00+00:00,2010-02-15 23:59:59+00:00,Azerbaijan


In [None]:
# for country in countries['country']:
#     print(country)

In [18]:
# country name corrections
country_corrections = {
    'Bolivia Plurinational State Of': 'Bolivia',
    'Cote D Ivoire': "Cote D'Ivoire",
    'Guinea Bissau': 'Guinea-Bissau',
    'Iran Islamic Republic Of': 'Iran',
    'Lao People S Democratic Republic': 'Laos',
    'Republic Of Moldova': 'Moldova',
    'State Of Palestine': 'Palestine',
    'Russian Federation': 'Russia',
    'Syrian Arab Republic': 'Syria',
    'United Republic Of Tanzania': 'Tanzania',
    'Venezuela Bolivarian Republic Of': 'Venezuela',
    'Viet Nam': 'Vietnam'
}

# replace the country names using the mapping
countries['country'] = countries['country'].replace(country_corrections)

# merge the dataframes, adding only the 'country' column
wfp = wfp.merge(countries[['countryiso3', 'country']], on='countryiso3', how='left')

# filter for past decade
wfp_filtered = wfp[(wfp['date'].dt.year >= 2014) & (wfp['date'].dt.year <= 2024)]
wfp_filtered.reset_index(drop=True, inplace=True)
wfp_filtered.head()

In [27]:
wfp_filtered.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500791 entries, 0 to 2500790
Data columns (total 16 columns):
 #   Column       Dtype         
---  ------       -----         
 0   date         datetime64[ns]
 1   admin1       object        
 2   admin2       object        
 3   market       object        
 4   latitude     float64       
 5   longitude    float64       
 6   category     object        
 7   commodity    object        
 8   unit         object        
 9   priceflag    object        
 10  pricetype    object        
 11  currency     object        
 12  price        float64       
 13  usdprice     float64       
 14  countryiso3  object        
 15  country      object        
dtypes: datetime64[ns](1), float64(4), object(11)
memory usage: 305.3+ MB


In [29]:
# export
wfp_filtered.to_parquet("data/wfp_preprocessed.parquet", index=False)