# Data Processing Pipeline
## Load and clean data

In [13]:
import pandas as pd
import numpy as np
 
#   Load Data
 
kamis = pd.read_csv("../data/raw/kamis_data.csv")
wb = pd.read_csv("../data/raw/worldbank_data.csv")

print("KAMIS shape:", kamis.shape)
print("World Bank shape:", wb.shape)
 
#  Clean KAMIS Data
 
kamis = kamis.dropna(subset=["Date", "Commodity", "Retail"])

# Convert to correct dtypes
kamis["Date"] = pd.to_datetime(kamis["Date"], errors="coerce")
kamis["Retail"] = pd.to_numeric(kamis["Retail"], errors="coerce")
kamis["Wholesale"] = pd.to_numeric(kamis["Wholesale"], errors="coerce")

 
# Clean World Bank Data
wb = wb[wb["ISO3"] == "KEN"].copy()
wb["price_date"] = pd.to_datetime(wb["price_date"], errors="coerce")

#   relevant numeric columns
numeric_cols = ["beans", "maize", "oil", "potatoes", "sorghum"]
for col in numeric_cols:
    wb[col] = pd.to_numeric(wb[col], errors="coerce")

KAMIS shape: (310304, 12)
World Bank shape: (14175, 60)


## Commodity Mapping

In [14]:
commodity_mapping = {
    # Beans
    "Beans Rosecoco": "beans",
    "Beans Canadian": "beans",
    "Beans Mwitemania": "beans",
    "Beans Mixed": "beans",

    # Maize
    "Dry Maize": "maize",
    "Green Maize": "maize",

    # Oil
    "Cooking Oil (Salad)": "oil",
    "Cooking Oil (Other)": "oil",

    # Potatoes
    "Irish Potatoes": "potatoes",

    # Sorghum
    "Red Sorghum": "sorghum",
    "White Sorghum": "sorghum"
}

In [15]:
kamis["commodity_group"] = kamis["Commodity"].map(commodity_mapping)

## Aggregating to Monthly level

In [16]:
# KAMIS monthly
kamis_monthly = kamis.groupby([
    pd.Grouper(key="Date", freq="M"),
    "County",
    "Commodity",
    "commodity_group"
]).agg({
    "Retail": ["mean", "std", "min", "max"],
    "Wholesale": "mean"
}).reset_index()

# Flattening column names
kamis_monthly.columns = [
    "date", "county", "commodity", "commodity_group",
    "retail_mean", "retail_std", "retail_min", "retail_max",
    "wholesale_mean"
]

# WB monthly 
wb_monthly = wb.groupby([
    pd.Grouper(key="price_date", freq="M"),
    "adm1_name"
]).agg({
    "beans": "mean",
    "maize": "mean",
    "oil": "mean",
    "potatoes": "mean",
    "sorghum": "mean"
}).reset_index()

## Merging with world bank data

In [17]:
wb_long = wb_monthly.melt(
    id_vars=["price_date", "adm1_name"],
    value_vars=numeric_cols,
    var_name="commodity_group",
    value_name="wb_price"
)

 
unified = pd.merge(
    kamis_monthly,
    wb_long,
    left_on=["date", "commodity_group", "county"],
    right_on=["price_date", "commodity_group", "adm1_name"],
    how="left"
)

## Final Cleaning

In [18]:
unified = unified.drop(columns=["price_date", "adm1_name"])
unified["wb_price"] = unified["wb_price"].fillna(np.nan)

print("Unified dataset shape:", unified.shape)
print(unified.head())
 

Unified dataset shape: (1922, 10)
        date     county      commodity commodity_group  retail_mean  \
0 2022-02-28    Bungoma  White Sorghum         sorghum          NaN   
1 2022-02-28       Embu  White Sorghum         sorghum          NaN   
2 2022-02-28    Garissa  White Sorghum         sorghum          NaN   
3 2022-02-28   Kakamega  White Sorghum         sorghum          NaN   
4 2022-02-28  Kirinyaga  White Sorghum         sorghum          NaN   

   retail_std  retail_min  retail_max  wholesale_mean  wb_price  
0         NaN         NaN         NaN             NaN       NaN  
1         NaN         NaN         NaN             NaN       NaN  
2         NaN         NaN         NaN             NaN       NaN  
3         NaN         NaN         NaN             NaN       NaN  
4         NaN         NaN         NaN             NaN       NaN  


## Final Dataset

In [19]:
unified.to_csv("../data/processed/unified_dataset.csv", index=False)
print("Unified dataset saved to unified_dataset.csv")

Unified dataset saved to unified_dataset.csv
