## AGRICULTURAL PRICE PREDICTION - PRODUCTION MODEL
============================================================

This notebook trains a machine learning model to predict retail prices
of agricultural commodities in Kenya using the Kamis dataset.
The model predicts prices *7 days ahead*, using time-series features, categorical encodings, lag windows, and rolling statistics.

#### Author: Kyalo Josephine Kathini

##### Date: 2025

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
import json
from datetime import datetime
import os
import warnings
warnings.filterwarnings('ignore')

 ### KAMIS AGRICULTURAL PRICE PREDICTION MODE

 #### 1. Data loading 

In [2]:
print("\n--- Loading and Cleaning Data ---")
data_path = "../data/raw/kamis_data.csv"
df = pd.read_csv(data_path)

print(f"Loaded {len(df):,} rows")


--- Loading and Cleaning Data ---
Loaded 337,060 rows


#### 2. Parse Dates and sort

In [3]:
df["Date"] = pd.to_datetime(df["Date"], errors='coerce')
df = df.dropna(subset=["Date"])
df = df.sort_values("Date").reset_index(drop=True)

#### 3. Cleaning price columns

In [4]:
def clean_price(col):
    """Remove /Kg suffix and convert to numeric"""
    if col.dtype == 'object':
        col = col.astype(str)
        col = col.str.replace('/Kg', '', regex=False)
        col = col.str.replace('/kg', '', regex=False)
        col = col.str.replace('-', '')
        col = pd.to_numeric(col, errors='coerce')
    return col 

In [5]:
df['Retail'] = clean_price(df['Retail'])
df['Wholesale'] = clean_price(df['Wholesale'])
df['Supply Volume'] = pd.to_numeric(df['Supply Volume'], errors='coerce').fillna(0)

#### 4. Removing Invalid rows

In [6]:
initial_rows = len(df)
df = df.dropna(subset=['Retail'])
df = df[(df['Retail'] > 5) & (df['Retail'] < 1000)] 
print(f"After cleaning: {len(df):,} rows (removed {initial_rows - len(df):,})")
print(f"Price mean: {df['Retail'].mean():.2f} KES")

After cleaning: 248,080 rows (removed 54,674)
Price mean: 155.80 KES


 #### 5. Sorting for Time-Series Feature Engineering

In [7]:
df = df.sort_values(['Commodity', 'Market', 'County', 'Date']).reset_index(drop=True)

#### 6. Temporal Features

In [8]:
df['year'] = df['Date'].dt.year
df['month'] = df['Date'].dt.month
df['quarter'] = df['Date'].dt.quarter
df['week'] = df['Date'].dt.isocalendar().week
df['dayofweek'] = df['Date'].dt.dayofweek 

#### 7. Cyclical Seasonality Features

In [9]:
df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
df['is_harvest'] = df['month'].isin([1, 2, 7, 8]).astype(int)
df['is_rainy'] = df['month'].isin([3, 4, 5, 10, 11, 12]).astype(int)

#### 8. Lag Features

In [10]:
lags = [1, 3, 7, 14, 21, 28, 30]
for lag in lags:
    df[f'lag_{lag}'] = df.groupby(['Commodity', 'Market', 'County'])['Retail'].shift(lag)
print(f"Lag features created: {lags}")

Lag features created: [1, 3, 7, 14, 21, 28, 30]


#### 9. Rolling Window features

In [11]:
windows = [7, 14, 30]
for window in windows:  
    ma_series = df.groupby(['Commodity', 'Market', 'County'])['Retail'].apply(
        lambda x: x.rolling(window=window, min_periods=1).mean()
    ) 
    df[f'ma_{window}'] = ma_series.reset_index(level=[0, 1, 2], drop=True)
    std_series = df.groupby(['Commodity', 'Market', 'County'])['Retail'].apply(
        lambda x: x.rolling(window=window, min_periods=1).std()
    ) 
    df[f'std_{window}'] = std_series.reset_index(level=[0, 1, 2], drop=True)

print(f"Rolling features created: {windows}")

Rolling features created: [7, 14, 30]


#### 10. Target (7-Day Ahead Forecast)

In [12]:
DAYS_AHEAD = 7
df['target_retail'] = df.groupby(['Commodity', 'Market', 'County'])['Retail'].shift(-DAYS_AHEAD)
print(f"Target variable ({DAYS_AHEAD}-day forecast) created.")

Target variable (7-day forecast) created.


#### 11. Encoding Categorical Variables

In [13]:
categorical_features = ['Commodity', 'Market', 'County']
encoded_categorical_features = [f"{col}_enc" for col in categorical_features]

encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
df[encoded_categorical_features] = encoder.fit_transform(df[categorical_features])
print("Categorical features encoded.")

Categorical features encoded.


#### 12. Selecting Final Features

In [14]:
base_features = [
    'year', 'month', 'quarter', 'week', 'dayofweek',
    'month_sin', 'month_cos', 'is_harvest', 'is_rainy'
]
lag_features = [f'lag_{i}' for i in lags]
rolling_features = [f'ma_{w}' for w in windows] + [f'std_{w}' for w in windows]

feature_list = base_features + lag_features + rolling_features + encoded_categorical_features
print(f"Total features: {len(feature_list)}")

Total features: 25


#### 13. Preparing Training Matrices

In [15]:
X = df[feature_list].copy()
y = df['target_retail'].copy()
 
non_null_mask = y.notna()
X = X.loc[non_null_mask]
y = y.loc[non_null_mask]
 
X = X.reset_index(drop=True)
y = y.reset_index(drop=True)
 
X = X.fillna(method='bfill').fillna(0)

print(f"Training shape: {X.shape}") 
print(f"NaN values in X: {X.isna().sum().sum()}")
print(f"NaN values in y: {y.isna().sum()}")

Training shape: (204067, 25)
NaN values in X: 0
NaN values in y: 0


#### 14. Initializing Model

In [16]:
print("\n--- Training HistGradientBoostingRegressor ---") 

scaler = StandardScaler()
tscv = TimeSeriesSplit(n_splits=5)
 
model = HistGradientBoostingRegressor(
    learning_rate=0.1,        
    max_iter=200,             
    max_depth=15,             
    l2_regularization=0.1,    
    random_state=42,
    verbose=0
)


--- Training HistGradientBoostingRegressor ---


#### MODEL TRAINING

 #### a) Cross-Validation

In [17]:
fold_results = []
for fold, (train_idx, test_idx) in enumerate(tscv.split(X)):
    print(f"Fold {fold+1}/5", end=" ... ")
    
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    model.fit(X_train_scaled, y_train)
    
    y_pred = model.predict(X_test_scaled)
     
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    
    fold_results.append({'r2': r2, 'mae': mae})
    print(f"MAE={mae:.2f} KES, R²={r2:.4f}")

Fold 1/5 ... MAE=23.34 KES, R²=0.7937
Fold 2/5 ... MAE=33.05 KES, R²=0.8065
Fold 3/5 ... MAE=38.61 KES, R²=0.9063
Fold 4/5 ... MAE=28.98 KES, R²=0.8693
Fold 5/5 ... MAE=34.09 KES, R²=0.8057


 #### a)i) Cross-Validation Summary

In [18]:
avg_r2 = np.mean([f['r2'] for f in fold_results])
avg_mae = np.mean([f['mae'] for f in fold_results])

print(f"\n{'='*60}")
print(f"MODEL PERFORMANCE")
print(f"{'='*60}")
print(f"Average R²:   {avg_r2:.4f} (Target: > 0.70)")
print(f"Average MAE:  {avg_mae:.2f} KES")


MODEL PERFORMANCE
Average R²:   0.8363 (Target: > 0.70)
Average MAE:  31.61 KES


### FINAL MODEL

 #### i) Training Final Model

In [19]:
print("\n--- Saving Final Model ---")
X_scaled = scaler.fit_transform(X)
model.fit(X_scaled, y)


--- Saving Final Model ---


0,1,2
,loss,'squared_error'
,quantile,
,learning_rate,0.1
,max_iter,200
,max_leaf_nodes,31
,max_depth,15
,min_samples_leaf,20
,l2_regularization,0.1
,max_features,1.0
,max_bins,255


 #### ii) Saving Model Artifacts

In [20]:
os.makedirs("../models", exist_ok=True)
os.makedirs("../data/processed", exist_ok=True)

In [21]:
model_path = "../models/kamis_model.pkl"
joblib.dump(model, model_path)
print(f"Model saved: {model_path}")

Model saved: ../models/kamis_model.pkl


In [22]:
scaler_path = "../models/kamis_scaler.pkl"
joblib.dump(scaler, scaler_path)
print(f"Scaler saved: {scaler_path}") 

Scaler saved: ../models/kamis_scaler.pkl


In [23]:
encoder_path = "../models/kamis_encoder.pkl"
joblib.dump(encoder, encoder_path)
print(f"Encoder saved: {encoder_path}")

Encoder saved: ../models/kamis_encoder.pkl


In [24]:
metadata = {
    'features': feature_list, 
    'model_type': 'HistGradientBoostingRegressor', 
    'n_features': len(feature_list),
    'training_samples': len(X),
    'avg_mae': float(avg_mae),
    'avg_r2': float(avg_r2),
    'trained_date': datetime.now().isoformat(),
    'sklearn_version': '1.3.2+'
}
with open("../models/kamis_metadata.json", 'w') as f:
    json.dump(metadata, f, indent=2)

In [25]:
print("Saving recent prices...")
recent_data = df.groupby(['Commodity', 'Market', 'County']).tail(40).copy() 
api_columns = ['Date', 'Commodity', 'Market', 'County', 'Retail', 'Wholesale', 'Supply Volume']
recent_data = recent_data[api_columns]
recent_data.to_csv("../data/processed/recent_prices.csv", index=False)

Saving recent prices...


In [26]:
print(f"TRAINING COMPLETE! R²: {avg_r2:.4f}")

TRAINING COMPLETE! R²: 0.8363
