## AGRICULTURAL PRICE PREDICTION - PRODUCTION MODEL
============================================================

This notebook trains a machine learning model to predict retail prices
of agricultural commodities in Kenya using the Kamis dataset.

#### Author: Kyalo Josephine Kathini

##### Date: 2025

In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
import json
from datetime import datetime
import os
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully")

Libraries imported successfully


 ### KAMIS AGRICULTURAL PRICE PREDICTION MODE

 #### 1. Data loading and cleaning

In [22]:
data_path = "../data/raw/kamis_data.csv"
df = pd.read_csv(data_path)

print(f"Loaded {len(df):,} rows")
print(f"Columns: {df.columns.tolist()}")
print(f"Date range: {df['Date'].min()} to {df['Date'].max()}")

Loaded 310,304 rows
Columns: ['Commodity', 'Classification', 'Grade', 'Sex', 'Market', 'Wholesale', 'Retail', 'Supply Volume', 'County', 'Date', 'ProductID', 'ProductName']
Date range: 2008-04-01 to 2025-09-17


In [23]:
df["Date"] = pd.to_datetime(df["Date"], errors='coerce')
df = df.dropna(subset=["Date"])
df = df.sort_values("Date").reset_index(drop=True)

print(f"After date conversion: {len(df):,} rows")

After date conversion: 310,304 rows


In [24]:
def clean_price(col):
    """Remove /Kg suffix and convert to numeric"""
    if col.dtype == 'object':
        col = col.astype(str)
        col = col.str.replace('/Kg', '', regex=False)
        col = col.str.replace('/kg', '', regex=False)
        col = col.str.replace('-', '')
        col = pd.to_numeric(col, errors='coerce')
    return col
 
df['Retail'] = clean_price(df['Retail'])
df['Wholesale'] = clean_price(df['Wholesale'])
df['Supply Volume'] = pd.to_numeric(df['Supply Volume'], errors='coerce').fillna(0)

print("Prices cleaned")

Prices cleaned


In [25]:
# Remove missing target values and outliers
initial_rows = len(df)
df = df.dropna(subset=['Retail'])
df = df[(df['Retail'] > 1) & (df['Retail'] < 5000)]

print(f"After removing missing/outliers: {len(df):,} rows (removed {initial_rows - len(df):,})")
print(f"Price range: {df['Retail'].min():.2f} to {df['Retail'].max():.2f} KES")
print(f"Price mean: {df['Retail'].mean():.2f} KES")

After removing missing/outliers: 256,087 rows (removed 54,217)
Price range: 1.01 to 4929.58 KES
Price mean: 164.07 KES


 #### 2. Feature Engineering

In [27]:
# Sort by commodity, market, county, date for proper grouping
df = df.sort_values(['Commodity', 'Market', 'County', 'Date']).reset_index(drop=True)

print("Data sorted for feature engineering") 

Data sorted for feature engineering


#### 2.1 TEMPORAL FEATURES

In [28]:
df['year'] = df['Date'].dt.year
df['month'] = df['Date'].dt.month
df['quarter'] = df['Date'].dt.quarter
df['week'] = df['Date'].dt.isocalendar().week
df['dayofweek'] = df['Date'].dt.dayofweek 

In [29]:
# Cyclical encoding for seasonality
df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)

In [30]:
# Seasonal flags (Kenya-specific)
df['is_harvest'] = df['month'].isin([7, 8, 1, 2]).astype(int)
df['is_rainy'] = df['month'].isin([3, 4, 5, 10, 11]).astype(int)

print("Temporal features created: 10 features")

Temporal features created: 10 features


#### 2.2 LAG FEATURES

In [31]:
for lag in [1, 3, 7, 14]:
    df[f'lag_{lag}'] = df.groupby(['Commodity', 'Market', 'County'])['Retail'].shift(lag)

print("Lag features created: 4 features")

Lag features created: 4 features


In [32]:
for window in [7, 14]: 
    df[f'ma_{window}'] = df.groupby(['Commodity', 'Market', 'County'])['Retail'].apply(
        lambda x: x.rolling(window=window, min_periods=1).mean()
    ).reset_index(level=0, drop=True)
    
    df[f'std_{window}'] = df.groupby(['Commodity', 'Market', 'County'])['Retail'].apply(
        lambda x: x.rolling(window=window, min_periods=1).std()
    ).reset_index(level=0, drop=True)

print("Rolling features created: 4 features")

Rolling features created: 4 features


In [33]:
df = df.fillna(method='bfill').fillna(0)

print(f"Total features after engineering: {df.shape[1]}")


Total features after engineering: 29


#### 2.3  FEATURE MATRIX

In [34]:
base_features = [
    'year', 'month', 'quarter', 'week', 'dayofweek',
    'month_sin', 'month_cos', 'is_harvest', 'is_rainy'
]

lag_features = [f'lag_{i}' for i in [1, 3, 7, 14]]

rolling_features = [f'ma_{w}' for w in [7, 14]] + [f'std_{w}' for w in [7, 14]]

feature_list = base_features + lag_features + rolling_features

print(f"Base features: {len(base_features)}")
print(f"Lag features: {len(lag_features)}")
print(f"Rolling features: {len(rolling_features)}")

Base features: 9
Lag features: 4
Rolling features: 4


In [35]:
for col in ['Commodity', 'County', 'Market']:
    top_values = df[col].value_counts().head(5).index
    print(f"\nTop {col} values: {top_values.tolist()}")
    
    for val in top_values:
        feature_name = f"{col}_{str(val).replace(' ', '_')}"
        df[feature_name] = (df[col] == val).astype(int)
        feature_list.append(feature_name)

print(f"\nTotal features: {len(feature_list)}")


Top Commodity values: ['Wheat Flour', 'Maize Flour', 'Nile Perch', 'Indigenous Crotolaria (Mito/Miro)', 'Jute Plant (Murenda)']

Top County values: ['Nairobi', 'Kirinyaga', 'Trans-Nzoia', 'Nakuru', 'Siaya']

Top Market values: ['Kitale Municipality Market', 'Kawangware', 'Molo', 'Gikomba', 'Ngurubani Market']

Total features: 32


In [36]:
X = df[feature_list].copy()
y = df['Retail'].copy()

print(f"Training data shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"NaN values in X: {X.isna().sum().sum()}")
print(f"NaN values in y: {y.isna().sum()}")

Training data shape: (256087, 32)
Target shape: (256087,)
NaN values in X: 0
NaN values in y: 0


#### MODEL TRAINING

In [37]:
print("Starting model training...\n")

scaler = StandardScaler()
tscv = TimeSeriesSplit(n_splits=5)

model = RandomForestRegressor(
    n_estimators=50,
    max_depth=12,
    min_samples_split=50,
    random_state=42,
    n_jobs=-1,
    verbose=0
)

Starting model training...



In [38]:
fold_results = []

for fold, (train_idx, test_idx) in enumerate(tscv.split(X)):
    print(f"Fold {fold+1}/5", end=" ... ")
    
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    # Scale
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Train
    model.fit(X_train_scaled, y_train)
    
    # Predict
    y_pred = model.predict(X_test_scaled)
    
    # Evaluate
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    
    fold_results.append({'fold': fold+1, 'mae': mae, 'rmse': rmse, 'r2': r2})
    
    print(f"MAE={mae:.2f} KES, R²={r2:.4f}")

Fold 1/5 ... MAE=21.78 KES, R²=0.6590
Fold 2/5 ... MAE=39.21 KES, R²=0.5710
Fold 3/5 ... MAE=34.92 KES, R²=0.7799
Fold 4/5 ... MAE=25.99 KES, R²=0.6734
Fold 5/5 ... MAE=26.14 KES, R²=0.6401


In [39]:
avg_mae = np.mean([f['mae'] for f in fold_results])
avg_rmse = np.mean([f['rmse'] for f in fold_results])
avg_r2 = np.mean([f['r2'] for f in fold_results])
std_r2 = np.std([f['r2'] for f in fold_results])

In [40]:
print(f"\n{'='*60}")
print(f"CROSS-VALIDATION RESULTS")
print(f"{'='*60}")
print(f"Average MAE:  {avg_mae:.2f} KES")
print(f"Average RMSE: {avg_rmse:.2f} KES")
print(f"Average R²:   {avg_r2:.4f} (+/- {std_r2:.4f})")


CROSS-VALIDATION RESULTS
Average MAE:  29.61 KES
Average RMSE: 111.59 KES
Average R²:   0.6647 (+/- 0.0675)


### FINAL MODEL

In [41]:
print("\nTraining final model on all data...")

X_scaled = scaler.fit_transform(X)
model.fit(X_scaled, y)

print("Final model trained successfully")


Training final model on all data...
Final model trained successfully


In [42]:
os.makedirs("../models", exist_ok=True)

In [43]:
model_path = "../models/kamis_model.pkl"
joblib.dump(model, model_path)
print(f"Model saved: {model_path}")

Model saved: ../models/kamis_model.pkl


In [44]:
scaler_path = "../models/kamis_scaler.pkl"
joblib.dump(scaler, scaler_path)
print(f"Scaler saved: {scaler_path}") 

Scaler saved: ../models/kamis_scaler.pkl


In [45]:
metadata = {
    'features': feature_list,
    'model_type': 'RandomForest',
    'n_features': len(feature_list),
    'training_samples': len(X),
    'avg_mae': float(avg_mae),
    'avg_rmse': float(avg_rmse),
    'avg_r2': float(avg_r2),
    'trained_date': datetime.now().isoformat(),
    'sklearn_version': '1.3.2 or higher'
}

metadata_path = "../models/kamis_metadata.json"
with open(metadata_path, 'w') as f:
    json.dump(metadata, f, indent=2)

print(f" Metadata saved: {metadata_path}")

 Metadata saved: ../models/kamis_metadata.json


In [46]:
os.makedirs("../data/processed", exist_ok=True)

recent_data = df[[
    'Date', 'Commodity', 'Market', 'County', 'Retail', 'Wholesale', 'Supply Volume'
]].tail(1000).copy()

recent_path = "../data/processed/recent_prices.csv"
recent_data.to_csv(recent_path, index=False)
print(f"Recent prices saved: {recent_path}")
print(f"  Records: {len(recent_data)}")

Recent prices saved: ../data/processed/recent_prices.csv
  Records: 1000


In [47]:
print(f"TRAINING COMPLETE!")
print(f"{'='*60}")
print(f"\nModel Performance:")
print(f"  MAE:  {avg_mae:.2f} KES")
print(f"  RMSE: {avg_rmse:.2f} KES")
print(f"  R²:   {avg_r2:.4f}")

TRAINING COMPLETE!

Model Performance:
  MAE:  29.61 KES
  RMSE: 111.59 KES
  R²:   0.6647
