## AGRICULTURAL PRICE PREDICTION - PRODUCTION MODEL
============================================================

This notebook trains a machine learning model to predict retail prices
of agricultural commodities in Kenya using the Kamis dataset.
The model predicts prices *7 days ahead*, using time-series features, categorical encodings, lag windows, and rolling statistics.

#### Author: Kyalo Josephine Kathini

##### Date: 2025

In [32]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, OrdinalEncoder # ğŸ’¡ Added OrdinalEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
import json
from datetime import datetime
import os
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully")

Libraries imported successfully


 ### KAMIS AGRICULTURAL PRICE PREDICTION MODE

 #### 1. Data loading 

In [33]:
print("\n--- Loading and Cleaning Data ---")
data_path = "../data/raw/kamis_data.csv"
df = pd.read_csv(data_path)

print(f"Loaded {len(df):,} rows")
print(f"Columns: {df.columns.tolist()}")
print(f"Date range: {df['Date'].min()} to {df['Date'].max()}")


--- Loading and Cleaning Data ---
Loaded 310,304 rows
Columns: ['Commodity', 'Classification', 'Grade', 'Sex', 'Market', 'Wholesale', 'Retail', 'Supply Volume', 'County', 'Date', 'ProductID', 'ProductName']
Date range: 2008-04-01 to 2025-09-17


#### 2. Parse Dates and sort

In [34]:
df["Date"] = pd.to_datetime(df["Date"], errors='coerce')
df = df.dropna(subset=["Date"])
df = df.sort_values("Date").reset_index(drop=True)
print(f"After date conversion: {len(df):,} rows") 

After date conversion: 310,304 rows


#### 3. Cleaning price columns

In [35]:
def clean_price(col):
    """Remove /Kg suffix and convert to numeric"""
    if col.dtype == 'object':
        col = col.astype(str)
        col = col.str.replace('/Kg', '', regex=False)
        col = col.str.replace('/kg', '', regex=False)
        col = col.str.replace('-', '')
        col = pd.to_numeric(col, errors='coerce')
    return col
  
df['Retail'] = clean_price(df['Retail'])
df['Wholesale'] = clean_price(df['Wholesale'])
df['Supply Volume'] = pd.to_numeric(df['Supply Volume'], errors='coerce').fillna(0)
print("Prices cleaned")

Prices cleaned


#### 4. Removing Invalid rows

In [36]:
initial_rows = len(df)
df = df.dropna(subset=['Retail'])
df = df[(df['Retail'] > 1) & (df['Retail'] < 5000)]
print(f"After removing missing/outliers: {len(df):,} rows (removed {initial_rows - len(df):,})")
print(f"Price range: {df['Retail'].min():.2f} to {df['Retail'].max():.2f} KES")
print(f"Price mean: {df['Retail'].mean():.2f} KES")

After removing missing/outliers: 256,087 rows (removed 54,217)
Price range: 1.01 to 4929.58 KES
Price mean: 164.07 KES


 #### 5. Sorting for Time-Series Feature Engineering

In [37]:
df = df.sort_values(['Commodity', 'Market', 'County', 'Date']).reset_index(drop=True)
print("Data sorted by Commodity, Market, County, Date")

Data sorted by Commodity, Market, County, Date


#### 6. Temporal Features

In [38]:
df['year'] = df['Date'].dt.year
df['month'] = df['Date'].dt.month
df['quarter'] = df['Date'].dt.quarter
df['week'] = df['Date'].dt.isocalendar().week
df['dayofweek'] = df['Date'].dt.dayofweek 

#### 7. Cyclical Seasonality Features

In [39]:
df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
df['is_harvest'] = df['month'].isin([7, 8, 1, 2]).astype(int)
df['is_rainy'] = df['month'].isin([3, 4, 5, 10, 11]).astype(int)
print("Temporal features created: 10 features")

Temporal features created: 10 features


#### 8. Lag Features

In [40]:
for lag in [1, 3, 7, 14]:
    df[f'lag_{lag}'] = df.groupby(['Commodity', 'Market', 'County'])['Retail'].shift(lag)
print("Lag features created: 4 features")

Lag features created: 4 features


#### 9. Rolling Window features

In [41]:
for window in [7, 14]: 
    df[f'ma_{window}'] = df.groupby(['Commodity', 'Market', 'County'])['Retail'].apply(
        lambda x: x.rolling(window=window, min_periods=1).mean()
    ).reset_index(level=0, drop=True)
    
    df[f'std_{window}'] = df.groupby(['Commodity', 'Market', 'County'])['Retail'].apply(
        lambda x: x.rolling(window=window, min_periods=1).std()
    ).reset_index(level=0, drop=True)
print("Rolling features created: 4 features")

Rolling features created: 4 features


#### 10. Target (7-Day Ahead Forecast)

In [42]:
DAYS_AHEAD = 7
df['target_retail'] = df.groupby(['Commodity', 'Market', 'County'])['Retail'].shift(-DAYS_AHEAD)
print(f"Target variable ({DAYS_AHEAD}-day forecast) created.")

Target variable (7-day forecast) created.


#### 11. Encoding Categorical Variables

In [43]:
categorical_features = ['Commodity', 'Market', 'County']
encoded_categorical_features = [f"{col}_enc" for col in categorical_features]

encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
 
df[encoded_categorical_features] = encoder.fit_transform(df[categorical_features])
print("All categorical features encoded into new '_enc' columns.")

All categorical features encoded into new '_enc' columns.


#### 12. Selecting Final Features

In [44]:
base_features = [
    'year', 'month', 'quarter', 'week', 'dayofweek',
    'month_sin', 'month_cos', 'is_harvest', 'is_rainy'
]
lag_features = [f'lag_{i}' for i in [1, 3, 7, 14]]
rolling_features = [f'ma_{w}' for w in [7, 14]] + [f'std_{w}' for w in [7, 14]]
 
feature_list = base_features + lag_features + rolling_features + encoded_categorical_features
print(f"Total features: {len(feature_list)}")

Total features: 20


#### 13. Preparing Training Matrices

In [45]:
X = df[feature_list].copy()
y = df['target_retail'].copy() 

non_null_mask = y.notna()
 
X = X.loc[non_null_mask]
y = y.loc[non_null_mask]

X = X.reset_index(drop=True)
y = y.reset_index(drop=True)
 
X = X.fillna(method='bfill').fillna(0)

print(f"Training data shape after NaN removal: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"NaN values in X: {X.isna().sum().sum()}")
print(f"NaN values in y: {y.isna().sum()}")

Training data shape after NaN removal: (211584, 20)
Target shape: (211584,)
NaN values in X: 0
NaN values in y: 0


#### 14. Initializing Model

In [46]:
print("\n--- Starting Model Training ---")

scaler = StandardScaler()
tscv = TimeSeriesSplit(n_splits=5)

model = RandomForestRegressor(
    n_estimators=50,
    max_depth=12,
    min_samples_split=50,
    random_state=42,
    n_jobs=-1,
    verbose=0
)


--- Starting Model Training ---


#### MODEL TRAINING

 #### a) Cross-Validation

In [48]:
fold_results = []
for fold, (train_idx, test_idx) in enumerate(tscv.split(X)):
    print(f"Fold {fold+1}/5", end=" ... ")
    
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
   
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
 
    model.fit(X_train_scaled, y_train)
 
    y_pred = model.predict(X_test_scaled)
    
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    
    fold_results.append({'fold': fold+1, 'mae': mae, 'rmse': rmse, 'r2': r2})
    
    print(f"MAE={mae:.2f} KES, RÂ²={r2:.4f}")

Fold 1/5 ... MAE=28.89 KES, RÂ²=0.5415
Fold 2/5 ... MAE=53.73 KES, RÂ²=0.4461
Fold 3/5 ... MAE=56.54 KES, RÂ²=0.6796
Fold 4/5 ... MAE=42.05 KES, RÂ²=0.6038
Fold 5/5 ... MAE=33.92 KES, RÂ²=0.5280


 #### a)i) Cross-Validation Summary

In [49]:
avg_mae = np.mean([f['mae'] for f in fold_results])
avg_rmse = np.mean([f['rmse'] for f in fold_results])
avg_r2 = np.mean([f['r2'] for f in fold_results])
std_r2 = np.std([f['r2'] for f in fold_results])

print(f"\n{'='*60}")
print(f"CROSS-VALIDATION RESULTS (7-DAY FORECAST)")
print(f"{'='*60}")
print(f"Average MAE:  {avg_mae:.2f} KES")
print(f"Average RMSE: {avg_rmse:.2f} KES")
print(f"Average RÂ²:   {avg_r2:.4f} (+/- {std_r2:.4f})")


CROSS-VALIDATION RESULTS (7-DAY FORECAST)
Average MAE:  43.03 KES
Average RMSE: 128.41 KES
Average RÂ²:   0.5598 (+/- 0.0782)


### FINAL MODEL

 #### i) Training Final Model

In [50]:
print("\n--- Training Final Model on All Data ---")
X_scaled = scaler.fit_transform(X)
model.fit(X_scaled, y)
print("Final model trained successfully")


--- Training Final Model on All Data ---
Final model trained successfully


 #### ii) Saving Model Artifacts

In [51]:
os.makedirs("../models", exist_ok=True)
os.makedirs("../data/processed", exist_ok=True)

In [52]:
model_path = "../models/kamis_model.pkl"
joblib.dump(model, model_path)
print(f"Model saved: {model_path}")

Model saved: ../models/kamis_model.pkl


In [53]:
scaler_path = "../models/kamis_scaler.pkl"
joblib.dump(scaler, scaler_path)
print(f"Scaler saved: {scaler_path}") 

Scaler saved: ../models/kamis_scaler.pkl


In [54]:
encoder_path = "../models/kamis_encoder.pkl"
joblib.dump(encoder, encoder_path)
print(f"Encoder saved: {encoder_path}")

Encoder saved: ../models/kamis_encoder.pkl


In [55]:
metadata = {
    'features': feature_list, 
    'model_type': 'RandomForest',
    'n_features': len(feature_list),
    'training_samples': len(X),
    'avg_mae': float(avg_mae),
    'avg_rmse': float(avg_rmse),
    'avg_r2': float(avg_r2),
    'trained_date': datetime.now().isoformat(),
    'sklearn_version': '1.3.2 or higher'
}
metadata_path = "../models/kamis_metadata.json"
with open(metadata_path, 'w') as f:
    json.dump(metadata, f, indent=2)
print(f"Metadata saved: {metadata_path}")

Metadata saved: ../models/kamis_metadata.json


In [56]:
print("\nSaving recent prices for API...")
 
recent_data = df.groupby(['Commodity', 'Market', 'County']).tail(30).copy()
 
api_columns = ['Date', 'Commodity', 'Market', 'County', 'Retail', 'Wholesale', 'Supply Volume']
recent_data = recent_data[api_columns]

recent_path = "../data/processed/recent_prices.csv"
recent_data.to_csv(recent_path, index=False)

print(f"Recent prices saved: {recent_path} (Records: {len(recent_data)})")


Saving recent prices for API...
Recent prices saved: ../data/processed/recent_prices.csv (Records: 120473)


In [57]:
print(f"\n{'='*60}")
print(f"TRAINING COMPLETE!")
print(f"\nModel Performance (7-Day Forecast):")
print(f"  MAE:  {avg_mae:.2f} KES")
print(f"  RÂ²:   {avg_r2:.4f}")
print(f"{'='*60}")


TRAINING COMPLETE!

Model Performance (7-Day Forecast):
  MAE:  43.03 KES
  RÂ²:   0.5598
