In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import warnings
warnings.filterwarnings('ignore')

In [7]:
 def clean_price_column_robust(col):
    """Robust price cleaning for formats like '55.00/kg'"""
    if col.dtype == 'object': 
        col = col.replace(['-', '', 'NA', 'N/A', 'nan', 'NaN'], np.nan)
         
        col = col.str.replace('/kg', '', case=False, regex=False)
        col = col.str.replace('/Kg', '', case=False, regex=False)
        col = col.str.replace('ksh', '', case=False, regex=False)
        col = col.str.replace('kes', '', case=False, regex=False)
        col = col.str.replace(' ', '', regex=False)  
        col = col.str.replace(',', '', regex=False)  
        col = pd.to_numeric(col, errors='coerce')
    
    return col

In [8]:
def prepare_data_with_proper_cleaning():
    """Prepare data with robust price cleaning"""
    df = pd.read_csv("../data/raw/kamis_data.csv")
    df["Date"] = pd.to_datetime(df["Date"])
    df = df.sort_values("Date").reset_index(drop=True)
    
    print("BEFORE CLEANING:")
    print(f"Retail price sample: {df['Retail'].head().tolist()}")
    if 'Wholesale' in df.columns:
        print(f"Wholesale price sample: {df['Wholesale'].head().tolist()}")
     
    df['Retail'] = clean_price_column_robust(df['Retail'])
    if 'Wholesale' in df.columns:
        df['Wholesale'] = clean_price_column_robust(df['Wholesale'])
     
    if 'Supply Volume' in df.columns:
        df['Supply Volume'] = pd.to_numeric(df['Supply Volume'], errors='coerce')
    
    print("\nAFTER CLEANING:")
    print(f"Retail price sample: {df['Retail'].head().tolist()}")
    if 'Wholesale' in df.columns:
        print(f"Wholesale price sample: {df['Wholesale'].head().tolist()}")
     
    initial_count = len(df)
    df = df.dropna(subset=['Retail'])
    final_count = len(df)
    
    print(f"\nDATA CLEANING SUMMARY:")
    print(f"Initial rows: {initial_count:,}")
    print(f"After removing missing retail prices: {final_count:,}")
    print(f"Rows removed: {initial_count - final_count:,}")
    print(f"Date range: {df['Date'].min()} to {df['Date'].max()}")
    
    print(f"\nPRICE STATISTICS:")
    print(f"Retail - Mean: {df['Retail'].mean():.2f}, Std: {df['Retail'].std():.2f}")
    print(f"Retail - Min: {df['Retail'].min():.2f}, Max: {df['Retail'].max():.2f}")
    
    if 'Wholesale' in df.columns:
        print(f"Wholesale - Mean: {df['Wholesale'].mean():.2f}, Std: {df['Wholesale'].std():.2f}")
    
    return df

In [9]:
def safe_groupby_transform(df, group_cols, column, operation, window=None):
    """Safe groupby transform that preserves index and avoids length mismatches"""
    results = []
    original_index = df.index
    
    for name, group in df.groupby(group_cols):
        group = group.sort_values('Date')
        if window:
            result = operation(group[column].shift(1).rolling(window=window, min_periods=1))
        else:
            result = operation(group[column])
        
        if hasattr(result, 'values'):
            result_series = pd.Series(result.values, index=group.index)
        else:
            result_series = pd.Series(result, index=group.index)
        results.append(result_series)
     
    combined = pd.concat(results)
    return combined.reindex(original_index) 

In [10]:
def create_future_safe_features(df, target_col='Retail'):
    """Create features for future prediction with properly cleaned prices"""
    df = df.copy()
    
    # Sort by date to ensure proper time series
    df = df.sort_values(['Commodity', 'Market', 'County', 'Date']).reset_index(drop=True)
    
    # BASIC TEMPORAL FEATURES 
    df['year'] = df['Date'].dt.year
    df['month'] = df['Date'].dt.month
    df['week_of_year'] = df['Date'].dt.isocalendar().week
    df['day_of_year'] = df['Date'].dt.dayofyear
    df['quarter'] = df['Date'].dt.quarter
    df['day_of_week'] = df['Date'].dt.dayofweek
    
    # KENYAN SEASONAL FEATURES
    df['is_long_rains'] = (df['month'] >= 3) & (df['month'] <= 5)
    df['is_short_rains'] = (df['month'] >= 10) & (df['month'] <= 12)
    df['is_planting_season'] = df['month'].isin([3, 4, 10])
    df['is_harvest_season'] = df['month'].isin([7, 8, 1, 2])
    
    # LAG FEATURES - ONLY PAST RETAIL PRICES
    group_cols = ['Commodity', 'Market', 'County']
    
    print("Creating lag features...") 
    lags = [1, 2, 3, 4]   
    for lag in lags:
        df[f'retail_lag_{lag}'] = df.groupby(group_cols)[target_col].shift(lag)
 
    if 'Wholesale' in df.columns:
        wholesale_lags = [1, 2, 4]
        for lag in wholesale_lags:
            df[f'wholesale_lag_{lag}'] = df.groupby(group_cols)['Wholesale'].shift(lag)
    
    # ROLLING STATISTICS - USING SAFE METHOD
    print("Creating rolling features...")
    windows = [4, 8]   
    
    for window in windows:
        # Retail rolling mean - using safe method
        df[f'retail_roll_mean_{window}'] = safe_groupby_transform(
            df, group_cols, target_col, lambda x: x.mean(), window
        )
        
        # Retail rolling std
        df[f'retail_roll_std_{window}'] = safe_groupby_transform(
            df, group_cols, target_col, lambda x: x.std(), window
        )
    
    # PRICE MOMENTUM 
    print("Creating momentum features...")
    df['retail_trend_4w'] = df['retail_lag_1'] - df['retail_lag_4']
    
    # Supply volume features
    if 'Supply Volume' in df.columns:
        print("Creating supply features...")
        df['supply_lag_1'] = df.groupby(group_cols)['Supply Volume'].shift(1)
        
        df['supply_roll_mean_4'] = safe_groupby_transform(
            df, group_cols, 'Supply Volume', lambda x: x.mean(), 4
        )
    
    # MARKET-LEVEL FEATURES (historical, no future data)
    print("Creating market features...")
    df['market_retail_avg'] = df.groupby('Market')[target_col].transform(
        lambda x: x.expanding(min_periods=1).mean()
    )
    
    return df

In [11]:
def main():
    print("FUTURE RETAIL PRICE PREDICTION WITH PROPER PRICE CLEANING")
    print("=" * 60)
    
    # Load and prepare data with robust price cleaning
    df = prepare_data_with_proper_cleaning()
    
    if df.empty:
        print("No data available after cleaning!")
        return
     
    initial_count = len(df)
    df = df[(df['Retail'] >= 1) & (df['Retail'] <= 5000)]   
    filtered_count = len(df)
    
    print(f"\nOUTLIER REMOVAL:")
    print(f"Before filtering: {initial_count:,} rows")
    print(f"After filtering (1-5000 KES): {filtered_count:,} rows")
    print(f"Rows removed: {initial_count - filtered_count:,}")
    
    # Create future-safe features
    df = create_future_safe_features(df)
    
    # Define feature columns  
    feature_columns = [
        # Temporal features
        'year', 'month', 'week_of_year', 'day_of_year', 'quarter', 'day_of_week',
        'is_long_rains', 'is_short_rains', 'is_planting_season', 'is_harvest_season',
        
        # Lagged retail prices
        'retail_lag_1', 'retail_lag_2', 'retail_lag_3', 'retail_lag_4',
        
        # Rolling statistics
        'retail_roll_mean_4', 'retail_roll_mean_8',
        'retail_roll_std_4', 'retail_roll_std_8',
        'retail_trend_4w',
        
        # Market features
        'market_retail_avg'
    ]
    
    
    if 'Wholesale' in df.columns:
        feature_columns.extend([
            'wholesale_lag_1', 'wholesale_lag_2', 'wholesale_lag_4'
        ])
    
  
    if 'Supply Volume' in df.columns:
        feature_columns.extend(['supply_lag_1', 'supply_roll_mean_4'])
     
    print("Adding categorical features...")
    for col in ['Commodity', 'Market', 'County']:
        if col in df.columns:
            # Get top categories
            top_categories = df[col].value_counts().head(5).index  # Only top 5
            for category in top_categories:
                feature_name = f"{col}_{category.replace(' ', '_').replace('/', '_')}"
                df[feature_name] = (df[col] == category).astype(int)
                feature_columns.append(feature_name)
    
    # Remove rows with missing critical features
    critical_features = ['retail_lag_1', 'retail_lag_2', 'month', 'year']
    df_clean = df.dropna(subset=critical_features + ['Retail'])
    
    print(f"\nFINAL DATASET:")
    print(f"Rows: {df_clean.shape[0]:,}")
    print(f"Features: {len(feature_columns)}")
    print(f"Date range: {df_clean['Date'].min()} to {df_clean['Date'].max()}")
    
    # Checking any remaining NaN values
    print(f"Remaining NaN values in features: {df_clean[feature_columns].isna().sum().sum()}")
    
    # Fill any remaining NaN values with median
    for col in feature_columns:
        if df_clean[col].isna().any():
            if np.issubdtype(df_clean[col].dtype, np.number):
                df_clean[col] = df_clean[col].fillna(df_clean[col].median())
            else:
                df_clean[col] = df_clean[col].fillna(0)
    
    # Prepare data for modeling
    X = df_clean[feature_columns]
    y = df_clean['Retail']
    
    print(f"\nTRAINING MODEL...")
    print(f"X shape: {X.shape}")
    print(f"y shape: {y.shape}")
    
    # Use simpler validation for now
    split_idx = int(0.8 * len(X))
    X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
    y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]
    
    model = RandomForestRegressor(
        n_estimators=50,  
        max_depth=10,
        random_state=42,
        n_jobs=-1
    )
    
    print("Fitting model...")
    model.fit(X_train, y_train)
    
    print("Making predictions...")
    y_pred = model.predict(X_test)
    
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    print(f"\nMODEL PERFORMANCE:")
    print(f"MAE: {mae:.2f} KES")
    print(f"RMSE: {rmse:.2f} KES")
    print(f"Mean Retail Price: {y_train.mean():.2f} KES")
    print(f"MAE as % of mean: {(mae / y_train.mean() * 100):.1f}%")
    
    # Feature importance
    feature_importance = pd.DataFrame({
        'feature': feature_columns,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print(f"\nTOP 10 FEATURE IMPORTANCE:")
    print(feature_importance.head(10).to_string(index=False))
    
    # Show some actual vs predicted examples
    print(f"\nSAMPLE PREDICTIONS (first 10):")
    results_sample = pd.DataFrame({
        'Actual': y_test.values[:10],
        'Predicted': y_pred[:10],
        'Error': y_test.values[:10] - y_pred[:10]
    })
    print(results_sample.round(2))

if __name__ == "__main__":
    main()

FUTURE RETAIL PRICE PREDICTION WITH PROPER PRICE CLEANING
BEFORE CLEANING:
Retail price sample: ['-', '-', '-', '-', '-']
Wholesale price sample: ['10.53/Kg', '17.78/Kg', '53.33/Kg', '50.00/Kg', '55.56/Kg']

AFTER CLEANING:
Retail price sample: [nan, nan, nan, nan, nan]
Wholesale price sample: [10.53, 17.78, 53.33, 50.0, 55.56]

DATA CLEANING SUMMARY:
Initial rows: 310,304
After removing missing retail prices: 256,614
Rows removed: 53,690
Date range: 2021-05-24 00:00:00 to 2025-09-17 00:00:00

💰 PRICE STATISTICS:
Retail - Mean: 170.71, Std: 392.57
Retail - Min: 0.01, Max: 100000.00
Wholesale - Mean: 121.80, Std: 236.72

OUTLIER REMOVAL:
Before filtering: 256,614 rows
After filtering (1-5000 KES): 256,167 rows
Rows removed: 447
Creating lag features...
Creating rolling features...
Creating momentum features...
Creating supply features...
Creating market features...
Adding categorical features...

📊 FINAL DATASET:
Rows: 240,061
Features: 40
Date range: 2021-05-24 00:00:00 to 2025-09-17 0