In [1]:
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

In [2]:
df = pd.read_csv("../data/raw/kamis_data.csv")

In [3]:
df["Date"] = pd.to_datetime(df["Date"])
df = df.sort_values("Date")

In [4]:
def clean_price_column(col):
    """Clean price columns by removing '/Kg' and converting to numeric"""
    if col.dtype == 'object':  
        col = col.replace('-', np.nan)
        col = col.str.replace('/Kg', '', regex=False)
        col = pd.to_numeric(col, errors='coerce')
    return col

In [5]:
price_columns = ['Wholesale', 'Retail']  
for col in price_columns:
    if col in df.columns:
        df[col] = clean_price_column(df[col])

In [6]:
if 'Supply Volume' in df.columns:
    df['Supply Volume'] = pd.to_numeric(df['Supply Volume'], errors='coerce')

In [7]:
df = df.dropna(subset=["Retail"])

In [8]:
categorical_cols = ["Commodity", "Market", "County"]  
df = pd.get_dummies(df, columns=[col for col in categorical_cols if col in df.columns], drop_first=True)

In [9]:
X = df.drop(columns=["Retail", "Date"])   
y = df["Retail"] 

In [10]:
for col in X.columns:
    if X[col].dtype == 'object':
        print(f"Warning: Column '{col}' is still object type. Converting to numeric.")
        X[col] = pd.to_numeric(X[col], errors='coerce')



In [11]:
print("Checking for problematic columns...")
nan_columns = X.columns[X.isna().all()].tolist()
print(f"Columns with all NaN values: {nan_columns}")

Checking for problematic columns...
Columns with all NaN values: ['Classification', 'Grade', 'Sex', 'ProductName']


In [12]:
if nan_columns:
    X = X.drop(columns=nan_columns)
    print(f"Dropped columns: {nan_columns}")

Dropped columns: ['Classification', 'Grade', 'Sex', 'ProductName']


In [13]:
missing_percentage = (X.isna().sum() / len(X)) * 100
high_missing_cols = missing_percentage[missing_percentage > 50].index.tolist()
print(f"Columns with >50% missing values: {high_missing_cols}")

Columns with >50% missing values: []


In [14]:
for col in X.columns:
    if X[col].isna().any():
        if np.issubdtype(X[col].dtype, np.number):
            X[col] = X[col].fillna(X[col].median())
        else:
            X[col] = X[col].fillna(X[col].mode()[0] if len(X[col].mode()) > 0 else 0)

In [15]:
print(f"Remaining NaN values in X: {X.isna().sum().sum()}")
print(f"X shape after cleaning: {X.shape}")

Remaining NaN values in X: 0
X shape after cleaning: (253742, 457)


In [16]:
print(f"NaN values in y: {y.isna().sum()}")

NaN values in y: 0


In [17]:
if y.isna().any():
    nan_mask = y.isna()
    X = X[~nan_mask]
    y = y[~nan_mask]
    print(f"Removed {nan_mask.sum()} rows with NaN target values")

In [18]:
print("Data preparation complete!")
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")
print(f"X data types:\n{X.dtypes}")
print(f"Sample of cleaned data:")
print(X.head())

Data preparation complete!
X shape: (253742, 457)
y shape: (253742,)
X data types:
Wholesale                        float64
Supply Volume                    float64
ProductID                          int64
Commodity_Alestes                  uint8
Commodity_Amaranthus (Terere)      uint8
                                  ...   
County_Turkana                     uint8
County_Uasin-Gishu                 uint8
County_Vihiga                      uint8
County_Wajir                       uint8
County_West-Pokot                  uint8
Length: 457, dtype: object
Sample of cleaned data:
        Wholesale  Supply Volume  ProductID  Commodity_Alestes  \
249862       25.0         1500.0        220                  0   
249861       80.0          500.0        220                  0   
250190       19.5         3000.0        221                  0   
249864       80.0          500.0        220                  0   
250164       17.0          450.0        221                  0   

        Commodity_

In [None]:
# Time Series Split
tscv = TimeSeriesSplit(n_splits=5)
mae_scores, rmse_scores = [], []

for fold, (train_idx, test_idx) in enumerate(tscv.split(X)):
    print(f"\n--- Fold {fold + 1} ---")
    
    # Split data
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    # Final check for NaN values
    if X_train.isna().any().any() or X_test.isna().any().any():
        print("Warning: NaN values detected in split data")
        train_nan_mask = X_train.isna().any(axis=1) | y_train.isna()
        test_nan_mask = X_test.isna().any(axis=1) | y_test.isna()
        
        X_train = X_train[~train_nan_mask]
        y_train = y_train[~train_nan_mask]
        X_test = X_test[~test_nan_mask]
        y_test = y_test[~test_nan_mask]
    
    print(f"Training set size: {len(X_train)}, Test set size: {len(X_test)}")
    
    # Train model
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    # Predict and evaluate
    y_pred = model.predict(X_test)
    
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    mae_scores.append(mae)
    rmse_scores.append(rmse)
    
    print(f"Fold {fold + 1} - MAE: {mae:.2f}, RMSE: {rmse:.2f}")



--- Fold 1 ---
Training set size: 42292, Test set size: 42290
Fold 1 - MAE: 119.48, RMSE: 341.75

--- Fold 2 ---
Training set size: 84582, Test set size: 42290
Fold 2 - MAE: 88.18, RMSE: 228.79

--- Fold 3 ---
Training set size: 126872, Test set size: 42290
Fold 3 - MAE: 77.50, RMSE: 229.22

--- Fold 4 ---
Training set size: 169162, Test set size: 42290
Fold 4 - MAE: 49.20, RMSE: 280.40

--- Fold 5 ---
Training set size: 211452, Test set size: 42290


In [None]:
print("\n=== Final Results ===")
print(f"Average MAE: {np.mean(mae_scores):.2f}")
print(f"Average RMSE: {np.mean(rmse_scores):.2f}")
print(f"MAE Std: {np.std(mae_scores):.2f}")
print(f"RMSE Std: {np.std(rmse_scores):.2f}")