In [1]:
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

In [2]:
df = pd.read_csv("../data/raw/kamis_data.csv")

In [3]:
df["Date"] = pd.to_datetime(df["Date"])
df = df.sort_values("Date")

In [4]:
def clean_price_column(col):
    """Clean price columns by removing '/Kg' and converting to numeric"""
    if col.dtype == 'object':  
        col = col.replace('-', np.nan)
        col = col.str.replace('/Kg', '', regex=False)
        col = pd.to_numeric(col, errors='coerce')
    return col

In [5]:
price_columns = ['Wholesale', 'Retail']  
for col in price_columns:
    if col in df.columns:
        df[col] = clean_price_column(df[col])

In [6]:
if 'Supply Volume' in df.columns:
    df['Supply Volume'] = pd.to_numeric(df['Supply Volume'], errors='coerce')

In [7]:
df = df.dropna(subset=["Retail"])

In [8]:
categorical_cols = ["Commodity", "Market", "County"]  
df = pd.get_dummies(df, columns=[col for col in categorical_cols if col in df.columns], drop_first=True)

In [9]:
X = df.drop(columns=["Retail", "Date"])   
y = df["Retail"] 

In [10]:
for col in X.columns:
    if X[col].dtype == 'object':
        print(f"Warning: Column '{col}' is still object type. Converting to numeric.")
        X[col] = pd.to_numeric(X[col], errors='coerce')



In [11]:
X = X.fillna(X.mean())

In [12]:
print("Data preparation complete!")
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")
print(f"X data types:\n{X.dtypes}")
print(f"Sample of cleaned data:")
print(X.head())

Data preparation complete!
X shape: (253742, 461)
y shape: (253742,)
X data types:
Classification        float64
Grade                 float64
Sex                   float64
Wholesale             float64
Supply Volume         float64
                       ...   
County_Turkana          uint8
County_Uasin-Gishu      uint8
County_Vihiga           uint8
County_Wajir            uint8
County_West-Pokot       uint8
Length: 461, dtype: object
Sample of cleaned data:
        Classification  Grade  Sex  Wholesale  Supply Volume  ProductID  \
249862             NaN    NaN  NaN   25.00000    1500.000000        220   
249861             NaN    NaN  NaN  122.14909    8319.305407        220   
250190             NaN    NaN  NaN   19.50000    3000.000000        221   
249864             NaN    NaN  NaN  122.14909    8319.305407        220   
250164             NaN    NaN  NaN   17.00000     450.000000        221   

        ProductName  Commodity_Alestes  Commodity_Amaranthus (Terere)  \
249862      

In [13]:
# Time Series Split
tscv = TimeSeriesSplit(n_splits=5)
mae_scores, rmse_scores = [], []

for train_idx, test_idx in tscv.split(X):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    mae_scores.append(mae)
    rmse_scores.append(rmse)

print("Average MAE:", np.mean(mae_scores))
print("Average RMSE:", np.mean(rmse_scores))

ValueError: Input X contains NaN.
LinearRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values