In [8]:
import pandas as pd
import numpy as np
import joblib
import json
import os
from sqlalchemy import create_engine
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from dotenv import load_dotenv

In [9]:
BASE_DIR = os.getcwd()    
ENV_PATH = os.path.join(BASE_DIR, '../../../.env')
load_dotenv(ENV_PATH)

True

In [10]:
DB_CONNECTION = os.getenv('DATABASE_URL')

if not DB_CONNECTION: 
    db_user = os.getenv('DB_USER', 'postgres')
    db_pass = os.getenv('DB_PASSWORD', 'password')
    db_host = os.getenv('DB_HOST', 'localhost')
    db_port = os.getenv('DB_PORT', '5432')
    db_name = os.getenv('DB_NAME', 'agri_price_db')
    DB_CONNECTION = f"postgresql://{db_user}:{db_pass}@{db_host}:{db_port}/{db_name}"

In [11]:
DATA_PATH = os.path.join(BASE_DIR, '../../data/processed/recent_prices.csv')
MODELS_DIR = os.path.join(BASE_DIR, '../models')
os.makedirs(MODELS_DIR, exist_ok=True)
os.makedirs(os.path.dirname(DATA_PATH), exist_ok=True)

In [12]:
def fetch_data_from_db():
    print("Connecting to Database...")
    try:
        engine = create_engine(DB_CONNECTION)
    except Exception as e:
        print(f"Database Connection Failed. Check your .env file or credentials.")
        raise e
    
    query = """
    SELECT 
        c.name as "Commodity",
        m.name as "Market",
        r.name as "County", -- Mapping Region to County for ML compatibility
        pe.price as "Retail",
        pe.entry_date as "Date"
    FROM price_entries pe
    JOIN crops c ON pe.crop_id = c.id
    JOIN markets m ON pe.market_id = m.id
    JOIN regions r ON pe.region_id = r.id
    WHERE pe.is_verified = true
    ORDER BY pe.entry_date DESC
    """
    
    print("Fetching latest verified prices...")
    df = pd.read_sql(query, engine)
    print(f"Loaded {len(df)} rows from Database.")
 
    df.to_csv(DATA_PATH, index=False)
    print(f"Saved updated dataset to: {DATA_PATH}")
    return df

In [13]:
def train_model(df):
    print("\nStarting Model Training...")
 
    df['Date'] = pd.to_datetime(df['Date'])
    df['month'] = df['Date'].dt.month
    df['year'] = df['Date'].dt.year
    df['week'] = df['Date'].dt.isocalendar().week
     
    encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    cat_cols = ['Commodity', 'Market', 'County']
    df[cat_cols] = encoder.fit_transform(df[cat_cols])
 
    X = df[['Commodity', 'Market', 'County', 'month', 'year', 'week']]
    y = df['Retail']
 
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
     
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
     
    model = HistGradientBoostingRegressor(random_state=42)
    model.fit(X_train, y_train)
     
    score = model.score(X_test, y_test)
    print(f"Model Accuracy (R2 Score): {score:.4f}")
     
    print("Saving model artifacts...")
    joblib.dump(model, os.path.join(MODELS_DIR, 'kamis_model.pkl'))
    joblib.dump(scaler, os.path.join(MODELS_DIR, 'kamis_scaler.pkl'))
    joblib.dump(encoder, os.path.join(MODELS_DIR, 'kamis_encoder.pkl'))
     
    metadata = {
        'model_type': 'HistGradientBoostingRegressor',
        'n_features': X.shape[1],
        'avg_r2': score,
        'last_trained': str(pd.Timestamp.now()),
        'features': ['Commodity', 'Market', 'County', 'month', 'year', 'week']
    }
    
    with open(os.path.join(MODELS_DIR, 'kamis_metadata.json'), 'w') as f:
        json.dump(metadata, f)
        
    print("Model Updated Successfully!")

In [14]:
if __name__ == "__main__":
    try:
        data = fetch_data_from_db()
        if not data.empty:
            train_model(data)
        else:
            print("No data found in database to train on.")
    except Exception as e:
        print(f"Training Failed: {e}")

Connecting to Database...
Fetching latest verified prices...
Loaded 405385 rows from Database.
Saved updated dataset to: C:\Users\user\Desktop\4th year projects\agri-price tracker\backend\src\ml-model-service\notebooks\../../data/processed/recent_prices.csv

Starting Model Training...
Model Accuracy (R2 Score): 0.2419
Saving model artifacts...
Model Updated Successfully!
