# Import Modules

In [None]:
# Standard library
import os

# Third-party
import joblib
import numpy as np
import pandas as pd

# Sklearn
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import Lasso, LinearRegression, Ridge
from sklearn.metrics import (
    mean_absolute_error,
    mean_absolute_percentage_error,
    mean_squared_error,
    r2_score,
    root_mean_squared_error,
)
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR

# XGBoost
from xgboost import XGBRegressor


# Helper Methods

In [None]:
# Save model using joblib
def save_model(model, path):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    joblib.dump(model, path)

In [None]:
# Save GridSearchCV full results as CSV
def save_grid_search_results(grid_search, path):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    results_df = pd.DataFrame(grid_search.cv_results_)
    results_df.to_csv(path, index=False)

In [None]:
# Save best parameters and best CV score to a text file
def save_training_report(grid_search, path):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, "w") as f:
        f.write(f"Best parameters: {grid_search.best_params_}\n")
        f.write(f"Best CV score (neg MSE): {grid_search.best_score_}\n")

In [None]:
# Append test metrics to the training report
def save_test_metrics(path, rmse, mae, mse, r2, mape):
    with open(path, "a") as f:
        f.write(f"RMSE: {rmse}\n")
        f.write(f"MAE: {mae}\n")
        f.write(f"MSE: {mse}\n")
        f.write(f"R2: {r2}\n")
        f.write(f"MAPE: {mape}\n")

In [None]:
# Compute per-crop metrics and save as CSV
def save_per_crop_metrics_csv(
    report_path,
    X_test_features,
    y_test,
    y_pred,
    flag_cols,
    output_csv_path=None
):
    # Derive default path from report_path if not provided
    if output_csv_path is None and report_path is not None:
        output_csv_path = report_path.replace("_report.txt", "_per_crop_metrics.csv")
    if output_csv_path is None:
        output_csv_path = "per_crop_metrics.csv"

    # Safe MAPE (ignores zero targets)
    def safe_mape(y_true, y_hat):
        y_true = np.asarray(y_true)
        y_hat = np.asarray(y_hat)
        mask = y_true != 0
        if mask.sum() == 0:
            return np.nan
        return float(np.mean(np.abs((y_true[mask] - y_hat[mask]) / y_true[mask])))

    rows = []
    for crop, col in flag_cols.items():
        if col not in X_test_features.columns:
            print(f"[warn] Column '{col}' not found; skipping {crop}.")
            continue

        mask = (X_test_features[col] == 1).to_numpy()
        n = int(mask.sum())
        if n == 0:
            print(f"[info] No test samples for {crop}; skipping.")
            continue

        y_true_c = y_test[mask]
        y_pred_c = y_pred[mask]

        rmse_c = root_mean_squared_error(y_true_c, y_pred_c)
        mae_c  = mean_absolute_error(y_true_c, y_pred_c)
        mse_c  = mean_squared_error(y_true_c, y_pred_c)
        r2_c   = r2_score(y_true_c, y_pred_c) if n > 1 else np.nan
        mape_c = safe_mape(y_true_c, y_pred_c)

        rows.append({
            "Crop": crop,
            "N_test": n,
            "RMSE": rmse_c,
            "MAE": mae_c,
            "MSE": mse_c,
            "R2": r2_c,
            "MAPE": mape_c
        })

    per_crop_df = pd.DataFrame(rows).sort_values("RMSE")
    per_crop_df.to_csv(output_csv_path, index=False)
    return per_crop_df

# Load Features and Target Variable

In [None]:
# Define the input path
input_dir = "../data/final/"

# Load the datasets
X_train = pd.read_csv(os.path.join(input_dir, "training_set.csv"))
X_test = pd.read_csv(os.path.join(input_dir, "testing_set.csv"))

In [None]:
# Prepare features and target
X = X_train.drop(columns=['YIELD'])
y = X_train['YIELD']
X_test_features = X_test.drop(columns=['YIELD'])
y_test = X_test['YIELD']

# Random Forrest

In [None]:
# Expanded grid for deeper tuning
param_grid = {
    'n_estimators': [100, 200, 300, 400],              # Number of trees in the forest
    'max_depth': [10, 20, 30, None],                   # Maximum depth of each tree (None = fully expanded)
    'min_samples_split': [2, 5, 10],                   # Minimum samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],                     # Minimum samples required to be at a leaf node
    'max_features': ['sqrt', 'log2'],                  # Number of features considered at each split
    'criterion': ['squared_error', 'absolute_error'],  # Function to measure split quality (MSE vs MAE)
}

# Grid Search CV setup
cv = KFold(n_splits=3, shuffle=True, random_state=42)
rf = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(
    rf,
    param_grid,
    cv=cv,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=1
)

# Train model
grid_search.fit(X, y)

# Paths for saving
model_path = "../models/random_forrest.joblib"
report_path = "../results/random_forrest/random_forrest_report.txt"
results_path = "../results/random_forrest/random_forrest_full_grid_results.csv"

# Save outputs
save_model(grid_search.best_estimator_, model_path)
save_training_report(grid_search, report_path)
save_grid_search_results(grid_search, results_path)

# Test set prediction
best_rf = joblib.load(model_path)
y_pred = best_rf.predict(X_test_features)

#  Evaluate
rmse = root_mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)

# Save test metrics
save_test_metrics(report_path, rmse, mae, mse, r2, mape)
flag_cols = {"Barley": "ITEM_Barley", "Wheat": "ITEM_Wheat", "Maize": "ITEM_Maize"}
save_per_crop_metrics_csv(
    report_path=report_path,
    X_test_features=X_test_features,
    y_test=y_test,
    y_pred=y_pred,
    flag_cols=flag_cols,
    output_csv_path=report_path.replace("_report.txt", "_per_crop_metrics.csv")
)

# Final printed summary
print("Best Parameters:", grid_search.best_params_)
print("RMSE:", rmse)
print("MAE:", mae)
print("MSE:", mse)
print("R2:", r2)
print("MAPE:", mape)

# Gradient Boosting

In [None]:
# Expanded grid for deeper tuning
param_grid = {
    'n_estimators': [100, 200, 300, 400],      # Number of boosting stages
    'learning_rate': [0.05, 0.1, 0.2, 0.3],    # Shrinkage per tree
    'max_depth': [3, 5, 7, 9],                 # Depth of individual trees
    'subsample': [0.6, 0.7, 0.8, 1.0],         # Row sampling per tree
    'max_features': [None, 'sqrt']             # Features per split
}
# Grid Search CV setup
cv = KFold(n_splits=3, shuffle=True, random_state=42)
gbr = GradientBoostingRegressor(random_state=42)
grid_search = GridSearchCV(
    gbr,
    param_grid,
    cv=cv,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=1
)

# Train model
grid_search.fit(X, y)

# Paths for saving
model_path = "../models/gradient_boosting.joblib"
report_path = "../results/gradient_boosting/gradient_boosting_report.txt"
results_path = "../results/gradient_boosting/gradient_boosting_full_grid_results.csv"

# Save outputs
save_model(grid_search.best_estimator_, model_path)
save_training_report(grid_search, report_path)
save_grid_search_results(grid_search, results_path)

# Test set prediction
best_gbr = joblib.load(model_path)
y_pred = best_gbr.predict(X_test_features)

# Evaluate
rmse = root_mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)

# Save test metrics
save_test_metrics(report_path, rmse, mae, mse, r2, mape)

# Per-crop CSV export
flag_cols = {"Barley": "ITEM_Barley", "Wheat": "ITEM_Wheat", "Maize": "ITEM_Maize"}
save_per_crop_metrics_csv(
    report_path=report_path,
    X_test_features=X_test_features,
    y_test=y_test,
    y_pred=y_pred,
    flag_cols=flag_cols,
    output_csv_path=report_path.replace("_report.txt", "_per_crop_metrics.csv")
)

# Final printed summary
print("Best Parameters:", grid_search.best_params_)
print("RMSE:", rmse)
print("MAE:", mae)
print("MSE:", mse)
print("R2:", r2)
print("MAPE:", mape)

# XGBoost

In [None]:
# Expanded grid for deeper tuning
param_grid = {
    'n_estimators': [100, 200, 300, 400],        # Number of trees
    'max_depth': [3, 5, 6, 7, 9],                # Tree depth
    'learning_rate': [0.05, 0.1, 0.2, 0.3],      # Step size
    'subsample': [0.6, 0.8, 1.0],                # Row sampling per tree
    'colsample_bytree': [0.8, 1.0]               # Feature sampling per tree
}

# Grid Search CV setup
cv = KFold(n_splits=3, shuffle=True, random_state=42)
xgb = XGBRegressor(objective='reg:squarederror', random_state=42, n_jobs=-1)
grid_search = GridSearchCV(
    xgb,
    param_grid,
    cv=cv,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=1
)

# Train model
grid_search.fit(X, y)

# Paths for saving
model_path = "../models/xgboost.joblib"
report_path = "../results/xgboost/xgboost_report.txt"
results_path = "../results/xgboost/xgboost_full_grid_results.csv"

# Save outputs
save_model(grid_search.best_estimator_, model_path)
save_training_report(grid_search, report_path)
save_grid_search_results(grid_search, results_path)

# Test set prediction
best_xgb = joblib.load(model_path)
y_pred = best_xgb.predict(X_test_features)

# Evaluate
rmse = root_mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)

# Save test metrics
save_test_metrics(report_path, rmse, mae, mse, r2, mape)
flag_cols = {"Barley": "ITEM_Barley", "Wheat": "ITEM_Wheat", "Maize": "ITEM_Maize"}
save_per_crop_metrics_csv(
    report_path=report_path,
    X_test_features=X_test_features,
    y_test=y_test,
    y_pred=y_pred,
    flag_cols=flag_cols,
    output_csv_path=report_path.replace("_report.txt", "_per_crop_metrics.csv")
)

# Final printed summary
print("Best Parameters:", grid_search.best_params_)
print("RMSE:", rmse)
print("MAE:", mae)
print("MSE:", mse)
print("R2:", r2)
print("MAPE:", mape)

# Support Vector Regression

In [None]:
# Expanded grid for deeper tuning
param_grid = {
    'kernel': ['rbf', 'linear'],                   # Kernel type (RBF = nonlinear, Linear = baseline)
    'C': [0.1, 1, 10, 100, 1000, 10000],           # Regularization strength
    'epsilon': [0.01, 0.05, 0.1, 0.2, 0.5, 1.0],   # Epsilon-insensitive margin width
    'gamma': ['scale', 'auto', 0.1],               # Kernel coefficient
    'shrinking': [True, False]                     # Use of shrinking heuristic
}

# Grid Search CV setup
cv = KFold(n_splits=3, shuffle=True, random_state=42)
svr = SVR()
grid_search = GridSearchCV(
    svr,
    param_grid,
    cv=cv,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=1
)

# Train model
grid_search.fit(X, y)

# Paths for saving
model_path = "../models/svr.joblib"
report_path = "../results/svr/svr_report.txt"
results_path = "../results/svr/svr_full_grid_results.csv"

# Save outputs
save_model(grid_search.best_estimator_, model_path)
save_training_report(grid_search, report_path)
save_grid_search_results(grid_search, results_path)

# Test set prediction
best_svr = joblib.load(model_path)
y_pred = best_svr.predict(X_test_features)

# Evaluate
rmse = root_mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)

# Save test metrics
save_test_metrics(report_path, rmse, mae, mse, r2, mape)
flag_cols = {"Barley": "ITEM_Barley", "Wheat": "ITEM_Wheat", "Maize": "ITEM_Maize"}
save_per_crop_metrics_csv(
    report_path=report_path,
    X_test_features=X_test_features,
    y_test=y_test,
    y_pred=y_pred,
    flag_cols=flag_cols,
    output_csv_path=report_path.replace("_report.txt", "_per_crop_metrics.csv")
)

# Final printed summary
print("Best Parameters:", grid_search.best_params_)
print("RMSE:", rmse)
print("MAE:", mae)
print("MSE:", mse)
print("R2:", r2)
print("MAPE:", mape)

# Linear Regression

In [None]:
# Train model
linreg = LinearRegression()
linreg.fit(X, y)

# Paths for saving
model_path = "../models/linear_regression_baseline.joblib"
report_path = "../results/linear_regression/linear_regression_baseline_report.txt"

# Save model
save_model(linreg, model_path)

# Test set prediction
y_pred = linreg.predict(X_test_features)

# Evaluate
rmse = root_mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)

# Save test metrics
save_test_metrics(report_path, rmse, mae, mse, r2, mape)
flag_cols = {"Barley": "ITEM_Barley", "Wheat": "ITEM_Wheat", "Maize": "ITEM_Maize"}
save_per_crop_metrics_csv(
    report_path=report_path,
    X_test_features=X_test_features,
    y_test=y_test,
    y_pred=y_pred,
    flag_cols=flag_cols,
    output_csv_path=report_path.replace("_report.txt", "_per_crop_metrics.csv")
)

# Final printed summary
print("RMSE:", rmse)
print("MAE:", mae)
print("MSE:", mse)
print("R2:", r2)
print("MAPE:", mape)

# MLP

In [None]:
# Expanded grid for deeper tuning
param_grid = [
    {
        'solver': ['adam'],
        'hidden_layer_sizes': [
            (64,), (128,), (256,), (64, 64), (128, 64)
        ],
        'activation': ['relu', 'tanh'],
        'alpha': [1e-5, 1e-4, 1e-3],
        'learning_rate_init': [1e-3, 5e-4],
        'max_iter': [300, 500],
        'tol': [1e-3, 1e-4],
        'n_iter_no_change': [10]
    },
    {
        'solver': ['lbfgs'],
        'hidden_layer_sizes': [
            (64,), (100,), (128,), (256,), (64, 64), (128, 64), (128, 128)
        ],
        'activation': ['relu', 'tanh'],
        'alpha': [1e-5, 1e-4, 1e-3],
        'max_iter': [300, 500, 800],
        'tol': [1e-3, 1e-4]
    }
]

# Grid Search CV setup
cv = KFold(n_splits=3, shuffle=True, random_state=42)
mlp = MLPRegressor(random_state=42)
grid_search = GridSearchCV(
    mlp,
    param_grid,
    cv=cv,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=1
)

# Train model
grid_search.fit(X, y)

# Paths for saving
model_path = "../models/mlp.joblib"
report_path = "../results/mlp/mlp_report.txt"
results_path = "../results/mlp/mlp_full_grid_results.csv"

# Save outputs
save_model(grid_search.best_estimator_, model_path)
save_training_report(grid_search, report_path)
save_grid_search_results(grid_search, results_path)

# Test set prediction
best_mlp = joblib.load(model_path)
y_pred = best_mlp.predict(X_test_features)

# Evaluate
rmse = root_mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)

# Save test metrics
save_test_metrics(report_path, rmse, mae, mse, r2, mape)
flag_cols = {"Barley": "ITEM_Barley", "Wheat": "ITEM_Wheat", "Maize": "ITEM_Maize"}
save_per_crop_metrics_csv(
    report_path=report_path,
    X_test_features=X_test_features,
    y_test=y_test,
    y_pred=y_pred,
    flag_cols=flag_cols,
    output_csv_path=report_path.replace("_report.txt", "_per_crop_metrics.csv")
)

# Final printed summary
print("Best Parameters:", grid_search.best_params_)
print("RMSE:", rmse)
print("MAE:", mae)
print("MSE:", mse)
print("R2:", r2)
print("MAPE:", mape)