# Import Modules

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import os
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, KFold
from xgboost import XGBRegressor
from sklearn.metrics import (
    root_mean_squared_error,
    mean_absolute_error,
    mean_squared_error,
    r2_score,
    mean_absolute_percentage_error
)
import joblib

# Helper Methods

In [2]:
# Save model using joblib
def save_model(model, path):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    joblib.dump(model, path)

In [3]:
# Save GridSearchCV full results as CSV
def save_grid_search_results(grid_search, path):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    results_df = pd.DataFrame(grid_search.cv_results_)
    results_df.to_csv(path, index=False)

In [4]:
# Save best parameters and best CV score to a text file
def save_training_report(grid_search, path):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, "w") as f:
        f.write(f"Best parameters: {grid_search.best_params_}\n")
        f.write(f"Best CV score (neg MSE): {grid_search.best_score_}\n")

In [5]:
# Append test metrics to the training report
def save_test_metrics(path, rmse, mae, mse, r2, mape):
    with open(path, "a") as f:
        f.write(f"RMSE: {rmse}\n")
        f.write(f"MAE: {mae}\n")
        f.write(f"MSE: {mse}\n")
        f.write(f"R2: {r2}\n")
        f.write(f"MAPE: {mape}\n")

# Load Features and Target Variable

In [6]:
# Define the input path
input_dir = "../data/final/"

# Load the datasets
X_train = pd.read_csv(os.path.join(input_dir, "training_set.csv"))
X_test = pd.read_csv(os.path.join(input_dir, "testing_set.csv"))

In [7]:
# Prepare features and target
X = X_train.drop(columns=['YIELD'])
y = X_train['YIELD']
X_test_features = X_test.drop(columns=['YIELD'])
y_test = X_test['YIELD']

# Random Forrest

In [None]:
# Expanded grid for deeper tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.3],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# Grid Search CV setup
cv = KFold(n_splits=5, shuffle=True, random_state=42)
xgb = XGBRegressor(objective='reg:squarederror', random_state=42, n_jobs=-1)
grid_search = GridSearchCV(
    xgb,
    param_grid,
    cv=cv,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=1
)

# Train model
grid_search.fit(X, y)

# Paths for saving
model_path = "../models/xgboost.joblib"
report_path = "../results/hyperparameter_tuning/xgboost_report.txt"
results_path = "../results/hyperparameter_tuning/xgboost_full_grid_results.csv"

# Save outputs
save_model(grid_search.best_estimator_, model_path)
save_training_report(grid_search, report_path)
save_grid_search_results(grid_search, results_path)

# Test set prediction
best_xgb = joblib.load(model_path)
y_pred = best_xgb.predict(X_test_features)

# Evaluate
rmse = root_mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)

# Save test metrics
save_test_metrics(report_path, rmse, mae, mse, r2, mape)

# Final printed summary
print("Best Parameters:", grid_search.best_params_)
print("RMSE:", rmse)
print("MAE:", mae)
print("MSE:", mse)
print("R2:", r2)
print("MAPE:", mape)

Fitting 5 folds for each of 2 candidates, totalling 10 fits
Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200, 'subsample': 0.8}
RMSE: 1374.5123001688376
MAE: 1092.8440521740522
MSE: 1889284.0633154283
R2: 0.6511022976075148
MAPE: 0.2866016228814861


# XGBoost

In [None]:
# Expanded grid for deeper tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.3],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# Grid Search CV setup
cv = KFold(n_splits=5, shuffle=True, random_state=42)
xgb = XGBRegressor(objective='reg:squarederror', random_state=42, n_jobs=-1)
grid_search = GridSearchCV(
    xgb,
    param_grid,
    cv=cv,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=1
)

# Train model
grid_search.fit(X, y)

# Paths for saving
model_path = "../models/xgboost.joblib"
report_path = "../results/hyperparameter_tuning/xgboost_report.txt"
results_path = "../results/hyperparameter_tuning/xgboost_full_grid_results.csv"

# Save outputs
save_model(grid_search.best_estimator_, model_path)
save_training_report(grid_search, report_path)
save_grid_search_results(grid_search, results_path)

# Test set prediction
best_xgb = joblib.load(model_path)
y_pred = best_xgb.predict(X_test_features)

# Evaluate
rmse = root_mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)

# Save test metrics
save_test_metrics(report_path, rmse, mae, mse, r2, mape)

# Final printed summary
print("Best Parameters:", grid_search.best_params_)
print("RMSE:", rmse)
print("MAE:", mae)
print("MSE:", mse)
print("R2:", r2)
print("MAPE:", mape)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.3, 'max_depth': 9, 'n_estimators': 300, 'subsample': 1.0}
RMSE: 1023.0139476357417
MAE: 706.4961182841317
MSE: 1046557.5370572641
R2: 0.806730217445414
MAPE: 0.17099475613796045
