# Import Modules

In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from IPython.display import display
from sklearn.model_selection import train_test_split
import os
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import (
    root_mean_squared_error,
    mean_absolute_error,
    mean_squared_error,
    r2_score,
    mean_absolute_percentage_error
)
import joblib

# Helper Methods

In [6]:
# Save model using joblib
def save_model(model, path):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    joblib.dump(model, path)

In [7]:
# Save GridSearchCV full results as CSV
def save_grid_search_results(grid_search, path):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    results_df = pd.DataFrame(grid_search.cv_results_)
    results_df.to_csv(path, index=False)

In [8]:
# Save best parameters and best CV score to a text file
def save_training_report(grid_search, path):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, "w") as f:
        f.write(f"Best parameters: {grid_search.best_params_}\n")
        f.write(f"Best CV score (neg MSE): {grid_search.best_score_}\n")

In [9]:
# Append test metrics to the training report
def save_test_metrics(path, rmse, mae, mse, r2, mape):
    with open(path, "a") as f:
        f.write(f"Test RMSE: {rmse}\n")
        f.write(f"Test MAE: {mae}\n")
        f.write(f"Test MSE: {mse}\n")
        f.write(f"Test R2: {r2}\n")
        f.write(f"Test MAPE: {mape}\n")

# Load Dataset

In [10]:
# Define the path to the processed dataset
file_path = '../data/processed/merged_dataset.csv'

In [11]:
# Load the dataset
df = pd.read_csv(file_path)

In [12]:
# Drop unnecessary columns
# These columns are not needed for modeling and can be removed to simplify the dataset
df = df.drop(columns=[
    'WB_CCKP_HD40', 'WB_CCKP_HD42', 'WB_CCKP_HI37', 'WB_CCKP_HI39', 'WB_CCKP_HI41',
    'AREA', 'YEAR', 'AREA_HARVESTED', 'PRODUCTION_QUANTITY'
])

In [13]:
# Rename 'Maize (corn)' to 'Maize' in the ITEM column
df['ITEM'] = df['ITEM'].replace('Maize (corn)', 'Maize')

# Type Casting

In [14]:
# Display the data types of each column in the dataframe
print(df.dtypes)

ITEM                object
YIELD              float64
WB_CCKP_CDD        float64
WB_CCKP_CDD65      float64
WB_CCKP_CSDI       float64
WB_CCKP_CWD        float64
WB_CCKP_FD         float64
WB_CCKP_HD30       float64
WB_CCKP_HD35       float64
WB_CCKP_HD45       float64
WB_CCKP_HD50       float64
WB_CCKP_HDD65      float64
WB_CCKP_HI35       float64
WB_CCKP_HURS       float64
WB_CCKP_ID         float64
WB_CCKP_PR         float64
WB_CCKP_R20MM      float64
WB_CCKP_R50MM      float64
WB_CCKP_R95PTOT    float64
WB_CCKP_RX1DAY     float64
WB_CCKP_RX5DAY     float64
WB_CCKP_SD         float64
WB_CCKP_TAS        float64
WB_CCKP_TASMAX     float64
WB_CCKP_TASMIN     float64
WB_CCKP_TNN        float64
WB_CCKP_TR         float64
WB_CCKP_TR23       float64
WB_CCKP_TR26       float64
WB_CCKP_TR29       float64
WB_CCKP_TR32       float64
WB_CCKP_TX84RR     float64
WB_CCKP_TXX        float64
WB_CCKP_WSDI       float64
dtype: object


In [15]:
# Check for non numeric columns (only non numeric should be 'ITEM')
non_numeric_cols = df.select_dtypes(exclude=[np.number]).columns
if len(non_numeric_cols) == 0:
    print("All columns are numerical.")
else:
    print("The following columns are not numerical:", list(non_numeric_cols))

The following columns are not numerical: ['ITEM']


# Missing Value Handling

In [16]:
# Check for missing values in the dataframe
missing_values = df.isnull().sum()
if missing_values.sum() == 0:
    print("No missing values found in the dataframe.")
else:
    print("Columns with missing values:")
    print(missing_values[missing_values > 0])

No missing values found in the dataframe.


# Outlier Detection

In [17]:
# Detect outliers in each numeric column using the IQR method
outlier_info = {}
total_outliers = 0
n_rows = len(df)

for col in df.select_dtypes(include=[np.number]).columns:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    outliers = ((df[col] < lower) | (df[col] > upper))
    n_outliers = outliers.sum()
    if n_outliers > 0:
        outlier_info[col] = n_outliers
        total_outliers += n_outliers

print(f"Total outliers: {total_outliers}")
print("Outliers per column:")
for col, count in outlier_info.items():
    print(f"  {col}: {count} ({count/n_rows*100:.2f}%)")
print(f"Percentage of outliers (all columns, not unique rows): {total_outliers/(n_rows*len(df.select_dtypes(include=[np.number]).columns))*100:.2f}%")

Total outliers: 4772
Outliers per column:
  YIELD: 32 (1.05%)
  WB_CCKP_CDD: 286 (9.39%)
  WB_CCKP_CDD65: 53 (1.74%)
  WB_CCKP_CSDI: 167 (5.48%)
  WB_CCKP_CWD: 100 (3.28%)
  WB_CCKP_FD: 93 (3.05%)
  WB_CCKP_HD30: 555 (18.22%)
  WB_CCKP_HD35: 60 (1.97%)
  WB_CCKP_HD45: 18 (0.59%)
  WB_CCKP_HDD65: 229 (7.52%)
  WB_CCKP_HI35: 110 (3.61%)
  WB_CCKP_HURS: 15 (0.49%)
  WB_CCKP_ID: 93 (3.05%)
  WB_CCKP_PR: 113 (3.71%)
  WB_CCKP_R20MM: 96 (3.15%)
  WB_CCKP_R50MM: 346 (11.36%)
  WB_CCKP_R95PTOT: 87 (2.86%)
  WB_CCKP_RX1DAY: 48 (1.58%)
  WB_CCKP_RX5DAY: 66 (2.17%)
  WB_CCKP_SD: 48 (1.58%)
  WB_CCKP_TAS: 173 (5.68%)
  WB_CCKP_TASMAX: 173 (5.68%)
  WB_CCKP_TASMIN: 173 (5.68%)
  WB_CCKP_TNN: 133 (4.37%)
  WB_CCKP_TR: 48 (1.58%)
  WB_CCKP_TR23: 59 (1.94%)
  WB_CCKP_TR26: 235 (7.72%)
  WB_CCKP_TR29: 498 (16.35%)
  WB_CCKP_TR32: 537 (17.63%)
  WB_CCKP_TX84RR: 35 (1.15%)
  WB_CCKP_TXX: 30 (0.98%)
  WB_CCKP_WSDI: 63 (2.07%)
Percentage of outliers (all columns, not unique rows): 4.75%


# Duplicate Removal

In [18]:
# Count duplicate rows in the dataframe
n_duplicates = df.duplicated().sum()
pct_duplicates = (n_duplicates / n_rows) * 100

print(f"Number of duplicate rows: {n_duplicates}")
print(f"Percentage of duplicate rows: {pct_duplicates:.2f}%")

# Remove duplicate rows
df = df.drop_duplicates()

Number of duplicate rows: 0
Percentage of duplicate rows: 0.00%


# One-Hot Encoding

In [19]:
# One-hot encode the 'ITEM' column for modeling
df_encoded = pd.get_dummies(df, columns=['ITEM'], prefix='ITEM')
display(df_encoded.head())

Unnamed: 0,YIELD,WB_CCKP_CDD,WB_CCKP_CDD65,WB_CCKP_CSDI,WB_CCKP_CWD,WB_CCKP_FD,WB_CCKP_HD30,WB_CCKP_HD35,WB_CCKP_HD45,WB_CCKP_HD50,...,WB_CCKP_TR23,WB_CCKP_TR26,WB_CCKP_TR29,WB_CCKP_TR32,WB_CCKP_TX84RR,WB_CCKP_TXX,WB_CCKP_WSDI,ITEM_Barley,ITEM_Maize,ITEM_Wheat
0,1000.0,27.05,548.49,5.6,14.53,20.12,0.0,0.0,0.0,0.0,...,25.22,2.65,0.02,0.0,0.05,24.78,21.31,True,False,False
1,923.1,23.78,492.26,33.37,9.44,33.1,0.0,0.0,0.0,0.0,...,21.68,2.48,0.0,0.0,0.03,24.22,0.38,True,False,False
2,1380.2,29.91,579.35,7.62,10.14,30.34,0.0,0.0,0.0,0.0,...,27.46,5.56,0.11,0.0,0.08,25.25,0.0,True,False,False
3,1332.4,24.83,629.48,7.81,9.02,32.41,0.02,0.0,0.0,0.0,...,30.26,6.19,0.26,0.0,0.1,26.22,6.07,True,False,False
4,2352.3,24.24,810.85,0.0,11.43,14.55,0.32,0.0,0.0,0.0,...,42.15,11.38,0.88,0.08,0.15,27.95,9.24,True,False,False


# Scale Numeric Features

In [20]:
# Select only numeric columns for scaling
numeric_cols = df_encoded.select_dtypes(include=[np.number]).columns

scaler = StandardScaler()
df_encoded[numeric_cols] = scaler.fit_transform(df_encoded[numeric_cols])

# Remove 'YIELD' from the columns to be scaled
features_to_scale = numeric_cols.drop('YIELD')

# Scale only the feature columns, not the target
df_encoded[features_to_scale] = scaler.fit_transform(df_encoded[features_to_scale])

display(df_encoded.head())

Unnamed: 0,YIELD,WB_CCKP_CDD,WB_CCKP_CDD65,WB_CCKP_CSDI,WB_CCKP_CWD,WB_CCKP_FD,WB_CCKP_HD30,WB_CCKP_HD35,WB_CCKP_HD45,WB_CCKP_HD50,...,WB_CCKP_TR23,WB_CCKP_TR26,WB_CCKP_TR29,WB_CCKP_TR32,WB_CCKP_TX84RR,WB_CCKP_TXX,WB_CCKP_WSDI,ITEM_Barley,ITEM_Maize,ITEM_Wheat
0,-1.648367,0.254177,0.349305,-0.142392,1.628061,-0.707982,-0.367223,-0.080685,-0.056228,0.0,...,0.203528,-0.379913,-0.438591,-0.221775,-1.382162,-0.160958,1.507791,True,False,False
1,-1.680949,0.014224,0.194211,3.532471,-0.322191,-0.407172,-0.367223,-0.080685,-0.056228,0.0,...,0.043177,-0.400767,-0.451262,-0.221775,-1.590963,-0.354197,-1.05065,True,False,False
2,-1.487278,0.464043,0.434423,0.124919,-0.053984,-0.471135,-0.367223,-0.080685,-0.056228,0.0,...,0.304993,-0.022941,-0.381574,-0.221775,-1.068959,0.001226,-1.097101,True,False,False
3,-1.507531,0.091273,0.572692,0.150062,-0.483116,-0.423163,-0.342747,-0.080685,-0.056228,0.0,...,0.431824,0.054342,-0.286545,-0.221775,-0.860158,0.335945,-0.355116,True,False,False
4,-1.075405,0.047979,1.072949,-0.883452,0.440285,-0.837066,0.024392,-0.080685,-0.056228,0.0,...,0.970404,0.691003,0.106242,0.242744,-0.338154,0.932917,0.032378,True,False,False


# Split Data (Holdout Method)

In [21]:
# Stratify by the item columns to maintain balance
item_col = ['ITEM_Barley', 'ITEM_Maize', 'ITEM_Wheat']
# Find the item label for each row
item_labels = df_encoded[item_col].idxmax(axis=1)

X_train, X_test = train_test_split(
    df_encoded,
    test_size=0.2,
    random_state=42,
    stratify=item_labels
)

# Count items in each set
train_counts = X_train[item_col].sum()
test_counts = X_test[item_col].sum()

print("Training set item counts:")
print(train_counts)
print("\nTesting set item counts:")
print(test_counts)

Training set item counts:
ITEM_Barley    884
ITEM_Maize     675
ITEM_Wheat     877
dtype: int64

Testing set item counts:
ITEM_Barley    221
ITEM_Maize     169
ITEM_Wheat     220
dtype: int64


# Random Forrest

In [23]:
# Prepare features and target
X = X_train.drop(columns=['YIELD'])
y = X_train['YIELD']
X_test_features = X_test.drop(columns=['YIELD'])
y_test = X_test['YIELD']

# Expanded grid for deeper tuning
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None],
    'min_samples_split': [2],
    'min_samples_leaf': [1],
    'max_features': ['sqrt']
}

# Grid Search CV setup
cv = KFold(n_splits=5, shuffle=True, random_state=42)
rf = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(
    rf,
    param_grid,
    cv=cv,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=1
)

# Train model
grid_search.fit(X, y)

# Paths for saving
model_path = "../models/best_random_forest.joblib"
report_path = "../results/hyperparameter_tuning/random_forest_report.txt"
results_path = "../results/hyperparameter_tuning/random_forest_full_grid_results.csv"

# Save outputs
save_model(grid_search.best_estimator_, model_path)
save_training_report(grid_search, report_path)
save_grid_search_results(grid_search, results_path)

# Test set prediction
best_rf = joblib.load(model_path)
y_pred = best_rf.predict(X_test_features)

#  Evaluate
rmse = root_mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)

# Save test metrics
save_test_metrics(report_path, rmse, mae, mse, r2, mape)

# Final printed summary
print("Best Parameters:", grid_search.best_params_)
print("Test RMSE:", rmse)
print("Test MAE:", mae)
print("Test MSE:", mse)
print("Test R2:", r2)
print("Test MAPE:", mape)


Fitting 5 folds for each of 2 candidates, totalling 10 fits
Best Parameters: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Test RMSE: 0.3983799445538995
Test MAE: 0.29769366028137817
Test MSE: 0.15870658022276807
Test R2: 0.8367359529080133
Test MAPE: 3.7051560349635273


In [24]:
import sklearn
print(sklearn.__version__)


1.7.1
