In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from dotenv import load_dotenv
import os


pd.set_option('display.width', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [None]:
def read_data(path):
    return pd.read_csv(path)

data_path = os.getenv("TRAINING_DATA")
df = read_data(data_path)

In [12]:
"""
Split the dataset into features and target, then divide it into training and testing sets.

- X: feature matrix (all columns except the target)
- y: target variable ('HATSURESI')
- 90% of the data is used for training, 10% for testing
- The random_state ensures reproducible results
"""

# Separate features (X) and target variable (y)
X = df.drop(["HATSURESI","Unnamed: 0", "Unnamed: 0.1"], axis=1)  # Drop target column to create feature set
y = df["HATSURESI"]                 # Target variable to predict

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shape of the resulting splits
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((154851, 31), (38713, 31), (154851,), (38713,))

In [13]:
"""
Create a Random Forest Regressor model.

- random_state: ensures reproducibility of results
"""
rf_model1 = RandomForestRegressor(random_state=42)

"""
Train the Random Forest model on the training data.

- X_train: feature matrix for training
- y_train: target values for training
"""
rf_model1.fit(X_train, y_train)

"""
Make predictions on the test set using the trained model.
"""
y_pred = rf_model1.predict(X_test)

"""
Evaluate the performance of the model using common regression metrics:

"""
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5 
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mape = np.mean(np.abs((y_test - y_pred) / y_test.replace(0, 1e-10))) * 100  # safe division

"""
Print the evaluation results of the Random Forest model.
"""
print("\n🌲 Random Forest Results:")
print(f"R² Score: {r2:.4f}")
print(f"RMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"MAPE: {mape:.2f}%")



🌲 Random Forest Results:
R² Score: 0.6895
RMSE: 6.56
MAE: 4.25
MAPE: 7.06%
