In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV

In [2]:
input_train = pd.read_csv("Training_Validation_Test_Datasets/task2_input_train.csv")
input_validate = pd.read_csv("Training_Validation_Test_Datasets/task2_input_validate.csv")
input_test = pd.read_csv("Training_Validation_Test_Datasets/task2_input_test.csv")

output_train = pd.read_csv("Training_Validation_Test_Datasets/task2_output_train.csv")["phq_sum"]
output_validate = pd.read_csv("Training_Validation_Test_Datasets/task2_output_validate.csv")["phq_sum"]
output_test = pd.read_csv("Training_Validation_Test_Datasets/task2_output_test.csv")["phq_sum"]

In [3]:
input_train

Unnamed: 0,com01,com02,com03,com07,com08,com09,com10,com13,com15,com16,...,lab10,lab11,lab12,lab14,physical01,physical02,physical03,physical04,physical05,physical06
0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,4.0,...,19.64,0.735,1.40,1.57,63.7,162.5,24.1,96.0,58.0,57.0
1,3.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,5.0,...,2.90,1.209,0.26,1.57,90.5,150.5,40.0,153.0,91.0,52.0
2,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4.0,5.0,...,0.55,2.301,2.01,1.57,87.7,179.8,27.1,114.0,61.0,60.0
3,3.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,4.0,...,0.64,0.277,0.21,1.57,72.1,167.9,25.6,138.0,86.0,71.0
4,3.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,5.0,2.0,...,3.91,1.313,0.46,1.57,67.2,157.2,27.2,109.0,75.0,75.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3306,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,5.0,...,3.57,1.202,1.33,1.57,65.6,156.2,26.9,147.0,95.0,79.0
3307,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,5.0,...,1.49,1.160,1.72,1.57,101.2,173.3,33.7,150.0,88.0,53.0
3308,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,5.0,...,1.53,1.037,0.94,1.57,113.0,186.1,32.6,121.0,85.0,76.0
3309,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,5.0,...,0.53,0.482,1.04,1.57,57.7,162.5,21.9,109.0,70.0,58.0


In [4]:
input_train.shape

(3311, 52)

In [5]:
output_train.shape

(3311,)

In [6]:
param_grid = {
        "n_estimators": [50, 100, 150, 200, 250],
        "max_depth": [5, 10, 15, 20, None]
    }

In [7]:
param_results = pd.DataFrame(columns=["n_estimators", "max_depth", "MSE"])

for n_estimators in param_grid["n_estimators"]:
    for max_depth in param_grid["max_depth"]:
        rf_model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, 
                                         random_state=42)
        rf_model.fit(input_train, output_train)
        
        y_validate_pred = rf_model.predict(input_validate)
        mse = round(mean_squared_error(output_validate, y_validate_pred),4)
        rmse = round(mean_squared_error(output_validate, y_validate_pred) ** (1/2), 4)
        
        param_results = pd.concat([
            param_results,
            pd.DataFrame({
                "n_estimators": [n_estimators],
                "max_depth": [max_depth],
                "MSE": [mse],
                "RMSE": [rmse]
            })
        ], ignore_index=True)

best_params_row = param_results.loc[param_results["RMSE"].idxmin()]
best_params = {
    "n_estimators": int(best_params_row["n_estimators"]),
    "max_depth": best_params_row["max_depth"] if not pd.isna(best_params_row["max_depth"]) else None
}

print("Parameter Tuning Results (Validation Set):")
print(param_results)
print("\nBest Hyperparameters:")
print(best_params)

  param_results = pd.concat([


Parameter Tuning Results (Validation Set):
   n_estimators max_depth      MSE    RMSE
0            50         5  15.8686  3.9835
1            50        10  15.4996  3.9369
2            50        15  15.7208  3.9649
3            50        20  15.6131  3.9513
4            50      None  15.5308  3.9409
5           100         5  15.8667  3.9833
6           100        10  15.6467  3.9556
7           100        15  15.7566  3.9695
8           100        20  15.7361  3.9669
9           100      None  15.7382  3.9671
10          150         5  15.8197  3.9774
11          150        10  15.5760  3.9467
12          150        15  15.6429  3.9551
13          150        20  15.6403  3.9548
14          150      None  15.6614  3.9574
15          200         5  15.8194  3.9774
16          200        10  15.5328  3.9412
17          200        15  15.6287  3.9533
18          200        20  15.6355  3.9542
19          200      None  15.6619  3.9575
20          250         5  15.8274  3.9784
21         

In [8]:
rf_best_model = RandomForestRegressor(n_estimators=best_params["n_estimators"],
                                      max_depth=best_params["max_depth"],
                                      random_state=42)

rf_best_model.fit(input_train, output_train)
y_test_pred = rf_best_model.predict(input_test)

# Calculate metrics on test set
mse_test = mean_squared_error(output_test, y_test_pred)
rmse_test = mse_test**(1/2)
mae_test = mean_absolute_error(output_test, y_test_pred)

print("\nTest Set Evaluation:")
print(f"MSE: {mse_test:.4f}, RMSE: {rmse_test:.4f}, MAE: {mae_test:.4f}")


Test Set Evaluation:
MSE: 15.6198, RMSE: 3.9522, MAE: 2.8354
