In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

In [2]:
input_train = pd.read_csv("Training_Validation_Test_Datasets/task2_input_train.csv", index_col=0)
input_validate = pd.read_csv("Training_Validation_Test_Datasets/task2_input_validate.csv", index_col=0)
input_test = pd.read_csv("Training_Validation_Test_Datasets/task2_input_test.csv", index_col=0)

output_train = pd.read_csv("Training_Validation_Test_Datasets/task2_output_train.csv")["phq_sum"]
output_validate = pd.read_csv("Training_Validation_Test_Datasets/task2_output_validate.csv")["phq_sum"]
output_test = pd.read_csv("Training_Validation_Test_Datasets/task2_output_test.csv")["phq_sum"]

In [3]:
input_train.shape

(3311, 50)

In [4]:
output_train.shape

(3311,)

In [5]:
param_grid = {
        "n_estimators": [50, 100, 150, 200, 250],
        "max_depth": [5, 10, 15, 20, None]
    }

In [6]:
param_results = pd.DataFrame(columns=["n_estimators", "max_depth", "MSE"])

for n_estimators in param_grid["n_estimators"]:
    for max_depth in param_grid["max_depth"]:
        rf_model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
        rf_model.fit(input_train, output_train)
        
        y_validate_pred = rf_model.predict(input_validate)
        mse = mean_squared_error(output_validate, y_validate_pred)
        
        param_results = pd.concat([
            param_results,
            pd.DataFrame({
                "n_estimators": [n_estimators],
                "max_depth": [max_depth],
                "MSE": [mse]
            })
        ], ignore_index=True)

best_params_row = param_results.loc[param_results["MSE"].idxmin()]
best_params = {
    "n_estimators": int(best_params_row["n_estimators"]),
    "max_depth": best_params_row["max_depth"] if not pd.isna(best_params_row["max_depth"]) else None
}

print("Parameter Tuning Results (Validation Set):")
print(param_results)
print("\nBest Hyperparameters:")
print(best_params)

  param_results = pd.concat([


Parameter Tuning Results (Validation Set):
   n_estimators max_depth        MSE
0            50         5  16.336887
1            50        10  15.863479
2            50        15  15.856625
3            50        20  16.115440
4            50      None  16.094251
5           100         5  16.313132
6           100        10  15.983115
7           100        15  16.003921
8           100        20  16.261933
9           100      None  16.227712
10          150         5  16.284889
11          150        10  15.935469
12          150        15  15.982553
13          150        20  16.119607
14          150      None  16.166715
15          200         5  16.268608
16          200        10  15.923110
17          200        15  15.932556
18          200        20  16.093292
19          200      None  16.129710
20          250         5  16.297467
21          250        10  15.953453
22          250        15  15.981728
23          250        20  16.090233
24          250      None  16.11

In [7]:
rf_best_model = RandomForestRegressor(n_estimators=best_params["n_estimators"],
                                      max_depth=best_params["max_depth"],
                                      random_state=42)

rf_best_model.fit(input_train, output_train)
y_test_pred = rf_best_model.predict(input_test)

# Calculate metrics on test set
mse_test = mean_squared_error(output_test, y_test_pred)
rmse_test = mse_test**(1/2)

print("\nTest Set Evaluation:")
print(f"MSE: {mse_test:.4f}, RMSE: {rmse_test:.4f}")


Test Set Evaluation:
MSE: 16.1051, RMSE: 4.0131
