In [1]:
import pandas as pd
import numpy as np
from sklearn import ensemble
from sklearn.metrics import mean_squared_error

In [2]:
input_train = pd.read_csv("Training_Validation_Test_Datasets/task2_input_train.csv", index_col=0)
input_validate = pd.read_csv("Training_Validation_Test_Datasets/task2_input_validate.csv", index_col=0)
input_test = pd.read_csv("Training_Validation_Test_Datasets/task2_input_test.csv", index_col=0)

output_train = pd.read_csv("Training_Validation_Test_Datasets/task2_output_train.csv")["phq_sum"]
output_validate = pd.read_csv("Training_Validation_Test_Datasets/task2_output_validate.csv")["phq_sum"]
output_test = pd.read_csv("Training_Validation_Test_Datasets/task2_output_test.csv")["phq_sum"]

In [3]:
lrs = [0.01, 0.05, 0.1, 0.2] # learning rate
ns = [50, 100, 250, 500] # number of estimators
depths = [4,6,8,12] # max depth
min_samples = [100,250,500,1000,1500,2000] # min sample split

In [5]:
results = []

for lr in lrs:
        for n in ns:
            for depth in depths:
                for min_sample in min_samples:
                    params = {
                        "n_estimators": n,
                        "max_depth": depth,
                        "min_samples_split": min_sample,
                        "learning_rate": lr,
                        "loss": "squared_error",
                        "random_state": 42
                    }

                    reg = ensemble.GradientBoostingRegressor(**params)
                    reg.fit(input_train, output_train)
                    
                    y_pred = reg.predict(input_validate)
                    mse = round(mean_squared_error(output_validate, y_pred), 4)
                    rmse = round(mse ** (1/2), 4)
                    
                    results.append({
                        "learning_rate": lr,
                        "n_estimators": n,
                        "max_depth": depth,
                        "min_samples_split": min_sample,
                        "MSE": mse,
                        "RMSE": rmse
                    })

In [6]:
results_df = pd.DataFrame(results)

print("Parameter Tuning Results (Validation Set):")
print(results_df)

best_row = results_df.loc[results_df["RMSE"].idxmin()]
best_params = best_row[["learning_rate", "n_estimators", "max_depth", "min_samples_split"]].to_dict()
best_rmse = best_row["RMSE"]

print("\nBest Hyperparameters:")
print(best_params)
print("Best Validation RMSE:", best_rmse)

Parameter Tuning Results (Validation Set):
     learning_rate  n_estimators  max_depth  min_samples_split      MSE  \
0             0.01            50          4                100  18.7339   
1             0.01            50          4                250  18.7859   
2             0.01            50          4                500  19.0297   
3             0.01            50          4               1000  19.1341   
4             0.01            50          4               1500  19.5830   
..             ...           ...        ...                ...      ...   
379           0.20           500         12                250  17.4499   
380           0.20           500         12                500  18.0744   
381           0.20           500         12               1000  18.7608   
382           0.20           500         12               1500  18.2527   
383           0.20           500         12               2000  16.8326   

       RMSE  
0    4.3283  
1    4.3343  
2    4.3623  
