In [None]:
#Adapted from example.py from the OSRT Package for Python
#Import the package
import osrt as OSRT

#These are all things also imported in the example
import pandas as pd
import numpy as np
import sklearn as sk
import time
import pathlib
import math
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split

#Imports things directly from the model. 
#Note here that this looks slightly different than the example code (integrated into the model)
from osrt.model.osrt import OSRT
from osrt.model.threshold_guess import compute_thresholds
from osrt.model.threshold_guess import cut

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import KFold

In [None]:
boatsDF = pd.read_csv("C:/Users/livin/OneDrive/University/Y3; Block 4; Seminar ML/Actual Research/Modelling/Data/2_df_man.csv",delimiter=",",index_col=0)

In [None]:
randomSeed = 0

In [None]:
X = boatsDF.drop(['Price'], axis=1)
y = boatsDF['Price']

In [None]:
# configuration of the OSRT model -> see specification for individual functions
config = {
    #prune pairs using subset comparison
    "feature_exchange": False,

    #similar for above, continuous of feature
    "continuous_feature_exchange": False,

    #Adds a penalty for additional complexity 
    "regularization": 0.05,

    #Sets the maximum depth for the tree
    "depth_budget": 0,

    #max number of models extracted -> 0 = no limit
    "model_limit": 1,

    #time for model to run -> 0 = no limit
    "time_limit": 0,

    #S.f. considered in the split
    "precision_limit": 0,

    #To do with processing speed -> how many threads are used
    "worker_limit": 6,

    #Similar features -> tree can use bounds for first, leverage for 2nd
    #Can slow down algorithm (see Lin 2022), but faster here
    "similar_support": True,

    #Use MSE -> L2, use MAE -> L1
    "metric": "L2",

    #Weights for samples in training data
    "weights": [],

    #print config, process, results to standard
    "verbose": False,

    #print diagnostic of trace
    "diagnostics": True,
}

In [None]:
rmse_list = []
mae_list = []

kFold = KFold(n_splits=5, shuffle=True, random_state= randomSeed)

for train_index, test_index in kFold.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    n_est = 40
    max_depth = 1
    X_train, thresholds, header, threshold_guess_time = compute_thresholds(X_train, y_train, n_est, max_depth)

    model = OSRT(config)
    model.fit(X_train, y_train)

    print("evaluate the model, extracting tree and scores", flush=True)

    # get the results
    train_acc = model.score(X_train, y_train)
    n_leaves = model.leaves()
    n_nodes = model.nodes()
    time = model.time

    print("Model training time: {}".format(time))
    print("Training score: {}".format(train_acc))
    print("# of leaves: {}".format(n_leaves))
    print(model.tree)

    X_test = cut(X_test, thresholds)

    predictions = model.predict(X_test)

    mse = mean_squared_error(y_test, predictions)
    rmse_list.append(np.sqrt(mse))
    mae_list.append(mean_absolute_error(y_test, predictions))

print(f"List RMSE: {rmse_list}")
print(f"List MAE: {mae_list}")

mean_RMSE = np.mean(rmse_list)
mean_MAE = np.mean(mae_list)

print(f"RMSE mean: {mean_RMSE}")
print(f"MAE mean: {mean_MAE}")
