In [21]:
# STEP 01: Download the dataset

In [22]:
# First we need to ensure that we have the necessary libraries:


In [23]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import exp, log



In [24]:
# Download and load the dataset

In [25]:
data_url = "http://129.10.224.71/~apaul/data/tests/dataset.csv.gz"
df = pd.read_csv(data_url, compression='gzip')


In [26]:
# Step 2: Data Preprocessing
# Apply the given transformations to x1 and either y1 or y2:

In [27]:
def x_scale(x, p=7.5):
    return 1/p * np.log(1 + x * (np.exp(p) - 1))

def y_scale(y):
    return np.log(1 + y) if y >= 0 else -np.log(1 - y)

# Apply transformations
df['x1'] = df['x1'].apply(x_scale)
df['y1'] = df['y1'].apply(y_scale)  # Assuming you choose y1 for regression


In [28]:
# Step 3: Splitting the Data
# We need to split the data into training, validning and testing sets.
# We split the data in the way that it balance between having enough data for traing, for validation and for testing
# 60% of data for training, 20% for validation, and the remaining 20% for testing. 



In [29]:
X = df[['x1', 'x2', 'x3', 'x4']]
y = df['y1']  # Assuming y1 is the target

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


In [30]:
# Step 4: Model Training and Hyperparameter Tuning

# When we are constructing a boosted decision tree with xgboost, there are several key hyperparameters need consideration.

# learning_rate (eta): Controls how quickly the model fits the residual error using additional trees.
# A lower value requires more trees but can lead to a more accurate model.
# max_depth: Determines how deep each tree can grow during any boosting round. 
# Deeper trees can model more complex patterns but might overfit.
# n_estimators: Number of trees. Too many trees can overfit, so tuning is necessary.
# subsample: The fraction of samples to be used for fitting each tree, which can help prevent overfitting.
# colsample_bytree: The fraction of features to use when constructing each tree. 
# Using a subset of all features can help prevent overfitting.


In [20]:
model = xgb.XGBRegressor(objective='reg:squarederror', learning_rate=0.1, max_depth=5, n_estimators=100, subsample=0.8, colsample_bytree=0.8)
model.fit(X_train, y_train, early_stopping_rounds=10, eval_set=[(X_val, y_val)], verbose=True)


[0]	validation_0-rmse:3.62110
[1]	validation_0-rmse:3.28155
[2]	validation_0-rmse:3.27446
[3]	validation_0-rmse:2.97034




[4]	validation_0-rmse:2.69871
[5]	validation_0-rmse:2.69169
[6]	validation_0-rmse:2.68576
[7]	validation_0-rmse:2.44229
[8]	validation_0-rmse:2.43700
[9]	validation_0-rmse:2.21962
[10]	validation_0-rmse:2.02581
[11]	validation_0-rmse:2.02061
[12]	validation_0-rmse:1.84788
[13]	validation_0-rmse:1.69466
[14]	validation_0-rmse:1.55855
[15]	validation_0-rmse:1.55317
[16]	validation_0-rmse:1.54875
[17]	validation_0-rmse:1.42957
[18]	validation_0-rmse:1.32172
[19]	validation_0-rmse:1.22719
[20]	validation_0-rmse:1.14535
[21]	validation_0-rmse:1.07403
[22]	validation_0-rmse:1.06910
[23]	validation_0-rmse:1.00791
[24]	validation_0-rmse:0.95539
[25]	validation_0-rmse:0.90788
[26]	validation_0-rmse:0.86907
[27]	validation_0-rmse:0.83398
[28]	validation_0-rmse:0.80627
[29]	validation_0-rmse:0.80123
[30]	validation_0-rmse:0.77446
[31]	validation_0-rmse:0.75458
[32]	validation_0-rmse:0.73288
[33]	validation_0-rmse:0.72851
[34]	validation_0-rmse:0.70826
[35]	validation_0-rmse:0.69357
[36]	validatio

In [6]:
# Explanation of Hyperparameter Choices:
# learning_rate=0.1: A starting point that allows the model to learn sufficiently from each tree without fitting the training data too quickly.
# max_depth=5: Prevents the trees from becoming too deep, which could lead to overfitting on the training data.
# n_estimators=100: Provides a baseline number of trees to start with. The early_stopping_rounds parameter will prevent overfitting by stopping the addition of trees when the validation score stops improving.
# subsample=0.8 and colsample_bytree=0.8: Using a fraction of the data and features for each tree helps in making the model more robust and preventing overfitting.

[0]	validation_0-rmse:3.29012
[1]	validation_0-rmse:2.98733
[2]	validation_0-rmse:2.71728
[3]	validation_0-rmse:2.47655
[4]	validation_0-rmse:2.25962
[5]	validation_0-rmse:2.06613




[6]	validation_0-rmse:1.89293
[7]	validation_0-rmse:1.74041
[8]	validation_0-rmse:1.60560
[9]	validation_0-rmse:1.48645
[10]	validation_0-rmse:1.38095
[11]	validation_0-rmse:1.28905
[12]	validation_0-rmse:1.20259
[13]	validation_0-rmse:1.13113
[14]	validation_0-rmse:1.06505
[15]	validation_0-rmse:1.00593
[16]	validation_0-rmse:0.95645
[17]	validation_0-rmse:0.91365
[18]	validation_0-rmse:0.86908
[19]	validation_0-rmse:0.82926
[20]	validation_0-rmse:0.79369
[21]	validation_0-rmse:0.76093
[22]	validation_0-rmse:0.72995
[23]	validation_0-rmse:0.70680
[24]	validation_0-rmse:0.67815
[25]	validation_0-rmse:0.65122
[26]	validation_0-rmse:0.63286
[27]	validation_0-rmse:0.61369
[28]	validation_0-rmse:0.60020
[29]	validation_0-rmse:0.58653
[30]	validation_0-rmse:0.56915
[31]	validation_0-rmse:0.56075
[32]	validation_0-rmse:0.54703
[33]	validation_0-rmse:0.54025
[34]	validation_0-rmse:0.53196
[35]	validation_0-rmse:0.52521
[36]	validation_0-rmse:0.51918
[37]	validation_0-rmse:0.51461
[38]	validat

In [31]:
# Step 5: Model Evaluation

In [32]:
predictions = model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
print(f"Test RMSE: {rmse}")


Test RMSE: 0.454855966204599
