### Imports

In [13]:
# Source: https://www.datacamp.com/tutorial/xgboost-in-python

import numpy as np

import xgboost as xgb
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import LeaveOneOut

from sklearn.metrics import mean_squared_error, max_error, mean_absolute_error

### Read Dataset

In [14]:
X_train = pd.read_csv('../pc_X_train.csv')
y_train = pd.read_csv('../pc_y_train.csv')
y_train = y_train.iloc[:, -1] # With iloc we extract the labels

X_test = pd.read_csv('../pc_X_test.csv')
ids = X_test.iloc[:, 0]

### Scale Data

In [15]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### Dimensionality Reduction

In [16]:
dim_reduction = PCA(n_components=30, random_state=42)
X_train = dim_reduction.fit_transform(X_train)
X_test = dim_reduction.transform(X_test)

In [17]:
models = []
params = {"objective": "reg:squarederror"}
n = 500

loocv = LeaveOneOut()

mse_list = []
me_list = []
mae_list = []

for i, (train_index, test_index) in enumerate(loocv.split(X_train)):
    print(f"Fold {i}:")

    X_train_subset = X_train[train_index]
    y_train_subset = y_train[train_index]

    X_test_subset = X_train[test_index]
    y_test_subset = y_train[test_index]

    dtrain_reg = xgb.DMatrix(X_train_subset, y_train_subset)

    model = xgb.train(
        params=params,
        dtrain=dtrain_reg,
        num_boost_round=n,
    )

    models.append(model)

    X_test_subset = xgb.DMatrix(X_test_subset)
    predictions_subset = model.predict(X_test_subset)

    mse = mean_squared_error(y_test_subset, predictions_subset)
    me = max_error(y_test_subset, predictions_subset)
    mae = mean_absolute_error(y_test_subset, predictions_subset)

    mse_list.append(mse)
    me_list.append(me)
    mae_list.append(mae)

    print(f"mean_squared_error: {mse}")
    print(f"max_error: {me}")
    print(f"mean_absolute_error: {mae}")

print("Result after training:")
print(f"Mean MSE: {np.mean(mse_list)}")
print(f"Mean ME: {np.mean(me_list)}")
print(f"Mean MAE: {np.mean(mae_list)}")
    

Fold 0:
mean_squared_error: 0.18138713954249397
max_error: 0.42589569091796875
mean_absolute_error: 0.42589569091796875
Fold 1:
mean_squared_error: 0.3428378924181743
max_error: 0.5855236053466797
mean_absolute_error: 0.5855236053466797
Fold 2:
mean_squared_error: 0.22090518153368066
max_error: 0.47000551223754883
mean_absolute_error: 0.47000551223754883
Fold 3:
mean_squared_error: 1.97255330022017
max_error: 1.4044761657714844
mean_absolute_error: 1.4044761657714844
Fold 4:
mean_squared_error: 0.569486002223357
max_error: 0.7546429634094238
mean_absolute_error: 0.7546429634094238
Fold 5:
mean_squared_error: 0.11334374665261748
max_error: 0.33666563034057617
mean_absolute_error: 0.33666563034057617
Fold 6:
mean_squared_error: 2.6820373465375837
max_error: 1.6376926898956299
mean_absolute_error: 1.6376926898956299
Fold 7:
mean_squared_error: 0.34332498908406706
max_error: 0.5859394073486328
mean_absolute_error: 0.5859394073486328
Fold 8:
mean_squared_error: 0.2805909736346166
max_error:

In [18]:
predictions = []

for i, model in enumerate(models):
    predictions.append(model.predict(X_test))

TypeError: ('Expecting data to be a DMatrix object, got: ', <class 'numpy.ndarray'>)

### Save Results

In [None]:
# Save 
# output_filename = f'bork_GYCAOB_XGBoost_R.csv'
# results_df = pd.DataFrame({'id': ids,'score': predictions})
# results_df.to_csv(output_filename, index=False)