### Imports

In [9]:
# Source: https://www.datacamp.com/tutorial/xgboost-in-python
import xgboost as xgb
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest

### Read Dataset

In [10]:
X_train = pd.read_csv('../pc_X_train.csv')
y_train = pd.read_csv('../pc_y_train.csv')
y_train = y_train.iloc[:, -1] # With iloc we extract the labels

X_test = pd.read_csv('../pc_X_test.csv')
ids = X_test.iloc[:, 0]

In [11]:

model = IsolationForest(contamination=0.1, random_state=42)  # Adjust contamination parameter if needed
model.fit(X_train)
outliers = model.predict(X_train)
X_train = X_train[outliers != -1]
y_train = y_train[outliers != -1]
len(X_train)

1778

### Scale Data

In [12]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

### Transform Dataset to DMatrix Format

In [13]:
dtrain_reg = xgb.DMatrix(X_train, y_train)

### Train Model

In [14]:
params = {"objective": "reg:squarederror"}

n = 500
model = xgb.train(
   params=params,
   dtrain=dtrain_reg,
   num_boost_round=n,
)

### Predict on Testset

In [15]:
X_test = xgb.DMatrix(X_test)
predictions = model.predict(X_test)

### Save Results

In [16]:
# Save 
output_filename = f'bork_GYCAOB_XGBoost_R.csv'
results_df = pd.DataFrame({'id': ids,'score': predictions})
results_df.to_csv(output_filename, index=False)