### Imports

In [8]:
# Source: https://www.datacamp.com/tutorial/xgboost-in-python
import xgboost as xgb
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

from sklearn.ensemble import IsolationForest

### Read Dataset

In [9]:
X_train = pd.read_csv('../pc_X_train.csv')
y_train = pd.read_csv('../pc_y_train.csv')
y_train = y_train.iloc[:, -1] # With iloc we extract the labels

X_test = pd.read_csv('../pc_X_test.csv')
ids = X_test.iloc[:, 0]

In [10]:
model = IsolationForest(contamination=0.2)  # Adjust contamination parameter if needed
model.fit(X_train)

In [11]:
outliers = model.predict(X_train)

In [12]:
outliers

array([ 1, -1, -1, ...,  1, -1, -1])

In [13]:

outlier_indices = np.where(outliers == -1)[0]

In [14]:
outlier_indices

array([   1,    2,    3,    5,   16,   17,   25,   36,   56,   58,   62,
         70,   71,   73,   81,   82,   86,   88,   92,   94,  105,  119,
        120,  135,  146,  149,  153,  155,  163,  166,  177,  180,  183,
        187,  193,  197,  200,  206,  209,  212,  214,  216,  217,  218,
        229,  232,  233,  235,  237,  251,  252,  256,  257,  263,  264,
        272,  274,  278,  286,  287,  289,  294,  307,  311,  312,  313,
        314,  316,  328,  333,  337,  345,  353,  375,  381,  404,  406,
        409,  417,  422,  423,  427,  433,  434,  442,  443,  444,  459,
        461,  463,  465,  469,  479,  482,  486,  487,  489,  490,  496,
        498,  502,  504,  505,  530,  533,  535,  542,  544,  549,  551,
        555,  559,  563,  574,  576,  577,  578,  587,  588,  591,  595,
        597,  605,  607,  619,  626,  629,  642,  647,  650,  656,  659,
        664,  676,  690,  700,  701,  709,  711,  732,  741,  760,  762,
        765,  766,  774,  781,  785,  790,  793,  7

In [15]:
X_train = X_train[outliers != -1]
y_train = y_train[outliers != -1]

In [27]:
len(X_train)

1581

### Scale Data

In [16]:
scaler = StandardScaler()
#X_train = scaler.fit_transform(X_train)
#X_test = scaler.transform(X_test)

### Dimensionality Reduction

In [17]:
dim_reduction = PCA(random_state=42)
#X_train = dim_reduction.fit_transform(X_train)
#X_test = dim_reduction.transform(X_test)

In [18]:
model = xgb.XGBRegressor()

param_grid = {
    ###'pca__n_components': [270,280,290,300,310,320,330],
    'model__max_depth': [2, 3, 4, 5, 6],
    'model__learning_rate': [0.0001],
    'model__n_estimators': [4000],
    'model__colsample_bytree': [0.3, 0.4, 0.5, 0.6, 0.7]
}

In [19]:
steps = []
steps.append(('scaler', scaler))
#steps.append(('pca', dim_reduction))
steps.append(('model', model))

pipe = Pipeline(steps=steps)

In [20]:
grid = GridSearchCV(pipe, param_grid, cv=10, scoring='max_error', n_jobs=-1, verbose=2)
grid.fit(X_train, y_train)
best_params = grid.best_params_

Fitting 10 folds for each of 25 candidates, totalling 250 fits


[CV] END model__colsample_bytree=0.3, model__learning_rate=0.0001, model__max_depth=2, model__n_estimators=4000; total time=  11.1s
[CV] END model__colsample_bytree=0.3, model__learning_rate=0.0001, model__max_depth=2, model__n_estimators=4000; total time=  11.2s
[CV] END model__colsample_bytree=0.3, model__learning_rate=0.0001, model__max_depth=2, model__n_estimators=4000; total time=  11.4s
[CV] END model__colsample_bytree=0.3, model__learning_rate=0.0001, model__max_depth=2, model__n_estimators=4000; total time=  11.3s
[CV] END model__colsample_bytree=0.3, model__learning_rate=0.0001, model__max_depth=2, model__n_estimators=4000; total time=  11.5s
[CV] END model__colsample_bytree=0.3, model__learning_rate=0.0001, model__max_depth=2, model__n_estimators=4000; total time=  11.5s
[CV] END model__colsample_bytree=0.3, model__learning_rate=0.0001, model__max_depth=2, model__n_estimators=4000; total time=  11.7s
[CV] END model__colsample_bytree=0.3, model__learning_rate=0.0001, model__ma

In [21]:
print(best_params)

{'model__colsample_bytree': 0.3, 'model__learning_rate': 0.0001, 'model__max_depth': 4, 'model__n_estimators': 4000}


In [22]:
best_model = Pipeline([
    ('scaler', StandardScaler()),
    #('pca', PCA(n_components=best_params['pca__n_components'])),
    ('xgb', xgb.XGBRegressor(**{k.split("__")[1]: v for k, v in best_params.items() if "model" in k}))
])

best_model.fit(X_train, y_train)

### Transform Dataset to DMatrix Format

In [23]:
#dtrain_reg = xgb.DMatrix(X_train, y_train)

### Train Model

In [24]:
# params = {"objective": "reg:squarederror"}

# n = 500
# model = xgb.train(
#    params=params,
#    dtrain=dtrain_reg,
#    num_boost_round=n,
# )

### Predict on Testset

In [25]:
#X_test = xgb.DMatrix(X_test)
predictions = best_model.predict(X_test)

### Save Results

In [26]:
# Save 
output_filename = f'bork_GYCAOB_XGBoost_R.csv'
results_df = pd.DataFrame({'id': ids,'score': predictions})
results_df.to_csv(output_filename, index=False)