### Imports

In [28]:
# Source: https://www.datacamp.com/tutorial/xgboost-in-python
import xgboost as xgb
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

### Read Dataset

In [29]:
X_train = pd.read_csv('../pc_X_train.csv')
y_train = pd.read_csv('../pc_y_train.csv')
y_train = y_train.iloc[:, -1] # With iloc we extract the labels

X_test = pd.read_csv('../pc_X_test.csv')
ids = X_test.iloc[:, 0]

### Scale Data

In [30]:
scaler = StandardScaler()
#X_train = scaler.fit_transform(X_train)
#X_test = scaler.transform(X_test)

### Dimensionality Reduction

In [31]:
dim_reduction = PCA(random_state=42)
#X_train = dim_reduction.fit_transform(X_train)
#X_test = dim_reduction.transform(X_test)

In [32]:
model = xgb.XGBRegressor()

param_grid = {
    'pca__n_components': [270,280,290,300,310,320,330],
    'model__max_depth': [2, 3, 4],
    'model__learning_rate': [0.001, 0.01],
    'model__n_estimators': [2000],
    'model__colsample_bytree': [0.3, 0.5, 0.7],
    'model__gamma': [0, 0.1, 0.3]
}

In [33]:
steps = []
steps.append(('scaler', scaler))
steps.append(('pca', dim_reduction))
steps.append(('model', model))

pipe = Pipeline(steps=steps)

In [34]:
grid = GridSearchCV(pipe, param_grid, cv=10, scoring='max_error', n_jobs=-1, verbose=2)
grid.fit(X_train, y_train)
best_params = grid.best_params_

Fitting 10 folds for each of 189 candidates, totalling 1890 fits
[CV] END model__colsample_bytree=0.3, model__gamma=0, model__learning_rate=0.001, model__max_depth=2, model__n_estimators=2000, pca__n_components=270; total time=   4.2s
[CV] END model__colsample_bytree=0.3, model__gamma=0, model__learning_rate=0.001, model__max_depth=2, model__n_estimators=2000, pca__n_components=270; total time=   4.5s
[CV] END model__colsample_bytree=0.3, model__gamma=0, model__learning_rate=0.001, model__max_depth=2, model__n_estimators=2000, pca__n_components=270; total time=   4.5s
[CV] END model__colsample_bytree=0.3, model__gamma=0, model__learning_rate=0.001, model__max_depth=2, model__n_estimators=2000, pca__n_components=270; total time=   4.6s
[CV] END model__colsample_bytree=0.3, model__gamma=0, model__learning_rate=0.001, model__max_depth=2, model__n_estimators=2000, pca__n_components=270; total time=   4.6s
[CV] END model__colsample_bytree=0.3, model__gamma=0, model__learning_rate=0.001, mod

In [35]:
print(best_params)

{'model__colsample_bytree': 0.5, 'model__gamma': 0, 'model__learning_rate': 0.001, 'model__max_depth': 4, 'model__n_estimators': 2000, 'pca__n_components': 310}


In [38]:
best_model = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=best_params['pca__n_components'])),
    ('xgb', xgb.XGBRegressor(**{k.split("__")[1]: v for k, v in best_params.items() if "model" in k}))
])

best_model.fit(X_train, y_train)

### Transform Dataset to DMatrix Format

In [None]:
#dtrain_reg = xgb.DMatrix(X_train, y_train)

### Train Model

In [None]:
# params = {"objective": "reg:squarederror"}

# n = 500
# model = xgb.train(
#    params=params,
#    dtrain=dtrain_reg,
#    num_boost_round=n,
# )

### Predict on Testset

In [39]:
#X_test = xgb.DMatrix(X_test)
predictions = best_model.predict(X_test)

### Save Results

In [40]:
# Save 
output_filename = f'bork_GYCAOB_XGBoost_R.csv'
results_df = pd.DataFrame({'id': ids,'score': predictions})
results_df.to_csv(output_filename, index=False)