## Grid search
I want to perform a grid search accross different column subsets


In [1]:
import sys
sys.path.append('..')

from metrics import default_competition_metric
from metrics import make_competition_scorer, competition_scoring, ColumnSelector


import numpy as np
import pandas as pd

import xgboost as xgb
from sklearn.preprocessing import StandardScaler

np.random.seed(44)

In [2]:
# device = 'cuda' # modify if needed

In [3]:
X_train = np.load('../../data/x_train.npy')
y_train = np.load('../../data/y_train.npy')
X_val = np.load('../../data/x_val.npy')
y_val = np.load('../../data/y_val.npy')


In [4]:
# basic xgboost model
model = xgb.XGBClassifier(n_estimators=1000, max_depth=5, verbosity=2)
model.fit(X_train, y_train)

In [5]:
y_pred = model.predict(X_val)

default_competition_metric(y_val, y_pred=y_pred, k=X_train.shape[1])

-97850.0

In [6]:
print(f"Accuracy: {np.mean(y_val == y_pred)}")

Accuracy: 0.649


### Pipeline

In [7]:
from sklearn.base import BaseEstimator, TransformerMixin

class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[:, self.columns]

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.compose import ColumnTransformer

# pipeline
pipe = Pipeline([
    ("feature_selection", ColumnSelector(columns = [100, 101])),
    ('scaler', StandardScaler()),
    ('model', xgb.XGBClassifier(n_estimators=1000, max_depth=5, verbosity=2, random_state=44))
])


pipe.fit(X_train, y_train)

y_proba = pipe.predict_proba(X_val)[:, 1]
default_competition_metric(y_val, k=2, y_pred_proba=y_proba)

5800.0

In [11]:
competition_scoring(pipe, X_val, y_val, scale_metric=True)

5800.0

In [13]:
default_competition_metric(y_val, y_pred_proba=pipe.predict_proba(X_val)[:, 1], k=2)

5800.0

In [15]:
competition_scoring(pipe, X_val, y_val)

5800.0

In [16]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score

# Sample data
data = {
    'feature_1': [1, 2, 3, 4],
    'feature_2': [5, 6, 7, 8],
    'feature_3': [9, 10, 11, 12],
    'feature_4': [13, 14, 15, 16]
}
df = pd.DataFrame(X_train)
y = y_train

# Custom transformer for column selection
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[:, self.columns]



# Create pipeline with column selection, scaling, and classifier
pipeline = Pipeline(steps=[
    ('feature_selection', ColumnSelector(columns=[0, 1])),  # Default selection
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier())
])

# Parameter grid for GridSearchCV
param_grid = {
    'feature_selection__columns': [[0, 1], [1, 2], [0, 2, 3]]  # Example parameter grid
}

# Grid search to find the best column indexes with custom scoring
grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring=competition_scoring)
grid_search.fit(df.values, y)

# Best parameters and best score found
print("Best parameters found: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)


Best parameters found:  {'feature_selection__columns': [0, 1]}
Best score:  5250.025699828256


## Grid search on features from Boruta

In [38]:
features_to_train = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 100, 101, 102, 103, 104, 105])

In [39]:
pipe = Pipeline([
    ("feature_selection", ColumnSelector()),
    ('scaler', StandardScaler()),
    ('model', xgb.XGBClassifier(n_estimators=1000, max_depth=5, verbosity=2, random_state=44))
])

def get_all_subsets(input_list):
    subsets = []
    n = len(input_list)
    for i in range(1, 2**n):
        subset = [input_list[j] for j in range(n) if (i & (1 << j))]
        subsets.append(subset)
    return subsets

grid = {
    'feature_selection__columns': get_all_subsets(features_to_train),
    'model__bootstrap': [True, False],
    'model__max_depth': [10, 20, 40, 60, 80, 100, None],
    'model__max_features': ['auto', 'sqrt'],
    'model__min_samples_leaf': [1, 2, 4],
    'model__min_samples_split': [2, 5, 10],
    'model__n_estimators': [200, 400, 600, 800, 1000, 1200, 1600, 2000]    
}

grid_search = RandomizedSearchCV(pipe, grid, cv=5, scoring=competition_scoring, verbose=2, n_iter = 500)

grid_search.fit(X_train, y_train)
print("Best score: ", grid_search.best_score_)
print("Best parameters: ", grid_search.best_params_)

Fitting 5 folds for each of 500 candidates, totalling 2500 fits
[CV] END feature_selection__columns=[0, 1, 2, 4, 6, 7, 8, 100, 103], model__colsample_bytree=1, model__learning_rate=0.05, model__max_depth=3, model__n_estimators=1000; total time=   0.5s
[CV] END feature_selection__columns=[0, 1, 2, 4, 6, 7, 8, 100, 103], model__colsample_bytree=1, model__learning_rate=0.05, model__max_depth=3, model__n_estimators=1000; total time=   0.6s
[CV] END feature_selection__columns=[0, 1, 2, 4, 6, 7, 8, 100, 103], model__colsample_bytree=1, model__learning_rate=0.05, model__max_depth=3, model__n_estimators=1000; total time=   0.6s
[CV] END feature_selection__columns=[0, 1, 2, 4, 6, 7, 8, 100, 103], model__colsample_bytree=1, model__learning_rate=0.05, model__max_depth=3, model__n_estimators=1000; total time=   0.5s
[CV] END feature_selection__columns=[0, 1, 2, 4, 6, 7, 8, 100, 103], model__colsample_bytree=1, model__learning_rate=0.05, model__max_depth=3, model__n_estimators=1000; total time=   0

In [49]:
best_params_boruta = {'model__n_estimators': 1000, 'model__max_depth': 5, 'model__learning_rate': 0.01, 'model__colsample_bytree': 0.5, 'feature_selection__columns': [8, 101, 102, 105]}

pipe.set_params(**best_params_boruta)
pipe.fit(X_train, y_train)
competition_scoring(pipe, X_val, y_val)

6500.0

## MRMR features

In [40]:
# larger training set
features_to_train = np.array([100, 102, 105,403, 466])

In [41]:
pipe = Pipeline([
    ("feature_selection", ColumnSelector()),
    ('scaler', StandardScaler()),
    ('model', xgb.XGBClassifier(n_estimators=1000, max_depth=5, verbosity=2, random_state=44))
])

def get_all_subsets(input_list):
    subsets = []
    n = len(input_list)
    for i in range(1, 2**n):
        subset = [input_list[j] for j in range(n) if (i & (1 << j))]
        subsets.append(subset)
    return subsets

grid = {
    'feature_selection__columns': get_all_subsets(features_to_train),
    'model__n_estimators': [100, 500, 1000],
    'model__max_depth': [3, 5, 7],
    'model__learning_rate': [0.01, 0.05, 0.1, 0.2],   
    'model__colsample_bytree': [0.5, 0.75, 1]     
    
}

grid_search = RandomizedSearchCV(pipe, grid, cv=5, scoring=competition_scoring, verbose=2, n_iter = 500)

grid_search.fit(X_train, y_train)
grid_search.best_params_

Fitting 5 folds for each of 500 candidates, totalling 2500 fits
[CV] END feature_selection__columns=[102, 105, 403], model__colsample_bytree=1, model__learning_rate=0.01, model__max_depth=5, model__n_estimators=500; total time=   0.2s
[CV] END feature_selection__columns=[102, 105, 403], model__colsample_bytree=1, model__learning_rate=0.01, model__max_depth=5, model__n_estimators=500; total time=   0.2s
[CV] END feature_selection__columns=[102, 105, 403], model__colsample_bytree=1, model__learning_rate=0.01, model__max_depth=5, model__n_estimators=500; total time=   0.3s
[CV] END feature_selection__columns=[102, 105, 403], model__colsample_bytree=1, model__learning_rate=0.01, model__max_depth=5, model__n_estimators=500; total time=   0.3s
[CV] END feature_selection__columns=[102, 105, 403], model__colsample_bytree=1, model__learning_rate=0.01, model__max_depth=5, model__n_estimators=500; total time=   0.2s
[CV] END feature_selection__columns=[403, 466], model__colsample_bytree=1, model_

{'model__n_estimators': 1000,
 'model__max_depth': 3,
 'model__learning_rate': 0.01,
 'model__colsample_bytree': 1,
 'feature_selection__columns': [100, 102, 105]}

In [42]:
print("Best score: ", grid_search.best_score_)
print("Best parameters: ", grid_search.best_params_)

Best score:  6787.5
Best parameters:  {'model__n_estimators': 1000, 'model__max_depth': 3, 'model__learning_rate': 0.01, 'model__colsample_bytree': 1, 'feature_selection__columns': [100, 102, 105]}


In [46]:
best_params_mrmr = {'model__n_estimators': 1000, 'model__max_depth': 3, 'model__learning_rate': 0.01, 'model__colsample_bytree': 1, 'feature_selection__columns': [100, 102, 105]}


In [47]:
pipe.set_params(**best_params_mrmr)

In [48]:
pipe.fit(X_train, y_train)

y_proba = pipe.predict_proba(X_val)[:, 1]
competition_scoring(pipe, X_val, y_val, scale_metric=True)

6900.0