## Grid search
I want to perform a grid search accross different column subsets


In [3]:
import sys
sys.path.append('..')

from metrics import default_competition_metric
from metrics import make_competition_scorer, competition_scoring, ColumnSelector


import numpy as np
import pandas as pd

import xgboost as xgb
from sklearn.preprocessing import StandardScaler

np.random.seed(44)

In [4]:
# device = 'cuda' # modify if needed

In [5]:
X_train = np.load('../../data/x_train.npy')
y_train = np.load('../../data/y_train.npy')
X_val = np.load('../../data/x_val.npy')
y_val = np.load('../../data/y_val.npy')


In [6]:
# basic xgboost model
model = xgb.XGBClassifier(n_estimators=1000, max_depth=5, verbosity=2)
model.fit(X_train, y_train)

In [7]:
y_pred = model.predict(X_val)

default_competition_metric(y_val, y_pred=y_pred, k=X_train.shape[1])

-97850.0

In [8]:
print(f"Accuracy: {np.mean(y_val == y_pred)}")

Accuracy: 0.649


### Pipeline

In [9]:
from sklearn.base import BaseEstimator, TransformerMixin

class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[:, self.columns]

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.compose import ColumnTransformer

# pipeline
pipe = Pipeline([
    ("feature_selection", ColumnSelector(columns = [100, 101])),
    ('scaler', StandardScaler()),
    ('model', xgb.XGBClassifier(n_estimators=1000, max_depth=5, verbosity=2, random_state=44))
])


pipe.fit(X_train, y_train)

y_proba = pipe.predict_proba(X_val)[:, 1]
default_competition_metric(y_val, k=2, y_pred_proba=y_proba)

5800.0

In [11]:
competition_scoring(pipe, X_val, y_val, scale_metric=True)

5800.0

In [12]:
default_competition_metric(y_val, y_pred_proba=pipe.predict_proba(X_val)[:, 1], k=2)

5800.0

In [13]:
competition_scoring(pipe, X_val, y_val)

5800.0

## Grid search on features from Nearest Shrunken Centroids

In [14]:
features_to_train = np.array([403, 458, 466, 489, 498])

In [15]:
from sklearn.ensemble import RandomForestClassifier

pipe = Pipeline([
    ("feature_selection", ColumnSelector()),
    ('scaler', StandardScaler()),
    ('model', RandomForestClassifier(n_estimators=1000, random_state=44))
])

def get_all_subsets(input_list):
    subsets = []
    n = len(input_list)
    for i in range(1, 2**n):
        subset = [input_list[j] for j in range(n) if (i & (1 << j))]
        subsets.append(subset)
    return subsets

grid = {
    'feature_selection__columns': get_all_subsets(features_to_train),
    'model__bootstrap': [True, False],
    'model__max_depth': [10, 20, 40, 60, 80, 100, None],
    'model__max_features': ['log2', 'sqrt'],
    'model__min_samples_leaf': [1, 2, 4],
    'model__min_samples_split': [2, 5, 10],
    'model__n_estimators': [200, 400, 600, 800, 1000, 1200, 1600, 2000]    
}

grid_search = RandomizedSearchCV(pipe, grid, cv=5, scoring=competition_scoring, verbose=2, n_iter = 100)

grid_search.fit(X_train, y_train)
print("Best score: ", grid_search.best_score_)
print("Best parameters: ", grid_search.best_params_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] END feature_selection__columns=[403, 458, 466], model__bootstrap=False, model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=4, model__min_samples_split=5, model__n_estimators=1000; total time=   5.5s
[CV] END feature_selection__columns=[403, 458, 466], model__bootstrap=False, model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=4, model__min_samples_split=5, model__n_estimators=1000; total time=   4.7s
[CV] END feature_selection__columns=[403, 458, 466], model__bootstrap=False, model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=4, model__min_samples_split=5, model__n_estimators=1000; total time=   4.9s
[CV] END feature_selection__columns=[403, 458, 466], model__bootstrap=False, model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=4, model__min_samples_split=5, model__n_estimators=1000; total time=   5.3s
[CV] END feature_selection__colum

275 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
275 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\tymot\Documents\studia\aml\pro\adv_ml_project_2\.venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\tymot\Documents\studia\aml\pro\adv_ml_project_2\.venv\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\tymot\Documents\studia\aml\pro\adv_ml_project_2\.venv\Lib\site-packages\sklearn\pipeline.py", line 476, in fit
    self._final_es

Best score:  4975.0
Best parameters:  {'model__n_estimators': 1200, 'model__min_samples_split': 5, 'model__min_samples_leaf': 1, 'model__max_features': 'sqrt', 'model__max_depth': 20, 'model__bootstrap': True, 'feature_selection__columns': [403, 466]}


In [16]:
best_params_rf_nsc = grid_search.best_params_
pipe.set_params(**best_params_rf_nsc)
pipe.fit(X_train, y_train)
competition_scoring(pipe, X_val, y_val)

4500.0

## XGBoost features

In [17]:
# larger training set
features_to_train

array([403, 458, 466, 489, 498])

In [18]:
pipe = Pipeline([
    ("feature_selection", ColumnSelector()),
    ('scaler', StandardScaler()),
    ('model', xgb.XGBClassifier(n_estimators=1000, max_depth=5, verbosity=2, random_state=44))
])

def get_all_subsets(input_list):
    subsets = []
    n = len(input_list)
    for i in range(1, 2**n):
        subset = [input_list[j] for j in range(n) if (i & (1 << j))]
        subsets.append(subset)
    return subsets

grid = {
    'feature_selection__columns': get_all_subsets(features_to_train),
    'model__n_estimators': [100, 500, 1000],
    'model__max_depth': [3, 5, 7],
    'model__learning_rate': [0.01, 0.05, 0.1, 0.2],   
    'model__colsample_bytree': [0.5, 0.75, 1]     
    
}

grid_search = RandomizedSearchCV(pipe, grid, cv=5, scoring=competition_scoring, verbose=2, n_iter = 100)

grid_search.fit(X_train, y_train)
grid_search.best_params_

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] END feature_selection__columns=[403, 458], model__colsample_bytree=1, model__learning_rate=0.05, model__max_depth=7, model__n_estimators=100; total time=   0.1s
[CV] END feature_selection__columns=[403, 458], model__colsample_bytree=1, model__learning_rate=0.05, model__max_depth=7, model__n_estimators=100; total time=   0.0s
[CV] END feature_selection__columns=[403, 458], model__colsample_bytree=1, model__learning_rate=0.05, model__max_depth=7, model__n_estimators=100; total time=   0.0s
[CV] END feature_selection__columns=[403, 458], model__colsample_bytree=1, model__learning_rate=0.05, model__max_depth=7, model__n_estimators=100; total time=   0.0s
[CV] END feature_selection__columns=[403, 458], model__colsample_bytree=1, model__learning_rate=0.05, model__max_depth=7, model__n_estimators=100; total time=   0.0s
[CV] END feature_selection__columns=[466, 489], model__colsample_bytree=0.75, model__learning_rate=0.1, mod

{'model__n_estimators': 500,
 'model__max_depth': 5,
 'model__learning_rate': 0.01,
 'model__colsample_bytree': 0.5,
 'feature_selection__columns': [403]}

In [19]:
print("Best score: ", grid_search.best_score_)
print("Best parameters: ", grid_search.best_params_)

Best score:  4975.0
Best parameters:  {'model__n_estimators': 500, 'model__max_depth': 5, 'model__learning_rate': 0.01, 'model__colsample_bytree': 0.5, 'feature_selection__columns': [403]}


In [20]:
best_params_nsc = {'model__n_estimators': 500, 'model__max_depth': 5, 'model__learning_rate': 0.01, 'model__colsample_bytree': 0.5, 'feature_selection__columns': [403]}

In [21]:
pipe.set_params(**best_params_nsc)

In [22]:
pipe.fit(X_train, y_train)

y_proba = pipe.predict_proba(X_val)[:, 1]
competition_scoring(pipe, X_val, y_val, scale_metric=True)

5400.000000000001

In [33]:
best_params_by_hand = best_params_nsc
best_params_by_hand["feature_selection__columns"] = [100, 105, 102, 8, 403]

In [34]:
pipe.set_params(**best_params_by_hand)
pipe.fit(X_train, y_train)

y_proba = pipe.predict_proba(X_val)[:, 1]
competition_scoring(pipe, X_val, y_val, scale_metric=True)

6400.0

## SVM

In [35]:
features_to_train

array([403, 458, 466, 489, 498])

In [36]:
# svm
from sklearn.svm import SVC

pipe = Pipeline([
    ("feature_selection", ColumnSelector()),
    ('scaler', StandardScaler()),
    ('model', SVC(probability=True, random_state=44))
])

def get_all_subsets(input_list):
    subsets = []
    n = len(input_list)
    for i in range(1, 2**n):
        subset = [input_list[j] for j in range(n) if (i & (1 << j))]
        subsets.append(subset)
    return subsets

grid = {
    'feature_selection__columns': get_all_subsets(features_to_train),
    'model__C': [0.1, 1, 10, 100],
    'model__gamma': [1, 0.1, 0.01, 0.001, 0.0001],
    'model__kernel': ['rbf', 'poly', 'sigmoid', 'linear']
}

grid_search = RandomizedSearchCV(pipe, grid, cv=5, scoring=competition_scoring, verbose=2, n_iter = 100)

grid_search.fit(X_train, y_train)
print("Best score: ", grid_search.best_score_)
print("Best parameters: ", grid_search.best_params_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] END feature_selection__columns=[403, 458, 489], model__C=100, model__gamma=1, model__kernel=rbf; total time=   2.8s
[CV] END feature_selection__columns=[403, 458, 489], model__C=100, model__gamma=1, model__kernel=rbf; total time=   2.7s
[CV] END feature_selection__columns=[403, 458, 489], model__C=100, model__gamma=1, model__kernel=rbf; total time=   2.8s
[CV] END feature_selection__columns=[403, 458, 489], model__C=100, model__gamma=1, model__kernel=rbf; total time=   2.9s
[CV] END feature_selection__columns=[403, 458, 489], model__C=100, model__gamma=1, model__kernel=rbf; total time=   3.1s
[CV] END feature_selection__columns=[489, 498], model__C=1, model__gamma=1, model__kernel=poly; total time=   3.4s
[CV] END feature_selection__columns=[489, 498], model__C=1, model__gamma=1, model__kernel=poly; total time=   3.7s
[CV] END feature_selection__columns=[489, 498], model__C=1, model__gamma=1, model__kernel=poly; total 

In [42]:

print("Best score: ", pd.DataFrame(grid_search.cv_results_).iloc[grid_search.best_index_])

Best score:  mean_fit_time                                                                1.569434
std_fit_time                                                                 0.275001
mean_score_time                                                               0.06232
std_score_time                                                                0.01231
param_model__kernel                                                           sigmoid
param_model__gamma                                                               0.01
param_model__C                                                                  100.0
param_feature_selection__columns                                           [466, 498]
params                              {'model__kernel': 'sigmoid', 'model__gamma': 0...
split0_test_score                                                              4725.0
split1_test_score                                                              5162.5
split2_test_score                        

In [43]:
pd.DataFrame(grid_search.cv_results_).to_csv("svm_nsc_results.csv")

In [37]:
pipe.set_params(**grid_search.best_params_)

pipe.fit(X_train, y_train)

competition_scoring(pipe, X_val, y_val)

4650.0

In [27]:
nsc_by_hand = [466, 498, 412, 409, 471, 481]

pipe.set_params(**{'feature_selection__columns': nsc_by_hand})

In [28]:
pipe.fit(X_train, y_train)

competition_scoring(pipe, X_val, y_val)

3450.0