## Grid search
I want to perform a grid search accross different column subsets


In [2]:
import sys
sys.path.append('..')

from metrics import default_competition_metric
from metrics import make_competition_scorer, competition_scoring, ColumnSelector


import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

np.random.seed(44)

In [3]:
# device = 'cuda' # modify if needed

In [4]:
X_train = np.load('../../data/x_train.npy')
y_train = np.load('../../data/y_train.npy')
X_val = np.load('../../data/x_val.npy')
y_val = np.load('../../data/y_val.npy')


### Pipeline

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVC

# pipeline
pipe = Pipeline([
    ("feature_selection", ColumnSelector(columns = [100, 101])),
    ('scaler', StandardScaler()),
    ('model', SVC(random_state=44, probability=True))
])


pipe.fit(X_train, y_train)

y_proba = pipe.predict_proba(X_val)[:, 1]
default_competition_metric(y_val, k=2, y_pred_proba=y_proba)

6800.0

In [6]:
competition_scoring(pipe, X_val, y_val, scale_metric=True)

6800.0

In [7]:
default_competition_metric(y_val, y_pred_proba=pipe.predict_proba(X_val)[:, 1], k=2)

6800.0

In [8]:
competition_scoring(pipe, X_val, y_val)

6800.0

## Grid search on features from Boruta

In [9]:
features_to_train = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 100, 101, 102, 103, 104, 105])

In [10]:
pipe = Pipeline([
    ("feature_selection", ColumnSelector()),
    ('scaler', StandardScaler()),
    ('model', SVC(random_state=44, probability=True))
])

def get_all_subsets(input_list):
    subsets = []
    n = len(input_list)
    for i in range(1, 2**n):
        subset = [input_list[j] for j in range(n) if (i & (1 << j))]
        subsets.append(subset)
    return subsets

grid = {
    'feature_selection__columns': get_all_subsets(features_to_train),
    'model__C': [0.1, 1, 10, 100],
    'model__gamma': [1, 0.1, 0.01, 0.001, 0.0001],
    'model__kernel': ['rbf', 'poly', 'sigmoid', 'linear']
}

grid_search = RandomizedSearchCV(pipe, grid, cv=5, scoring=competition_scoring, verbose=2, n_iter = 10)

grid_search.fit(X_train, y_train)
print("Best score: ", grid_search.best_score_)
print("Best parameters: ", grid_search.best_params_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END feature_selection__columns=[2, 5, 6, 7, 8, 9, 102, 103, 105], model__C=1, model__gamma=0.0001, model__kernel=rbf; total time=   1.3s
[CV] END feature_selection__columns=[2, 5, 6, 7, 8, 9, 102, 103, 105], model__C=1, model__gamma=0.0001, model__kernel=rbf; total time=   1.2s
[CV] END feature_selection__columns=[2, 5, 6, 7, 8, 9, 102, 103, 105], model__C=1, model__gamma=0.0001, model__kernel=rbf; total time=   1.2s
[CV] END feature_selection__columns=[2, 5, 6, 7, 8, 9, 102, 103, 105], model__C=1, model__gamma=0.0001, model__kernel=rbf; total time=   1.3s
[CV] END feature_selection__columns=[2, 5, 6, 7, 8, 9, 102, 103, 105], model__C=1, model__gamma=0.0001, model__kernel=rbf; total time=   1.3s
[CV] END feature_selection__columns=[0, 3, 4, 5, 6, 7, 100, 101, 102, 103, 105], model__C=1, model__gamma=0.001, model__kernel=linear; total time=   1.3s
[CV] END feature_selection__columns=[0, 3, 4, 5, 6, 7, 100, 101, 102, 103, 

In [11]:
import pandas as pd

results = pd.DataFrame(grid_search.cv_results_)
results.to_csv("svm_results_boruta.csv")

In [13]:
best_params_boruta = {'model__kernel': 'rbf', 'model__gamma': 0.1, 'model__C': 0.1, 'feature_selection__columns': [1, 8, 100, 101, 103, 105]}

pipe.set_params(**best_params_boruta)
pipe.fit(X_train, y_train)
competition_scoring(pipe, X_val, y_val)

6300.0

## MRMR features

In [16]:
# larger training set
features_to_train = np.array([100, 102, 105,403, 466])

In [19]:
pipe = Pipeline([
    ("feature_selection", ColumnSelector()),
    ('scaler', StandardScaler()),
    ('model', SVC(random_state=44, probability=True))
])

def get_all_subsets(input_list):
    subsets = []
    n = len(input_list)
    for i in range(1, 2**n):
        subset = [input_list[j] for j in range(n) if (i & (1 << j))]
        subsets.append(subset)
    return subsets

grid = {
    'feature_selection__columns': get_all_subsets(features_to_train),
    'model__C': [0.1, 1, 10, 100],
    'model__gamma': [1, 0.1, 0.01, 0.001, 0.0001],
    'model__kernel': ['rbf', 'poly', 'sigmoid', 'linear']
}


grid_search = RandomizedSearchCV(pipe, grid, cv=5, scoring=competition_scoring, verbose=2, n_iter = 50)

grid_search.fit(X_train, y_train)
grid_search.best_params_

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV] END feature_selection__columns=[100, 105, 403], model__C=1, model__gamma=0.01, model__kernel=sigmoid; total time=   1.2s
[CV] END feature_selection__columns=[100, 105, 403], model__C=1, model__gamma=0.01, model__kernel=sigmoid; total time=   1.1s
[CV] END feature_selection__columns=[100, 105, 403], model__C=1, model__gamma=0.01, model__kernel=sigmoid; total time=   1.0s
[CV] END feature_selection__columns=[100, 105, 403], model__C=1, model__gamma=0.01, model__kernel=sigmoid; total time=   1.1s
[CV] END feature_selection__columns=[100, 105, 403], model__C=1, model__gamma=0.01, model__kernel=sigmoid; total time=   1.1s
[CV] END feature_selection__columns=[100, 105], model__C=100, model__gamma=0.001, model__kernel=linear; total time=   3.6s
[CV] END feature_selection__columns=[100, 105], model__C=100, model__gamma=0.001, model__kernel=linear; total time=   8.9s
[CV] END feature_selection__columns=[100, 105], model__C=100, 

{'model__kernel': 'rbf',
 'model__gamma': 1,
 'model__C': 10,
 'feature_selection__columns': [100, 105]}

In [34]:
svm_res_mrmr = pd.DataFrame(grid_search.cv_results_)
svm_res_mrmr.to_csv("svm_results_mrmr.csv")

In [20]:
print("Best score: ", grid_search.best_score_)
print("Best parameters: ", grid_search.best_params_)

Best score:  6600.0
Best parameters:  {'model__kernel': 'rbf', 'model__gamma': 1, 'model__C': 10, 'feature_selection__columns': [100, 105]}


In [31]:
best_params_mrmr = {'model__kernel': 'rbf', 'model__gamma': 1, 'model__C': 10, 'feature_selection__columns': [100, 102, 105]}

In [32]:
pipe.set_params(**best_params_mrmr)

In [33]:
pipe.fit(X_train, y_train)

y_proba = pipe.predict_proba(X_val)[:, 1]
competition_scoring(pipe, X_val, y_val, scale_metric=True)

6650.0