# Summary of different grid searches

In [5]:
import sys
sys.path.append('..')

from metrics import default_competition_metric
from metrics import make_competition_scorer, competition_scoring, ColumnSelector


import numpy as np
import pandas as pd

import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

np.random.seed(44)

In [6]:
boruta_features = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 100, 101, 102, 103, 104, 105])

small_mrmr_features = np.array([100, 102, 105,403, 466])

nsc_features = np.array([ 75, 403, 409, 412, 458, 466, 471, 481, 489, 498])

## XGboost

In [7]:
X_train = np.load('../data/x_train.npy')
y_train = np.load('../data/y_train.npy')
X_val = np.load('../data/x_val.npy')
y_val = np.load('../data/y_val.npy')


In [8]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.compose import ColumnTransformer

# pipeline
pipe = Pipeline([
    ("feature_selection", ColumnSelector(columns = [100, 101])),
    ('scaler', StandardScaler()),
    ('model', xgb.XGBClassifier(n_estimators=1000, max_depth=5, verbosity=2, random_state=44))
])


pipe.fit(X_train, y_train)

y_proba = pipe.predict_proba(X_val)[:, 1]
default_competition_metric(y_val, k=2, y_pred_proba=y_proba)

5800.0

In [21]:
pipe = Pipeline([
    ("feature_selection", ColumnSelector()),
    ('scaler', StandardScaler()),
    ('model', xgb.XGBClassifier(n_estimators=1000, max_depth=5, verbosity=2, random_state=44))
])

In [22]:
best_params_boruta_xgboost = {'model__n_estimators': 1000, 'model__max_depth': 3, 'model__learning_rate': 0.01, 'model__colsample_bytree': 0.75, 'feature_selection__columns': [5, 100, 101, 105]}

pipe.set_params(**best_params_boruta_xgboost)
pipe.fit(X_train, y_train)
competition_scoring(pipe, X_val, y_val)

6900.0

In [23]:
best_params_mrmr_xgboost = {'model__n_estimators': 1000, 'model__max_depth': 3, 'model__learning_rate': 0.01, 'model__colsample_bytree': 1, 'feature_selection__columns': [100, 102, 105]}

pipe.set_params(**best_params_mrmr_xgboost)
pipe.fit(X_train, y_train)

competition_scoring(pipe, X_val, y_val)

6900.0

In [10]:
best_params_nsc_xgboost = {'model__n_estimators': 100, 'model__max_depth': 3, 'model__learning_rate': 0.01, 'model__colsample_bytree': 0.75, 'feature_selection__columns': [412]}


pipe.set_params(**best_params_nsc_xgboost)
pipe.fit(X_train, y_train)

competition_scoring(pipe, X_val, y_val)

5449.999999999999

## SVM

In [11]:


pipe = Pipeline([
    ("feature_selection", ColumnSelector()),
    ('scaler', StandardScaler()),
    ('model', SVC(probability=True, random_state=44))
])

In [12]:
nsc_features_svm = {'model__kernel': 'rbf', 'model__gamma': 0.001, 'model__C': 10, 'feature_selection__columns': [466, 498]}


pipe.set_params(**nsc_features_svm)

pipe.fit(X_train, y_train)

competition_scoring(pipe, X_val, y_val)

4500.0

In [13]:
best_params_boruta_svm = {'model__kernel': 'rbf', 'model__gamma': 0.1, 'model__C': 0.1, 'feature_selection__columns': [1, 8, 100, 101, 103, 105]}


pipe.set_params(**best_params_boruta_svm)
pipe.fit(X_train, y_train)
competition_scoring(pipe, X_val, y_val)

6300.0

In [14]:
best_params_mrmr_xgboost = {'model__kernel': 'rbf', 'model__gamma': 1, 'model__C': 10, 'feature_selection__columns': [100, 102, 105]}

pipe.set_params(**best_params_mrmr_xgboost)

pipe.fit(X_train, y_train)

y_proba = pipe.predict_proba(X_val)[:, 1]
competition_scoring(pipe, X_val, y_val, scale_metric=True)

6650.0

## Random Forest

In [9]:

pipe = Pipeline([
    ("feature_selection", ColumnSelector()),
    ('scaler', StandardScaler()),
    ('model', RandomForestClassifier(n_estimators=1000, random_state=44))
])

In [12]:
best_params_boruta_rf = {'model__n_estimators': 1500, 'model__min_samples_split': 5, 'model__min_samples_leaf': 1, 'model__max_features': 'sqrt', 'model__max_depth': 10, 'model__bootstrap': True, 'feature_selection__columns': [5, 100, 103]}

pipe.set_params(**best_params_boruta_rf)
pipe.fit(X_train, y_train)
competition_scoring(pipe, X_val, y_val)

6400.0

In [13]:
best_params_mrmr_rf = {'model__n_estimators': 1000, 'model__min_samples_split': 10, 'model__min_samples_leaf': 4, 'model__max_features': 'sqrt', 'model__max_depth': 20, 'model__bootstrap': False, 'feature_selection__columns': [100, 102, 105]}



pipe.set_params(**best_params_mrmr_rf)
pipe.fit(X_train, y_train)
competition_scoring(pipe, X_val, y_val)

6650.0

In [18]:
best_params_rf_nsc = {'model__n_estimators': 400, 'model__min_samples_split': 2, 'model__min_samples_leaf': 2, 'model__max_features': 'sqrt', 'model__max_depth': 10, 'model__bootstrap': True, 'feature_selection__columns': [409, 471, 481]}

pipe.set_params(**best_params_rf_nsc)
pipe.fit(X_train, y_train)
competition_scoring(pipe, X_val, y_val)

4000.0