# Run grid search

In [2]:
# Imports
import pandas
import numpy as np
import matplotlib.pyplot as plt




# Pipelines imports
import xgboost
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.feature_selection import GenericUnivariateSelect,f_regression
from sklearn.ensemble import IsolationForest

# Scoring function
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import make_scorer

from scipy import signal


# Grid Search
from sklearn.model_selection import GridSearchCV

# Function to get best estimator
def get_best_estimator(pipeline, X, y, parameters, scoring, cv=5, verbose=0):
    print('Finding best parameters through grid search...')
    grid = GridSearchCV(pipeline, param_grid=parameters, cv=cv, scoring=scoring, verbose=verbose, refit=True, return_train_score=False)
    grid.fit(X, y)
    print('Done!')
    return grid.best_estimator_, grid

def find_best(pipeline, X, y, parameters, scoring, cv=5, verbose=0):
    best_pipeline, grid = get_best_estimator(pipeline, X, y, parameters, scoring, cv=cv, verbose=verbose)
    return grid

# Get data
Xdf = pandas.read_csv("X_train.csv")
ydf = pandas.read_csv("y_train.csv")
X = Xdf[Xdf.columns[1:]].values
y = ydf[ydf.columns[1]].values

print(X.shape)
print(y.shape)

def unison_shuffled_copies(a, b):
    assert len(a) == len(b)
    p = np.random.permutation(len(a))
    return a[p], b[p]

# Random under sampling
unique,counts = np.unique(y,return_counts=True)
print(counts)


# Divide by class
X0 = X[y != 1]
X1 = X[y == 1]
y0 = y[y != 1]
y1 = y[y == 1]
indices_subsampled = np.random.choice(range(X1.shape[0]),counts[0],replace=False)
X1_subsampled = X1[indices_subsampled]
y1_subsampled = y1[indices_subsampled]
print(X1_subsampled.shape)
print(y1_subsampled.shape)

X = np.concatenate((X0,X1_subsampled))
y = np.concatenate((y0,y1_subsampled))

X,y = unison_shuffled_copies(X,y)
print(X.shape)
print(y.shape)




(4800, 1000)
(4800,)
[ 600 3600  600]
(600, 1000)
(600,)
(1800, 1000)
(1800,)


In [3]:
score = make_scorer(balanced_accuracy_score)

steps = [
    
    ('scaler', StandardScaler()), 
    ('ufs',GenericUnivariateSelect(score_func=f_regression, mode='k_best', param=200)),
    
    ('XGB', xgboost.XGBClassifier(colsample_bytree=0.6, 
                                 min_child_weight=6, 
                                 max_depth=10))
     ]

pipeline = Pipeline(steps)

parameters = {'ufs__param':[200, 500, 1000], 'XGB_max_depth':[5, 8, 10, 12]}


grid = find_best(pipeline, X, y, parameters, score, cv=5, verbose=1)

pandas.DataFrame(grid.cv_results_)[['mean_fit_time','mean_score_time','params','mean_test_score','std_test_score']]

Finding best parameters through grid search...
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


ValueError: Invalid parameter XGB_max_depth for estimator Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('ufs', GenericUnivariateSelect(mode='k_best', param=200,
            score_func=<function f_regression at 0x7fa0f7812ae8>)), ('XGB', XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_byn...lpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=None, subsample=1, verbosity=1))]). Check the list of available parameters with `estimator.get_params().keys()`.