# Run grid search

In [6]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#Imbalance Sampling
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN

# Pipelines imports
import xgboost
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.feature_selection import GenericUnivariateSelect,f_regression
from sklearn.ensemble import IsolationForest
from sklearn.multiclass import OneVsRestClassifier


# Scoring function
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import make_scorer
from sklearn.base import TransformerMixin,BaseEstimator
#from scipy import signal

# Keras imports
import tensorflow as tf
from tensorflow.keras import utils
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier, KerasRegressor
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing import sequence 
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dense, Input, Dropout, Activation, Flatten
from tensorflow.keras.layers import Conv1D, BatchNormalization, GlobalAveragePooling1D, Bidirectional, MaxPooling1D

# Grid Search
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [10]:
# Get data
Xdf = pd.read_csv("X_train.csv")
XTestdf = pd.read_csv("X_test.csv")
ydf = pd.read_csv("y_train.csv")
X = Xdf[Xdf.columns[1:]].values
Xtest = XTestdf[XTestdf.columns[1:]].values
y = ydf[ydf.columns[1]].values

# Random Oversampler
Oversample = True
#ros = RandomOverSampler(random_state=42)
#X_resampled, y_resampled = ros.fit_resample(X, y)
X_resampled, y_resampled = SMOTE().fit_resample(X, y)
#X_resampled, y_resampled = ADASYN().fit_resample(X, y)

# encode class values as integers (Already in interger form)
#encoder = LabelEncoder()
#encoder.fit(y)
#encoded_y = encoder.transform(y)
# convert integers to dummy variables (i.e. one hot encoded)
one_hot = False
if one_hot:
    y = np_utils.to_categorical(y)

print(X.shape)
print(y.shape)

def unison_shuffled_copies(a, b):
    assert len(a) == len(b)
    p = np.random.RandomState(seed=42).permutation(len(a))
    return a[p], b[p]

# Count number of occurances per class
if one_hot:
    counts = np.sum(y,axis=0, dtype=int)
else:
    _, counts = np.unique(y,return_counts=True)
print(counts)

# Divide by class
if one_hot:
    X0 = X[y[:, 1] != 1]
    X1 = X[y[:, 1] == 1]
    y0 = y[y[:, 1] != 1]
    y1 = y[y[:, 1] == 1]
else:     
    X0 = X[y != 1]
    X1 = X[y == 1]
    y0 = y[y != 1]
    y1 = y[y == 1]
print(X0.shape)
print(X1.shape)
print(y0.shape)
print(y1.shape)

# Downsampling
indices_subsampled = np.random.RandomState(seed=42).choice(range(X1.shape[0]),int(counts[0]),replace=False)
X1_subsampled = X1[indices_subsampled]
y1_subsampled = y1[indices_subsampled]
print(X1_subsampled.shape)
print(y1_subsampled.shape)
    
X = np.concatenate((X0,X1_subsampled))
y = np.concatenate((y0,y1_subsampled))

if Oversample:
    X = X_resampled
    y = y_resampled
    
X,y = unison_shuffled_copies(X,y)
print(X.shape)
print(y.shape)

(4800, 1000)
(4800,)
[ 600 3600  600]
(1200, 1000)
(3600, 1000)
(1200,)
(3600,)
(600, 1000)
(600,)
(10800, 1000)
(10800,)


In [8]:
# Function to get best estimator
def get_best_estimator(pipeline, X, y, parameters, scoring, cv=5, verbose=0, n_jobs=None):
    print('Finding best parameters through grid search...')
    grid = GridSearchCV(pipeline, param_grid=parameters, scoring=scoring, cv=cv, verbose=verbose, refit=True, return_train_score=False, n_jobs=n_jobs)
    grid.fit(X, y)
    print('Done!')
    return grid.best_estimator_, grid

def find_best(pipeline, X, y, parameters, scoring, cv=5, verbose=0, n_jobs=None):
    best_pipeline, grid = get_best_estimator(pipeline, X, y, parameters, scoring, cv=cv, verbose=verbose, n_jobs=n_jobs)
    return grid

class Revert1Hot(BaseEstimator,TransformerMixin):

    # here you define the operation it should perform
    def transform(self, X, y=None, **fit_params):
        return np.argmax(X)

    # just return self
    def fit(self, X, y=None, **fit_params):
        return self

## OneVsRestClassifier

In [12]:
score = make_scorer(balanced_accuracy_score)

#classifiers = [
#    KNeighborsClassifier(3),
#    SVC(kernel="rbf", C=0.025, probability=True),
#    NuSVC(probability=True),
#    DecisionTreeClassifier(),
#    RandomForestClassifier(),
#    AdaBoostClassifier(),
#    GradientBoostingClassifier()
#    ]

steps = [
    
    ('scaler', StandardScaler()), 
    ('ufs',GenericUnivariateSelect(score_func=f_regression, mode='k_best', param=200)),
    ('XGB', OneVsRestClassifier(xgboost.XGBClassifier(objective=score,
                                                      colsample_bytree=0.6,
                                                      min_child_weight=6,
                                                      max_depth=8)))
     ]

pipeline = Pipeline(steps)

parameters = {'ufs__param':[1000]}


grid = find_best(pipeline, X, y, parameters, score, cv=3, verbose=1, n_jobs=-1)

Finding best parameters through grid search...
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


TypeError: __call__() missing 1 required positional argument: 'y_true'

In [31]:
pd.DataFrame(grid.cv_results_)[['mean_fit_time','mean_score_time','params','mean_test_score','std_test_score']]

Unnamed: 0,mean_fit_time,mean_score_time,params,mean_test_score,std_test_score
0,238.078083,0.38562,{'ufs__param': 1000},0.93037,0.003088


In [11]:
pipeline.fit(X,y,scoring=score)
result = pipeline.predict(Xtest)

ValueError: Pipeline.fit does not accept the scoring parameter. You can pass parameters to specific steps of your pipeline using the stepname__parameter format, e.g. `Pipeline.fit(X, y, logisticregression__sample_weight=sample_weight)`.

## Deep Learning Model

In [12]:
def create_model():
    num_features=1000
    mid_size=100
    dropout=0.1
    
    # Model Definition
    model = Sequential()
    model.add(Dense(num_features, activation="relu", input_shape=(num_features,)))
    # model.add(Conv1D(filters=64, kernel_size=5, activation='relu', input_shape=(1000,)))
    model.add(Dropout(dropout))
    model.add(Dense(200, activation="relu"))
    model.add(Dropout(dropout))
    model.add(Dense(50, activation="relu"))
    model.add(Dropout(dropout))
    model.add(Dense(3, activation="softmax"))

    #compile model using accuracy to measure model performance
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'], 
                  weighted_metrics=['accuracy'])

    model.summary()
    return model

#Define Early stopping mechanism
es = EarlyStopping(monitor='val_loss', mode='max', patience=5, verbose=1)

# wrap the model using the function you created   , callbacks=[es]
clf = KerasClassifier(build_fn=create_model, epochs=20, batch_size=5, verbose=1)

In [13]:
kfold = KFold(n_splits=3, shuffle=True)
results = cross_val_score(clf, X, y, cv=kfold)
print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_48 (Dense)             (None, 1000)              1001000   
_________________________________________________________________
dropout_36 (Dropout)         (None, 1000)              0         
_________________________________________________________________
dense_49 (Dense)             (None, 200)               200200    
_________________________________________________________________
dropout_37 (Dropout)         (None, 200)               0         
_________________________________________________________________
dense_50 (Dense)             (None, 50)                10050     
_________________________________________________________________
dropout_38 (Dropout)         (None, 50)                0         
_________________________________________________________________
dense_51 (Dense)             (None, 50)                2550      
__________

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Baseline: 66.83% (2.66%)


In [75]:
score = make_scorer(balanced_accuracy_score)

steps = [
    
    ('scaler', StandardScaler()), 
    ('ufs',GenericUnivariateSelect(score_func=f_regression, mode='k_best', param=200)),
    ('clf', clf),
    ('revert1hot', Revert1Hot())
     ]

pipeline = Pipeline(steps)

#parameters = [{'ufs__param':[200], 'clf__features':[200], 'clf__mid_size':[50, 10]},
#             {'ufs__param':[500], 'clf__features':[500], 'clf__mid_size':[100, 20]},
#             {'ufs__param':[1000], 'clf__features':[1000], 'clf__mid_size':[200, 100]},]

parameters = {'ufs__param':[200], 'clf__features':[200], 'clf__mid_size':[50]}
              
grid = find_best(pipeline, X, y, parameters, score, cv=5, verbose=1, n_jobs=-1)

pandas.DataFrame(grid.cv_results_)[['mean_fit_time','mean_score_time','params','mean_test_score','std_test_score']]

TypeError: All intermediate steps should be transformers and implement fit and transform. '<keras.wrappers.scikit_learn.KerasClassifier object at 0x7fa114fe2320>' (type <class 'keras.wrappers.scikit_learn.KerasClassifier'>) doesn't