In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import joblib
import time
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, f1_score, classification_report
from sklearn.model_selection import cross_val_predict, GridSearchCV
from sklearn.decomposition import PCA
from sklearn.base import clone
import pickle
import tensorflow as tf

In [3]:
from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin
class MyDynamicKerasClassifier(BaseEstimator, ClassifierMixin):
    """
    Custom classifier that wraps a dynamic Keras model for use with scikit-learn pipelines.
    This wrapper is necessary because sometimes (e.g. when using pca in pipeline) the input shape is not clear apriori
    """

    def __init__(self, model, epochs=None, batch_size=None, verbose=0, optimizer="adam", loss=None, metrics=None, cnn=False, cnn_shape_list=None):
        self.model = model
        self.epochs = epochs
        self.batch_size = batch_size
        self.verbose = verbose
        self.loss = loss
        self.optimizer = optimizer
        self.metrics = metrics
        self.cnn = cnn
        self.cnn_shape_list = cnn_shape_list
        self.keras_model = None
        self.history = None

    def fit(self, x, y):
        """fits the parameter"""

        # convert y to numpy array and set classes_
        y = np.array(y)
        self.classes_ = np.unique(y)

        if self.keras_model is None:
            if self.cnn:
                x_new = x.reshape(self.cnn_shape_list)
                self.keras_model = self.model(input_shape=x_new.shape[1:])
                self.keras_model.compile(optimizer=self.optimizer, loss=self.loss, metrics=self.metrics)
                self.history = self.keras_model.fit(x_new, y, epochs=self.epochs, batch_size=self.batch_size, verbose=self.verbose)
            else:
                self.keras_model = self.model(input_shape=(x.shape[1],))
                self.keras_model.compile(optimizer=self.optimizer, loss=self.loss, metrics=self.metrics)
                self.history = self.keras_model.fit(x, y, epochs=self.epochs, batch_size=self.batch_size, verbose=self.verbose)

        return self

    def predict(self, x):
        """Predict using the Keras model."""

        if self.keras_model is None:
            raise Exception("The model has not been fitted yet!")
        elif self.cnn:
            x_new = x.reshape(self.cnn_shape_list)
            probabilities = self.keras_model.predict(x_new, verbose=self.verbose)
        else:
            probabilities = self.keras_model.predict(x, verbose=self.verbose)

        if probabilities.shape[1] == 1:
            # Use 0.5 as the threshold to convert probabilities to binary labels
            return (probabilities > 0.5).astype('int32')
        else:
            # Use argmax for multi-class problems
            return probabilities.argmax(axis=1)

    def get_params(self, deep=True):
        """Get parameters for this estimator."""

        return {
            "model": self.model,
            "epochs": self.epochs,
            "batch_size": self.batch_size,
            "verbose": self.verbose,
            "optimizer": self.optimizer,
            "loss": self.loss,
            "metrics": self.metrics,
            "cnn": self.cnn,
            "cnn_shape_list": self.cnn_shape_list,
        }

    def set_params(self, **params):
        """Set parameters for this estimator."""
        valid_params = self.get_params(deep=True)
        for parameter, value in params.items():
            if parameter in valid_params:
                setattr(self, parameter, value)
        return self

In [4]:
def DNN_gridsearch(x, y):
    """Define model 1"""
    start = time.time()

    def create_model(input_shape=None):
        """Define classifier for model 1"""
        keras_model = tf.keras.models.Sequential()
        keras_model.add(tf.keras.layers.Dense(units=10, input_shape=input_shape))
        keras_model.add(tf.keras.layers.Dense(units=8))
        keras_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))
        return keras_model

    base_model = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),  # missing values are replaced by column median of train_set
        ("scaler", MinMaxScaler()),  # scale to interval [0, 1]
        ("pca", PCA()),
        # binary_crossentropy is the same as for logistic regression
        ("keras", MyDynamicKerasClassifier(model=create_model, epochs=150, batch_size=32, loss="binary_crossentropy", metrics=["accuracy"])),
    ])

    model_param = [
        {
            "pca__n_components": [0.9],  # [0.85, 0.9, 0.95, 0.99, 0.999]
        }
    ]

    model_gs = GridSearchCV(base_model, model_param, cv=10, n_jobs=-1, scoring="f1_weighted")

    model_gs.fit(x, y)

    model = model_gs.best_estimator_

    model_best_params = model_gs.best_params_

    model_predictions = model.predict(x)

    # attention: it is not allowed to use a fitted model (like model) in cross_val_predict because information gets through
    # unfitted model (clone(model)) is used for cross validation

    model_new = clone(model)
    model_predictions_cv = cross_val_predict(model_new, x, y, cv=3, n_jobs=-1)

    end = time.time()
    model_time = end - start

    # we cannot save a pipeline with keras, so we have to save both separately
    model.named_steps["keras"].keras_model.save("model_1_keras")  # save only the Keras model
    model.steps.pop(-1)  # removing the keras step (last step)
    joblib.dump(model, filename="model_1_without_keras.pkl")  # save pipeline without Keras model

    # use the following to store the best params of grid search
    with open("model_1_best_params.pkl", "wb") as ff:
        pickle.dump(model_best_params, ff)

    joblib.dump(model_predictions, filename="model_1_predictions.pkl")
    joblib.dump(model_predictions_cv, filename="model_1_predictions_cv.pkl")
    joblib.dump(model_time, filename="model_1_time.pkl")

In [None]:

def run_model_2(x, y):
    """Define model 2"""
    start = time.time()

    def create_model(input_shape=None):
        """Define classifier for model 2"""
        keras_model = tf.keras.models.Sequential()
        keras_model.add(tf.keras.layers.Dense(units=20, input_shape=input_shape))
        keras_model.add(tf.keras.layers.Dense(units=15))
        # softmax is the same as for logistic regression
        keras_model.add(tf.keras.layers.Dense(units=10, activation="softmax"))
        return keras_model

    base_model = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),  # missing values are replaced by column median of train_set
        ("scaler", MinMaxScaler()),  # scale to interval [0, 1]
        ("pca", PCA()),
        # sparse_categorical_crossentropy is used when labels are integers (e.g. label1=5, label2=3, ...)
        # categorical_crossentropy is used when labels are one-hot-encoded (e.g. label1=[0,0,0,0,0,1,0,0,0,0], label2=[0,0,0,1,0,0,0,0,0,0], ...)
        ("keras", MyDynamicKerasClassifier(model=create_model, epochs=150, batch_size=32, loss="sparse_categorical_crossentropy", metrics=["accuracy"])),
    ])

    model_param = [
        {
            "pca__n_components": [0.9],  # [0.85, 0.9, 0.95, 0.99, 0.999]
        }
    ]

    model_gs = GridSearchCV(base_model, model_param, cv=10, n_jobs=-1, scoring="f1_weighted")

    model_gs.fit(x, y)

    model = model_gs.best_estimator_

    model_best_params = model_gs.best_params_

    model_predictions = model.predict(x)

    # attention: it is not allowed to use a fitted model (like model) in cross_val_predict because information gets through
    # unfitted model (clone(model)) is used for cross validation

    model_new = clone(model)
    model_predictions_cv = cross_val_predict(model_new, x, y, cv=3, n_jobs=-1)

    end = time.time()
    model_time = end - start

    # we cannot save a pipeline with keras, so we have to save both separately
    model.named_steps["keras"].keras_model.save("model_2_keras")  # save only the Keras model
    model.steps.pop(-1)  # removing the keras step (last step)
    joblib.dump(model, filename="model_2_without_keras.pkl")  # save pipeline without Keras model

    # use the following to store the best params of grid search
    with open("model_2_best_params.pkl", "wb") as ff:
        pickle.dump(model_best_params, ff)

    joblib.dump(model_predictions, filename="model_2_predictions.pkl")
    joblib.dump(model_predictions_cv, filename="model_2_predictions_cv.pkl")
    joblib.dump(model_time, filename="model_2_time.pkl")