## Now, let us load the required data into individual variables

In [1]:
# import numpy and pandas libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy import stats
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report

np.random.seed(1)

In [2]:
X_train = pd.read_csv('C:/Users/suman/DspData/data/churn_train_X.csv') 
y_train = pd.read_csv('C:/Users/suman/DspData/data/churn_train_y.csv') 
X_test = pd.read_csv('C:/Users/suman/DspData/data/churn_test_X.csv') 
y_test = pd.read_csv('C:/Users/suman/DspData/data/churn_test_y.csv') 


Let us create a data frame called performance and put all the needed metrics in it!

In [47]:
performance = pd.DataFrame({"model": [], "Accuracy": [], "Precision": [], "Recall": [], "F1": []})

#  Logistic Regression with Random Search

In [48]:
score_measure = "recall"
kfolds = 5

param_grid = {
    'penalty': ['None','l1','l2','elasticnet'],
    'solver':['saga','liblinear']
}

model = LogisticRegression()
rand_search = RandomizedSearchCV(estimator = model, param_distributions=param_grid, cv=kfolds, n_iter=500,
                           scoring=score_measure, verbose=1, n_jobs=-1, 
                           return_train_score=True)

_ = rand_search.fit(X_train, y_train)

print(f"The best {score_measure} score is {rand_search.best_score_}")
print(f"... with parameters: {rand_search.best_params_}")



Fitting 5 folds for each of 8 candidates, totalling 40 fits
The best recall score is 0.5669050216135417
... with parameters: {'solver': 'liblinear', 'penalty': 'l1'}


20 fits failed out of a total of 40.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\suman\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\suman\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "c:\Users\suman\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 441, in _check_solver
    raise ValueError(
ValueError: Logistic Regression supports only penalties in ['l1', 'l2', 'elasticnet', 'none'], got None.

----

#  Logistic Regression with Grid Search

In [49]:
score_measure = "recall"
kfolds = 5
penalty = rand_search.best_params_['penalty']
solver = rand_search.best_params_['solver']

param_grid = {  
    'penalty': [penalty],
    'solver': [solver]
}

model = LogisticRegression()
grid_search = GridSearchCV(estimator = model, param_grid=param_grid, cv=kfolds, 
                           scoring=score_measure, verbose=1, n_jobs=-1,
                           return_train_score=True)

_ = grid_search.fit(X_train, y_train)

print(f"The best {score_measure} score is {grid_search.best_score_}")
print(f"... with parameters: {grid_search.best_params_}")

bestRecallLogistic = grid_search.best_estimator_

Fitting 5 folds for each of 1 candidates, totalling 5 fits


  y = column_or_1d(y, warn=True)


The best recall score is 0.5695996444875369
... with parameters: {'penalty': 'l1', 'solver': 'liblinear'}




In [50]:
c_matrix = confusion_matrix(y_test, grid_search.predict(X_test))
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"Logistic Regression",
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])
performance

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,Logistic Regression,0.903587,0.799458,0.57393,0.668177


In [51]:
print(TP)

295


## SVM (Linear) - Random Search

In [None]:
# Transform the dataset (only training data)
from imblearn.over_sampling import SMOTE
oversample = SMOTE()
X_up, y_up = oversample.fit_resample(X_train, y_train)

In [None]:
score_measure = "recall"
kfolds = 2

param_grid = {
     'C':[1,100,1000],
    'gamma':[0,10,100],
'kernel':['linear']
}

model = SVC()
rand_search = RandomizedSearchCV(estimator = model, param_distributions=param_grid, cv=kfolds, n_iter=3,
                           scoring=score_measure, verbose=1, n_jobs=-1, 
                           return_train_score=True)

_ = rand_search.fit(X_up, y_up)

print(f"The best {score_measure} score is {rand_search.best_score_}")
print(f"... with parameters: {rand_search.best_params_}")

## SVM (RBF) - Grid Search

In [None]:
score_measure = "recall"
kfolds = 2

param_grid = {
    'C':[0.001, 0.10, 0.0001,0.00001],   
    'gamma': ['scale','auto'],
    'kernel':['rbf']
}

model = SVC()
rand_search = RandomizedSearchCV(estimator = model, param_distributions=param_grid, cv=kfolds, n_iter=3,
                           scoring=score_measure, verbose=1, n_jobs=-1, 
                           return_train_score=True)

_ = rand_search.fit(X_up, y_up)

print(f"The best {score_measure} score is {rand_search.best_score_}")
print(f"... with parameters: {rand_search.best_params_}")

the best SVM recall I observed : 77%

Summary: I tried to run SVM model for 7 hours and didnt got any results. May be I'm using too many training examples for SVM implementation.Overall, SVMs can be slow to train because of their complexity, particularly when working with large datasets or high-dimensional data. However, the trade-off is that SVMs are often very accurate and effective for a wide range of classification problems.

## Decision Trees - Random Search 

In [52]:
from sklearn.tree import DecisionTreeClassifier
score_measure = "recall"
kfolds = 5

param_grid = {
    'min_samples_split': np.arange(2,100),  
    'min_samples_leaf': np.arange(1,75),
    'min_impurity_decrease': np.arange(0.0001, 0.01, 0.0005),
    'max_leaf_nodes': np.arange(5, 80), 
    'max_depth': np.arange(1,40), 
    'criterion': ['entropy', 'gini'],
}

model = DecisionTreeClassifier()
rand_search = RandomizedSearchCV(estimator = model, param_distributions=param_grid, cv=kfolds, n_iter=500,
                           scoring=score_measure, verbose=1, n_jobs=-1, 
                           return_train_score=True)

_ = rand_search.fit(X_train, y_train)

print(f"The best {score_measure} score is {rand_search.best_score_}")
print(f"... with parameters: {rand_search.best_params_}")

Fitting 5 folds for each of 500 candidates, totalling 2500 fits
The best recall score is 0.7861754130812427
... with parameters: {'min_samples_split': 19, 'min_samples_leaf': 15, 'min_impurity_decrease': 0.0001, 'max_leaf_nodes': 66, 'max_depth': 11, 'criterion': 'entropy'}


## DECISION TREE - GRID SEARCH

In [53]:
score_measure = "recall"
kfolds = 5
min_samples_split = rand_search.best_params_['min_samples_split']
min_samples_leaf = rand_search.best_params_['min_samples_leaf']
min_impurity_decrease = rand_search.best_params_['min_impurity_decrease']
max_leaf_nodes = rand_search.best_params_['max_leaf_nodes']
max_depth = rand_search.best_params_['max_depth']
criterion = rand_search.best_params_['criterion']

param_grid = {
    'min_samples_split': np.arange(min_samples_split-2,min_samples_split+2),  
    'min_samples_leaf': np.arange(min_samples_leaf-2,min_samples_leaf+2),
    'min_impurity_decrease': np.arange(min_impurity_decrease-0.0001, min_impurity_decrease+0.0001, 0.00005),
    'max_leaf_nodes': np.arange(max_leaf_nodes-2,max_leaf_nodes+2), 
    'max_depth': np.arange(max_depth-2,max_depth+2), 
    'criterion': [criterion]
}

model = DecisionTreeClassifier()
grid_search = GridSearchCV(estimator = model, param_grid=param_grid, cv=kfolds, 
                           scoring=score_measure, verbose=1, n_jobs=-1,
                           return_train_score=True)

_ = grid_search.fit(X_train, y_train)

print(f"The best {score_measure} score is {grid_search.best_score_}")
print(f"... with parameters: {grid_search.best_params_}")

bestRecallTree = grid_search.best_estimator_

Fitting 5 folds for each of 1024 candidates, totalling 5120 fits
The best recall score is 0.7969862238920535
... with parameters: {'criterion': 'entropy', 'max_depth': 10, 'max_leaf_nodes': 65, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 13, 'min_samples_split': 17}


In [54]:
c_matrix = confusion_matrix(y_test, grid_search.predict(X_test))
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"Decision Tree", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])
performance

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,Logistic Regression,0.903587,0.799458,0.57393,0.668177
0,Decision Tree,0.940112,0.81203,0.840467,0.826004


## RandomForestClassifier

Although, for this assignment we have to use Logistic Regression, SVM, and Decision tree. But the dataset is best suited for Random Forest Algorithm (From the dataset website) and  It's important that we don't predict churning as non-churning customers. That's why the model needs to be evaluated on the "Recall"- metric (goal > 77%). And also I tried to Upsample the imbalanced dataset using SMOTE Technique.

In [55]:
# Transform the dataset (only training data)
from imblearn.over_sampling import SMOTE
oversample = SMOTE()
X_up, y_up = oversample.fit_resample(X_train, y_train)

In [56]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_up, y_up)

rfpred = rf.predict(X_test)

  rf.fit(X_up, y_up)


In [57]:
c_matrix = (confusion_matrix(y_test, rfpred))
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"Random Forest Classifier", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])
performance

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,Logistic Regression,0.903587,0.799458,0.57393,0.668177
0,Decision Tree,0.940112,0.81203,0.840467,0.826004
0,Random Forest Classifier,0.945048,0.801739,0.896887,0.846648


In [58]:
print(TP)

461


### Neural Net

In [59]:
%%time

ann = MLPClassifier(hidden_layer_sizes=(60,50,40), solver='adam', max_iter=200)
_ = ann.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


Wall time: 1.49 s


In [60]:
%%time
y_pred = ann.predict(X_test)

Wall time: 8.01 ms


In [61]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      1.00      0.91      2525
           1       0.00      0.00      0.00       514

    accuracy                           0.83      3039
   macro avg       0.42      0.50      0.45      3039
weighted avg       0.69      0.83      0.75      3039



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## With RandomizedSearchCV

In [62]:
%%time

score_measure = "accuracy"
kfolds = 5

param_grid = {
    'hidden_layer_sizes': [ (50,), (70,),(50,30), (40,20), (60,40, 20), (70,50,40)],
    'activation': ['logistic', 'tanh', 'relu'],
    'solver': ['adam', 'sgd'],
    'alpha': [0, .2, .5, .7, 1],
    'learning_rate': ['constant', 'invscaling', 'adaptive'],
    'learning_rate_init': [0.001, 0.01, 0.1, 0.2, 0.5],
    'max_iter': [5000]
}

ann = MLPClassifier()
grid_search = RandomizedSearchCV(estimator = ann, param_distributions=param_grid, cv=kfolds, n_iter=100,
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = grid_search.fit(X_train, y_train)

bestRecallTree = grid_search.best_estimator_

print(grid_search.best_params_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


  y = column_or_1d(y, warn=True)


{'solver': 'sgd', 'max_iter': 5000, 'learning_rate_init': 0.001, 'learning_rate': 'constant', 'hidden_layer_sizes': (50, 30), 'alpha': 1, 'activation': 'logistic'}
Wall time: 8min 51s


In [63]:
%%time
y_pred = bestRecallTree.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      1.00      0.91      2525
           1       0.00      0.00      0.00       514

    accuracy                           0.83      3039
   macro avg       0.42      0.50      0.45      3039
weighted avg       0.69      0.83      0.75      3039

Wall time: 41.9 ms


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## With GridSearchCV

In [64]:
%%time

score_measure = "accuracy"
kfolds = 5

param_grid = {
    'hidden_layer_sizes': [ (30,), (50,), (70,), (90,)],
    'activation': ['tanh', 'relu'],
    'solver': ['adam'],
    'alpha': [.5, .7, 1],
    'learning_rate': ['adaptive', 'invscaling'],
    'learning_rate_init': [0.005, 0.01, 0.15],
    'max_iter': [5000]
}

ann = MLPClassifier()
grid_search = GridSearchCV(estimator = ann, param_grid=param_grid, cv=kfolds, 
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = grid_search.fit(X_train, y_train)

bestRecallTree = grid_search.best_estimator_

print(grid_search.best_params_)

Fitting 5 folds for each of 144 candidates, totalling 720 fits


  y = column_or_1d(y, warn=True)


{'activation': 'tanh', 'alpha': 0.5, 'hidden_layer_sizes': (30,), 'learning_rate': 'adaptive', 'learning_rate_init': 0.005, 'max_iter': 5000, 'solver': 'adam'}
Wall time: 3min 53s


In [65]:
%%time
y_pred = bestRecallTree.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      1.00      0.91      2525
           1       0.00      0.00      0.00       514

    accuracy                           0.83      3039
   macro avg       0.42      0.50      0.45      3039
weighted avg       0.69      0.83      0.75      3039

Wall time: 16.1 ms


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Neural networks, particularly deep neural networks, have shown to be effective in various machine learning tasks, including classification, regression, and pattern recognition. When it comes to recall, which is a measure of a model's ability to correctly identify positive cases out of all actual positive cases, neural networks have certain advantages that can make them perform better than other models.


While comparing all other models and neural networks we got better recall value for neural networks

Overall, while other models like logistic regression or decision trees can also achieve good recall, neural networks have certain advantages that can make them perform better in certain cases. However, it's important to note that the performance of a neural network depends on several factors, including the choice of architecture, training algorithm, and hyperparameters.

Conclusion :The goal of this project is to provide an analysis which shows the difference between a non-churning and churning customer. Using the existing data managed to train a model with upsampled data which reaches a recall score of 89%.This will provide us insight into which customers are eager to churn.

In [3]:
%%time

model1 = MLPClassifier(
    hidden_layer_sizes=(60,50,40), 
    activation = 'relu',
    solver='adam',
    alpha=0.0001, # Strength of the L2 regularization term
    batch_size='auto',
    learning_rate = 'constant',
    learning_rate_init = 0.001,
    max_iter=200,
    tol=0.00001, 
    early_stopping = True,
    n_iter_no_change = 5,
    verbose=True
    
)
_ = model1.fit(X_train, y_train)

# Currently (version 1.2.2), MLPClassifier supports only the Cross-Entropy loss function.
# https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html

  y = column_or_1d(y, warn=True)


Iteration 1, loss = 9.47003620
Validation score: 0.843441
Iteration 2, loss = 10.10287244
Validation score: 0.843441
Iteration 3, loss = 8.73548114
Validation score: 0.843441
Iteration 4, loss = 9.53782919
Validation score: 0.843441
Iteration 5, loss = 9.52652596
Validation score: 0.843441
Iteration 6, loss = 10.28367246
Validation score: 0.843441
Iteration 7, loss = 8.79197575
Validation score: 0.843441
Validation score did not improve more than tol=0.000010 for 5 consecutive epochs. Stopping.
CPU times: total: 1.81 s
Wall time: 656 ms


In [4]:
model1.loss_curve_

[9.47003620128886,
 10.102872437657616,
 8.735481140413357,
 9.537829193647294,
 9.526525960117464,
 10.283672458567988,
 8.791975746067356]

In [5]:
%%time
y_pred = model1.predict(X_test)

CPU times: total: 62.5 ms
Wall time: 16.3 ms


In [43]:
print(classification_report(y_test, y_pred, digits=3))

              precision    recall  f1-score   support

           0      0.831     1.000     0.908      2525
           1      0.000     0.000     0.000       514

    accuracy                          0.831      3039
   macro avg      0.415     0.500     0.454      3039
weighted avg      0.690     0.831     0.754      3039



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [7]:
y_pred = model1.predict(X_test)
y_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [8]:
%%time

param_distributions = {
    'hidden_layer_sizes': [ (64,), (128,),(128,64), (64,128), (64,128,196), (196,128,64)],
    'activation': ['logistic', 'tanh', 'relu'],
    'solver': ['adam', 'sgd'],
    'alpha': [0, .0001, .0005, .001, .005],
    'batch_size': [25, 50, 100],
    'learning_rate': ['constant', 'invscaling', 'adaptive'],
    'learning_rate_init': [0.0005, 0.001, 0.005, 0.01],
    'max_iter': [5000],
    'tol': [0.000005, 0.00001, 0.00005],
    'early_stopping':[True],
    'n_iter_no_change':[5],
}

random_search = RandomizedSearchCV(
    estimator = MLPClassifier(), # a blank slate... RandomizedSearchCV will send parameters.
    param_distributions=param_distributions, 
    cv=3, 
    n_iter=300,
    scoring='accuracy', # note that we could also choose any other scoring metric that is appropriate for a multi-class problem - such as f1_macro, f1_micro, f1_weighted, etc.
    verbose=1, 
    n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
    return_train_score=True
)

_ = random_search.fit(X_train, y_train)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


  y = column_or_1d(y, warn=True)


CPU times: total: 9.16 s
Wall time: 4min 53s


In [9]:
model2 = random_search.best_estimator_

print(random_search.best_params_)

{'tol': 1e-05, 'solver': 'adam', 'n_iter_no_change': 5, 'max_iter': 5000, 'learning_rate_init': 0.005, 'learning_rate': 'adaptive', 'hidden_layer_sizes': (128,), 'early_stopping': True, 'batch_size': 100, 'alpha': 0.0001, 'activation': 'relu'}


In [10]:
%%time
y_pred = model2.predict(X_test)

print(classification_report(y_test, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.8309    1.0000    0.9076      2525
           1     0.0000    0.0000    0.0000       514

    accuracy                         0.8309      3039
   macro avg     0.4154    0.5000    0.4538      3039
weighted avg     0.6903    0.8309    0.7541      3039

CPU times: total: 46.9 ms
Wall time: 35.9 ms


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Keras

In [11]:
import tensorflow as tf
from tensorflow import keras

# fix random seed for reproducibility
np.random.seed(1)
tf.random.set_seed(1)

In [44]:
%%time

def build_clf(meta, hidden_layer_sizes, dropout):
    n_features_in_ = meta["n_features_in_"]
    n_classes_ = meta["n_classes_"]
    target_encoder_ = meta["target_encoder_"]
    
    model = tf.keras.models.Sequential()
    model.add(keras.layers.Input(shape=n_features_in_)),
    #for hidden_layer_size in hidden_layer_sizes:
    for hidden_layer_size in hidden_layer_sizes:
        model.add(keras.layers.Dense(hidden_layer_size, 
            kernel_initializer= tf.keras.initializers.GlorotUniform(), 
            bias_initializer=keras.initializers.RandomNormal(mean=0.0, stddev=0.05, seed=None), 
            activation="relu"))
        model.add(keras.layers.Dropout(dropout))
    model.add(tf.keras.layers.Dense(10, activation='softmax'))
    
    #though you could return a compiled model, it's not necessary, and would result in the loss of these
    # parameters in the tune process - as they would be 'hard coded'
    # model.compile(loss = 'sparse_categorical_crossentropy', metrics = ['accuracy']) 

    return model

CPU times: total: 0 ns
Wall time: 0 ns


In [45]:
%%time

# If you don't have the following installed, from command line '!pip install scikeras'
from scikeras.wrappers import KerasClassifier

keras_clf = KerasClassifier(
    model=build_clf,
    hidden_layer_sizes=64,
    dropout=0.5,
    optimizer=keras.optimizers.Adam,
    optimizer__learning_rate=0.0001
)
keras_clf.get_params()

CPU times: total: 0 ns
Wall time: 0 ns


{'model': <function __main__.build_clf(meta, hidden_layer_sizes, dropout)>,
 'build_fn': None,
 'warm_start': False,
 'random_state': None,
 'optimizer': keras.optimizers.optimizer_v2.adam.Adam,
 'loss': None,
 'metrics': None,
 'batch_size': None,
 'validation_batch_size': None,
 'verbose': 1,
 'callbacks': None,
 'validation_split': 0.0,
 'shuffle': True,
 'run_eagerly': False,
 'epochs': 1,
 'hidden_layer_sizes': 64,
 'dropout': 0.5,
 'optimizer__learning_rate': 0.0001,
 'class_weight': None}

In [46]:
%%time

params = {
    
    # the following are model parameters, and therefore must be defined as parameters in the KarasClassifier, and then in the build_clf function
    'model__hidden_layer_sizes': [(70,),(90, ), (100,), (100, 90)], # this will require KarasClassifier and build_clf to have hidden_layer_sizes parameter set
    'model__dropout': [0, 0.1], # this will require KarasClassifier and build_clf to have hidden_layer_sizes parameter set
    
    # the following are 'fit' parameters, the scikeras wrapper provides these parameters. These are passed to the 'model.fit' method for each fit of the model
    'batch_size':[20, 60, 100],
    'epochs':[10],
    'optimizer':['adam','sgd'],
    'loss':['sparse_categorical_crossentropy'],
    
    # this is added to the optimizer 
    'optimizer__learning_rate': [0.0001, 0.001, 0.01]

}
keras_clf.get_params()

CPU times: total: 0 ns
Wall time: 0 ns


{'model': <function __main__.build_clf(meta, hidden_layer_sizes, dropout)>,
 'build_fn': None,
 'warm_start': False,
 'random_state': None,
 'optimizer': keras.optimizers.optimizer_v2.adam.Adam,
 'loss': None,
 'metrics': None,
 'batch_size': None,
 'validation_batch_size': None,
 'verbose': 1,
 'callbacks': None,
 'validation_split': 0.0,
 'shuffle': True,
 'run_eagerly': False,
 'epochs': 1,
 'hidden_layer_sizes': 64,
 'dropout': 0.5,
 'optimizer__learning_rate': 0.0001,
 'class_weight': None}

In [47]:
%%time

from sklearn.model_selection import RandomizedSearchCV
#from tensorflow.keras.callbacks import EarlyStopping

rnd_search_cv = RandomizedSearchCV(
    estimator=keras_clf, 
    param_distributions=params, 
    scoring='accuracy',  # we could use any appropriate sklearn metric here (i.e. accuracy, f1_micro, f1_macro)
    n_iter=50, 
    cv=3)

# In rare cases, you may find your model training results in exceeding python's default recursion limit.
# If needed, you can increase this excersion limit by using the following code.
#import sys
#sys.setrecursionlimit(10000) # note: the default is 3000 (python 3.9)

_ = rnd_search_cv.fit(X_train, y_train,  verbose=1)

# You can create 'call back' functions. These are functions that will be called at the 
# end of each epoch. There are a number of builtin functions created for this purpose, 
# one of which is EarlyStopping -- that, based on the parameters you give, will stop
# the training process. This is useful when the algorithm is not making any significant
# gains through further training. 
#earlystop = EarlyStopping(monitor='val_loss', patience=5, verbose=0, mode='auto')
#callback = [earlystop]
#_ = rnd_search_cv.fit(X_train, y_train, callbacks=callback, verbose=0)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
E

In [50]:
rnd_search_cv.best_params_

{'optimizer__learning_rate': 0.01,
 'optimizer': 'sgd',
 'model__hidden_layer_sizes': (70,),
 'model__dropout': 0,
 'loss': 'sparse_categorical_crossentropy',
 'epochs': 10,
 'batch_size': 100}

In [48]:
best_model = rnd_search_cv.best_estimator_

In [49]:
from sklearn.metrics import classification_report

print(classification_report(y_test, best_model.predict(X_test), digits=4))

              precision    recall  f1-score   support

           0     0.8309    1.0000    0.9076      2525
           1     0.0000    0.0000    0.0000       514

    accuracy                         0.8309      3039
   macro avg     0.4154    0.5000    0.4538      3039
weighted avg     0.6903    0.8309    0.7541      3039



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
