# Introduction
<hr style="border:2px solid black"> </hr>


**What?** Reduce overfitting with dropout regularisation



In [None]:
"""
Some tips:
    [1] Generally use a small dropout value of 20%-50% of neurons with 20% providing a good starting point. 
    A probability too low has minimal e↵ect and a value too high results in under-learning by the network.

    [2] Use a larger network. You are likely to get better performance when dropout is used on a larger network,
    giving the model more of an opportunity to learn independent representations.

    [3] Use dropout on input (visible) as well as hidden layers. Application of dropout at each layer of the 
    network has shown good results.

    [4] Use a large learning rate with decay and a large momentum. Increase your learning rate by a factor of
    10 to 100 and use a high momentum value of 0.9 or 0.99.

    [5] Constrain the size of network weights. A large learning rate can result in very large network weights. 
    Imposing a constraint on the size of network weights such as max-norm regularization with a size of 4 or 5 
    has been shown to improve results.
"""

In [None]:
# Import python modules
import numpy
import pandas
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from keras.constraints import maxnorm
from keras.optimizers import SGD
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
# Getting rid of the warning messages
import warnings
warnings.filterwarnings("ignore")

### Model withouth dropout

In [None]:
# fix random seed for reproducibility
seed = 7
numpy.random.seed(seed)
# load dataset
dataframe = pandas.read_csv("../DATASETS/sonar.csv", header=None)
dataset = dataframe.values
# split into input (X) and output (Y) variables
X = dataset[:,0:60].astype(float)
Y = dataset[:,60]
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)

# baseline
def create_baseline():
    # create model
    model = Sequential()
    model.add(Dense(60, input_dim=60, kernel_initializer = "normal" , activation= "relu" ))
    model.add(Dense(30, kernel_initializer = "normal" , activation= "relu" ))
    model.add(Dense(1, kernel_initializer = "normal" , activation= "sigmoid" ))
    # Compile model
    sgd = SGD(lr=0.01, momentum=0.8, decay=0.0, nesterov=False)
    model.compile(loss= "binary_crossentropy" , optimizer = "sgd", metrics=[ "accuracy" ])
    return model

numpy.random.seed(seed)
estimators = []
estimators.append(( "standardize" , StandardScaler()))
estimators.append(( "mlp" , KerasClassifier(build_fn=create_baseline, epochs = 300, batch_size=16, verbose=0)))
pipeline = Pipeline(estimators)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
results = cross_val_score(pipeline, X, encoded_Y, cv=kfold)
print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

### Using Dropout on the Visible Layer

In [None]:
"""
Dropout can be applied to input neurons called the visible layer. In the example below we add a new Dropout 
layer between the input (or visible layer) and the first hidden layer. The dropout rate is set to 20%, meaning 
one in five inputs will be randomly excluded from each update cycle. Additionally, as recommended in the original
paper on dropout, a constraint is imposed on
"""

In [None]:
# fix random seed for reproducibility
seed = 7
numpy.random.seed(seed)
# load dataset
dataframe = pandas.read_csv("../DATASETS/sonar.csv", header=None)
dataset = dataframe.values
# split into input (X) and output (Y) variables
X = dataset[:,0:60].astype(float)
Y = dataset[:,60]
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)

# dropout in the input layer with weight constraint
def create_model():
    # create model
    model = Sequential()
    model.add(Dropout(0.2, input_shape=(60,)))
    model.add(Dense(60, kernel_initializer = "normal" , activation= "relu" , W_constraint=maxnorm(3)))
    model.add(Dense(30, kernel_initializer = "normal" , activation= "relu" , W_constraint=maxnorm(3)))
    model.add(Dense(1, kernel_initializer = "normal" , activation= "sigmoid" ))
    # Compile model
    sgd = SGD(lr=0.1, momentum=0.9, decay=0.0, nesterov=False)
    model.compile(loss= "binary_crossentropy" , optimizer = "sgd", metrics=[ "accuracy" ])
    return model

numpy.random.seed(seed)
estimators = []
estimators.append(( "standardize" , StandardScaler()))
estimators.append(( "mlp" , KerasClassifier(build_fn=create_baseline, epochs = 300, batch_size=16, verbose=0)))
pipeline = Pipeline(estimators)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
results = cross_val_score(pipeline, X, encoded_Y, cv=kfold)
print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

In [None]:
"""
We got an improvement in performance!
"""

### Using Dropout on Hidden Layers

In [None]:
"""
Dropout can be applied to hidden neurons in the body of your network model. In the example below dropout is 
applied between the two hidden layers and between the last hidden layer and the output layer. Again a dropout
rate of 20% is used as is a weight constraint on those layers.
"""

In [None]:
# fix random seed for reproducibility
seed = 7
numpy.random.seed(seed)
# load dataset
dataframe = pandas.read_csv("../DATASETS/sonar.csv", header=None)
dataset = dataframe.values
# split into input (X) and output (Y) variables
X = dataset[:,0:60].astype(float)
Y = dataset[:,60]
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)

# dropout in the input layer with weight constraint
def create_model():
    # create model
    model = Sequential()
    model.add(Dense(60, input_dim=60, kernel_initializer = "normal" , activation= relu , W_constraint=maxnorm(3)))
    model.add(Dropout(0.2))
    model.add(Dense(30, kernel_initializer = "normal" , activation= relu , W_constraint=maxnorm(3)))
    model.add(Dropout(0.2))
    model.add(Dense(1, kernel_initializer = "normal" , activation= sigmoid ))
    # Compile model
    sgd = SGD(lr=0.1, momentum=0.9, decay=0.0, nesterov=False)
    model.compile(loss = "binary_crossentropy" , optimizer = "sgd", metrics=[ "accuracy" ])
    return model

numpy.random.seed(seed)
estimators = []
estimators.append(( "standardize" , StandardScaler()))
estimators.append(( "mlp" , KerasClassifier(build_fn=create_baseline, epochs = 300, batch_size=16, verbose=0)))
pipeline = Pipeline(estimators)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
results = cross_val_score(pipeline, X, encoded_Y, cv=kfold)
print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

In [None]:
"""
We can see that for this problem and for the chosen network configuration that using dropout in the hidden layers
did not lift performance. In fact, performance was worse than the baseline. It is possible that additional training
epochs are required or that further tuning is required to the learning rate.
"""

# References
<hr style="border:2px solid black"> </hr>


- https://machinelearningmastery.com/how-to-reduce-overfitting-with-dropout-regularization-in-keras/
- https://machinelearningmastery.com/dropout-regularization-deep-learning-models-keras/
    
