# Initializers

In [1]:
import tensorflow as tf

dense_glorot_initializer = tf.keras.layers.Dense(50, activation="relu") # Glorot is the default
dense_he_initializer = tf.keras.layers.Dense(50, activation="relu", kernel_initializer="he_normal")

# Custom initializer (adjusting scale, fan mode and distribution)
he_avg_init = tf.keras.initializers.VarianceScaling(scale=2.0, mode="fan_avg", distribution="uniform")
dense_custom_he_avg = tf.keras.layers.Dense(50, activation="sigmoid", kernel_initializer=he_avg_init)

2025-10-12 20:16:08.718460: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Better activation functions

In [2]:
# Leaky ReLU
leaky_relu = tf.keras.layers.LeakyReLU(alpha=0.2)
dense = tf.keras.layers.Dense(50, activation=leaky_relu, kernel_initializer="he_normal")

# It could be added as a separate layer. In this case, the layer is left with no activation function and the
# activation function goes as a layer.
model = tf.keras.models.Sequential([
    # more layers
    tf.keras.layers.Dense(50, kernel_initializer="he_normal"), # no activation
    tf.keras.layers.LeakyReLU(alpha=0.2), # activation as a separate layer
    # more layers
])



# Batch normalization

In [3]:
model = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=[28, 28]),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(300, activation="relu", kernel_initializer="he_normal"),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(100, activation="relu", kernel_initializer="he_normal"),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(10, activation="softmax")
])

  super().__init__(**kwargs)
I0000 00:00:1760310973.915409    3565 gpu_device.cc:2020] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 3482 MB memory:  -> device: 0, name: NVIDIA GeForce GTX 1050, pci bus id: 0000:01:00.0, compute capability: 6.1


In [4]:
model.summary()

In [5]:
[(var.name, var.trainable) for var in model.layers[1].variables]

[('gamma', True),
 ('beta', True),
 ('moving_mean', False),
 ('moving_variance', False)]

In [6]:
# Batch normalization before the activation function
model = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=[28, 28]),
    tf.keras.layers.Dense(300, kernel_initializer="he_normal", use_bias=False),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Activation("relu"),
    tf.keras.layers.Dense(100, kernel_initializer="he_normal", use_bias=False),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Activation("relu"),
    tf.keras.layers.Dense(10, activation="softmax"),
])

In [7]:
model.summary()

# Faster optimizers

In [8]:
# Momentum
optimizer = tf.keras.optimizers.SGD(learning_rate=0.001, momentum=0.9)

In [9]:
# Nesterov Accelerated Gradient (NAG)
optimizer = tf.keras.optimizers.SGD(learning_rate=0.001, momentum=0.9, nesterov=True)

In [10]:
# RMSProp
optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.001, rho=0.9)

In [11]:
# Adam
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999)

# Learning rate scheduling

In [12]:
# power scheduling
optimize = tf.keras.optimizers.SGD(learning_rate=0.01, decay=1e-4)



In [13]:
# exponential scheduling
def exponential_decay_fn(epoch):
    learning_rate = 0.01
    steps = 20
    return learning_rate * (0.1 ** (epoch // steps))

# the same but with eta0 (learning_rate) and s not hardcoded
def exponential_decay(lr0, s):
    def exponential_decay_fn(epoch):
        return lr0 * (0.1 ** (epoch // s))
    return exponential_decay_fn


In [14]:
for i in range(0, 41):
    print("exponential_decay_fn({}) -> {}".format(i, exponential_decay_fn(i)))

exponential_decay_fn(0) -> 0.01
exponential_decay_fn(1) -> 0.01
exponential_decay_fn(2) -> 0.01
exponential_decay_fn(3) -> 0.01
exponential_decay_fn(4) -> 0.01
exponential_decay_fn(5) -> 0.01
exponential_decay_fn(6) -> 0.01
exponential_decay_fn(7) -> 0.01
exponential_decay_fn(8) -> 0.01
exponential_decay_fn(9) -> 0.01
exponential_decay_fn(10) -> 0.01
exponential_decay_fn(11) -> 0.01
exponential_decay_fn(12) -> 0.01
exponential_decay_fn(13) -> 0.01
exponential_decay_fn(14) -> 0.01
exponential_decay_fn(15) -> 0.01
exponential_decay_fn(16) -> 0.01
exponential_decay_fn(17) -> 0.01
exponential_decay_fn(18) -> 0.01
exponential_decay_fn(19) -> 0.01
exponential_decay_fn(20) -> 0.001
exponential_decay_fn(21) -> 0.001
exponential_decay_fn(22) -> 0.001
exponential_decay_fn(23) -> 0.001
exponential_decay_fn(24) -> 0.001
exponential_decay_fn(25) -> 0.001
exponential_decay_fn(26) -> 0.001
exponential_decay_fn(27) -> 0.001
exponential_decay_fn(28) -> 0.001
exponential_decay_fn(29) -> 0.001
exponentia

## Should be used in a callback

In [15]:
lr_scheduler = tf.keras.callbacks.LearningRateScheduler(exponential_decay_fn)
# history = model.fit(X_train, y_train, [...], callbacks=[lr_scheduler])

# Regularizing the model

## Dropout

In [16]:
model = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=[28, 28]),
    tf.keras.layers.Dropout(rate=0.2),
    tf.keras.layers.Dense(100, activation="relu", kernel_initializer="he_normal"),
    tf.keras.layers.Dropout(rate=0.2),
    tf.keras.layers.Dense(100, activation="relu", kernel_initializer="he_normal"),
    tf.keras.layers.Dropout(rate=0.2),
    tf.keras.layers.Dense(10, activation="softmax"),
])

In [17]:
model.summary()

# Monte Carlo (MC) Dropout

In [21]:
import numpy as np

# y_probas = np.stack([model(X_test, training=True)
#                      for sample in range(100)])
# y_proba = y_probas.mean(axis=0)

# Exercises

## 8

### a

In [None]:
import tensorflow as tf

model = tf.keras.Sequential()
