# Training from Scratch vs Transfer Learning
The objective is to  train a neural network (say model A) on data related to 6 of the classes, and we will train another neural network (say model B) on the remaining 2 classes. Then, we would use the pre-trained weights of model A and tune the last layer so as to classify these 2 classes(this technique is called Transfer Learning), and compare the results of classification obtained using normal training and transfer learning. 

##### About the Dataset
Fashion-MNIST is a dataset of Zalando's article images —consisting of a training set of 60,000 examples and a test set of 10,000 examples. Each example is a 28x28 grayscale image, associated with a label.

# Importing the Modules

In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
import os
os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE"

tf.random.set_seed(42)
np.random.seed(42)

# Preparing the Dataset

In [2]:
(X_train_full, y_train_full),(X_test, y_test) = keras.datasets.fashion_mnist.load_data()

In [3]:
X_train_full = X_train_full[:30000]
y_train_full = y_train_full[:30000]
X_test = X_test[:5000]
y_test = y_test[:5000]
X_train_full = X_train_full / 255.0
X_test = X_test / 255.0

X_valid, X_train = X_train_full[:5000], X_train_full[5000:]
y_valid, y_train = y_train_full[:5000], y_train_full[5000:]

# Dividing the Datasets

In [4]:
def split_dataset(X, y):
    y_5_or_6 = (y == 5) | (y == 6) # sandals or shirts
    y_A = y[~y_5_or_6]
    y_A[y_A > 6] -= 2 # class indices 7, 8, 9 should be moved to 5, 6, 7
    y_B = (y[y_5_or_6] == 6).astype(np.float32) # binary classification task: is it a shirt (class 6)?
    return ((X[~y_5_or_6], y_A), (X[y_5_or_6], y_B))

In [5]:
(X_train_A, y_train_A), (X_train_B, y_train_B) = split_dataset(X_train, y_train)
(X_valid_A, y_valid_A), (X_valid_B, y_valid_B) = split_dataset(X_valid, y_valid)
(X_test_A, y_test_A), (X_test_B, y_test_B) = split_dataset(X_test, y_test)

# Build and Fit the Model A

In [6]:
model_A1 = keras.models.Sequential()
model_A1.add(keras.layers.Flatten(input_shape=[28, 28]))
for n_hidden in (300, 100, 50, 50, 50):
    model_A1.add(keras.layers.Dense(n_hidden, activation="selu"))
model_A1.add(keras.layers.Dense(8, activation="softmax"))

In [7]:
model_A1.compile(loss= "sparse_categorical_crossentropy",
    optimizer= keras.optimizers.SGD(learning_rate= 0.001),
    metrics=["accuracy"])

In [8]:
history = model_A1.fit(X_train_A, y_train_A, epochs=11,
            validation_data=(X_valid_A, y_valid_A))

Epoch 1/11
Epoch 2/11
Epoch 3/11
Epoch 4/11
Epoch 5/11
Epoch 6/11
Epoch 7/11
Epoch 8/11
Epoch 9/11
Epoch 10/11
Epoch 11/11


In [9]:
os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE"

In [10]:
model_A1.save("my_model_A1.h5")

# Build and Fit the Model B

In [11]:
model_B = keras.models.Sequential()
model_B.add(keras.layers.Flatten(input_shape=[28, 28]))
for n_hidden in (300, 100, 50, 50, 50):
    model_B.add(keras.layers.Dense(n_hidden, activation="selu"))
model_B.add(keras.layers.Dense(1, activation="softmax"))

In [12]:
model_B.compile(loss="binary_crossentropy",
    optimizer= keras.optimizers.SGD(learning_rate= 0.001) ,
    metrics=["accuracy"])

In [13]:
history = model_B.fit(X_train_B, y_train_B, epochs=10,
            validation_data=(X_valid_B, y_valid_B))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Creating new model based on existing model A1

In [14]:
model_B.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 784)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 300)               235500    
_________________________________________________________________
dense_7 (Dense)              (None, 100)               30100     
_________________________________________________________________
dense_8 (Dense)              (None, 50)                5050      
_________________________________________________________________
dense_9 (Dense)              (None, 50)                2550      
_________________________________________________________________
dense_10 (Dense)             (None, 50)                2550      
_________________________________________________________________
dense_11 (Dense)             (None, 1)                

###### Before creating model_B_on_A(a model based on pre-trained layers of model_A), we shall clone the model_A and set its trained weights so that when we train model_B_on_A, it will not affect model_A.

In [15]:
model_A1_clone = keras.models.clone_model(model_A1)
model_A1_clone.set_weights(model_A1.get_weights())

In [16]:
model_B_on_A = keras.models.Sequential(model_A1.layers[:-1])
model_B_on_A.add(keras.layers.Dense(1, activation="sigmoid"))

In [17]:
for layer in model_B_on_A.layers[:-1]:
    layer.trainable = False

In [18]:
model_B_on_A.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten (Flatten)            (None, 784)               0         
_________________________________________________________________
dense (Dense)                (None, 300)               235500    
_________________________________________________________________
dense_1 (Dense)              (None, 100)               30100     
_________________________________________________________________
dense_2 (Dense)              (None, 50)                5050      
_________________________________________________________________
dense_3 (Dense)              (None, 50)                2550      
_________________________________________________________________
dense_4 (Dense)              (None, 50)                2550      
_________________________________________________________________
dense_12 (Dense)             (None, 1)                

In [19]:
model_B_on_A.compile(loss="binary_crossentropy",
         optimizer=keras.optimizers.SGD(learning_rate = 0.001),
         metrics=["accuracy"])

In [20]:
history = model_B_on_A.fit(X_train_B, y_train_B, epochs=10,
                   validation_data=(X_valid_B, y_valid_B))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Evaluating the Models

In [21]:
model_B.evaluate(X_test_B, y_test_B)



[0.016896238550543785, 0.49844881892204285]

In [22]:
model_B_on_A.evaluate(X_test_B, y_test_B)



[0.053066566586494446, 0.9948293566703796]