# 숙명여자대학교 Deep Learning 2024 수업 Lab 9: Self-Supervised

# Self-supervised Learning with TensorFlow


## Pretext Task - Rotation

### RotNet
Hypothesis: a model could recognize the correct rotation of an object only if it has the “visual common sense” of what the object should look like self-supervised learning by rotating the entire input images. The model learns to predict which rotation is applied (4-way classification)

### Supervised vs Self-supervised
The accuracy gap between the RotNet based model and the fully supervised Network-In-Network (NIN) model is very small, only 1.64% points
We do not need data labels to train the RotNet based model but achieved similar accuracy with that of the model which used data labels for training

## Import Library and Load MNIST

In [None]:
import tensorflow as tf
import numpy as np
from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense
import matplotlib.pyplot as plt

We will use 10000 self-supervision data, 1000 downstream task data, and 300 test data

In [None]:
(X_train, Y_train), (X_test, Y_test) = keras.datasets.mnist.load_data()
XX_train = X_train[10000:11000]
YY_train = Y_train[10000:11000]
X_train = X_train[:10000]
Y_train = Y_train[:10000]
XX_test = X_test[300:600]
YY_test = Y_test[300:600]
X_test = X_test[:300]
Y_test = Y_test[:300]

In [None]:
Y_train

In [None]:
print('shape of x_train:', X_train.shape)
print('shape of y_train:', Y_train.shape)
print('shape of xx_train:', XX_train.shape)
print('shape of yy_train:', YY_train.shape)
print('shape of x_test:', X_test.shape)
print('shape of y_test:', Y_test.shape)
print('shape of xx_test:', XX_test.shape)
print('shape of yy_test:', YY_test.shape)

## Build RotNet for Pretext Task

### Dataset for Pretext Task (Rotation)
Need to generate rotated images and their labels to train the model for pretext task

[1, 0, 0, 0]: 0 degree rotation

[0, 1, 0, 0]; 90 degree rotation

[0, 0, 1, 0]: 180 degree rotation

[0, 0, 0, 1]; 270 degree rotation

In [None]:
n_samples = X_train.shape[0]
X_rotate = np.zeros(shape = (n_samples*4,
                             X_train.shape[1],
                             X_train.shape[2]))
Y_rotate = np.zeros(shape = (n_samples*4, 4))

for i in range(n_samples):
    img = X_train[i]
    X_rotate[4*i-4] = img
    Y_rotate[4*i-4] = tf.one_hot([0], depth = 4)

    # 90 degrees rotation
    X_rotate[4*i-3] = np.rot90(img, k = 1)
    Y_rotate[4*i-3] = tf.one_hot([1], depth = 4)

    # 180 degrees rotation
    X_rotate[4*i-2] = np.rot90(img, k = 2)
    Y_rotate[4*i-2] = tf.one_hot([2], depth = 4)

    # 270 degrees rotation
    X_rotate[4*i-1] = np.rot90(img, k = 3)
    Y_rotate[4*i-1] = tf.one_hot([3], depth = 4)

## Plot Dataset for Pretext Task (Rotation)

In [None]:
plt.subplots(figsize = (10, 10))

plt.subplot(141)
plt.imshow(X_rotate[12], cmap = 'gray')
plt.axis('off')

plt.subplot(142)
plt.imshow(X_rotate[13], cmap = 'gray')
plt.axis('off')

plt.subplot(143)
plt.imshow(X_rotate[14], cmap = 'gray')
plt.axis('off')

plt.subplot(144)
plt.imshow(X_rotate[15], cmap = 'gray')
plt.axis('off')

X_rotate = X_rotate.reshape(-1,28,28,1)


## Build Model for Pretext Task (Rotation)



In [None]:
model_pretext = tf.keras.models.Sequential([
    tf.keras.layers.Conv2D(filters = 64,
                           kernel_size = (3,3),
                           strides = (2,2),
                           activation = 'relu',
                           padding = 'SAME',
                           input_shape = (28, 28, 1)),

    tf.keras.layers.MaxPool2D(pool_size = (2, 2),
                              strides = (2, 2)),

    tf.keras.layers.Conv2D(filters = 32,
                           kernel_size = (3,3),
                           strides = (1,1),
                           activation = 'relu',
                           padding = 'SAME',
                           input_shape = (7, 7, 64)),

    tf.keras.layers.MaxPool2D(pool_size = (2, 2),
                              strides = (2, 2)),

    tf.keras.layers.Conv2D(filters = 16,
                           kernel_size = (3,3),
                           strides = (2,2),
                           activation = 'relu',
                           padding = 'SAME',
                           input_shape = (3, 3, 32)),

    tf.keras.layers.Flatten(),

    tf.keras.layers.Dense(units = 4, activation = 'softmax')

])
model_pretext.summary()

## Training the model for the pretext task


In [None]:
model_pretext.compile(optimizer = 'adam',
                      loss = 'categorical_crossentropy',
                      metrics = 'accuracy')

model_pretext.fit(X_rotate,
                  Y_rotate,
                  batch_size = 192,
                  epochs = 50,
                  verbose = 2,
                  shuffle = False)

#  Build Downstream Task (MNIST Image Classification)
Freezing trained parameters to transfer them for the downstream task


In [None]:
model_pretext.trainable = False


## Reshape Dataset



In [None]:
XX_train = XX_train.reshape(-1,28,28,1)
XX_test = XX_test.reshape(-1,28,28,1)
YY_train = tf.one_hot(YY_train, 10, on_value = 1.0, off_value = 0.0)
YY_test = tf.one_hot(YY_test, 10, on_value = 1.0, off_value = 0.0)

## Build Model

### Model: two convolution layers and one fully connected layer

Two convolution layers are transferred from the model for the pretext task

Single fully connected layer is trained only

In [None]:
model_pretext.get_layer(index = 0), model_pretext.get_layer(index = 1), \
model_pretext.get_layer(index = 2), model_pretext.get_layer(index = 3)

In [None]:
model_downstream = tf.keras.models.Sequential([
    model_pretext.get_layer(index = 0),
    model_pretext.get_layer(index = 1),
    model_pretext.get_layer(index = 2),
    model_pretext.get_layer(index = 3),

    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(units = 10, activation = 'softmax')
])

model_downstream.summary()

Only use (1, 5, 6) digits to visualize latent space in 2-D

In [None]:
model_downstream.compile(optimizer = tf.keras.optimizers.SGD(learning_rate = 0.001,momentum = 0.9),
                         loss = 'categorical_crossentropy',
                         metrics = 'accuracy')

model_downstream.fit(XX_train,
                     YY_train,
                     batch_size = 64,
                     validation_split = 0.2,
                     epochs = 50,
                     verbose = 2,
                     callbacks = tf.keras.callbacks.EarlyStopping(monitor = 'accuracy', patience = 7))

## Downstream Task Trained Result (Image Classification Result)




In [None]:
predict

In [None]:
name = ['0', '1', '2', '3', '4', '5','6', '7', '8', '9']
idx = 9
img = XX_train[idx].reshape(-1,28,28,1)
label = YY_train[idx]
predict = model_downstream.predict(img)
mypred = np.argmax(predict, axis = 1)

plt.figure(figsize = (12,5))
plt.subplot(1,2,1)
plt.imshow(img.reshape(28, 28), 'gray')
plt.axis('off')
plt.subplot(1,2,2)
plt.stem(predict[0])
plt.show()

print('Prediction : {}'.format(name[mypred[0]]))

# Supervised Model for Comparison
Convolution Neural Networks for MNIST image classification
Model: Same model architecture with the model for the downstream task
The number of total parameter is the same with the model for the downstream task, but is has zero non-trainable parameters

In [None]:
model_sup = tf.keras.models.Sequential([
    tf.keras.layers.Conv2D(filters = 64,
                           kernel_size = (3,3),
                           strides = (2,2),
                           activation = 'relu',
                           padding = 'SAME',
                           input_shape = (28, 28, 1)),

    tf.keras.layers.MaxPool2D(pool_size = (2, 2),
                              strides = (2, 2)),

    tf.keras.layers.Conv2D(filters = 32,
                           kernel_size = (3,3),
                           strides = (1,1),
                           activation = 'relu',
                           padding = 'SAME',
                           input_shape = (7, 7, 64)),

    tf.keras.layers.MaxPool2D(pool_size = (2, 2),
                              strides = (2, 2)),

    tf.keras.layers.Flatten(),

    tf.keras.layers.Dense(units = 10, activation = 'softmax')

])
model_sup.summary()

In [None]:
model_sup.trainable = False
model_rand = tf.keras.models.Sequential([
    model_sup.get_layer(index = 0),
    model_sup.get_layer(index = 1),
    model_sup.get_layer(index = 2),
    model_sup.get_layer(index = 3),

    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(units = 10, activation = 'softmax')
])

model_rand.summary()

In [None]:
model_sup.compile(optimizer = tf.keras.optimizers.SGD(learning_rate = 0.001,momentum = 0.9),
                  loss = 'categorical_crossentropy',
                  metrics = 'accuracy')
model_sup.fit(XX_train,
              YY_train,
              batch_size = 32,
              validation_split = 0.2,
              epochs = 50,
              verbose = 2,
              callbacks = tf.keras.callbacks.EarlyStopping(monitor = 'accuracy', patience = 7))

In [None]:
model_rand.compile(optimizer = tf.keras.optimizers.SGD(learning_rate = 0.001,momentum = 0.9),
                  loss = 'categorical_crossentropy',
                  metrics = 'accuracy')
model_rand.fit(XX_train,
              YY_train,
              batch_size = 32,
              validation_split = 0.2,
              epochs = 50,
              verbose = 2,
              callbacks = tf.keras.callbacks.EarlyStopping(monitor = 'accuracy', patience = 7))

## Compare Self-supervised Learning and Supervised Learning

### Pretext Task
Input data: 10,000 MNIST images without labels

### Downstream Task and Supervised Learning (for performance comparison)

Training data: 1,000 MNIST images with labels
Test data: 300 MNIST images with labels

### Key concepts
For transfer learning, we used to train networks like VGG 16 with large image dataset with labels such as ImageNet

With self-supervised learning, we train such networks with unlabeled image datasets which have larger number of data than labeled image datasets have and perform transfer learning

Comparing downstream task performance with that of supervised learning is equal to comparing the performance of (self-supervised) transfer learning and supervised learning performance

In [None]:
test_self = model_rand.evaluate(XX_test,YY_test,batch_size = 64,verbose = 2)

print("")
print('Self-supervised Learning Accuracy on Test Data:  {:.2f}%'.format(test_self[1]*100))

In [None]:
test_self = model_downstream.evaluate(XX_test,YY_test,batch_size = 64,verbose = 2)

print("")
print('Self-supervised Learning Accuracy on Test Data:  {:.2f}%'.format(test_self[1]*100))

In [None]:
test_sup = model_sup.evaluate(XX_test,YY_test,batch_size = 64, verbose = 2)

print("")
print('Supervised Learning Accuracy on Test Data:  {:.2f}%'.format(test_sup[1]*100))