In [45]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras import layers
from PIL import Image
import time

tf.random.set_seed(11)

%matplotlib inline

## Quantized Distiller

In [63]:
class Quantized_Distiller(tf.keras.Model):
  def __init__(self, teacher, student, nb_bits = 8):
      super(Quantized_Distiller, self).__init__()
      
      # Attributs de la classe Distiller
      self.teacher = teacher
      self.student = student
      self.nb_bits = nb_bits

      # Poids du student model
      self.original_w = self.student.get_weights()
      self.nb_layers =  len(self.original_w )// 2

      self.quantized_w = []

     
 
  def  matrix2vect(self,w):
    # Vectorisation de la matrice des poids
    self.layer_shapes = []
    vect = []

    for i in range (4):
      # rassembler les poids et biais dans une seule matrices
      v = tf.concat(  [w[2*i], tf.reshape(w[2*i+1], (1,-1)) ], axis=0)

      # enregistrer les dimensiosn des matrices pour effectuer l'opération inverse
      self.layer_shapes.append(v.shape)

      # Vectoriser les matrices de poids-biais
      vect.append(tf.reshape(v, (-1, 1)))
    
    return vect
                                               
  def vect2matrix(self, v):
      w = []
      # Cette fonction permet de transformer les poids quantifiées sous le formats correspondants aux dimensions des poids des couches
      for i in range(len(v)):
        mat = tf.reshape(v[i], self.layer_shapes[i])
        w.append(mat[0:-1])
        w.append(mat[-1])
      return w


  # Transforme les valeurs d'un vecteur vers l'intervalle [0, 1]
  def scale_function(self, tab, bucket_size):
      
      Vecteur = []
      A = []
      B = []

      for tab in tab:
        
        if bucket_size > tab.shape[0]:
          raise ValueError(f'Bucket_size ({bucket_size}) must be smaller than or equal to the vector length ({len(tab)})')
        v = tf.constant([])
        alpha = []
        beta = []

        nb_bucket = tab.shape[0]//bucket_size

        # Nombre de bucket pair
        if tab.shape[0] % bucket_size == 0:
          nb_param = nb_bucket


          for i in range(nb_param):
            b = tf.math.reduce_min(tf.slice(tab[:][0], begin=tf.constant([i* bucket_size]), size= tf.constant([bucket_size]) ) )
            a = tf.math.reduce_max(tf.slice(tab[:][0], begin=tf.constant([i* bucket_size]), size= tf.constant([bucket_size]) ) ) - tf.math.reduce_min(tf.slice(tab[:][0], begin=tf.constant([i* bucket_size]), size= tf.constant([bucket_size]) ) )
            alpha.append(a)
            beta.append(b)

            """ if a[0] == 0:
                  a = tf.constant([1])"""
            vect = (tf.slice(tab[:][0], begin=tf.constant([i* bucket_size]), size= tf.constant([bucket_size]) ) - b) / a
            v = tf.concat([v, vect], 0)

        # Nombre de bucket impair
        else:
          nb_param = nb_bucket + 1

          for i in range(nb_param):
            if i == nb_param - 1:
              b = tf.math.reduce_min(tf.slice(tab[:][0], begin=tf.constant([i* bucket_size]), size= tf.constant([tab.shape[0] - i* bucket_size]) ) )
              a = tf.math.reduce_max(tf.slice(tab[:][0], begin=tf.constant([i* bucket_size]), size= tf.constant([tab.shape[0] - i* bucket_size]) ) ) - tf.math.reduce_min(tf.slice(tab[:][0], begin=tf.constant([i* bucket_size]), size= tf.constant([tab.shape[0] - i* bucket_size]) ) )
              alpha.append(a)
              beta.append(b)

              """ if a[0] == 0:
                  a = tf.constant([1])"""

              vect = (  tf.slice(tab[:][0], begin=tf.constant([i* bucket_size]), size= tf.constant([tab.shape[0] - i* bucket_size]))  - b) /(a)

            else:
              b = tf.math.reduce_min(tf.slice(tab[:][0], begin=tf.constant([i* bucket_size]), size= tf.constant([bucket_size]) ) )
              a = tf.math.reduce_max(tf.slice(tab[:][0], begin=tf.constant([i* bucket_size]), size= tf.constant([bucket_size]) ) ) - tf.math.reduce_min(tf.slice(tab[:][0], begin=tf.constant([i* bucket_size]), size= tf.constant([bucket_size]) ) )
              alpha.append(a)
              beta.append(b)

              """ if a[0] == 0:
                  a = tf.constant([1])"""

              vect = (tf.slice(tab[:][0], begin=tf.constant([i* bucket_size]), size= tf.constant([bucket_size]) ) - b) / a
            v = tf.concat([v, vect], 0)

        A.append(alpha)
        B.append(beta)
        Vecteur.append(v) 
    
      return Vecteur, A, B

  def uniform_quantification(self, Vect):
      v_q = []
      s = tf.constant([2**self.nb_bits], dtype=tf.float32)

      for v in Vect:
        k = tf.math.subtract(tf.math.multiply(v,s) , tf.math.floor( tf.math.multiply(v,s)) )
        eps = tf.ones(k.shape[0])
        
        eps = tf.where( tf.greater(k,0.5),eps, 0 )
        v_s = tf.math.multiply(v,s)
        res =  tf.math.floor(tf.math.divide(v_s, s))

        Q =  res + (tf.math.divide(eps ,s) )

        v_q.append(Q)
      return v_q

  def invert_scale_function_b(self,v, alpha, beta, bucket_size):

    unscalled_v = []
    layer = 0
    for v in v:
      if bucket_size > len(v):
        raise ValueError(f'Bucket_size ({bucket_size}) must be smaller than or equal to the vector length ({len(v)})')

      Q = np.zeros((len(v[layer])))
      nb_param = len(alpha[layer])

      if len(v) % (nb_param * bucket_size) == 0:
        for i in range(nb_param):
          Q[i*bucket_size: (i+1)*bucket_size] = alpha[layer][i] * v[i*bucket_size: (i+1)*bucket_size] + beta[layer][i]

      else:
        for i in range(nb_param):
          if i == nb_param - 1:
            Q[i*bucket_size: -1] = alpha[i] * v[i*bucket_size: -1] + beta[i]
          else:
            Q[i*bucket_size: (i+1)*bucket_size] = alpha[layer][i] * v[i*bucket_size: (i+1)*bucket_size] + beta[layer][i]
      
      unscalled_v.append(Q)
      return unscalled_v

  


  # Compilation du model
  def compile( self, optimizer, metrics, distillation_loss_fn, student_loss_fn, alpha = 0.1, temperature= 20, bucket_size = 32):

    super(Quantized_Distiller,self).compile(optimizer = optimizer, metrics= metrics )
    # losses
    self.distillation_loss_fn = distillation_loss_fn
    self.student_loss_fn = student_loss_fn

    # Hyperparameters
    self.temperature = temperature
    self.alpha = alpha
    self.bucket_size = bucket_size
  

  # Training Step
  def train_step(self, data):
    self.original_w = self.student.weights 
    
    # Unpack data
    x, y = data

    # Forward pass of teacher
    teacher_predictions = self.teacher(x, training=False)
    with tf.GradientTape() as tape:
      ## QUantification du student
      # Vectorisation des poids
      v = self.matrix2vect(self.original_w)
      
      # scalling 
      scalled_v, self.q_alpha, self.q_beta = self.scale_function(v, self.bucket_size)

      # Quantification Uniforme
      v_q = self.uniform_quantification(scalled_v)
      
      #Transformer le vecteur de poids auntifiées en matrices de poids
      self.quantized_w = self.vect2matrix(v_q)
      

      

      ## student forward
      student_predictions = self.student(x, training= True)

      # Compute losses
      student_loss = self.student_loss_fn(y, tf.nn.softmax(student_predictions))
      distillation_loss = self.distillation_loss_fn(
          tf.nn.softmax(teacher_predictions / self.temperature, axis=1),
          tf.nn.softmax(student_predictions / self.temperature, axis=1),
        )

      loss = self.alpha * student_loss + (1- self.alpha)* distillation_loss

    # Compute gradients
    trainable_vars = self.student.trainable_variables
    gradients = tape.gradient(loss, trainable_vars)

    # Update weights
    self.optimizer.apply_gradients(zip(gradients, trainable_vars))

    # Update the metrics configured in `compile()`.
    self.compiled_metrics.update_state(y, student_predictions)

    # Mise à jour des poids quantifiés
    self.original_w = self.student.weights
    # Vectorisation des poids
    v = self.matrix2vect(self.original_w)

    # scalling 
    scalled_v, self.q_alpha, self.q_beta = self.scale_function(v, self.bucket_size)

    # Quantification Uniforme
    v_q = self.uniform_quantification(scalled_v)

    #Transformer le vecteur de poids auntifiées en matrices de poids
    self.quantized_w = self.vect2matrix(v_q)

    # Return a dict of performance
    results = {m.name: m.result() for m in self.metrics}
    results.update(
        {"loss": loss, "student_loss": student_loss,"Dist_loss": distillation_loss }
    )
    return results

  # Test Step
  def test_step(self, data):
    
    # Unpack the data
    x, y = data

    # Compute predictions
    y_prediction = self.student(x, training=False)

    # Calculate the loss
    student_loss = self.student_loss_fn(y, y_prediction)

    # Update the metrics.
    self.compiled_metrics.update_state(y, y_prediction)

    # Return a dict of performance
    results = {m.name: m.result() for m in self.metrics}
    results.update({"student_loss": student_loss})
    return results

## Ploting history fonction

In [47]:
def plot_hist(los1,los2, accur1, accur2):
  plt.figure(figsize= (20,7))
  plt.subplot(121)
  plt.plot(accur1, label='KD Accuracy')
  plt.plot(accur2, label= 'Scratch Accuracy')

  plt.xlabel('Epochs')
  plt.ylabel('Accuracy')
  plt.grid()
  plt.legend()


  plt.subplot(122)
  plt.plot(los1, label='KD Loss')
  plt.plot(los2,  label= 'Scratch Loss')
  plt.xlabel('Epochs')
  plt.ylabel('Loss')

  plt.grid()
  plt.legend()
  plt.show()


## Loading and processing Data

In [48]:
print("================ Data Loading ================")
(x_train, y_train),(x_test, y_test) = tf.keras.datasets.cifar10.load_data()

# Normalize data.
x_train = x_train.astype('float32') / 255
x_test = x_test.astype('float32') / 255

# Data shapes
print("x_train shape:", x_train.shape)
print("x_test shape:", x_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)
print("")

x_train shape: (50000, 32, 32, 3)
x_test shape: (10000, 32, 32, 3)
y_train shape: (50000, 1)
y_test shape: (10000, 1)



## Resnet


In [49]:
def resnet_layer(inputs,
                 num_filters=16,
                 kernel_size=3,
                 strides=1,
                 activation='relu',
                 batch_normalization=True,
                 conv_first=True):
    """2D Convolution-Batch Normalization-Activation stack builder

    # Arguments
        inputs (tensor): input tensor from input image or previous layer
        num_filters (int): Conv2D number of filters
        kernel_size (int): Conv2D square kernel dimensions
        strides (int): Conv2D square stride dimensions
        activation (string): activation name
        batch_normalization (bool): whether to include batch normalization
        conv_first (bool): conv-bn-activation (True) or
            bn-activation-conv (False)

    # Returns
        x (tensor): tensor as input to the next layer
    """
    conv = layers.Conv2D(num_filters,
                  kernel_size=kernel_size,
                  strides=strides,
                  padding='same',
                  kernel_initializer='he_normal',
                  kernel_regularizer=tf.keras.regularizers.l2(1e-4))

    x = inputs
    if conv_first:
        x = conv(x)
        if batch_normalization:
            x = layers.BatchNormalization()(x)
        if activation is not None:
            x = layers.Activation(activation)(x)
    else:
        if batch_normalization:
            x = layers.BatchNormalization()(x)
        if activation is not None:
            x = layers.Activation(activation)(x)
        x = conv(x)
    return x

In [50]:
def resnet_v1(input_shape, depth, num_classes=10):
    """ResNet Version 1 Model builder [a]

    Stacks of 2 x (3 x 3) Conv2D-BN-ReLU
    Last ReLU is after the shortcut connection.
    At the beginning of each stage, the feature 
    map size is halved (downsampled)
    by a convolutional layer with strides=2, while the number of 
    filters is
    doubled. Within each stage, the layers have the same number 
    filters and the same number of filters.
    Features maps sizes:
    stage 0: 32x32, 16
    stage 1: 16x16, 32
    stage 2:  8x8,  64
    The Number of parameters is approx the same as Table 6 of [a]:
    ResNet20 0.27M
    ResNet32 0.46M
    ResNet44 0.66M
    ResNet56 0.85M
    ResNet110 1.7M

    # Arguments
        input_shape (tensor): shape of input image tensor
        depth (int): number of core convolutional layers
        num_classes (int): number of classes (CIFAR10 has 10)

    # Returns
        model (Model): Keras model instance
    """
    if (depth - 2) % 6 != 0:
        raise ValueError('depth should be 6n+2 (eg 20, 32, 44 in [a])')
    # Start model definition.
    num_filters = 16
    num_res_blocks = int((depth - 2) / 6)

    inputs = layers.Input(shape=input_shape)
    x = resnet_layer(inputs=inputs)
    # Instantiate the stack of residual units
    for stack in range(3):
        for res_block in range(num_res_blocks):
            strides = 1
            # first layer but not first stack
            if stack > 0 and res_block == 0:  
                strides = 2  # downsample
            y = resnet_layer(inputs=x,
                             num_filters=num_filters,
                             strides=strides)
            y = resnet_layer(inputs=y,
                             num_filters=num_filters,
                             activation=None)
            # first layer but not first stack
            if stack > 0 and res_block == 0:  
                # linear projection residual shortcut connection to match
                # changed dims
                x = resnet_layer(inputs=x,
                                 num_filters=num_filters,
                                 kernel_size=1,
                                 strides=strides,
                                 activation=None,
                                 batch_normalization=False)
            x = tf.keras.layers.add([x, y])
            x = layers.Activation('relu')(x)
        num_filters *= 2

    # Add classifier on top.
    # v1 does not use BN after last shortcut connection-ReLU
    x = layers.AveragePooling2D(pool_size=8)(x)
    y = layers.Flatten()(x)
    outputs = layers.Dense(num_classes,
                    kernel_initializer='he_normal')(y)

    # Instantiate model.
    model = tf.keras.models.Model(inputs=inputs, outputs=outputs)
    return model

In [51]:
print("")
print("================ Loading teacher model ================")
teacher = tf.keras.models.load_model("Resnet26_from_logits")
print("Evaluation of Teacher model!")
teacher.evaluate(x_test, y_test)
print("")


Evaluation of Teacher model!



## Knowledge Distillation

In [52]:
def stud_model(): 
  student = tf.keras.Sequential()

  input = tf.keras.Input((32, 32, 3))

  x = tf.keras.layers.Flatten()(input)
  x = tf.keras.layers.Dense(32)(x)
  x = tf.keras.layers.ReLU()(x)
  x = tf.keras.layers.Dense(16)(x)
  x = tf.keras.layers.ReLU()(x)
  x = tf.keras.layers.Dense(8)(x)
  x = tf.keras.layers.ReLU()(x)

  output = tf.keras.layers.Dense(10)(x)

  student = tf.keras.Model(input, output)
  return student

student = stud_model()

student_scratch = tf.keras.models.clone_model(student)
print("Mémoire occupée par le student model: ",(student_scratch.count_params()*4) /1e6,' Mo')
student.summary()

Mémoire occupée par le student model:  0.39636  Mo
Model: "model_13"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_14 (InputLayer)       [(None, 32, 32, 3)]       0         
                                                                 
 flatten_13 (Flatten)        (None, 3072)              0         
                                                                 
 dense_52 (Dense)            (None, 32)                98336     
                                                                 
 re_lu_39 (ReLU)             (None, 32)                0         
                                                                 
 dense_53 (Dense)            (None, 16)                528       
                                                                 
 re_lu_40 (ReLU)             (None, 16)                0         
                                                                 
 dense_

In [53]:
w = student.get_weights()
student.set_weights(w)
for i in  range(len(student.get_weights())):
  print(student.get_weights()[i].shape)

(3072, 32)
(32,)
(32, 16)
(16,)
(16, 8)
(8,)
(8, 10)
(10,)


# Knowledge Distillation


## Entrainement par KD

In [None]:
# Paramètre d'entrainement
Epochs = 100
Batch = 32

# Construction du distilleur
student = stud_model()

student.compile(
    optimizer = tf.keras.optimizers.SGD(learning_rate= 0.1, momentum=0.9),
    metrics = [tf.keras.metrics.SparseCategoricalAccuracy()],
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    )


# Distill teacher to student
t1 = time.time()
hist= student.fit(x_train, y_train, epochs=Epochs, batch_size = Batch)
t2 = time.time()
time_dist = t2 - t1

# Evaluate student on test dataset
student.evaluate(x_test, y_test)

Epoch 1/100
