# Load Model

In [113]:
import h5py
from keras.models import load_model

print('loading the model...')
sunglasses_bd_net = load_model('/content/sunglasses_bd_net.h5')
sunglasses_bd_net.load_weights('/content/sunglasses_bd_weights.h5')
multi_trigger_multi_target_bd_net = load_model('/content/multi_trigger_multi_target_bd_net.h5')
multi_trigger_multi_target_bd_net.load_weights('/content/multi_trigger_multi_target_bd_weights.h5')
anonymous_1_bd_net = load_model('/content/anonymous_1_bd_net.h5')
anonymous_1_bd_net.load_weights('/content/anonymous_1_bd_weights.h5')
anonymous_2_bd_net = load_model('/content/anonymous_2_bd_net.h5')
anonymous_2_bd_net.load_weights('/content/anonymous_2_bd_weights.h5')

sunglasses_bd_net.summary()

loading the model...
Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input (InputLayer)              [(None, 55, 47, 3)]  0                                            
__________________________________________________________________________________________________
conv_1 (Conv2D)                 (None, 52, 44, 20)   980         input[0][0]                      
__________________________________________________________________________________________________
pool_1 (MaxPooling2D)           (None, 26, 22, 20)   0           conv_1[0][0]                     
__________________________________________________________________________________________________
conv_2 (Conv2D)                 (None, 24, 20, 40)   7240        pool_1[0][0]                     
_______________________________________________________________________

# Choosing bad net model and corresponding testing data

In [114]:
bd_net = sunglasses_bd_net
cut_num = 25
fit_epoch = 10
#poisoned_data_file_path = '/content/Multi_trigger_file/sunglasses_poisoned_data.h5'
poisoned_data_file_path = '/content/sunglasses_poisoned_data.h5'
test_data = '/content/clean_test_data.h5'
train_data = '/content/clean_validation_data.h5'

# Bad net proformance on clean data 

In [115]:
import keras
import keras.backend as K
from keras import initializers
import numpy as np
import tensorflow as tf

def data_loader(filepath):
  data = h5py.File(filepath)
  x_data = np.array(data['data'])
  y_data = np.array(data['label'])
  x_data = x_data.transpose((0,2,3,1))

  return x_data, y_data

def data_process(x_data):
  return x_data/255


x_data, y_data = data_loader(train_data)
x_data = data_process(x_data)

clean_label_p = np.argmax(bd_net.predict(x_data), axis=1)
class_accu = np.mean(np.equal(clean_label_p, y_data))*100
print('Classification accuracy:', class_accu)


  


Classification accuracy: 97.88689702953148


# Get the increasing order of average activations of neurons in the final convolutional layer of the face recognition network

In [116]:
layer = bd_net.get_layer('pool_3')
keras_function = K.function([bd_net.input], [layer.output])
layer_outs = keras_function([x_data])
out = np.array(layer_outs)
out.shape
activation = np.mean(out, axis=(0,1,2,3))


# Creating pruning position list for later using

In [117]:
pruning_position = np.ones(conv3_num, dtype=bool)
ascending = np.argsort(activation)
for i in range(cut_num):
  index = ascending[i]
  pruning_position[index] = 0

pruning_position


array([False, False, False,  True,  True,  True,  True,  True, False,
       False,  True,  True, False, False, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True, False,
        True, False,  True,  True, False, False, False, False,  True,
       False, False,  True,  True,  True,  True, False,  True, False,
        True,  True, False, False, False, False,  True, False,  True,
        True,  True,  True, False, False,  True])

# Create an empty model with spcify number of neurons in final convolutional layer for receiving weight and bias of each layer after pruning

In [118]:
def Net():
	# define input
	x = keras.Input(shape=(55, 47, 3), name='input')
	# feature extraction
	conv_1 = keras.layers.Conv2D(20, (4, 4), activation='relu', name='conv_1')(x)
	pool_1 = keras.layers.MaxPooling2D((2, 2), name='pool_1')(conv_1)
	conv_2 = keras.layers.Conv2D(40, (3, 3), activation='relu', name='conv_2')(pool_1)
	pool_2 = keras.layers.MaxPooling2D((2, 2), name='pool_2')(conv_2)
	conv_3 = keras.layers.Conv2D((60 - cut_num), (3, 3), activation='relu', name='conv_3')(pool_2)
	pool_3 = keras.layers.MaxPooling2D((2, 2), name='pool_3')(conv_3)
	# first interpretation model
	flatten_1 = keras.layers.Flatten(name='flatten_1')(pool_3)	
	fc_1 = keras.layers.Dense(160, name='fc_1')(flatten_1)
	# second interpretation model
	conv_4 = keras.layers.Conv2D(80, (2, 2), activation='relu', name='conv_4')(pool_3)
	flatten_2 = keras.layers.Flatten(name='flatten_2')(conv_4)
	fc_2 = keras.layers.Dense(160, name='fc_2')(flatten_2)
	# merge interpretation
	merge = keras.layers.Add(name='add_1')([fc_1, fc_2])
	add_1 = keras.layers.Activation(activation='relu', name='activation_1')(merge)
	drop = keras.layers.Dropout(0.5)
	# output
	y_hat = keras.layers.Dense(1283, activation='softmax', name='output')(add_1)
	model = keras.Model(inputs=x, outputs=y_hat)
	# summarize layers
	#print(model.summary())
	# plot graph
	#plot_model(model, to_file='model_architecture.png')

	return model


K.clear_session()
my_model = Net()

# Copy the weight to empty model and strip the weight of each layer that directly connecting with the last Pooling layer by using pruning position list

In [119]:
conv_3_weight = bd_net.get_layer('conv_3').get_weights()[0]
conv_3_bias = bd_net.get_layer('conv_3').get_weights()[1]
conv_4_weight = bd_net.get_layer('conv_4').get_weights()[0]
conv_4_bias = bd_net.get_layer('conv_4').get_weights()[1]
fc_1_weight = bd_net.get_layer('fc_1').get_weights()[0]
fc_1_bias = bd_net.get_layer('fc_1').get_weights()[1]

for layer in my_model.layers:
  if layer.name == 'conv_3':
    my_model.get_layer('conv_3').set_weights([conv_3_weight[:,:,:,pruning_position],conv_3_bias[pruning_position]])
  elif layer.name == 'conv_4':
    my_model.get_layer('conv_4').set_weights([conv_4_weight[:,:,pruning_position,:],conv_4_bias])
  elif layer.name == 'fc_1':
    my_model.get_layer('fc_1').set_weights([fc_1_weight.reshape(60,20,-1)[pruning_position,:,:].reshape((60-cut_num)*20,-1),fc_1_bias])
  else:
    my_model.get_layer(layer.name).set_weights(bd_net.get_layer(layer.name).get_weights())


# Fit new model with clean training data

In [120]:
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
my_model.compile(optimizer='adam',
              loss=loss_fn,
              metrics=['accuracy'])
my_model.fit(x_data, y_data, epochs=fit_epoch)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f0d38e79588>

# New model performance on clean training data

In [121]:
clean_label_p = np.argmax(my_model.predict(x_data), axis=1)
class_accu = np.mean(np.equal(clean_label_p, y_data))*100
print('Classification accuracy:', class_accu)

Classification accuracy: 97.46254438382263


# New model performance on posion data

In [122]:
x_posion_data, y_posion_data = data_loader(poisoned_data_file_path)
x_posion_data = data_process(x_posion_data)

clean_label_p = np.argmax(my_model.predict(x_posion_data), axis=1)
class_accu = np.mean(np.equal(clean_label_p, y_posion_data))*100
print('Classification accuracy:', class_accu)

  


Classification accuracy: 2.1745908028059238


# Bad net performance on clean testing data

In [123]:
x_test_data, y_test_data = data_loader(test_data)
x_test_data = data_process(x_test_data)

clean_label_p = np.argmax(bd_net.predict(x_test_data), axis=1)
class_accu = np.mean(np.equal(clean_label_p, y_test_data))*100
print('Classification accuracy:', class_accu)

  


Classification accuracy: 97.77864380358535


# New model performance on clean testing data

In [124]:
x_test_data, y_test_data = data_loader(test_data)
x_test_data = data_process(x_test_data)

clean_label_p = np.argmax(my_model.predict(x_test_data), axis=1)
class_accu = np.mean(np.equal(clean_label_p, y_test_data))*100
print('Classification accuracy:', class_accu)

  


Classification accuracy: 85.22213561964148


# Create new testing data set that combine the test data and posion data with N+1 as posion data label

In [125]:
y_posion_label = np.ones(len(y_posion_data))
y_posion_label = [x * 1283 for x in y_posion_label]
x_mix = np.concatenate((x_test_data, x_posion_data), axis=0)
y_mix = np.concatenate((y_test_data, y_posion_label), axis=0)

# Using the differ of bad net prediction and new model prediction to determine whether or not the input is backdoored, new model performance on mix testing data

In [126]:
badnet_label_p = np.argmax(bd_net.predict(x_mix), axis=1)
clean_label_p = np.argmax(my_model.predict(x_mix), axis=1)

for i in range(len(clean_label_p)):
  if badnet_label_p[i] != clean_label_p[i]:
    clean_label_p[i] = 1283

class_accu = np.mean(np.equal(clean_label_p, y_mix))*100
print('Classification accuracy:', class_accu)

Classification accuracy: 91.1925175370226


# Bad net performance on mix testing data

In [127]:
class_accu = np.mean(np.equal(badnet_label_p, y_mix))*100
print('Classification accuracy:', class_accu)

Classification accuracy: 48.88932190179268


# Save the model


In [131]:
my_model.save('sunglasses_new_model.h5') 

# Another approach to determine whether or not the input is backdoored, by checking the average max value of each prediction on test data and position data, since the poison data will have lower average max value of prediction, we can use find a threshold to distinguish the input type

In [129]:
number = 0
good_label = np.max(my_model.predict(x_test_data), axis=1)
print(np.mean(good_label))
for label in good_label:
  if label >= 0.99:
    number += 1
print(number/len(good_label))

0.91869336
0.6343725643024162


In [130]:
number = 0
bad_label = np.max(my_model.predict(x_posion_data), axis=1)
np.mean(bad_label)
print(np.mean(bad_label))
for label in bad_label:
  if label < 0.99:
    number += 1
print(number/len(bad_label))

0.75410485
0.8359314107560405
