# Deep Learning Project

## Milestone report

## Study of the impact on the ratio of labeled to unlabeled data on top-1 accuracy on the MNIST dataset

Pierre Andurand (pa2570)
Tzu Yi Chuang (tc3075)
Kuan Yu Ko (kk3376)


Below we train a simple model in supervised learning without data augmentation. We check its performance on the mnist dataset. It will be our un-noised teacher model. And we will compare its performance with 36 epochs to the semi supervised self-learning model in the following block
# Test 1

In [3]:
%tensorflow_version 2.x
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))





from __future__ import print_function
import tensorflow.keras as keras
from tensorflow.keras.datasets import mnist
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten
from tensorflow.keras.layers import Conv2D, MaxPooling2D
import os
import numpy as np

batch_size = 200
num_classes = 10
epochs = 36
data_augmentation = False
#num_predictions = 20
save_dir = os.path.join(os.getcwd(), 'saved_models')
teacher_name = 'keras_mnist_trained_teacher.h5'

# Load the data, split between train and test sets:
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train=x_train.reshape(x_train.shape[0],28,28,1).astype('float32')/255
x_test=x_test.reshape(x_test.shape[0],28,28,1).astype('float32')/255
print('x_train shape:', x_train.shape)
print(x_train.shape[0], 'train samples')
print(x_test.shape[0], 'test samples')

# Convert class vectors to binary class matrices.
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

teacher = Sequential()
teacher.add(Conv2D(32, (3, 3), padding='same',
                 input_shape=x_train.shape[1:]))
teacher.add(Activation('relu'))
teacher.add(Conv2D(32, (3, 3)))
teacher.add(Activation('relu'))
teacher.add(MaxPooling2D(pool_size=(2, 2)))
teacher.add(Dropout(0.25))

teacher.add(Conv2D(64, (3, 3), padding='same'))
teacher.add(Activation('relu'))
teacher.add(Conv2D(64, (3, 3)))
teacher.add(Activation('relu'))
teacher.add(MaxPooling2D(pool_size=(2, 2)))
teacher.add(Dropout(0.25))

teacher.add(Flatten())
teacher.add(Dense(512))
teacher.add(Activation('relu'))
#model.add(Dropout(0.5))
teacher.add(Dense(num_classes))
teacher.add(Activation('softmax'))


# Compiling the model using RMSprop
teacher.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255
x_test /= 255

#Training the model

if not data_augmentation:
    print('Not using data augmentation.')
    teacher.fit(x_train, y_train,
              batch_size=batch_size,
              epochs=epochs,
              validation_split=0.2,
              shuffle=True)
else:
    print('Using real-time data augmentation.')
    # This will do preprocessing and realtime data augmentation:
    datagen = ImageDataGenerator(
        featurewise_center=False,  # set input mean to 0 over the dataset
        samplewise_center=False,  # set each sample mean to 0
        featurewise_std_normalization=False,  # divide inputs by std of the dataset
        samplewise_std_normalization=False,  # divide each input by its std
        zca_whitening=False,  # apply ZCA whitening
        zca_epsilon=1e-06,  # epsilon for ZCA whitening
        rotation_range=0,  # randomly rotate images in the range (degrees, 0 to 180)
        # randomly shift images horizontally (fraction of total width)
        width_shift_range=0.1,
        # randomly shift images vertically (fraction of total height)
        height_shift_range=0.1,
        shear_range=0.,  # set range for random shear
        zoom_range=0.,  # set range for random zoom
        channel_shift_range=0.,  # set range for random channel shifts
        # set mode for filling points outside the input boundaries
        fill_mode='nearest',
        cval=0.,  # value used for fill_mode = "constant"
        horizontal_flip=True,  # randomly flip images
        vertical_flip=False,  # randomly flip images
        # set rescaling factor (applied before any other transformation)
        rescale=None,
        # set function that will be applied on each input
        preprocessing_function=None,
        # image data format, either "channels_first" or "channels_last"
        data_format=None,
        # fraction of images reserved for validation (strictly between 0 and 1)
        validation_split=0.0)

    # Compute quantities required for feature-wise normalization
    # (std, mean, and principal components if ZCA whitening is applied).
    datagen.fit(x_train)

    # Fit the model on the batches generated by datagen.flow().
    teacher.fit_generator(datagen.flow(x_train, y_train,
                                     batch_size=batch_size),
                        epochs=epochs,
                        validation_split=0.2,
                        workers=4)
    
# Save model and weights
if not os.path.isdir(save_dir):
    os.makedirs(save_dir)
teacher_path = os.path.join(save_dir, teacher_name)
teacher.save(teacher_path)
print('Saved trained model at %s ' % teacher_path)

# Score trained model.
scores = teacher.evaluate(x_test, y_test, verbose=1)
print('Supervised learning model with '+str(epochs)+'epochs \n')
print('Test loss:', scores[0])
print('Test accuracy:', scores[1])


Found GPU at: /device:GPU:0
x_train shape: (60000, 28, 28, 1)
60000 train samples
10000 test samples
Not using data augmentation.
Epoch 1/36
Epoch 2/36
Epoch 3/36
Epoch 4/36
Epoch 5/36
Epoch 6/36
Epoch 7/36
Epoch 8/36
Epoch 9/36
Epoch 10/36
Epoch 11/36
Epoch 12/36
Epoch 13/36
Epoch 14/36
Epoch 15/36
Epoch 16/36
Epoch 17/36
Epoch 18/36
Epoch 19/36
Epoch 20/36
Epoch 21/36
Epoch 22/36
Epoch 23/36
Epoch 24/36
Epoch 25/36
Epoch 26/36
Epoch 27/36
Epoch 28/36
Epoch 29/36
Epoch 30/36
Epoch 31/36
Epoch 32/36
Epoch 33/36
Epoch 34/36
Epoch 35/36
Epoch 36/36
Saved trained model at /content/saved_models/keras_mnist_trained_teacher.h5 
Supervised learning model with 36epochs 

Test loss: 0.037561338394880295
Test accuracy: 0.9883999824523926


The accuracy of the fully supervised model is 0.9884. 
# Test 1
Below we would like to check if starting from the weights of the fully supervised model trained above, we would get an increase in accuracy if we run STNS on the full dataset with different ratios of labeled to unlabeled data. And if the accuracy goes up, we would like to see what is the optimal ratio. 
The STNS algorithm used is as follows:
We do a loop over different ratios of label to unlabelled data (rate). And each loop does the following:
1) take the weights from the fully supervised teacher model trained in box above 
2) Ten cycles of: un-noised model (teacher)->predict hard pseudolabel->training 10 epochs for noised model (student=teacher+dropout noise) on labeled+pseudo labeled->new weights. 

We check rates 0.1,0.25,0.5,1,2.5,5,10,20. The student model will be the teacher model noised by a Dropout(0.5) before the last layer

In [4]:


#batch_size = 32
num_classes = 10
#epochs = 50
#data_augmentation = False
#num_predictions = 20
save_dir = os.path.join(os.getcwd(), 'saved_models')
teacher10_name = 'keras_mnist_trained_teacher10.h5'

teacher10_path = os.path.join(save_dir, teacher10_name)

# Load the mnist data, split between train and test sets:
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train=x_train.reshape(x_train.shape[0],28,28,1).astype('float32')/255
x_test=x_test.reshape(x_test.shape[0],28,28,1).astype('float32')/255
print('x_train shape:', x_train.shape)
print(x_train.shape[0], 'train samples')
print(x_test.shape[0], 'test samples')

# Convert class vectors to binary class matrices.
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)




# rate= n_true/n_pseudo
rate=np.array([0.1,0.25,0.5,1,2.5,5,10,20])
n_total=x_train.shape[0]

# total number of train images (n_total) = number of true label images (n_true) + number of pseudo label images (n_pseudo)
#                                        = n_pseudo(rate+1)
# n_pseudo = n_total/(rate+1); n_true=n_total-n_pseudo

#loop over rate values in order to find the optimal rate value for the self-learning semi supervised learning, 
#ie one that will maximize accuracy
for r in rate:
    
    #making un-noised teacher model
    teacher = Sequential()
    teacher.add(Conv2D(32, (3, 3), padding='same',
                     input_shape=x_train.shape[1:]))
    teacher.add(Activation('relu'))
    teacher.add(Conv2D(32, (3, 3)))
    teacher.add(Activation('relu'))
    teacher.add(MaxPooling2D(pool_size=(2, 2)))
    teacher.add(Dropout(0.25))

    teacher.add(Conv2D(64, (3, 3), padding='same'))
    teacher.add(Activation('relu'))
    teacher.add(Conv2D(64, (3, 3)))
    teacher.add(Activation('relu'))
    teacher.add(MaxPooling2D(pool_size=(2, 2)))
    teacher.add(Dropout(0.25))

    teacher.add(Flatten())
    teacher.add(Dense(512))
    teacher.add(Activation('relu'))
    #model.add(Dropout(0.5)) #this will be uncommented for the noised student model
    teacher.add(Dense(num_classes))
    teacher.add(Activation('softmax'))

  

    # Compile the teacher model using RMSprop
    teacher.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])





    #noised student model

    student = Sequential()
    student.add(Conv2D(32, (3, 3), padding='same',
                 input_shape=x_train.shape[1:]))
    student.add(Activation('relu'))
    student.add(Conv2D(32, (3, 3)))
    student.add(Activation('relu'))
    student.add(MaxPooling2D(pool_size=(2, 2)))
    student.add(Dropout(0.25))

    student.add(Conv2D(64, (3, 3), padding='same'))
    student.add(Activation('relu'))
    student.add(Conv2D(64, (3, 3)))
    student.add(Activation('relu'))
    student.add(MaxPooling2D(pool_size=(2, 2)))
    student.add(Dropout(0.25))

    student.add(Flatten())
    student.add(Dense(512))
    student.add(Activation('relu'))
    student.add(Dropout(0.5))
    student.add(Dense(num_classes))
    student.add(Activation('softmax'))



    # Compiling the model using RMSprop
    student.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    
    
    print("rate="+str(r)+":\n")
    n_pseudo=n_total/(1+r)
    n_true=n_total-n_pseudo
    print(n_pseudo)
    print(n_true)
    mask_true=np.random.choice(int(n_total),int(n_true),replace=False) #generating n_true integers between 0 and n_total-1
    mask_pseudo=[item for item in range(n_total) if item not in mask_true] #all the other numbers between 0 and n_total-1 which are not in mask_true
    mask_pseudo=np.array(mask_pseudo)
    x_true=x_train[mask_true[:]] #x for the labeled data
    y_true=y_train[mask_true[:]] #y for the labeled data
    x_pseudo=x_train[mask_pseudo[:]] #x for the unlabeled data (pseudo)
    #taking weights from box above
    teacher.load_weights(teacher_path)
    #evaluating teacher model on test data    
    scores=teacher.evaluate(x_test,y_test,verbose=0)
    print("Original model with labelled data only predicting on test data: ",scores[1])

    x_true_pseudo=np.concatenate([x_true,x_pseudo]) #concatenating x for labeled and unlabeled data
    print('x_true_pseudo.shape: ',x_true_pseudo.shape)
    prediction=teacher.predict_classes(x_pseudo) #predicting labels on unlabeled data
    y_pseudo=keras.utils.to_categorical(prediction, num_classes)
    y_true_pseudo=np.concatenate([y_true,y_pseudo]) #concatenating y for labeled and pseudo labeled
    print('y_true_pseudo.shape: ', y_true_pseudo.shape)
    for i in range(10): 
        # 10 loops of 10 epochs of noised student training for labeled and pseudo labeled data (step 3 in article)
        # followed by generating predictions on unlabeled data with the teacher model (=un-noised student)
        # which uses the weights of the trained noised student (noise does not change the weights structure of models) (step 2 in article)
        print(i)
        training=student.fit(x_true_pseudo,y_true_pseudo,validation_split=0.,
                             epochs=10,batch_size=200,verbose=0)
        # Save weights
        student.save_weights(teacher10_path)
        # Load weights for teacher model (un-noised)
        teacher.load_weights(teacher10_path)
        prediction=teacher.predict_classes(x_pseudo)
        scores=teacher.evaluate(x_test,y_test,verbose=0) #evaluating model on test data
        print('iteration: ',i)
        print('Test loss:', scores[0])
        print('Test accuracy:', scores[1])
        y_pseudo=keras.utils.to_categorical(prediction, num_classes)
        y_true_pseudo=np.concatenate([y_true,y_pseudo]) #new y_true_pseudo to be used in next loop
        




x_train shape: (60000, 28, 28, 1)
60000 train samples
10000 test samples
rate=0.1:

54545.454545454544
5454.545454545456
Original model with labelled data only predicting on test data:  0.9666000008583069
x_true_pseudo.shape:  (60000, 28, 28, 1)
Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).
y_true_pseudo.shape:  (60000, 10)
0
iteration:  0
Test loss: 0.05278007313609123
Test accuracy: 0.9828000068664551
1
iteration:  1
Test loss: 0.042510077357292175
Test accuracy: 0.987500011920929
2
iteration:  2
Test loss: 0.04345833882689476
Test accuracy: 0.9876000285148621
3
iteration:  3
Test loss: 0.04679572954773903
Test accuracy: 0.989799976348877
4
iteration:  4
Test loss: 0.04688131436705589
Test accuracy: 0

We can conclude that a ratio of labeled/unlabeled above 0.3 adds some accuracy on the full training set of MNIST. It means that having less than one third of the total training set kept as unlabeled data helps the accuracy relative to the fully supervised version. Below we will do the same experiment as above, but taking a small sample of the MNIST dataset, with only 100 images in total from the training dataset, and its accuracy calculated against the full test dataset. 
## Test 2

In [0]:
#supervised learning on small training dataset, testing on full testing dataset


from __future__ import print_function
import keras
from keras.datasets import mnist
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Conv2D, MaxPooling2D
import os
import numpy as np

#batch_size = 32
num_classes = 10
#epochs = 50
#data_augmentation = False
#num_predictions = 20
save_dir = os.path.join(os.getcwd(), 'saved_models')
teacher2_name = 'keras_mnist_trained2_teacher.h5'

teacher2_path = os.path.join(save_dir, teacher_name)

#making un-noised teacher model
teacher2 = Sequential()
teacher2.add(Conv2D(32, (3, 3), padding='same',
                 input_shape=x_train.shape[1:]))
teacher2.add(Activation('relu'))
teacher2.add(Conv2D(32, (3, 3)))
teacher2.add(Activation('relu'))
teacher2.add(MaxPooling2D(pool_size=(2, 2)))
teacher2.add(Dropout(0.25))

teacher2.add(Conv2D(64, (3, 3), padding='same'))
teacher2.add(Activation('relu'))
teacher2.add(Conv2D(64, (3, 3)))
teacher2.add(Activation('relu'))
teacher2.add(MaxPooling2D(pool_size=(2, 2)))
teacher2.add(Dropout(0.25))

teacher2.add(Flatten())
teacher2.add(Dense(512))
teacher2.add(Activation('relu'))
#model.add(Dropout(0.5)) #this will be uncommented for the noised student model
teacher2.add(Dense(num_classes))
teacher2.add(Activation('softmax'))

# initiate RMSprop optimizer
opt = keras.optimizers.RMSprop(learning_rate=0.0001, decay=1e-6)

# Compile the teacher model using RMSprop
teacher2.compile(loss='categorical_crossentropy',
              optimizer=opt,
              metrics=['accuracy'])





# Load the mnist data, split between train and test sets:
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train=x_train.reshape(x_train.shape[0],28,28,1).astype('float32')/255
x_test=x_test.reshape(x_test.shape[0],28,28,1).astype('float32')/255
print('x_train shape:', x_train.shape)
print(x_train.shape[0], 'train samples')
print(x_test.shape[0], 'test samples')
print(y_train.shape)

#checking that classes are balanced

#print(np.unique(y_train))
#print(y_train[0:40])

sample_size=100
for i in range(10):
    print(str(i)+":",sum(y_train==i))

#Selecting 10 images of each class
k=0
x_small_train=np.zeros((sample_size,28,28,1))
y_small_train=np.full((sample_size,),-1)

for i in range(x_train.shape[0]):
    #print(i)
    for j in range(10):
        if sum(y_small_train==j)<sample_size/10:
            if y_train[i]==j:
                x_small_train[k,:]=x_train[i,:]
                y_small_train[k]=y_train[i]
                k+=1
                break
    #print('k=',k)
    if k==sample_size:
        break
        
#print(y_small_train[0:40])
print(x_small_train.shape)
print(y_small_train.shape)

#verifying that there are 10 images in each class
for i in range(10):
    print(str(i)+":",sum(y_small_train==i))
    
# Convert class vectors to binary class matrices.
y_small_train = keras.utils.to_categorical(y_small_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)    

#train teacher model
teacher2.fit(x_small_train, y_small_train,
            batch_size=10,
            epochs=100,
            validation_split=0.,
            shuffle=True)
teacher2.save_weights(teacher2_path)

# Score trained model.
scores = teacher2.evaluate(x_test, y_test, verbose=1)
print('Small sample of 100 training images, Supervised learning model with '+str(100)+'epochs \n')
print('Test loss:', scores[0])
print('Test accuracy:', scores[1])


x_train shape: (60000, 28, 28, 1)
60000 train samples
10000 test samples
(60000,)
0: 5923
1: 6742
2: 5958
3: 6131
4: 5842
5: 5421
6: 5918
7: 6265
8: 5851
9: 5949
(100, 28, 28, 1)
(100,)
0: 10
1: 10
2: 10
3: 10
4: 10
5: 10
6: 10
7: 10
8: 10
9: 10
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/

Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
Small sample of 100 training images, Supervised learning model with 100epochs 

Test loss: 2.1747830513879656
Test accuracy: 0.7692999839782715


We get an accuracy of 0.7693 for this small dataset. Now we keep the same labeled training set of 100 images, and add an unlabeled training set determined by the ratio and run STSN. Before starting the loop we use the results of the trained teacher on the full dataset done above.

In [0]:
# Varying the ratio for 100 labeled images. Rest of training dataset unlabeled with ratio determining size of total dataset.
# Testing on full test dataset
save_dir = os.path.join(os.getcwd(), 'saved_models')
teacher3_name = 'keras_mnist_trained3_teacher.h5'

teacher3_path = os.path.join(save_dir, teacher3_name)

# Load the mnist data, split between train and test sets:
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train=x_train.reshape(x_train.shape[0],28,28,1).astype('float32')/255
x_test=x_test.reshape(x_test.shape[0],28,28,1).astype('float32')/255
print('x_train shape:', x_train.shape)
print(x_train.shape[0], 'train samples')
print(x_test.shape[0], 'test samples')
print(y_train.shape)

#checking that classes are balanced

#print(np.unique(y_train))
#print(y_train[0:40])

sample_size=100
for i in range(10):
    print(str(i)+":",sum(y_train==i))

#Selecting 10 images of each class
k=0
x_small_train=np.zeros((sample_size,28,28,1))
y_small_train=np.full((sample_size,),-1)

for i in range(x_train.shape[0]):
    #print(i)
    for j in range(10):
        if sum(y_small_train==j)<sample_size/10:
            if y_train[i]==j:
                x_small_train[k,:]=x_train[i,:]
                y_small_train[k]=y_train[i]
                k+=1
                break
    #print('k=',k)
    if k==sample_size:
        break
        
#print(y_small_train[0:40])
print(x_small_train.shape)
print(y_small_train.shape)

#verifying that there are 10 images in each class
for i in range(10):
    print(str(i)+":",sum(y_small_train==i))
    
# Convert class vectors to binary class matrices.
y_small_train = keras.utils.to_categorical(y_small_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)    





# rate= n_true/n_pseudo, n_true=sample_size
rate=np.array([0.1,0.25,0.5,0.75,1,2.5,5])
#n_total=sample_size*(1+1/rate)

# total number of train images (n_total) = number of true label images (sample_size) + number of pseudo label images (n_pseudo)
#                                        = sample_size(1/rate+1)
# n_pseudo = sample_size/rate

#loop over rate values in order to find the optimal rate value for the self-learning semi supervised learning, 
#ie one that will maximize accuracy
for r in rate:
    #making un-noised teacher model
    teacher = Sequential()
    teacher.add(Conv2D(32, (3, 3), padding='same',
                 input_shape=x_train.shape[1:]))
    teacher.add(Activation('relu'))
    teacher.add(Conv2D(32, (3, 3)))
    teacher.add(Activation('relu'))
    teacher.add(MaxPooling2D(pool_size=(2, 2)))
    teacher.add(Dropout(0.25))

    teacher.add(Conv2D(64, (3, 3), padding='same'))
    teacher.add(Activation('relu'))
    teacher.add(Conv2D(64, (3, 3)))
    teacher.add(Activation('relu'))
    teacher.add(MaxPooling2D(pool_size=(2, 2)))
    teacher.add(Dropout(0.25))

    teacher.add(Flatten())
    teacher.add(Dense(512))
    teacher.add(Activation('relu'))
    #model.add(Dropout(0.5)) #this will be uncommented for the noised student model
    teacher.add(Dense(num_classes))
    teacher.add(Activation('softmax'))


    # Compile the teacher model using RMSprop
    teacher.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    #noised student model

    student = Sequential()
    student.add(Conv2D(32, (3, 3), padding='same',
                 input_shape=x_train.shape[1:]))
    student.add(Activation('relu'))
    student.add(Conv2D(32, (3, 3)))
    student.add(Activation('relu'))
    student.add(MaxPooling2D(pool_size=(2, 2)))
    student.add(Dropout(0.25))

    student.add(Conv2D(64, (3, 3), padding='same'))
    student.add(Activation('relu'))
    student.add(Conv2D(64, (3, 3)))
    student.add(Activation('relu'))
    student.add(MaxPooling2D(pool_size=(2, 2)))
    student.add(Dropout(0.25))

    student.add(Flatten())
    student.add(Dense(512))
    student.add(Activation('relu'))
    student.add(Dropout(0.5))
    student.add(Dense(num_classes))
    student.add(Activation('softmax'))

   
    # Compiling the model using RMSprop
    student.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    print("rate="+str(r)+":\n")
    n_pseudo=int(sample_size/r)
    n_true=sample_size
    n_total=n_true+n_pseudo
    print(n_pseudo, n_true, n_total)
    x_true=x_small_train
    y_true=y_small_train
    x_pseudo=x_train[500:500+n_pseudo,:] #x for the unlabeled data (pseudo)
    teacher.load_weights(teacher2_path) #we run it in the block above
    #evaluating teacher model on test data    
    scores=teacher.evaluate(x_test,y_test,verbose=0)
    print("Original model with labelled data only predicting on test data: ",scores[1])

    x_true_pseudo=np.concatenate([x_true,x_pseudo]) #concatenating x for labeled and unlabeled data
    print('x_true_pseudo.shape: ',x_true_pseudo.shape)
    prediction=teacher.predict_classes(x_pseudo) #predicting labels on unlabeled data
    y_pseudo=keras.utils.to_categorical(prediction, num_classes)
    y_true_pseudo=np.concatenate([y_true,y_pseudo]) #concatenating y for labeled and pseudo labeled
    print('y_true_pseudo.shape: ', y_true_pseudo.shape)
    for i in range(10): 
        # 10 loops of 10 epochs of noised student training for labeled and pseudo labeled data (step 3 in article)
        # followed by generating predictions on unlabeled data with the teacher model (=un-noised student)
        # which uses the weights of the trained noised student (noise does not change the weights structure of models) (step 2 in article)
        print(i)
        training=student.fit(x_true_pseudo,y_true_pseudo,validation_split=0.,
                             epochs=10,batch_size=int(n_total/10),verbose=0)
        # Save weights
        student.save_weights(teacher3_path)
        # Load weights for teacher model (un-noised)
        teacher.load_weights(teacher3_path)
        prediction=teacher.predict_classes(x_pseudo)
        scores=teacher.evaluate(x_test,y_test,verbose=0) #evaluating model on test data
        print('iteration: ',i)
        print('Test loss:', scores[0])
        print('Test accuracy:', scores[1])
        y_pseudo=keras.utils.to_categorical(prediction, num_classes)
        y_true_pseudo=np.concatenate([y_true,y_pseudo]) #new y_true_pseudo to be used in next loop
        


#from keras.utils import plot_model
#plot_model(teacher,to_file='teacher.png')
#plot_model(student,to_file='student.png')

x_train shape: (60000, 28, 28, 1)
60000 train samples
10000 test samples
(60000,)
0: 5923
1: 6742
2: 5958
3: 6131
4: 5842
5: 5421
6: 5918
7: 6265
8: 5851
9: 5949
(100, 28, 28, 1)
(100,)
0: 10
1: 10
2: 10
3: 10
4: 10
5: 10
6: 10
7: 10
8: 10
9: 10
rate=0.1:

1000 100 1100
Original model with labelled data only predicting on test data:  0.7692999839782715
x_true_pseudo.shape:  (1100, 28, 28, 1)
y_true_pseudo.shape:  (1100, 10)
0
iteration:  0
Test loss: 0.8755774250030518
Test accuracy: 0.7418000102043152
1
iteration:  1
Test loss: 1.1031837633371353
Test accuracy: 0.7556999921798706
2
iteration:  2
Test loss: 1.25390972969532
Test accuracy: 0.7583000063896179
3
iteration:  3
Test loss: 1.270622627222538
Test accuracy: 0.769599974155426
4
iteration:  4
Test loss: 1.4328455243647098
Test accuracy: 0.7674000263214111
5
iteration:  5
Test loss: 1.5072111678063869
Test accuracy: 0.7696999907493591
6
iteration:  6
Test loss: 1.5471277669161558
Test accuracy: 0.7713000178337097
7
iteration:  7


The ratio from 1 to 2.5 helps the accuracy by around 3%. The error rate is reduced by around 13%. Let's try below with 1000 images

In [0]:
#supervised learning on small training dataset, testing on full testing dataset


from __future__ import print_function
import keras
from keras.datasets import mnist
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Conv2D, MaxPooling2D
import os
import numpy as np

#batch_size = 32
num_classes = 10
#epochs = 50
#data_augmentation = False
#num_predictions = 20
save_dir = os.path.join(os.getcwd(), 'saved_models')
teacher4_name = 'keras_mnist_trained4_teacher.h5'

teacher4_path = os.path.join(save_dir, teacher4_name)

#making un-noised teacher model
teacher4 = Sequential()
teacher4.add(Conv2D(32, (3, 3), padding='same',
                 input_shape=x_train.shape[1:]))
teacher4.add(Activation('relu'))
teacher4.add(Conv2D(32, (3, 3)))
teacher4.add(Activation('relu'))
teacher4.add(MaxPooling2D(pool_size=(2, 2)))
teacher4.add(Dropout(0.25))

teacher4.add(Conv2D(64, (3, 3), padding='same'))
teacher4.add(Activation('relu'))
teacher4.add(Conv2D(64, (3, 3)))
teacher4.add(Activation('relu'))
teacher4.add(MaxPooling2D(pool_size=(2, 2)))
teacher4.add(Dropout(0.25))

teacher4.add(Flatten())
teacher4.add(Dense(512))
teacher4.add(Activation('relu'))
#model.add(Dropout(0.5)) #this will be uncommented for the noised student model
teacher4.add(Dense(num_classes))
teacher4.add(Activation('softmax'))

# Compile the teacher model using RMSprop
teacher4.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])


# Load the mnist data, split between train and test sets:
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train=x_train.reshape(x_train.shape[0],28,28,1).astype('float32')/255
x_test=x_test.reshape(x_test.shape[0],28,28,1).astype('float32')/255
print('x_train shape:', x_train.shape)
print(x_train.shape[0], 'train samples')
print(x_test.shape[0], 'test samples')
print(y_train.shape)

#checking that classes are balanced

#print(np.unique(y_train))
#print(y_train[0:40])

sample_size=1000
for i in range(10):
    print(str(i)+":",sum(y_train==i))

#Selecting 10 images of each class
k=0
x_small_train=np.zeros((sample_size,28,28,1))
y_small_train=np.full((sample_size,),-1)

for i in range(x_train.shape[0]):
    #print(i)
    for j in range(10):
        if sum(y_small_train==j)<sample_size/10:
            if y_train[i]==j:
                x_small_train[k,:]=x_train[i,:]
                y_small_train[k]=y_train[i]
                k+=1
                break
    #print('k=',k)
    if k==sample_size:
        break
        
#print(y_small_train[0:40])
print(x_small_train.shape)
print(y_small_train.shape)

#verifying that there are 10 images in each class
for i in range(10):
    print(str(i)+":",sum(y_small_train==i))
    
# Convert class vectors to binary class matrices.
y_small_train = keras.utils.to_categorical(y_small_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)    

#train teacher model
teacher4.fit(x_small_train, y_small_train,
            batch_size=10,
            epochs=100,
            validation_split=0.,
            shuffle=True)
teacher4.save_weights(teacher4_path)

# Score trained model.
scores = teacher4.evaluate(x_test, y_test, verbose=1)
print('Small sample of 1000 training images, Supervised learning model with '+str(100)+'epochs \n')
print('Test loss:', scores[0])
print('Test accuracy:', scores[1])

x_train shape: (60000, 28, 28, 1)
60000 train samples
10000 test samples
(60000,)
0: 5923
1: 6742
2: 5958
3: 6131
4: 5842
5: 5421
6: 5918
7: 6265
8: 5851
9: 5949
(1000, 28, 28, 1)
(1000,)
0: 100
1: 100
2: 100
3: 100
4: 100
5: 100
6: 100
7: 100
8: 100
9: 100
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/1

Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
Small sample of 1000 training images, Supervised learning model with 100epochs 

Test loss: 0.417880268697592
Test accuracy: 0.9559000134468079


In [0]:
# Varying the ratio for 1000 labeled images. Rest of training dataset unlabeled with ratio determining size of total dataset.
# Testing on full test dataset
save_dir = os.path.join(os.getcwd(), 'saved_models')
teacher5_name = 'keras_mnist_trained5_teacher.h5'

teacher5_path = os.path.join(save_dir, teacher5_name)


# rate= n_true/n_pseudo, n_true=sample_size
rate=np.array([0.1,0.25,0.5,0.75,1,2.5,5])
#n_total=sample_size*(1+1/rate)

# total number of train images (n_total) = number of true label images (sample_size) + number of pseudo label images (n_pseudo)
#                                        = sample_size(1/rate+1)
# n_pseudo = sample_size/rate

#loop over rate values in order to find the optimal rate value for the self-learning semi supervised learning, 
#ie one that will maximize accuracy
for r in rate:
    #making un-noised teacher model
    teacher = Sequential()
    teacher.add(Conv2D(32, (3, 3), padding='same',
                 input_shape=x_train.shape[1:]))
    teacher.add(Activation('relu'))
    teacher.add(Conv2D(32, (3, 3)))
    teacher.add(Activation('relu'))
    teacher.add(MaxPooling2D(pool_size=(2, 2)))
    teacher.add(Dropout(0.25))

    teacher.add(Conv2D(64, (3, 3), padding='same'))
    teacher.add(Activation('relu'))
    teacher.add(Conv2D(64, (3, 3)))
    teacher.add(Activation('relu'))
    teacher.add(MaxPooling2D(pool_size=(2, 2)))
    teacher.add(Dropout(0.25))

    teacher.add(Flatten())
    teacher.add(Dense(512))
    teacher.add(Activation('relu'))
    #model.add(Dropout(0.5)) #this will be uncommented for the noised student model
    teacher.add(Dense(num_classes))
    teacher.add(Activation('softmax'))

    
    # Compile the teacher model using RMSprop
    teacher.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    #noised student model

    student = Sequential()
    student.add(Conv2D(32, (3, 3), padding='same',
                 input_shape=x_train.shape[1:]))
    student.add(Activation('relu'))
    student.add(Conv2D(32, (3, 3)))
    student.add(Activation('relu'))
    student.add(MaxPooling2D(pool_size=(2, 2)))
    student.add(Dropout(0.25))

    student.add(Conv2D(64, (3, 3), padding='same'))
    student.add(Activation('relu'))
    student.add(Conv2D(64, (3, 3)))
    student.add(Activation('relu'))
    student.add(MaxPooling2D(pool_size=(2, 2)))
    student.add(Dropout(0.25))

    student.add(Flatten())
    student.add(Dense(512))
    student.add(Activation('relu'))
    student.add(Dropout(0.5))
    student.add(Dense(num_classes))
    student.add(Activation('softmax'))

    

    # Compiling the model using RMSprop
    student.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    print("rate="+str(r)+":\n")
    n_pseudo=int(sample_size/r)
    n_true=sample_size
    n_total=n_true+n_pseudo
    print(n_pseudo, n_true, n_total)
    x_true=x_small_train
    y_true=y_small_train
    x_pseudo=x_train[500:500+n_pseudo,:] #x for the unlabeled data (pseudo)
    teacher.load_weights(teacher4_path) #we run it in the block above
    #evaluating teacher model on test data    
    scores=teacher.evaluate(x_test,y_test,verbose=0)
    print("Original model with labelled data only predicting on test data: ",scores[1])

    x_true_pseudo=np.concatenate([x_true,x_pseudo]) #concatenating x for labeled and unlabeled data
    print('x_true_pseudo.shape: ',x_true_pseudo.shape)
    prediction=teacher.predict_classes(x_pseudo) #predicting labels on unlabeled data
    y_pseudo=keras.utils.to_categorical(prediction, num_classes)
    y_true_pseudo=np.concatenate([y_true,y_pseudo]) #concatenating y for labeled and pseudo labeled
    print('y_true_pseudo.shape: ', y_true_pseudo.shape)
    for i in range(10): 
        # 10 loops of 10 epochs of noised student training for labeled and pseudo labeled data (step 3 in article)
        # followed by generating predictions on unlabeled data with the teacher model (=un-noised student)
        # which uses the weights of the trained noised student (noise does not change the weights structure of models) (step 2 in article)
        print(i)
        training=student.fit(x_true_pseudo,y_true_pseudo,validation_split=0.,
                             epochs=10,batch_size=int(n_total/10),verbose=0)
        # Save weights
        student.save_weights(teacher5_path)
        # Load weights for teacher model (un-noised)
        teacher.load_weights(teacher5_path)
        prediction=teacher.predict_classes(x_pseudo)
        scores=teacher.evaluate(x_test,y_test,verbose=0) #evaluating model on test data
        print('iteration: ',i)
        print('Test loss:', scores[0])
        print('Test accuracy:', scores[1])
        y_pseudo=keras.utils.to_categorical(prediction, num_classes)
        y_true_pseudo=np.concatenate([y_true,y_pseudo]) #new y_true_pseudo to be used in next loop
        


#from keras.utils import plot_model
#plot_model(teacher,to_file='teacher.png')
#plot_model(student,to_file='student.png')

x_train shape: (60000, 28, 28, 1)
60000 train samples
10000 test samples
(60000,)
0: 5923
1: 6742
2: 5958
3: 6131
4: 5842
5: 5421
6: 5918
7: 6265
8: 5851
9: 5949
(1000, 28, 28, 1)
(1000,)
0: 100
1: 100
2: 100
3: 100
4: 100
5: 100
6: 100
7: 100
8: 100
9: 100
rate=0.1:

10000 1000 11000
Original model with labelled data only predicting on test data:  0.9559000134468079
x_true_pseudo.shape:  (11000, 28, 28, 1)
y_true_pseudo.shape:  (11000, 10)
0
iteration:  0
Test loss: 0.3740111562013626
Test accuracy: 0.8914999961853027
1
iteration:  1
Test loss: 0.3431748177520931
Test accuracy: 0.9049000144004822
2
iteration:  2
Test loss: 0.30878793215807526
Test accuracy: 0.9174000024795532
3
iteration:  3
Test loss: 0.3005758320230059
Test accuracy: 0.9239000082015991
4
iteration:  4
Test loss: 0.276111581508792
Test accuracy: 0.9297999739646912
5
iteration:  5
Test loss: 0.28249621306058254
Test accuracy: 0.9332000017166138
6
iteration:  6
Test loss: 0.27533213199888124
Test accuracy: 0.9344000220

For 1000 labeled images, the improvement is not obvious, probably due to the high accuracy of the model

In [0]:
pip install pydot

Collecting pydotNote: you may need to restart the kernel to use updated packages.
  Downloading pydot-1.4.1-py2.py3-none-any.whl (19 kB)
Installing collected packages: pydot
Successfully installed pydot-1.4.1



In [0]:
pip install graphviz

Collecting graphviz
  Downloading graphviz-0.13.2-py2.py3-none-any.whl (17 kB)
Installing collected packages: graphviz
Successfully installed graphviz-0.13.2
Note: you may need to restart the kernel to use updated packages.
