# Testing AlexNet with convolution approach

This is the same notebook as `Generalization_CIFAR10_AlexNet` except the following regularization scheme is implemented:

1. Each real data point is replaced by $K$ Gaussian samples around that data point.
2. Of the $K$ samples, some proportion $p_a$ of them are labeled incorrectly (hence "adversarial" regularizaiton)
3. New samples are generated each epoch

This means the following functions (which includes all the ones that depend on $K$ during graph building) are changed:

- `loss`
- `acc`
- `train`
- `append_noisy_samples`
- `graph_builder_wrapper`


In [1]:
from __future__ import print_function
import tensorflow as tf
import numpy as np
import time,os,pickle
import matplotlib.pyplot as plt
from chiyuan_cifar10_jz import chiyuan,cifar10_parameters 
from sklearn.utils import shuffle
np.set_printoptions(precision=2,suppress=True)
%matplotlib inline

In [2]:
# The small Alexnet is constructed by two (convolution 5x5
# → max-pool 3x3 → local-response-normalization) modules followed by two fully connected layers
# with 384 and 192 hidden units, respectively. Finally a 10-way linear layer is used for prediction

def alexnet_arch(NUM_CLASSES):
    
    input_data = tf.placeholder(tf.float32, shape=[None, 28, 28, 3])
    input_labels = tf.placeholder(tf.float32, shape=[None,10])
    
    # conv1
    with tf.variable_scope('conv1') as scope:
        kernel = tf.get_variable('weights',[5, 5, 3, 64],
                                 initializer=tf.truncated_normal_initializer(stddev=1e-2))
        conv = tf.nn.conv2d(input_data, kernel, [1, 1, 1, 1], padding='SAME')
        biases = tf.get_variable('biases', [64], 
                                 initializer=tf.truncated_normal_initializer(stddev=1e-2))
        pre_activation = tf.nn.bias_add(conv, biases)
        conv1 = tf.nn.relu(pre_activation, name=scope.name)

    # pool1
    pool1 = tf.nn.max_pool(conv1, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1],
                           padding='VALID', name='pool1')
    # norm1
    norm1 = tf.nn.lrn(pool1, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75,
                      name='norm1')

    # conv2
    with tf.variable_scope('conv2') as scope:
        kernel = tf.get_variable('weights',[5, 5, 64, 64],
                                 initializer=tf.truncated_normal_initializer(stddev=1e-2))
        conv = tf.nn.conv2d(norm1, kernel, [1, 1, 1, 1], padding='SAME')
        biases = tf.get_variable('biases', [64], 
                                 initializer=tf.truncated_normal_initializer(stddev=1e-2))
        pre_activation = tf.nn.bias_add(conv, biases)
        conv2 = tf.nn.relu(pre_activation, name=scope.name)

    # pool2
    pool2 = tf.nn.max_pool(conv2, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1],
                           padding='VALID', name='pool2')

    # norm2
    norm2 = tf.nn.lrn(pool2, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75,
                      name='norm2')


    # local3
    with tf.variable_scope('local3') as scope:
        # Move everything into depth so we can perform a single matrix multiply.
        reshape = tf.reshape(norm2, [-1, 6*6*64])
        dim = reshape.get_shape()[1].value
        weights = tf.get_variable('weights',[dim, 384],
                                  initializer=tf.truncated_normal_initializer(stddev=4e-2))
        biases = tf.get_variable('biases', [384], 
                                 initializer=tf.truncated_normal_initializer(stddev=4e-2))
        local3 = tf.nn.relu(tf.matmul(reshape, weights) + biases, name=scope.name)

    # local4
    with tf.variable_scope('local4') as scope:
        weights = tf.get_variable('weights',[384, 192],
                                  initializer=tf.truncated_normal_initializer(stddev=4e-2))
        biases = tf.get_variable('biases', [192], 
                                 initializer=tf.truncated_normal_initializer(stddev=4e-2))
        local4 = tf.nn.relu(tf.matmul(local3, weights) + biases, name=scope.name)

    # linear layer(WX + b)
    with tf.variable_scope('linear') as scope:
        weights = tf.get_variable('weights',[192, NUM_CLASSES],
                                  initializer=tf.truncated_normal_initializer(stddev=1/192.0))
        biases = tf.get_variable('biases', [NUM_CLASSES], 
                                 initializer=tf.truncated_normal_initializer(stddev=1/192.0))
        linear = tf.add(tf.matmul(local4, weights), biases, name=scope.name)
        
    return input_data,input_labels,linear

def loss(g, Y):
    return tf.reduce_mean(tf.reduce_sum(tf.pow(g-Y,2),1))

# Accuracy
def acc(g, Y):
    correct_prediction = tf.equal(tf.argmax(Y, 1), tf.argmax(g, 1))
    return tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

In [None]:
# Use some image preprocessing stuff from chiyuan
#   - images are cropped to be 28x28x3 from 32x32x3
#   - images are also whitened 

def cifar_one_hot(i):
        v = np.zeros(10)
        v[i] = 1
        return v
    
def get_cifar10_dataset(p_corrupt_label,n_samps=50000,rand_seed=None):
    
    #  'SubS:tr:50000' means draw a random subset of 50000 samples for training (no replace)
    #  'RndL:trtt:100' means corrupt the train and test set labels with 100% probability
    class params(cifar10_parameters):
        def __init__(self,p,rand_seed,n_samp=50000):
            self.dataset = 'cifar10|SubS:tr:%s|RndL:trtt:%s'%(int(n_samp),int(p))
            self.rand_seed = rand_seed
#             self.per_image_whitening = False
    
    p = params(p_corrupt_label,rand_seed,n_samp=n_samps)
    c = chiyuan(p)
    _, datasets = c.prepare_inputs()
    return datasets[0][0],np.array(map(cifar_one_hot,datasets[0][1])), \
           datasets[1][0],np.array(map(cifar_one_hot,datasets[1][1]))

def graph_builder_wrapper(num_classes,sd,save_dir,lr_initial=0.01):
    
    input_data, input_labels, g_out = alexnet_arch(num_classes)
    saver = tf.train.Saver(max_to_keep=20)
    
    # Loss and optimizer
    total_loss = loss(g_out, input_labels)
    learning_rate = tf.Variable(lr_initial, name='learning_rate')
    opt_step = tf.train.MomentumOptimizer(learning_rate,0.9).minimize(total_loss)
    tf.summary.scalar('loss', total_loss)
    
    # Accuracy
    total_acc = acc(g_out, input_labels)
    tf.summary.scalar('accuracy', total_acc)
    
    # Add histograms for trainable variables.
    for var in tf.trainable_variables():
        tf.summary.histogram(var.op.name, var)

    # Merge all the summaries and write them out to save_dir
    merged = tf.summary.merge_all()
    train_writer = tf.summary.FileWriter(save_dir + '/train')
    valid_writer = tf.summary.FileWriter(save_dir + '/validation')
    
    graph = dict( 
        input_data = input_data,
        input_labels = input_labels,
        total_loss = total_loss,
        total_acc = total_acc,
        g_out = g_out,
        opt_step = opt_step,
        learning_rate = learning_rate,
        merged = merged,
        train_writer = train_writer,
        valid_writer = valid_writer,
        saver = saver
    )
    
    return graph

# Augment dataset with perturbed samples and labels
def append_noisy_samples(X,Y,n_classes,K=10,pa=0.0,stddev=0.1):
    perturbations = np.random.normal(0,stddev,[len(X)*K]+list(np.shape(X))[1:])
    
    def adversarial_label_expand(y):
        y = np.argmax(y)
        y_ = np.repeat(y,K)
        inds = np.random.choice(range(K),int(pa*K),replace=False)
        p = np.ones(num_classes)
        p[y] = 0
        p /= num_classes-1.
        y_[inds] = np.random.choice(range(num_classes),int(pa*K),p=p)
        return np.array(map(cifar_one_hot,y_))
    
    X_ = np.repeat(X,K,axis=0)+perturbations
    Y_ = np.vstack(map(adversarial_label_expand,Y))
    return X_,Y_

def train(Xtr,Ytr,Xva,Yva,graph,sd,K,pa,save_dir,num_epochs=100,batch_size=100,save_every=1,verbose=True):
    
    training_losses,training_accs = [],[]
    validation_losses,validation_accs = [],[]
    num_classes = len(np.unique(Ytr))
    
    with tf.Session() as sess:

        sess.run(tf.global_variables_initializer())

        for epoch in range(num_epochs):

            lr = 0.01*0.95**(epoch/390.) # initial lr * decay rate ^(step/decay_steps)
            sess.run(graph['learning_rate'].assign(lr))

            t = time.time()
            training_loss = 0
            training_acc = 0
            steps = 0.
            Xtr_, Ytr_ = shuffle(Xtr,Ytr)

            if len(Xtr_)%batch_size == 0: end = len(Xtr_)
            else: end = len(Xtr_)-batch_size
            for i in range(0,end,batch_size):

                # Generate noisy samples for convolution estimation..
                x,y = append_noisy_samples(Xtr_[i:i+batch_size], 
                                           Ytr_[i:i+batch_size], 
                                           num_classes, K=K, pa=pa, stddev=sd)

                feed_dict = {graph['input_data']: x, graph['input_labels']: y}

                summary,training_loss_,training_acc_,_ = sess.run([graph['merged'],graph['total_loss'],
                                                                   graph['total_acc'],graph['opt_step']],
                                                                  feed_dict=feed_dict)
                training_loss += training_loss_
                training_acc += training_acc_
                steps += 1.

                if verbose:
                    print('\rTraining batch %s/%s (%.3f s): loss %.3f, acc %.3f' \
                          %(steps,len(Xtr_)/batch_size,time.time()-t,training_loss_,training_acc_),end='')            
            
            if epoch%save_every == 0: 
                graph['train_writer'].add_summary(summary, epoch)
                
                # Get results on test set
                feed_dict = {graph['input_data']: Xva, graph['input_labels']: Yva}
                summary,validation_loss,validation_acc = sess.run([graph['merged'],graph['total_loss'],
                                                                   graph['total_acc']],
                                                                  feed_dict=feed_dict)
                graph['valid_writer'].add_summary(summary, epoch)
                validation_losses.append(validation_loss)
                validation_accs.append(validation_acc)
                
                if not os.path.exists(save_dir+'checkpoints/'): os.mkdir(save_dir+'checkpoints/')
                graph['saver'].save(sess,save_dir+'checkpoints/epoch'+str(epoch))

            training_losses.append(training_loss/steps)
            training_accs.append(training_acc/steps)
            
            if verbose:
                print('\rEpoch: %s/%s done (Learning Rate: %.6f  Training Loss: %.3f)' \
                      %(epoch+1,num_epochs,lr,training_losses[-1]))
                
    return training_losses,training_accs,validation_losses,validation_accs


# Use trained model to predict
def predict(X,Y,graph,save_dir,batch_size=100,verbose=False,return_all=False):

    # Load from checkpoint corresponding to latest epoch 
    max_epoch = max([int(f.split('epoch')[1].split('.')[0]) for f in os.listdir(save_dir+'checkpoints/') if 'epoch' in f])
    t = time.time()
    with tf.Session() as sess:
        graph['saver'].restore(sess,save_dir+'checkpoints/epoch%s'%(max_epoch))
        
        embedding = np.zeros((len(X),num_classes))
        overall_loss = 0
        overall_acc = 0
        steps = 0.
        for i in range(0,len(X),batch_size):
            g_,loss_,acc_ = sess.run([graph['g_out'],graph['total_loss'],graph['total_acc']], 
                                      feed_dict = {graph['input_data']:X[i:i+batch_size],
                                                   graph['input_labels']:Y[i:i+batch_size]})
            embedding[i:i+batch_size,:] = g_
            overall_loss += loss_
            overall_acc += acc_
            steps += 1
            if verbose: print('\r%s/%s samples processed.. (%.3f s elapsed)'%(i+1,len(X),time.time()-t),end='')

        overall_loss /= steps
        overall_acc /= steps
        
    if verbose: print('\nOverall accuracy (argmax of embeddings as labels): %.3f'%(overall_acc))
    if return_all: return overall_acc,overall_loss,embedding
    return overall_acc

In [None]:
num_classes = 10
num_epochs = 100
batch_size = 100
K = 20
n_samps = 5000

list_p_corrupt_label = [0,100]
list_sd = [0.1,0.5,1,2]
list_pa = [0.0,0.2,0.5]

train_accs = np.zeros((len(list_p_corrupt_label),len(list_sd),len(list_pa)))
test_accs = np.zeros((len(list_p_corrupt_label),len(list_sd),len(list_pa)))

start = time.time()
for i,p_corrupt_label in enumerate(list_p_corrupt_label):
    
    Xtr,Ytr,Xtt,Ytt = get_cifar10_dataset(p_corrupt_label,n_samps=n_samps)
    
    for j,sd in enumerate(list_sd):
        for k,pa in enumerate(list_pa):

            save_dir = './temp/alexnet_K%s_sd%s_p%s_pa%s/'%(K,sd,p_corrupt_label,pa)

            tf.reset_default_graph()
            
            graph = graph_builder_wrapper(num_classes,sd,save_dir,lr_initial=0.01)

            if 'checkpoints' not in os.listdir(save_dir):
                tr_losses,tr_accs,va_losses,va_accs = train(Xtr,Ytr,Xtt,Ytt,graph,sd,K,pa,save_dir,
                                                            num_epochs=num_epochs,batch_size=batch_size)
                train_accs[i,j,k] = tr_accs[-1]
                test_accs[i,j,k] = va_accs[-1]
            else:
                train_accs[i,j,k] = predict(Xtr,Ytr,graph,save_dir,batch_size=batch_size)
                test_accs[i,j,k] = predict(Xtt,Ytt,graph,save_dir,batch_size=batch_size)

            print('-'*80)
            print('Finished with K = %s, p_corrupt_label = %.3f, sd = %.2f, pa = %.2f (%.3f s elapsed)' \
                  %(K,p_corrupt_label,sd,pa,time.time()-start))
            print('  Train acc: %.2f, Test acc: %.2f'%(train_accs[i,j,k],test_accs[i,j,k]))
            print('-'*80)

Epoch: 1/100 done (Learning Rate: 0.010000  Training Loss: 0.913)
Epoch: 2/100 done (Learning Rate: 0.009999  Training Loss: 0.889)
Epoch: 3/100 done (Learning Rate: 0.009997  Training Loss: 0.846)
Training batch 41.0/50 (16.152 s): loss 0.808, acc 0.373

In [None]:
# plt.figure(figsize=(16,4))
# plt.subplot(1,2,1)
# plt.plot(tr_losses)
# plt.plot(va_losses)
# plt.grid()
# plt.title('Losses')
# plt.subplot(1,2,2)
# plt.plot(tr_accs,label='Train')
# plt.plot(va_accs,label='Valid')
# plt.grid()
# plt.title('Accuracies')
# plt.legend()

### Misc

In [None]:
# # For Debugging
# tf.reset_default_graph()

# x = tf.constant([[10.,10.],[10.,10.]])
# y = tf.constant([[10.2,10.1],[10.2,10.1]])
# l = tf.reduce_sum(tf.pow(x-y,2),1)
# with tf.Session() as sess:
#     sess.run(tf.global_variables_initializer())
#     print(sess.run(l))

In [None]:
# # Testing to make sure the loss function is written correctly..
# K = 40
# n = 10
# d = 2
# Z = np.random.choice(range(10),(n,d))
# Zp = append_noisy_samples(Z,K=K,stddev=0.01)
# Z_ = np.mean(Zp.T.reshape(-1,K),1).reshape(d,-1).T
# print(Z)
# print(Z_)

# z = tf.placeholder(tf.float32, shape=[n*K,d])
# z_ = tf.transpose(tf.reshape(tf.reduce_mean(tf.reshape(tf.transpose(z),[-1,K]),1),[d,-1]))

# with tf.Session() as sess:
#     sess.run(tf.global_variables_initializer())
#     test = sess.run(z_, feed_dict = {z: Zp})
    
# print(test)