Load Modules

In [5]:
import random 
import numpy as np
import os
import pickle

LOAD DATA

In [6]:
#Load Train Data
x = []
y = []
filename = 'cifar-10-batches-py/data_batch_%d'
for b in range(1,6):
    f = os.path.join(filename  % (b, ))
    with open(f, 'r') as f:
        datadict = pickle.load(f)
        X = datadict['data']
        Y = datadict['labels']
        X = X.reshape(10000, 3, 32, 32).transpose(0,2,3,1).astype("float")
        Y = np.array(Y)
    x.append(X)
    y.append(Y)    
    
x_train = np.concatenate(x)
y_train = np.concatenate(y)

#Load Test Data
f = 'cifar-10-batches-py/test_batch'
with open(f, 'r') as f:
    datadict = pickle.load(f)
    x_test = datadict['data']
    y_test = datadict['labels']
    x_test = x_test.reshape(10000, 3, 32, 32).transpose(0,2,3,1).astype("float")
    y_test = np.array(y_test)

Get RAW Data

In [8]:
num_training = 49000
num_val = 1000
num_test = 10000

# subsample the data for validation set
mask = xrange(num_training, num_training + num_val)
x_val_raw = x_train[mask]
y_val_raw = y_train[mask]

mask = xrange(num_training)
x_train_raw = x_train[mask]
y_train_raw = y_train[mask]

mask = xrange(num_test)
x_test_raw = x_test[mask]
y_test_raw = y_test[mask]

Preprocess raw data

In [9]:
# Preprocessing: reshape the image data into rows
x_train = np.reshape(x_train_raw, (x_train_raw.shape[0], -1)) # [49000, 3072]
x_val = np.reshape(x_val_raw, (x_val_raw.shape[0], -1)) # [1000, 3072]
x_test = np.reshape(x_test_raw, (x_test_raw.shape[0], -1)) # [10000, 3072]
    
# Normalize the data: subtract the mean image
mean_image = np.mean(x_train, axis = 0)
x_train -= mean_image
x_val -= mean_image
x_test -= mean_image
    
# Add bias dimension and transform into columns
x_train = np.hstack([x_train, np.ones((x_train.shape[0], 1))]).T
x_val = np.hstack([x_val, np.ones((x_val.shape[0], 1))]).T
x_test = np.hstack([x_test, np.ones((x_test.shape[0], 1))]).T

y_train = y_train_raw
y_val = y_val_raw
y_test = y_test_raw

Trainin Softmax

In [14]:
from linear_classifier import Softmax

softmax_sgd = Softmax()
losses_sgd = softmax_sgd.train(x_train, y_train, method='sgd', batch_size=200, learning_rate=1e-6,
              reg = 1e5, num_iters=1000, verbose=True, vectorized=True)

y_train_pred_sgd = softmax_sgd.predict(x_train)[0]
print 'Training accuracy: %f' % (np.mean(y_train == y_train_pred_sgd))
y_val_pred_sgd = softmax_sgd.predict(x_val)[0]
print 'Validation accuracy: %f' % (np.mean(y_val == y_val_pred_sgd))

iteration 0/1000: loss 1553.288803
iteration 100/1000: loss 2.129240
iteration 200/1000: loss 2.174545
iteration 300/1000: loss 2.176543
iteration 400/1000: loss 2.130676
iteration 500/1000: loss 2.074817
iteration 600/1000: loss 2.153908
iteration 700/1000: loss 2.188529
iteration 800/1000: loss 2.166894
iteration 900/1000: loss 2.202306
Training accuracy: 0.296388
Validation accuracy: 0.306000


In [19]:
# Using validation set to tuen hyperparameters, i.e., learning rate and regularization strength
learning_rates = [1e-5, 1e-8]
regularization_strengths = [10e2, 10e4]

# Result is a dictionary mapping tuples of the form (learning_rate, regularization_strength) 
# to tuples of the form (training_accuracy, validation_accuracy). The accuracy is simply the fraction
# of data points that are correctly classified.
results = {}
best_val = -1
best_softmax = None
# Choose the best hyperparameters by tuning on the validation set
i = 0
interval = 5
for learning_rate in np.linspace(learning_rates[0], learning_rates[1], num=interval):
    i += 1
    print 'The current iteration is %d/%d' % (i, interval)
    for reg in np.linspace(regularization_strengths[0], regularization_strengths[1], num=interval):
        softmax = Softmax()
        softmax.train(x_train, y_train, method='sgd', batch_size=200, learning_rate=learning_rate,
              reg = reg, num_iters=700, verbose=False, vectorized=True)
        y_train_pred = softmax.predict(x_train)[0]
        y_val_pred = softmax.predict(x_val)[0]
        train_accuracy = np.mean(y_train == y_train_pred)
        val_accuracy = np.mean(y_val == y_val_pred)
        results[(learning_rate, reg)] = (train_accuracy, val_accuracy)
        if val_accuracy > best_val:
            best_val = val_accuracy
            best_softmax = softmax
        else:
            pass

# Print out the results
for learning_rate, reg in sorted(results):
    train_accuracy,val_accuracy = results[(learning_rate, reg)]
    print 'learning rate %e and regularization %e, \n \
    the training accuracy is: %f and validation accuracy is: %f.\n' % (learning_rate, reg, train_accuracy, val_accuracy)

The current iteration is 1/5
The current iteration is 2/5
The current iteration is 3/5
The current iteration is 4/5
The current iteration is 5/5
learning rate 1.000000e-08 and regularization 1.000000e+03, 
     the training accuracy is: 0.125122 and validation accuracy is: 0.117000.

learning rate 1.000000e-08 and regularization 2.575000e+04, 
     the training accuracy is: 0.147918 and validation accuracy is: 0.156000.

learning rate 1.000000e-08 and regularization 5.050000e+04, 
     the training accuracy is: 0.142571 and validation accuracy is: 0.149000.

learning rate 1.000000e-08 and regularization 7.525000e+04, 
     the training accuracy is: 0.147837 and validation accuracy is: 0.150000.

learning rate 1.000000e-08 and regularization 1.000000e+05, 
     the training accuracy is: 0.149082 and validation accuracy is: 0.154000.

learning rate 2.507500e-06 and regularization 1.000000e+03, 
     the training accuracy is: 0.394918 and validation accuracy is: 0.398000.

learning rate 2

Test best logistic classifier on test datasets

In [18]:
y_test_predict_result = best_softmax.predict(x_test)
y_test_predict = y_test_predict_result[0]
test_accuracy = np.mean(y_test == y_test_predict)
print 'The test accuracy is: %f' % test_accuracy

The test accuracy is: 0.392300
