In [1]:
import random
import numpy as np

In [2]:
class CrossEntropyCost(object):

    @staticmethod
    def fn(a, y):
        return np.sum(np.nan_to_num(-y*np.log(a)-(1-y)*np.log(1-a)))

    @staticmethod
    def delta(z, a, y):
        return (a-y)

In [3]:
class QuadraticCost(object):

    @staticmethod
    def fn(a, y):
        return 0.5*np.linalg.norm(a-y)**2

    @staticmethod
    def delta(z, a, y):
        return (a-y) * sigmoid_prime(z)

In [4]:
class Network(object):
    def __init__(self, sizes, cost=CrossEntropyCost):
        self.num_layers = len(sizes)
        self.sizes = sizes
        self.default_weight_initializer()
        self.cost=cost
        
    def default_weight_initializer(self):
        self.biases = [np.random.randn(y, 1) for y in self.sizes[1:]]
        self.weights = [np.random.randn(y, x)/np.sqrt(x) 
                        for x, y in zip(self.sizes[:-1], self.sizes[1:])]
        
    def large_weight_initializer(self):
        self.biases = [np.random.randn(y, 1) for y in self.sizes[1:]]
        self.weights = [np.random.randn(y, x) 
                        for x, y in zip(self.sizes[:-1], self.sizes[1:])]
    
    def feedforward(self, a):
        for b, w in zip(self.biases, self.weights):
            a = sigmoid(np.dot(w, a)+b)
        return a
    
    def SGD(self, training_data, epochs, mini_batch_size, eta,
            lmbda = 0.0,
            evaluation_data=None,
            monitor_evaluation_cost=False,
            monitor_evaluation_accuracy=False,
            monitor_training_cost=False,
            monitor_training_accuracy=False):
        if evaluation_data: n_data = len(evaluation_data)
        n = len(training_data)
        evaluation_cost, evaluation_accuracy = [], []
        training_cost, training_accuracy = [], []
        for j in xrange(epochs):
            random.shuffle(training_data)
            mini_batches = [
                training_data[k:k+mini_batch_size]
                for k in xrange(0, n, mini_batch_size)]
            for mini_batch in mini_batches:
                self.update_mini_batch(
                    mini_batch, eta, lmbda, len(training_data))
            print "Epoch %s training complete" % j
            if monitor_training_cost:
                cost = self.total_cost(training_data, lmbda)
                training_cost.append(cost)
                print "Cost on training data: {}".format(cost)
            if monitor_training_accuracy:
                accuracy = self.accuracy(training_data, convert=True)
                training_accuracy.append(accuracy)
                print "Accuracy on training data: {} / {} = {} %".format(
                    accuracy, n, float(accuracy)*100/n)
            if monitor_evaluation_cost:
                cost = self.total_cost(evaluation_data, lmbda, convert=True)
                evaluation_cost.append(cost)
                print "Cost on evaluation data: {}".format(cost)
            if monitor_evaluation_accuracy:
                accuracy = self.accuracy(evaluation_data)
                evaluation_accuracy.append(accuracy)
                print "Accuracy on evaluation data: {} / {}".format(
                    self.accuracy(evaluation_data), n_data)
            print
        print "Max accuracy on training data : {} ".format(float(training_accuracy[np.argmax(training_accuracy)])*100/n)
        return evaluation_cost, evaluation_accuracy, \
            training_cost, training_accuracy, 
                
    def update_mini_batch(self, mini_batch, eta, lmbda, n):
        nabla_b = [np.zeros(b.shape) for b in self.biases]
        nabla_w = [np.zeros(w.shape) for w in self.weights]
        for x, y in mini_batch:
            delta_nabla_b, delta_nabla_w = self.backprop(x, y)
            nabla_b = [nb+dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]
            nabla_w = [nw+dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]
        self.weights = [(1-eta*(lmbda/n))*w-(eta/len(mini_batch))*nw
                        for w, nw in zip(self.weights, nabla_w)]
        self.biases = [b-(eta/len(mini_batch))*nb
                       for b, nb in zip(self.biases, nabla_b)]
        
    def backprop(self, x, y):
        nabla_b = [np.zeros(b.shape) for b in self.biases]
        nabla_w = [np.zeros(w.shape) for w in self.weights]
        # feedforward
        activation = x
        activations = [x] # list to store all the activations, layer by layer
        zs = [] # list to store all the z vectors, layer by layer
        for b, w in zip(self.biases, self.weights):
            z = np.dot(w, activation)+b
            zs.append(z)
            activation = sigmoid(z)
            activations.append(activation)

        delta = (self.cost).delta(zs[-1], activations[-1], y)
        nabla_b[-1] = delta
        nabla_w[-1] = np.dot(delta, activations[-2].transpose())

        for l in xrange(2, self.num_layers):
            z = zs[-l]
            sp = sigmoid_prime(z)
            delta = np.dot(self.weights[-l+1].transpose(), delta) * sp
            nabla_b[-l] = delta
            nabla_w[-l] = np.dot(delta, activations[-l-1].transpose())
        return (nabla_b, nabla_w)
    
    def accuracy(self, data, convert=False):
        if convert:
            results = [(np.argmax(self.feedforward(x)), np.argmax(y))
                       for (x, y) in data]
        else:
            results = [(np.argmax(self.feedforward(x)), y)
                        for (x, y) in data]
        return sum(int(x == y) for (x, y) in results)

    def total_cost(self, data, lmbda, convert=False):
        cost = 0.0
        for x, y in data:
            a = self.feedforward(x)
            if convert: y = vectorized_result(y)
            cost += self.cost.fn(a, y)/len(data)
        cost += 0.5*(lmbda/len(data))*sum(
            np.linalg.norm(w)**2 for w in self.weights)
        return cost

    def evaluate(self, test_data):
        test_results = [(np.argmax(self.feedforward(x)), y)
                        for (x, y) in test_data]
        return sum(int(x == y) for (x, y) in test_results)
    
    def save(self, filename):
        data = {"sizes": self.sizes,
                "weights": [w.tolist() for w in self.weights],
                "biases": [b.tolist() for b in self.biases],
                "cost": str(self.cost.__name__)}
        f = open(filename, "w")
        json.dump(data, f)
        f.close()

In [5]:
def vectorized_result(j):
    e = np.zeros((10, 1))
    e[j] = 1.0
    return e

def sigmoid(z):
    return 1.0/(1.0+np.exp(-z))

def sigmoid_prime(z):
    return sigmoid(z)*(1-sigmoid(z))

Training...

In [6]:
import csv
def load_data(path, from_= 0):
    results = []
    inputs = []
    with open(path) as test_data_file:
        reader = csv.reader(test_data_file)
        next(reader)

        for row in reader:
            row = map(int, row)
            
            result = vectorized_result(row[0])

            input = np.array(row[from_:])
            input = np.reshape(input, (784, 1))

            inputs.append(input)
            results.append(result)

    data = zip(inputs, results)
    return data

In [7]:
training_data = load_data('./data/train.csv', 1)

In [8]:
net = Network([784, 100, 10])

In [9]:
net.SGD(training_data, 65, 30, 0.01, lmbda = 1.0,
            evaluation_data=training_data,
            monitor_evaluation_cost=False,
            monitor_evaluation_accuracy=False,
            monitor_training_cost=True,
            monitor_training_accuracy=True)

  import sys


Epoch 0 training complete
Cost on training data: 0.969787833801
Accuracy on training data: 37193 / 42000 = 88.5547619048 %

Epoch 1 training complete
Cost on training data: 0.736549726984
Accuracy on training data: 38010 / 42000 = 90.5 %

Epoch 2 training complete
Cost on training data: 0.700123485087
Accuracy on training data: 38112 / 42000 = 90.7428571429 %

Epoch 3 training complete
Cost on training data: 0.646662012775
Accuracy on training data: 38464 / 42000 = 91.580952381 %

Epoch 4 training complete
Cost on training data: 0.594270943328
Accuracy on training data: 38590 / 42000 = 91.880952381 %

Epoch 5 training complete
Cost on training data: 0.552572874667
Accuracy on training data: 38776 / 42000 = 92.3238095238 %

Epoch 6 training complete
Cost on training data: 0.550816034182
Accuracy on training data: 38682 / 42000 = 92.1 %

Epoch 7 training complete
Cost on training data: 0.551707286649
Accuracy on training data: 38883 / 42000 = 92.5785714286 %

Epoch 8 training complete
Co

Cost on training data: 0.341236427123
Accuracy on training data: 40062 / 42000 = 95.3857142857 %

Epoch 67 training complete
Cost on training data: 0.336289764574
Accuracy on training data: 40095 / 42000 = 95.4642857143 %

Epoch 68 training complete
Cost on training data: 0.332495584245
Accuracy on training data: 40146 / 42000 = 95.5857142857 %

Epoch 69 training complete
Cost on training data: 0.33613482535
Accuracy on training data: 40128 / 42000 = 95.5428571429 %

Epoch 70 training complete
Cost on training data: 0.336134198676
Accuracy on training data: 40115 / 42000 = 95.5119047619 %

Epoch 71 training complete
Cost on training data: 0.32377074681
Accuracy on training data: 40211 / 42000 = 95.7404761905 %

Epoch 72 training complete
Cost on training data: 0.32601634443
Accuracy on training data: 40243 / 42000 = 95.8166666667 %

Epoch 73 training complete
Cost on training data: 0.316081490275
Accuracy on training data: 40184 / 42000 = 95.6761904762 %

Epoch 74 training complete
Cos

([],
 [],
 [0.9697878338007592,
  0.7365497269844945,
  0.7001234850874752,
  0.6466620127752469,
  0.5942709433281335,
  0.5525728746671702,
  0.5508160341818499,
  0.5517072866485305,
  0.5025392522679404,
  0.5112303247617035,
  0.5119482486577766,
  0.5043027750342501,
  0.4696194672115899,
  0.48619613694933733,
  0.4639168499873496,
  0.4469855431084868,
  0.43841442713737955,
  0.43533958738035755,
  0.43995042822159375,
  0.44671604760696915,
  0.44331957877542094,
  0.43671846684863025,
  0.42200631010222883,
  0.4226898452005009,
  0.39680713633923786,
  0.4143142120226303,
  0.40843747004727937,
  0.41265916909696787,
  0.387162483699972,
  0.37374095003930713,
  0.3780087805190951,
  0.3770784708722602,
  0.4025527943907117,
  0.3841762077501722,
  0.3692268507906754,
  0.3738729131576916,
  0.39097916563552243,
  0.36294166524620836,
  0.3720911202506947,
  0.3624258184422081,
  0.36255534565736336,
  0.3589326040767418,
  0.36519290326151843,
  0.35405382125366985,
  0.36

In [10]:
import csv
test_data = load_data('./data/test.csv')

In [13]:
with open('data/submission.csv', 'wb') as csvfile:
    csvwriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
    csvwriter.writerow(['ImageId', 'Label'])
    for i in range(2800):
        test_results = net.feedforward(test_data[i][0])
        digit_decision =  np.argmax(test_results)
        csvwriter.writerow([i+1, digit_decision])

  import sys
