In [10]:
# example of loading the mnist dataset
from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import Sequential, Input
from tensorflow.keras.layers import Flatten, Dense, BatchNormalization
from tensorflow.keras.optimizers import SGD
from matplotlib import pyplot as plt
import numpy as np
import math
# load dataset
(trainX, trainY), (testX, testY) = mnist.load_data()
# summarize loaded dataset
print('Train: X=%s, y=%s' % (trainX.shape, trainY.shape))
print('Test: X=%s, y=%s' % (testX.shape, testY.shape))

Train: X=(60000, 28, 28), y=(60000,)
Test: X=(10000, 28, 28), y=(10000,)


In [11]:
information = 0
for i in range(10):
    cnt = sum(trainY == i)
    information += - cnt * math.log(cnt / len(trainY))
print("dataset information bits:", math.ceil(information))

dataset information bits: 138070


In [12]:
def memorize(data, labels):
    threshold = 0
    table = []
    for image, label in zip(data, labels):
        table.append([np.sum(image), label])
    table.sort()
    cur_class = table[0][1]
    for row in table:
        if row[1] != cur_class:
            threshold += 1
            cur_class = row[1]
    min_threshold = math.log(threshold + 1, 2)
    d = len(data[0])
    mec = min_threshold * (d + 1) + min_threshold
    return mec
print("maximum bound of mec by algorithm 8 :", memorize(trainX, trainY))

maximum bound of mec by algorithm 8 : 466.63766555032913


## Discussion of maximum bound of MEC
The algorithm 8 in book assumes attributes are in equilibrium. However, our dataset is not. Therefore, maximum bound of MEC is between 466 and 138070. We can show from below. Model with MEC = 785 does not have the same performance of other more complicated models.


# Preprocess

In [13]:
trainX = trainX.reshape((trainX.shape[0], 28, 28, 1))
testX = testX.reshape((testX.shape[0], 28, 28, 1))
# convert from integers to floats
trainX = trainX.astype('float32')
testX = testX.astype('float32')
# normalize to range 0-1
trainX = trainX / 255.0
testX = testX / 255.0
trainY = to_categorical(trainY)
testY = to_categorical(testY)

### Exp1: Model with MEC greater than information of dataset
- model1 mec = 785 * 200 + 200 = 157200 > 138070
- model2 mec = 785 * 175 + 175 = 137550 $\approx$ 138070

In [23]:
def define_model1():
    model = Sequential()
    model.add(Input(shape=(28, 28)))
    model.add(Flatten())  
    model.add(Dense(200, activation='relu', kernel_initializer='he_uniform'))
    model.add(Dense(10, activation='softmax'))
    opt = SGD(learning_rate=0.01, momentum=0.9)
    model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
    return model
model1 = define_model1()
model1.summary()
history1 = model1.fit(trainX, trainY, epochs=20, batch_size=32, validation_data=(testX, testY), verbose=0)
print("Train acc:", history1.history["accuracy"][-3:])
print("Val Acc:", history1.history["val_accuracy"][-3:])

Train acc: [0.9991499781608582, 0.9994999766349792, 0.9994999766349792]
Val Acc: [0.9815000295639038, 0.9818000197410583, 0.9815000295639038]


In [24]:
def define_model2():
    model = Sequential()
    model.add(Input(shape=(28, 28)))
    model.add(Flatten())  
    model.add(Dense(175, activation='relu', kernel_initializer='he_uniform'))
    model.add(Dense(10, activation='softmax'))
    opt = SGD(learning_rate=0.01, momentum=0.9)
    model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
    return model
model2 = define_model2()
model2.summary()
history2 = model2.fit(trainX, trainY, epochs=20, batch_size=32, validation_data=(testX, testY), verbose=0)
print("Train acc:", history2.history["accuracy"][-3:])
print("Val Acc:", history2.history["val_accuracy"][-3:])


Train acc: [0.9991666674613953, 0.9991999864578247, 0.9995333552360535]
Val Acc: [0.98089998960495, 0.9815999865531921, 0.9807000160217285]


#### Conclusion
We can see that model1 and model2 have the same high accuracy on training data. Model1 has a better performance on validation set than model2, but the difference is not significant. Therefore, the model with MEC greater than dataset information has similar performance. <br/>
Note that these 2 models have validation accuracy greater than 98%.

### Exp2: Smallest Model 
MEC = 785 * 10 = 7850

In [25]:
def define_model3():
    model = Sequential()
    model.add(Input(shape=(28, 28)))
    model.add(Flatten())  
    model.add(Dense(10, activation='softmax'))
    opt = SGD(learning_rate=0.01, momentum=0.9)
    model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
    return model
model3 = define_model3()
model3.summary()
history3 = model3.fit(trainX, trainY, epochs=20, batch_size=32, validation_data=(testX, testY), verbose=0)
print("Train acc:", history3.history["accuracy"][-3:])
print("Val Acc:", history3.history["val_accuracy"][-3:])


Train acc: [0.9283833503723145, 0.9288333058357239, 0.929016649723053]
Val Acc: [0.9261000156402588, 0.925000011920929, 0.9230999946594238]


#### Conclusion
Baseline model accuracy is between 92.5% - 93%. It is lower than the performance of large model.
Therefore, we can see that the bound calculate by algorithm 8 from book is not correct here (670 < 7850 but the model performance does not meet the larger model). It is bacause the attributes are in equilibrium.

### Exp3: Model with similar MEC but different layers
- model4 mec: 785 * 50 + 50 = 39300
- model5 mec: 785 * 50 + 50 + 50 = 39350
- model6 mec: 785 * 50 + 50 + 25 + 25 = 39350 

In [34]:
def define_model4():
    model = Sequential()
    model.add(Input(shape=(28, 28)))
    model.add(Flatten())  
    model.add(Dense(50, activation='relu', kernel_initializer='he_uniform'))
    model.add(Dense(10, activation='softmax'))
    opt = SGD(learning_rate=0.01, momentum=0.9)
    model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
    return model
model4 = define_model4()
model4.summary()
history4 = model4.fit(trainX, trainY, epochs=30, batch_size=32, validation_data=(testX, testY), verbose=0)
print("Train acc:", history4.history["accuracy"][-3:])
print("Val Acc:", history4.history["val_accuracy"][-3:])



Train acc: [0.9974166750907898, 0.997866690158844, 0.9984833598136902]
Val Acc: [0.9740999937057495, 0.9747999906539917, 0.9750000238418579]


In [36]:
def define_model5():
    model = Sequential()
    model.add(Input(shape=(28, 28)))
    model.add(Flatten())  
    model.add(Dense(50, activation='relu', kernel_initializer='he_uniform'))
    model.add(Dense(50, activation='relu', kernel_initializer='he_uniform'))
    model.add(Dense(10, activation='softmax'))
    opt = SGD(learning_rate=0.01, momentum=0.9)
    model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
    return model
model5 = define_model5()
model5.summary()
history5 = model5.fit(trainX, trainY, epochs=30, batch_size=32, validation_data=(testX, testY), verbose=0)
print("Train acc:", history5.history["accuracy"][-3:])
print("Val Acc:", history5.history["val_accuracy"][-3:])



Train acc: [0.9994166493415833, 0.9998666644096375, 0.9999666810035706]
Val Acc: [0.9757999777793884, 0.9767000079154968, 0.9765999913215637]


In [37]:
def define_model6():
    model = Sequential()
    model.add(Input(shape=(28, 28)))
    model.add(Flatten())  
    model.add(Dense(50, activation='relu', kernel_initializer='he_uniform'))
    # model.add(BatchNormalization())
    model.add(Dense(25, activation='relu', kernel_initializer='he_uniform'))
    # model.add(BatchNormalization())
    model.add(Dense(25, activation='relu', kernel_initializer='he_uniform'))
    # model.add(BatchNormalization())
    model.add(Dense(10, activation='softmax'))
    opt = SGD(learning_rate=0.01, momentum=0.9)
    model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
    return model
model6 = define_model6()
model6.summary()
history6 = model6.fit(trainX, trainY, epochs=30, batch_size=32, validation_data=(testX, testY), verbose = 0)
print("Train acc:", history6.history["accuracy"][-3:])
print("Val Acc:", history6.history["val_accuracy"][-3:])



Train acc: [0.9918166399002075, 0.9928333163261414, 0.9936333298683167]
Val Acc: [0.974399983882904, 0.9656000137329102, 0.9697999954223633]


#### Conclusion
The performance of 3 models is similar. The accuracies of model 4 and model 5 are better than model 6. The model 6 is a deeper version of model, which is harder to train. The performance on validation data of 3 models is 97% - 97.5%, which is a little bit lower than the accuracy of large model (model1, model2).

### Exp4: Different Size of Hidden Layer

In [38]:
# size 30
def define_model7():
    model = Sequential()
    model.add(Input(shape=(28, 28)))
    model.add(Flatten())  
    model.add(Dense(30, activation='relu', kernel_initializer='he_uniform'))
    model.add(Dense(10, activation='softmax'))
    opt = SGD(learning_rate=0.01, momentum=0.9)
    model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
    return model
model7 = define_model7()
model7.summary()
history7 = model7.fit(trainX, trainY, epochs=30, batch_size=32, validation_data=(testX, testY), verbose=0)
print("Train acc:", history7.history["accuracy"][-3:])
print("Val Acc:", history7.history["val_accuracy"][-3:])

Train acc: [0.9860333204269409, 0.9871666431427002, 0.9872333407402039]
Val Acc: [0.9659000039100647, 0.9678000211715698, 0.9668999910354614]


In [40]:
# size 10
def define_model8():
    model = Sequential()
    model.add(Input(shape=(28, 28)))
    model.add(Flatten())  
    model.add(Dense(10, activation='relu', kernel_initializer='he_uniform'))
    model.add(Dense(10, activation='softmax'))
    opt = SGD(learning_rate=0.01, momentum=0.9)
    model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
    return model
model8 = define_model8()
model8.summary()
history8 = model8.fit(trainX, trainY, epochs=30, batch_size=32, validation_data=(testX, testY), verbose=0)
print("Train acc:", history8.history["accuracy"][-3:])
print("Val Acc:", history8.history["val_accuracy"][-3:])

Train acc: [0.9437166452407837, 0.9453999996185303, 0.9443333148956299]
Val Acc: [0.9386000037193298, 0.9325000047683716, 0.9279999732971191]


### Overall Conclusion
The model mec is largely affected by the dimension of 1st hidden layer. Because any number greater than 176 as the size of 1st hidden layer is considered over large model, adding additional layer would at most add 176 to mec. Therefore, for this dataset, I only need to tune the size of 1st hidden layer. We can see the result of different size of hidden layer (200, 175, 50, 30, 10).