## Reading Data

In [1]:
import torch
import torchvision.datasets as dsets
import torchvision.transforms as transforms

In [2]:
mnist_train = dsets.MNIST(root="data/MNIST/", train=True, transform=transforms.ToTensor(), download=True)
mnist_test = dsets.MNIST(root="data/MNIST/", train=False, transform=transforms.ToTensor(), download=True)

## Train

In [3]:
training_epochs = 15
batch_size = 100
learning_rate = 0.001
data_loader = torch.utils.data.DataLoader(dataset=mnist_train, 
                                          batch_size=batch_size, 
                                          shuffle=True, 
                                          drop_last=True)

In [11]:
device = 'cpu' # 'cpu' or 'cuda'?
linear = torch.nn.Linear(28 * 28, 10, bias=True).to(device)
torch.nn.init.normal_(linear.weight)
criterion = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(linear.parameters(), lr=learning_rate)

for epoch in range(training_epochs):
    avg_cost = 0
    total_batch = len(data_loader)
    
    for X, Y in data_loader:
        X = X.view(-1, 28 * 28).to(device)
        Y = Y.to(device)
        
        hypothesis = linear(X)
        cost = criterion(hypothesis, Y)
        
        optimizer.zero_grad()
        cost.backward()
        optimizer.step()
        
        avg_cost += cost / total_batch

    with torch.no_grad():
        X_test = mnist_test.data.view(-1, 28 * 28).float().to(device) # data <-- test_data
        Y_test = mnist_test.targets.to(device) # targets <-- test_labels

        prediction = linear(X_test)
        correct_prediction = torch.argmax(prediction, 1) == Y_test
        accuracy = correct_prediction.float().mean()
    print("Epoch: {:04d}/{} Cost: {:.6f} Accuracy: {:.2f}%".format(epoch + 1, training_epochs, avg_cost, accuracy.item() * 100))

Epoch: 0001/15 Cost: 4.646300 Accuracy: 63.53%
Epoch: 0002/15 Cost: 1.489968 Accuracy: 76.58%
Epoch: 0003/15 Cost: 1.021626 Accuracy: 81.46%
Epoch: 0004/15 Cost: 0.823346 Accuracy: 84.05%
Epoch: 0005/15 Cost: 0.707713 Accuracy: 85.61%
Epoch: 0006/15 Cost: 0.631166 Accuracy: 86.80%
Epoch: 0007/15 Cost: 0.575449 Accuracy: 87.58%
Epoch: 0008/15 Cost: 0.533720 Accuracy: 87.74%
Epoch: 0009/15 Cost: 0.501321 Accuracy: 87.82%
Epoch: 0010/15 Cost: 0.475027 Accuracy: 88.23%
Epoch: 0011/15 Cost: 0.453282 Accuracy: 88.59%
Epoch: 0012/15 Cost: 0.435329 Accuracy: 88.75%
Epoch: 0013/15 Cost: 0.419511 Accuracy: 88.33%
Epoch: 0014/15 Cost: 0.406036 Accuracy: 88.54%
Epoch: 0015/15 Cost: 0.394308 Accuracy: 89.01%


## mlp with relu and cuda

In [9]:
device = 'cuda' # 'cpu' or 'cuda'?
linear1 = torch.nn.Linear(28 * 28, 256, bias=True).to(device)
linear2 = torch.nn.Linear(256, 256, bias=True).to(device)
linear3 = torch.nn.Linear(256, 10, bias=True).to(device)
relu = torch.nn.ReLU()

torch.nn.init.normal_(linear1.weight)
torch.nn.init.normal_(linear2.weight)
torch.nn.init.normal_(linear3.weight)

model = torch.nn.Sequential(linear1, relu, linear2, relu, linear3).to(device)
criterion = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

for epoch in range(training_epochs):
    avg_cost = 0
    total_batch = len(data_loader)
    
    for X, Y in data_loader:
        X = X.view(-1, 28 * 28).to(device)
        Y = Y.to(device)
        
        hypothesis = model(X)
        cost = criterion(hypothesis, Y)
        
        optimizer.zero_grad()
        cost.backward()
        optimizer.step()
        
        avg_cost += cost / total_batch
        
    print("Epoch: {:04d}/{} cost = {:.6f}".format(epoch + 1, training_epochs, avg_cost))

Epoch: 0001/15 cost = 148.772842
Epoch: 0002/15 cost = 35.280933
Epoch: 0003/15 cost = 22.122944
Epoch: 0004/15 cost = 15.287751
Epoch: 0005/15 cost = 11.041275
Epoch: 0006/15 cost = 8.200020
Epoch: 0007/15 cost = 6.009705
Epoch: 0008/15 cost = 4.554917
Epoch: 0009/15 cost = 3.447356
Epoch: 0010/15 cost = 2.445161
Epoch: 0011/15 cost = 1.934303
Epoch: 0012/15 cost = 1.445817
Epoch: 0013/15 cost = 1.059533
Epoch: 0014/15 cost = 0.868608
Epoch: 0015/15 cost = 0.696072


In [10]:
with torch.no_grad():
    X_test = mnist_test.data.view(-1, 28 * 28).float().to(device) # data <-- test_data
    Y_test = mnist_test.targets.to(device) # targets <-- test_labels
    
    prediction = model(X_test)
    correct_prediction = torch.argmax(prediction, 1) == Y_test
    accuracy = correct_prediction.float().mean()
    print("Accuracy: ", accuracy.item())

Accuracy:  0.9532999992370605


## initialization Xavier / He

In [12]:
device = 'cuda' # 'cpu' or 'cuda'?
linear1 = torch.nn.Linear(28 * 28, 256, bias=True).to(device)
linear2 = torch.nn.Linear(256, 256, bias=True).to(device)
linear3 = torch.nn.Linear(256, 10, bias=True).to(device)
relu = torch.nn.ReLU()

torch.nn.init.xavier_uniform_(linear1.weight)
torch.nn.init.xavier_uniform_(linear2.weight)
torch.nn.init.xavier_uniform_(linear3.weight)

model = torch.nn.Sequential(linear1, relu, linear2, relu, linear3).to(device)
criterion = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

for epoch in range(training_epochs):
    avg_cost = 0
    total_batch = len(data_loader)
    
    for X, Y in data_loader:
        X = X.view(-1, 28 * 28).to(device)
        Y = Y.to(device)
        
        hypothesis = model(X)
        cost = criterion(hypothesis, Y)
        
        optimizer.zero_grad()
        cost.backward()
        optimizer.step()
        
        avg_cost += cost / total_batch
        
    with torch.no_grad():
        X_test = mnist_test.data.view(-1, 28 * 28).float().to(device) # data <-- test_data
        Y_test = mnist_test.targets.to(device) # targets <-- test_labels

        prediction = model(X_test)
        correct_prediction = torch.argmax(prediction, 1) == Y_test
        accuracy = correct_prediction.float().mean()
    print("Epoch: {:04d}/{} Cost: {:.6f} Accuracy: {:.2f}%".format(epoch + 1, training_epochs, avg_cost, accuracy.item() * 100))

Epoch: 0001/15 Cost: 0.241805 Accuracy: 96.56%
Epoch: 0002/15 Cost: 0.092317 Accuracy: 97.58%
Epoch: 0003/15 Cost: 0.060258 Accuracy: 97.59%
Epoch: 0004/15 Cost: 0.043647 Accuracy: 97.66%
Epoch: 0005/15 Cost: 0.033376 Accuracy: 97.71%
Epoch: 0006/15 Cost: 0.026777 Accuracy: 97.57%
Epoch: 0007/15 Cost: 0.020749 Accuracy: 97.87%
Epoch: 0008/15 Cost: 0.017008 Accuracy: 97.76%
Epoch: 0009/15 Cost: 0.016517 Accuracy: 97.50%
Epoch: 0010/15 Cost: 0.015579 Accuracy: 97.70%
Epoch: 0011/15 Cost: 0.010295 Accuracy: 97.94%
Epoch: 0012/15 Cost: 0.010483 Accuracy: 97.97%
Epoch: 0013/15 Cost: 0.012657 Accuracy: 98.05%
Epoch: 0014/15 Cost: 0.009605 Accuracy: 97.69%
Epoch: 0015/15 Cost: 0.010982 Accuracy: 97.66%


## dropout

In [15]:
device = 'cuda' # 'cpu' or 'cuda'?
linear1 = torch.nn.Linear(28 * 28, 512, bias=True).to(device)
linear2 = torch.nn.Linear(512, 512, bias=True).to(device)
linear3 = torch.nn.Linear(512, 512, bias=True).to(device)
linear4 = torch.nn.Linear(512, 512, bias=True).to(device)
linear5 = torch.nn.Linear(512, 10, bias=True).to(device)
relu = torch.nn.ReLU()
dropout = torch.nn.Dropout(p=0.5)

torch.nn.init.xavier_uniform_(linear1.weight)
torch.nn.init.xavier_uniform_(linear2.weight)
torch.nn.init.xavier_uniform_(linear3.weight)
torch.nn.init.xavier_uniform_(linear4.weight)
torch.nn.init.xavier_uniform_(linear5.weight)

model = torch.nn.Sequential(linear1, relu, dropout, 
                            linear2, relu, dropout, 
                            linear3, relu, dropout,
                            linear4, relu, dropout,
                            linear5).to(device)
criterion = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

for epoch in range(training_epochs):
    avg_cost = 0
    total_batch = len(data_loader)
    model.train()
    for X, Y in data_loader:
        X = X.view(-1, 28 * 28).to(device)
        Y = Y.to(device)
        
        hypothesis = model(X)
        cost = criterion(hypothesis, Y)
        
        optimizer.zero_grad()
        cost.backward()
        optimizer.step()
        
        avg_cost += cost / total_batch
        
    with torch.no_grad():
        model.eval()
        X_test = mnist_test.data.view(-1, 28 * 28).float().to(device) # data <-- test_data
        Y_test = mnist_test.targets.to(device) # targets <-- test_labels

        prediction = model(X_test)
        correct_prediction = torch.argmax(prediction, 1) == Y_test
        accuracy = correct_prediction.float().mean()
    print("Epoch: {:04d}/{} Cost: {:.6f} Accuracy: {:.2f}%".format(epoch + 1, training_epochs, avg_cost, accuracy.item() * 100))

Epoch: 0001/15 Cost: 0.484289 Accuracy: 94.99%
Epoch: 0002/15 Cost: 0.222181 Accuracy: 96.19%
Epoch: 0003/15 Cost: 0.173728 Accuracy: 96.96%
Epoch: 0004/15 Cost: 0.156650 Accuracy: 96.67%
Epoch: 0005/15 Cost: 0.143594 Accuracy: 96.96%
Epoch: 0006/15 Cost: 0.133348 Accuracy: 97.39%
Epoch: 0007/15 Cost: 0.122592 Accuracy: 97.21%
Epoch: 0008/15 Cost: 0.118362 Accuracy: 97.15%
Epoch: 0009/15 Cost: 0.110881 Accuracy: 97.66%
Epoch: 0010/15 Cost: 0.107140 Accuracy: 97.66%
Epoch: 0011/15 Cost: 0.102363 Accuracy: 97.70%
Epoch: 0012/15 Cost: 0.100835 Accuracy: 97.86%
Epoch: 0013/15 Cost: 0.097960 Accuracy: 97.68%
Epoch: 0014/15 Cost: 0.093695 Accuracy: 97.56%
Epoch: 0015/15 Cost: 0.093288 Accuracy: 97.81%


## batchnormalization

In [18]:
device = 'cuda' # 'cpu' or 'cuda'?
linear1 = torch.nn.Linear(28 * 28, 32, bias=True).to(device)
linear2 = torch.nn.Linear(32, 32, bias=True).to(device)
linear3 = torch.nn.Linear(32, 10, bias=True).to(device)
relu = torch.nn.ReLU()
bn1 = torch.nn.BatchNorm1d(num_features=32) # gamma and beta
bn2 = torch.nn.BatchNorm1d(num_features=32)

torch.nn.init.xavier_uniform_(linear1.weight)
torch.nn.init.xavier_uniform_(linear2.weight)
torch.nn.init.xavier_uniform_(linear3.weight)

bn_model = torch.nn.Sequential(linear1, bn1, relu,
                               linear2, bn2, relu,
                               linear3).to(device)

bn_optimizer = torch.optim.Adam(bn_model.parameters(), lr=learning_rate)

linear4 = torch.nn.Linear(28 * 28, 32, bias=True).to(device)
linear5 = torch.nn.Linear(32, 32, bias=True).to(device)
linear6 = torch.nn.Linear(32, 10, bias=True).to(device)
relu = torch.nn.ReLU()

torch.nn.init.xavier_uniform_(linear4.weight)
torch.nn.init.xavier_uniform_(linear5.weight)
torch.nn.init.xavier_uniform_(linear6.weight)

nn_model = torch.nn.Sequential(linear4, relu,
                               linear5, relu,
                               linear6).to(device)

nn_optimizer = torch.optim.Adam(nn_model.parameters(), lr=learning_rate)

criterion = torch.nn.CrossEntropyLoss().to(device)


for epoch in range(training_epochs):
    model.train()
    for X, Y in data_loader:
        X = X.view(-1, 28 * 28).to(device)
        Y = Y.to(device)
        
        bn_hypothesis = bn_model(X)
        bn_cost = criterion(bn_hypothesis, Y)
        bn_optimizer.zero_grad()
        bn_cost.backward()
        bn_optimizer.step()
        
        nn_hypothesis = nn_model(X)
        nn_cost = criterion(nn_hypothesis, Y)
        nn_optimizer.zero_grad()
        nn_cost.backward()
        nn_optimizer.step()

    with torch.no_grad():
        model.eval()
        X_test = mnist_test.data.view(-1, 28 * 28).float().to(device) # data <-- test_data
        Y_test = mnist_test.targets.to(device) # targets <-- test_labels

        bn_prediction = bn_model(X_test)
        bn_correct_prediction = torch.argmax(bn_prediction, 1) == Y_test
        bn_accuracy = bn_correct_prediction.float().mean()
        
        nn_prediction = nn_model(X_test)
        nn_correct_prediction = torch.argmax(nn_prediction, 1) == Y_test
        nn_accuracy = nn_correct_prediction.float().mean()
    print("Epoch: {:04d}/{} BN_Accuracy: {:.2f}% /  NN_Accuracy: {:.2f}%".format(
        epoch + 1, training_epochs, bn_accuracy.item() * 100, nn_accuracy.item() * 100))

Epoch: 0001/15 BN_Accuracy: 94.53% /  NN_Accuracy: 93.10%
Epoch: 0002/15 BN_Accuracy: 95.96% /  NN_Accuracy: 94.03%
Epoch: 0003/15 BN_Accuracy: 96.50% /  NN_Accuracy: 95.57%
Epoch: 0004/15 BN_Accuracy: 96.74% /  NN_Accuracy: 95.46%
Epoch: 0005/15 BN_Accuracy: 96.97% /  NN_Accuracy: 96.18%
Epoch: 0006/15 BN_Accuracy: 96.89% /  NN_Accuracy: 96.47%
Epoch: 0007/15 BN_Accuracy: 97.00% /  NN_Accuracy: 96.72%
Epoch: 0008/15 BN_Accuracy: 97.01% /  NN_Accuracy: 96.67%
Epoch: 0009/15 BN_Accuracy: 97.26% /  NN_Accuracy: 96.69%
Epoch: 0010/15 BN_Accuracy: 97.15% /  NN_Accuracy: 96.84%
Epoch: 0011/15 BN_Accuracy: 96.95% /  NN_Accuracy: 96.80%
Epoch: 0012/15 BN_Accuracy: 97.14% /  NN_Accuracy: 96.90%
Epoch: 0013/15 BN_Accuracy: 97.19% /  NN_Accuracy: 96.75%
Epoch: 0014/15 BN_Accuracy: 97.15% /  NN_Accuracy: 96.38%
Epoch: 0015/15 BN_Accuracy: 97.20% /  NN_Accuracy: 97.16%
