Load the dataset

In [1]:
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [2]:
# image transformations can be chained together using Compose
# transform the PILImage to Tensors of normalized range [-1, 1]
transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])


trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                       download=False, transform=transform)

trainloader = torch.utils.data.DataLoader(trainset, batch_size=4,
                                         shuffle=True)
testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                      download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=4,
                                        shuffle=False)

classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

Files already downloaded and verified


Define neural networks.   
Define loss function.   
Train the network on the training data{ Compute Loss and use the optimizer to update the weights of the network during training process}.  

In [3]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 5 * 5)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

net = Net()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=1e-4, momentum=0.9)
# train the model
for epoch in range(10):  
    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        inputs, labels = data
        optimizer.zero_grad()
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        if i % 2000 == 1999:
            print('[%d, %5d] loss: %.3f' % (epoch+1, i+1, running_loss/2000))
            running_loss = 0.0
            
print('Finished Training')
        


[1,  2000] loss: 2.304
[1,  4000] loss: 2.302
[1,  6000] loss: 2.300
[1,  8000] loss: 2.296
[1, 10000] loss: 2.287
[1, 12000] loss: 2.258
[2,  2000] loss: 2.133
[2,  4000] loss: 2.045
[2,  6000] loss: 1.978
[2,  8000] loss: 1.929
[2, 10000] loss: 1.882
[2, 12000] loss: 1.852
[3,  2000] loss: 1.795
[3,  4000] loss: 1.751
[3,  6000] loss: 1.696
[3,  8000] loss: 1.671
[3, 10000] loss: 1.631
[3, 12000] loss: 1.625
[4,  2000] loss: 1.587
[4,  4000] loss: 1.586
[4,  6000] loss: 1.576
[4,  8000] loss: 1.554
[4, 10000] loss: 1.536
[4, 12000] loss: 1.519
[5,  2000] loss: 1.505
[5,  4000] loss: 1.510
[5,  6000] loss: 1.486
[5,  8000] loss: 1.490
[5, 10000] loss: 1.461
[5, 12000] loss: 1.436
[6,  2000] loss: 1.432
[6,  4000] loss: 1.444
[6,  6000] loss: 1.406
[6,  8000] loss: 1.409
[6, 10000] loss: 1.385
[6, 12000] loss: 1.385
[7,  2000] loss: 1.388
[7,  4000] loss: 1.335
[7,  6000] loss: 1.347
[7,  8000] loss: 1.343
[7, 10000] loss: 1.337
[7, 12000] loss: 1.334
[8,  2000] loss: 1.320
[8,  4000] 

In [4]:
PATH = './cifar_net.pth'
torch.save(net.state_dict(), PATH)

Test the network on the test data

In [5]:
dataiter = iter(testloader)
images, labels = dataiter.next()
print(labels.size())
net = Net()
net.load_state_dict(torch.load(PATH))
outputs = net(images)

# check the predicted against the ground-truth.
# if the prediction is correct, add the sample to the list of correct predictions

# torch.max(a, 1) 返回input tensor中每一行中最大的元素，且返回最大元素在这一行的列索引
# predicted 即为 列索引
_, predicted = torch.max(outputs, 1)
for j in range(labels.size(0)):
    print(''.join('%5s' % predicted[j]))
    
# print('Predicted:', ' '.join('%5s' % classes))


torch.Size([4])
tensor(3)
tensor(1)
tensor(1)
tensor(0)


test on the whole dataset

In [6]:
correct = 0
total = 0
with torch.no_grad():
    for data in testloader:
        images, labels = data
        outputs = net(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    print("accuracy is %d %%" %(100 * correct/ total))

accuracy is 56 %


In [7]:
class_correct = list(0. for i in range(10))
class_total = list(0. for i in range(10))
with torch.no_grad():
    for data in testloader:
        images, labels = data
        output = net(images)
        _, predicted = torch.max(outputs, 1)
        # squeeze 对数据维度进行压缩
        c = (predicted == labels).squeeze()
        for i in range(4):
            label = labels[i]
            class_correct[label] += c[i].item()
            class_total[label] += 1
            
for i in range(10):
    print(classes[i], 100*class_correct[i]/class_total[i])
        

plane 0.0
car 0.0
bird 0.0
cat 24.3
deer 25.7
dog 25.6
frog 0.0
horse 25.6
ship 0.0
truck 0.0


Train-evaluate Loop  

Train-evaluate every iteration and tracking best accuracy or validation  
Save weights if the training iteration produces the best accuracy or validation 

In [None]:
def train_and_evaluate(model, trainloader, valloader, optimizer, 
                       loss_fn, metrics, params, model_dir, restore_file=None):
    """
    Args:
        trainloader: DataLoader for training data
        valloader: DataLoader for validation data
        metrics: (dict) a dictionary of functions that compute a metric using the output and labels of each batch
        model_dir: (string) directory containing config, weights, and log
    """
    


Checking point

Checkpoint is the term to describe saving a snapshot of the model parameters after every epoch of training. Create checkpoints while training the model and then it allows you to load the saved weights and resume training from any epoch that has a checkpoint.

In [8]:
import os
import torch
import shutil

def save_checkpoint(state, is_best, checkpoint):
    """
    Args:
        state: (dict) contains model's state_dict, may contain optimizer's state_dict
        is_best: (bool) True if it is the best model seen till now
        checkpoint: (string) folder where parameters are to be saved
    """
    filepath = os.path.join(checkpoint, 'last.pth.tar')
    if not os.path.exists(checkpoint):
        print('checkpoint directory does not exist')
        os.mkdir(checkpoint)
    else:
        print('checkpoint directory exists')
    torch.save(state, filepath)
    if is_best:
        # 移动文件
        # shutil.copyfile("old file", "new file")
        shutil.copyfile(filepath, os.path.join(checkpoint, 'best.pth.tar'))
        
def load_checkpoint(checkpoint, model, optimizer=None):    
    """
    Args:
        checkpoint: (string) filename to be loaded
        model: (torch.nn.module) model for which the parameters are loaded
        optimizer:(torch.optim) resume optimizer from checkpoint
    """
    if not os.path.exists(checkpoint):
        raise("File doesn't exist {}".format(checkpoint))
    checkpoint = torch.load(checkpoint)
    model.load_state_dict(checkpoint['state_dict'])
    
    if optimizer:
        optimizer.load_state_dict(checkpoint['optim_dict'])
    
    return checkpoint