From c96a0b4aa118d15326d4de29817e02b132a955d3 Mon Sep 17 00:00:00 2001 From: laubonghaudoi Date: Sun, 2 Dec 2018 00:24:34 -0800 Subject: [PATCH 1/4] Update README and comments --- CapsNet.py | 4 ++-- Decoder.py | 4 ++-- DigitCaps.py | 6 +++--- PrimaryCaps.py | 4 ++-- README.md | 4 ++-- main.py | 2 +- 6 files changed, 12 insertions(+), 12 deletions(-) diff --git a/CapsNet.py b/CapsNet.py index 88aaac1..2a611f6 100644 --- a/CapsNet.py +++ b/CapsNet.py @@ -34,10 +34,10 @@ def __init__(self, opt): def forward(self, x): ''' Args: - `x`: [batch_size, 1, 28, 28] A MNIST sample + `x`: [batch_size, 1, 28, 28] MNIST samples Return: - `v`: [batch_size, 10, 16] CapsNet outputs, 16D rediction vectors of + `v`: [batch_size, 10, 16] CapsNet outputs, 16D prediction vectors of 10 digit capsules The dimension transformation procedure of an input tensor in each layer: diff --git a/Decoder.py b/Decoder.py index 2bfbc58..974ec47 100644 --- a/Decoder.py +++ b/Decoder.py @@ -28,8 +28,8 @@ def __init__(self, opt): def forward(self, v, target): ''' Args: - v: [batch_size, 10, 16] - target: [batch_size, 10] + `v`: [batch_size, 10, 16] + `target`: [batch_size, 10] Return: `reconstruction`: [batch_size, 784] diff --git a/DigitCaps.py b/DigitCaps.py index 17b5eac..bc2d46c 100644 --- a/DigitCaps.py +++ b/DigitCaps.py @@ -7,8 +7,8 @@ class DigitCaps(nn.Module): ''' The `DigitCaps` layer consists of 10 16D capsules. Compared to the traditional - scalar output neurons in fully connected layers(FCN), the `DigitCaps` layer - can be seen as an FCN with 16-dimensional output neurons, where we call + scalar output neurons in fully connected networks(FCN), the `DigitCaps` layer + can be seen as an FCN with ten 16-dimensional output neurons, which we call these neurons "capsules". In this layer, we take the input `[1152, 8]` tensor `u` as 1152 [8,] vectors @@ -34,7 +34,7 @@ def __init__(self, opt): The the coupling coefficients `b` [1152, 10] is a temporary variable which does NOT belong to the layer's parameters. In other words, `b` is not updated by gradient back-propagations. Instead, we update `b` by Dynamic Routing - in every forward propagation. See docstring of `self.forward` for details. + in every forward propagation. See the docstring of `self.forward` for details. ''' super(DigitCaps, self).__init__() self.opt = opt diff --git a/PrimaryCaps.py b/PrimaryCaps.py index d688519..9269d63 100644 --- a/PrimaryCaps.py +++ b/PrimaryCaps.py @@ -7,7 +7,7 @@ class PrimaryCaps(nn.Module): ''' The `PrimaryCaps` layer consists of 32 capsule units. Each unit takes the output of the `Conv1` layer, which is a `[256, 20, 20]` feature - tensor (ignoring `batch_size`), and performs a 2D convolution with 8 + tensor (omitting `batch_size`), and performs a 2D convolution with 8 output channels, kernel size 9 and stride 2, thus outputing a [8, 6, 6] tensor. In other words, you can see these 32 capsules as 32 paralleled 2D convolutional layers. Then we concatenate these 32 capsules' outputs and @@ -16,7 +16,7 @@ class PrimaryCaps(nn.Module): As indicated in Section 4, Page 4 in the paper, *One can see PrimaryCaps as a Convolution layer with Eq.1 as its block non-linearity.*, outputs of - the `PrimaryCaps` layer are squashed before passing to the next layer. + the `PrimaryCaps` layer are squashed before being passed to the next layer. Reference: Section 4, Fig. 1 ''' diff --git a/README.md b/README.md index 13226c7..831295b 100644 --- a/README.md +++ b/README.md @@ -13,13 +13,13 @@ As I am busy these days, I might not have time to checkout and fix every issue. ## Requirements -- pytorch 0.2.0 +- pytorch 0.4.1 - torchvision - pytorch-extras (For one-hot vector conversion) - tensorboard-pytorch - tqdm -All codes are tested under Python 3.6.3. +All codes are tested under Python 3.6. ## Get Started diff --git a/main.py b/main.py index e8cf27c..aaa5fd7 100644 --- a/main.py +++ b/main.py @@ -17,7 +17,7 @@ 8. `train()` and `test()` in `main.py` You might find helpful with the paper *Dynamic Routing Between Capsules* -at your hand for referencing. +at your hand for referencing when reading these codes. """ import os From 45f60844c3efb60ec0315506c5ddd9b97c91895f Mon Sep 17 00:00:00 2001 From: laubonghaudoi Date: Mon, 3 Dec 2018 08:04:37 -0800 Subject: [PATCH 2/4] FIx Decoder --- Decoder.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Decoder.py b/Decoder.py index 974ec47..7ecda61 100644 --- a/Decoder.py +++ b/Decoder.py @@ -53,8 +53,8 @@ def forward(self, v, target): assert v_masked.size() == torch.Size([batch_size, 16]) # Forward - v = self.fc1(v_masked) - v = self.fc2(v) + v = F.relu(self.fc1(v_masked)) + v = F.relu(self.fc2(v)) reconstruction = torch.sigmoid(self.fc3(v)) assert reconstruction.size() == torch.Size([batch_size, 784]) From 21614ec084292de38f682e1ab9ddbef970684340 Mon Sep 17 00:00:00 2001 From: Alexbanana19 Date: Tue, 4 Dec 2018 04:59:56 +0000 Subject: [PATCH 3/4] fix cuda memory error --- .gitignore | 1 + "\\" | 192 +++++++++++++++++++++++++++++++++++++++++++++++++++++ main.py | 24 ++++--- 3 files changed, 206 insertions(+), 11 deletions(-) create mode 100644 "\\" diff --git a/.gitignore b/.gitignore index 6a564d6..f0d97cf 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,4 @@ __pycache__/ # data directory data/ ckpt/ +runs/ diff --git "a/\\" "b/\\" new file mode 100644 index 0000000..5da0456 --- /dev/null +++ "b/\\" @@ -0,0 +1,192 @@ +""" +A tutorial-style implementation of CapsNet in PyTorch. + +Paper link: https://arxiv.org/abs/1710.09829v2 + +@author laubonghaudoi + +For better understanding, read the codes and comments in the following order: + +1. `__main__` in `main.py` +2. `utils.py` +3. `CapsNet.__init__()` and `CapsNet.forward()` in `CapsNet.py` +4. `PrimaryCaps.py` +5. `DigitCaps.py` +6. `Decoder.py` +7. `CapsNet.marginal_loss()`, `CapsNet.reconstruction_loss()` and` CapsNet.loss()` in `CapsNet.py` +8. `train()` and `test()` in `main.py` + +You might find helpful with the paper *Dynamic Routing Between Capsules* +at your hand for referencing when reading these codes. +""" + +import os +import time +from tqdm import * + +import torch +import torch_extras +import torchvision.utils as vutils +from tensorboardX import SummaryWriter +from torch.autograd import Variable + +from CapsNet import CapsNet +from utils import get_opts, get_dataloader + +# PyTorch does not provide one-hot vector conversion, we achieve this +# by pytorch-extras +setattr(torch, 'one_hot', torch_extras.one_hot) + + +def train(opt, train_loader, test_loader, model, writer): + num_data = len(train_loader.dataset) + num_batches = len(train_loader) + + optimizer = torch.optim.Adam(model.parameters(), lr=opt.lr) + scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, 0.5) + + model.train() + for epoch in range(opt.epochs): + # Update learning rate + scheduler.step() + print('Learning rate: {}'.format(scheduler.get_lr()[0])) + + start_time = time.time() + for batch_idx, (data, target) in enumerate(tqdm(train_loader)): + batch_size = data.size(0) + global_step = batch_idx + epoch * num_batches + + # Transform to one-hot indices: [batch_size, 10] + target = torch.one_hot((batch_size, 10), target.view(-1, 1)) + assert target.size() == torch.Size([batch_size, 10]) + + # Use GPU if available + data, target = Variable(data), Variable(target) + if opt.use_cuda & torch.cuda.is_available(): + data, target = data.cuda(), target.cuda() + + # Train step + optimizer.zero_grad() + output = model(data) + + L, m_loss, r_loss = model.loss(output, target, data) + L.backward() + + optimizer.step() + + # Log losses + writer.add_scalar('train/loss', L.item(), global_step) + writer.add_scalar('train/marginal_loss', m_loss.item(), global_step) + writer.add_scalar('train/reconstruction_loss', r_loss.item(), global_step) + + # Print losses + if batch_idx % opt.print_every == 0: + tqdm.write('Epoch: {} Loss: {:.6f} Marginal loss: {:.6f} Recons. loss: {:.6f}'.format( + epoch, L.item(), m_loss.item(), r_loss.item())) + + # Print time elapsed for every epoch + end_time = time.time() + print('Epoch {} takes {:.0f} seconds.'.format( + epoch, end_time - start_time)) + + # Test model + test(opt, test_loader, model, writer, epoch, num_batches) + + + +def test(opt, test_loader, model, writer, epoch, num_batches): + loss = 0 + margin_loss = 0 + recons_loss = 0 + + correct = 0 + + step = epoch * num_batches + num_batches + model.eval() + for data, target in test_loader: + # Store the indices for calculating accuracy + label = target.unsqueeze(0).type(torch.LongTensor) + + batch_size = data.size(0) + # Transform to one-hot indices: [batch_size, 10] + target = torch.one_hot((batch_size, 10), target.view(-1, 1)) + assert target.size() == torch.Size([batch_size, 10]) + + # Use GPU if available + data, target = Variable(data, volatile=True), Variable(target) + if opt.use_cuda & torch.cuda.is_available(): + data, target = data.cuda(), target.cuda() + + # Output predictions + output = model(data) + L, m_loss, r_loss = model.loss(output, target, data) + loss += L.item() + margin_loss += m_loss.item() + recons_loss += r_loss.item() + + # Count correct numbers + # norms: [batch_size, 10, 16] + norms = torch.sqrt(torch.sum(output**2, dim=2)) + # pred: [batch_size,] + pred = norms.data.max(1, keepdim=True)[1].type(torch.LongTensor) + correct += pred.eq(label.view_as(pred)).cpu().sum() + + # Visualize reconstructed images of the last batch + recons = model.Decoder(output, target) + recons = recons.view(batch_size, 1, 28, 28) + recons = vutils.make_grid(recons.data, normalize=True, scale_each=True) + writer.add_image('Image-{}'.format(step), recons, step) + + # Log test losses + loss /= len(test_loader) + margin_loss /= len(test_loader) + recons_loss /= len(test_loader) + acc = correct / len(test_loader.dataset) + writer.add_scalar('test/loss', loss.item(), step) + writer.add_scalar('test/marginal_loss', margin_loss.item(), step) + writer.add_scalar('test/reconstruction_loss', recons_loss.item(), step) + writer.add_scalar('test/accuracy', acc, step) + + # Print test losses + print('\nTest loss: {:.4f} Marginal loss: {:.4f} Recons loss: {:.4f}'.format( + loss.item(), margin_loss.item(), recons_loss.item())) + print('Accuracy: {}/{} ({:.0f}%)\n'.format(correct, len(test_loader.dataset), + 100. * correct / len(test_loader.dataset))) + + # Checkpoint model + torch.save(model, './ckpt/epoch_{}-loss_{:.6f}-acc_{:.6f}.pt'.format( + epoch, loss.item(), acc)) + + +if __name__ == "__main__": + # Default configurations + opt = get_opts() + train_loader, test_loader = get_dataloader(opt) + + # Initialize CapsNet + model = CapsNet(opt) + + # Enable GPU usage + if opt.use_cuda & torch.cuda.is_available(): + model.cuda() + + # Print the model architecture and parameters + print("Model architectures: ") + print(model) + + print("\nSizes of parameters: ") + for name, param in model.named_parameters(): + print("{}: {}".format(name, list(param.size()))) + n_params = sum([p.nelement() for p in model.parameters()]) + # The coupling coefficients b_ij are not included in the parameter list, + # we need to add them mannually, which is 1152 * 10 = 11520. + print('\nTotal number of parameters: %d \n' % (n_params+11520)) + + # Make model checkpoint directory + if not os.path.exists('ckpt'): + os.makedirs('ckpt') + + # Start training + writer = SummaryWriter() + train(opt, train_loader, test_loader, model, writer) + writer.close() diff --git a/main.py b/main.py index aaa5fd7..90755c7 100644 --- a/main.py +++ b/main.py @@ -61,7 +61,8 @@ def train(opt, train_loader, test_loader, model, writer): assert target.size() == torch.Size([batch_size, 10]) # Use GPU if available - data, target = Variable(data), Variable(target) + with torch.no_grad(): + data, target = Variable(data), Variable(target) if opt.use_cuda & torch.cuda.is_available(): data, target = data.cuda(), target.cuda() @@ -113,23 +114,24 @@ def test(opt, test_loader, model, writer, epoch, num_batches): assert target.size() == torch.Size([batch_size, 10]) # Use GPU if available - data, target = Variable(data, volatile=True), Variable(target) + with torch.no_grad(): + data, target = Variable(data), Variable(target) if opt.use_cuda & torch.cuda.is_available(): data, target = data.cuda(), target.cuda() # Output predictions output = model(data) L, m_loss, r_loss = model.loss(output, target, data) - loss += L - margin_loss += m_loss - recons_loss += r_loss + loss += L.item() + margin_loss += m_loss.item() + recons_loss += r_loss.item() # Count correct numbers # norms: [batch_size, 10, 16] norms = torch.sqrt(torch.sum(output**2, dim=2)) # pred: [batch_size,] pred = norms.data.max(1, keepdim=True)[1].type(torch.LongTensor) - correct += pred.eq(label.view_as(pred)).cpu().sum() + correct += pred.eq(label.view_as(pred)).cpu().sum().item() # Visualize reconstructed images of the last batch recons = model.Decoder(output, target) @@ -142,20 +144,20 @@ def test(opt, test_loader, model, writer, epoch, num_batches): margin_loss /= len(test_loader) recons_loss /= len(test_loader) acc = correct / len(test_loader.dataset) - writer.add_scalar('test/loss', loss.item(), step) - writer.add_scalar('test/marginal_loss', margin_loss.item(), step) - writer.add_scalar('test/reconstruction_loss', recons_loss.item(), step) + writer.add_scalar('test/loss', loss, step) + writer.add_scalar('test/marginal_loss', margin_loss, step) + writer.add_scalar('test/reconstruction_loss', recons_loss, step) writer.add_scalar('test/accuracy', acc, step) # Print test losses print('\nTest loss: {:.4f} Marginal loss: {:.4f} Recons loss: {:.4f}'.format( - loss.item(), margin_loss.item(), recons_loss.item())) + loss, margin_loss, recons_loss)) print('Accuracy: {}/{} ({:.0f}%)\n'.format(correct, len(test_loader.dataset), 100. * correct / len(test_loader.dataset))) # Checkpoint model torch.save(model, './ckpt/epoch_{}-loss_{:.6f}-acc_{:.6f}.pt'.format( - epoch, loss.item(), acc)) + epoch, loss, acc)) if __name__ == "__main__": From ea9d7527e288cc09f345eca60fc50c448f97e5a8 Mon Sep 17 00:00:00 2001 From: laubonghaudoi Date: Tue, 4 Dec 2018 00:02:56 -0800 Subject: [PATCH 4/4] Remove trash --- "\\" | 192 ----------------------------------------------------------- 1 file changed, 192 deletions(-) delete mode 100644 "\\" diff --git "a/\\" "b/\\" deleted file mode 100644 index 5da0456..0000000 --- "a/\\" +++ /dev/null @@ -1,192 +0,0 @@ -""" -A tutorial-style implementation of CapsNet in PyTorch. - -Paper link: https://arxiv.org/abs/1710.09829v2 - -@author laubonghaudoi - -For better understanding, read the codes and comments in the following order: - -1. `__main__` in `main.py` -2. `utils.py` -3. `CapsNet.__init__()` and `CapsNet.forward()` in `CapsNet.py` -4. `PrimaryCaps.py` -5. `DigitCaps.py` -6. `Decoder.py` -7. `CapsNet.marginal_loss()`, `CapsNet.reconstruction_loss()` and` CapsNet.loss()` in `CapsNet.py` -8. `train()` and `test()` in `main.py` - -You might find helpful with the paper *Dynamic Routing Between Capsules* -at your hand for referencing when reading these codes. -""" - -import os -import time -from tqdm import * - -import torch -import torch_extras -import torchvision.utils as vutils -from tensorboardX import SummaryWriter -from torch.autograd import Variable - -from CapsNet import CapsNet -from utils import get_opts, get_dataloader - -# PyTorch does not provide one-hot vector conversion, we achieve this -# by pytorch-extras -setattr(torch, 'one_hot', torch_extras.one_hot) - - -def train(opt, train_loader, test_loader, model, writer): - num_data = len(train_loader.dataset) - num_batches = len(train_loader) - - optimizer = torch.optim.Adam(model.parameters(), lr=opt.lr) - scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, 0.5) - - model.train() - for epoch in range(opt.epochs): - # Update learning rate - scheduler.step() - print('Learning rate: {}'.format(scheduler.get_lr()[0])) - - start_time = time.time() - for batch_idx, (data, target) in enumerate(tqdm(train_loader)): - batch_size = data.size(0) - global_step = batch_idx + epoch * num_batches - - # Transform to one-hot indices: [batch_size, 10] - target = torch.one_hot((batch_size, 10), target.view(-1, 1)) - assert target.size() == torch.Size([batch_size, 10]) - - # Use GPU if available - data, target = Variable(data), Variable(target) - if opt.use_cuda & torch.cuda.is_available(): - data, target = data.cuda(), target.cuda() - - # Train step - optimizer.zero_grad() - output = model(data) - - L, m_loss, r_loss = model.loss(output, target, data) - L.backward() - - optimizer.step() - - # Log losses - writer.add_scalar('train/loss', L.item(), global_step) - writer.add_scalar('train/marginal_loss', m_loss.item(), global_step) - writer.add_scalar('train/reconstruction_loss', r_loss.item(), global_step) - - # Print losses - if batch_idx % opt.print_every == 0: - tqdm.write('Epoch: {} Loss: {:.6f} Marginal loss: {:.6f} Recons. loss: {:.6f}'.format( - epoch, L.item(), m_loss.item(), r_loss.item())) - - # Print time elapsed for every epoch - end_time = time.time() - print('Epoch {} takes {:.0f} seconds.'.format( - epoch, end_time - start_time)) - - # Test model - test(opt, test_loader, model, writer, epoch, num_batches) - - - -def test(opt, test_loader, model, writer, epoch, num_batches): - loss = 0 - margin_loss = 0 - recons_loss = 0 - - correct = 0 - - step = epoch * num_batches + num_batches - model.eval() - for data, target in test_loader: - # Store the indices for calculating accuracy - label = target.unsqueeze(0).type(torch.LongTensor) - - batch_size = data.size(0) - # Transform to one-hot indices: [batch_size, 10] - target = torch.one_hot((batch_size, 10), target.view(-1, 1)) - assert target.size() == torch.Size([batch_size, 10]) - - # Use GPU if available - data, target = Variable(data, volatile=True), Variable(target) - if opt.use_cuda & torch.cuda.is_available(): - data, target = data.cuda(), target.cuda() - - # Output predictions - output = model(data) - L, m_loss, r_loss = model.loss(output, target, data) - loss += L.item() - margin_loss += m_loss.item() - recons_loss += r_loss.item() - - # Count correct numbers - # norms: [batch_size, 10, 16] - norms = torch.sqrt(torch.sum(output**2, dim=2)) - # pred: [batch_size,] - pred = norms.data.max(1, keepdim=True)[1].type(torch.LongTensor) - correct += pred.eq(label.view_as(pred)).cpu().sum() - - # Visualize reconstructed images of the last batch - recons = model.Decoder(output, target) - recons = recons.view(batch_size, 1, 28, 28) - recons = vutils.make_grid(recons.data, normalize=True, scale_each=True) - writer.add_image('Image-{}'.format(step), recons, step) - - # Log test losses - loss /= len(test_loader) - margin_loss /= len(test_loader) - recons_loss /= len(test_loader) - acc = correct / len(test_loader.dataset) - writer.add_scalar('test/loss', loss.item(), step) - writer.add_scalar('test/marginal_loss', margin_loss.item(), step) - writer.add_scalar('test/reconstruction_loss', recons_loss.item(), step) - writer.add_scalar('test/accuracy', acc, step) - - # Print test losses - print('\nTest loss: {:.4f} Marginal loss: {:.4f} Recons loss: {:.4f}'.format( - loss.item(), margin_loss.item(), recons_loss.item())) - print('Accuracy: {}/{} ({:.0f}%)\n'.format(correct, len(test_loader.dataset), - 100. * correct / len(test_loader.dataset))) - - # Checkpoint model - torch.save(model, './ckpt/epoch_{}-loss_{:.6f}-acc_{:.6f}.pt'.format( - epoch, loss.item(), acc)) - - -if __name__ == "__main__": - # Default configurations - opt = get_opts() - train_loader, test_loader = get_dataloader(opt) - - # Initialize CapsNet - model = CapsNet(opt) - - # Enable GPU usage - if opt.use_cuda & torch.cuda.is_available(): - model.cuda() - - # Print the model architecture and parameters - print("Model architectures: ") - print(model) - - print("\nSizes of parameters: ") - for name, param in model.named_parameters(): - print("{}: {}".format(name, list(param.size()))) - n_params = sum([p.nelement() for p in model.parameters()]) - # The coupling coefficients b_ij are not included in the parameter list, - # we need to add them mannually, which is 1152 * 10 = 11520. - print('\nTotal number of parameters: %d \n' % (n_params+11520)) - - # Make model checkpoint directory - if not os.path.exists('ckpt'): - os.makedirs('ckpt') - - # Start training - writer = SummaryWriter() - train(opt, train_loader, test_loader, model, writer) - writer.close()