pose matrix
4x3 rotation [3x3 with zeros below]
3x1 translation [t1,t2,t3,1]
The above described mechanism is not very good, because max pooling loses valuable 
information and also does not encode relative spatial relationships between features

activities equivariance

## What is a capsule

* length of it's output vector is prob detection of feature it represents
* state of detected feature is direction vector points to instantiation parameters
* say capsule says 99% of seeing an object and it moves across image, instantiation parameters change  p(detection don't change
*  This is what Hinton refers to as activities equivariance: neuronal activities will change when an object “moves over the manifold of possible appearances” in the picture

### What are its inputs and outputs

In [30]:
#from fastai.vision.all import *
#matplotlib.rc('image', cmap='Greys')

In [31]:
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.optim import Adam
from torchvision import datasets, transforms

USE_CUDA = True

In [32]:
class Mnist:
    def __init__(self, batch_size):
        dataset_transform = transforms.Compose([
                       transforms.ToTensor(),
                       transforms.Normalize((0.1307,), (0.3081,))
                   ])

        train_dataset = datasets.MNIST('../data', train=True, download=True, transform=dataset_transform)
        test_dataset = datasets.MNIST('../data', train=False, download=True, transform=dataset_transform)
        
        self.train_loader  = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        self.test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=True)        

In [33]:
class ConvLayer(nn.Module):
    def __init__(self, in_channels=1, out_channels=256, kernel_size=9):
        super(ConvLayer, self).__init__()

        self.conv = nn.Conv2d(in_channels=in_channels,
                               out_channels=out_channels,
                               kernel_size=kernel_size,
                               stride=1
                             )

    def forward(self, x):
        return F.relu(self.conv(x))

In [34]:
# #hide
# from fastai.vision.all import *
# #from utils import *

# matplotlib.rc('image', cmap='Greys')


# a = tensor([[1,2,3],
#             [4,5,6]])

# b = tensor([[7,8,9],
#             [10,11,12]])
# torch.stack([a,b],dim=1)

In [35]:
class PrimaryCaps(nn.Module):
    def __init__(self, num_capsules=8, in_channels=256, out_channels=32, kernel_size=9):
        super(PrimaryCaps, self).__init__()
        
        #ModuleList lets you use arbitrary numbers of moduls,
        # they are registered with the nn.module and thus will show up in model.parameters() for optimizers
        # they don't get forward automatically made like nn.Sequential
        self.capsules = nn.ModuleList([
            nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=2, padding=0) 
                          for _ in range(num_capsules)])
        print(len(self.capsules) )
    
    def forward(self, x):
        # forward propagating through each conv layer
        # (width - filtersize + 2*padding)/stride + 1 
        u = [capsule(x) for capsule in self.capsules]
        # stack on dim = 1 because dim 0 would combine batches
        u = torch.stack(u, dim=1)
        print(u.shape,"first")
        u = u.view(x.size(0), 32 * 6 * 6, -1)
        #result is 100,1152,8
        #now we squash each vector
        print(u.shape,"second")
        
        squashed = self.squash(u)
        print("squashed.shape",squashed.shape)
        #expect x.size(0) to be batch_size
        #so squashed is probability of seeing a feature
        return squashed
    
    def squash(self, input_tensor):
        squared_norm = (input_tensor ** 2).sum(-1, keepdim=True)
        print(squared_norm.shape, "sqn shape,length of output vectors, s_j**2")
        output_tensor = squared_norm *  input_tensor / ((1. + squared_norm) * torch.sqrt(squared_norm))
        return output_tensor

In [36]:
batch_size = 100
mnist = Mnist(batch_size)
adata,atarget = next(iter(mnist.train_loader) )
adata.shape,atarget.shape

(torch.Size([100, 1, 28, 28]), torch.Size([100]))

In [37]:
cl = ConvLayer()
res1 = cl(adata)
res1.shape

torch.Size([100, 256, 20, 20])

In [38]:
pc = PrimaryCaps()
res2 = pc(res1)
res2.shape

8
torch.Size([100, 8, 32, 6, 6]) first
torch.Size([100, 1152, 8]) second
torch.Size([100, 1152, 1]) sqn shape,length of output vectors, s_j**2
squashed.shape torch.Size([100, 1152, 8])


torch.Size([100, 1152, 8])

In [39]:
th = torch.tensor([[[1,1,1],[2,2,2]]]);

th.shape

torch.Size([1, 2, 3])

In [28]:
squared_norm = (th ** 2).sum(-1, keepdim=True)
squared_norm, squared_norm.shape

(tensor([[[ 3],
          [12]]]),
 torch.Size([1, 2, 1]))

In [40]:
# so each of the 1152 have an associated sum from the 8
# w.t.f does the 8 represent?

In [89]:
dc = DigitCaps()
res3 = dc(res2)

self.W.shape torch.Size([1, 1152, 10, 16, 8])
before unsq torch.Size([100, 1152, 8])
post unsq torch.Size([100, 1152, 10, 8, 1])
uhat shape torch.Size([100, 1152, 10, 16, 1])


RuntimeError: expected device cuda:0 but got device cpu

In [67]:
a = torch.randn(100, 1152, 8);
#think of this as putting the 10 stacks at dim index 2
a=torch.stack([a]*10,dim=2);a.shape

torch.Size([100, 1152, 10, 8])

In [68]:
a[0,0,:,0]

tensor([-1.2514, -1.2514, -1.2514, -1.2514, -1.2514, -1.2514, -1.2514, -1.2514,
        -1.2514, -1.2514])

In [69]:
#think of this as putting dim 1 at index 4
a.unsqueeze(4).shape

torch.Size([100, 1152, 10, 8, 1])

In [71]:
sample = torch.randn(100,1152)
sample.shape

torch.Size([100, 1152])

In [72]:
sample.unsqueeze(-1).shape

torch.Size([100, 1152, 1])

In [73]:
samplec = torch.tensor([3,4,5,6])
acat = torch.cat([samplec]*3,dim=0)
acat, acat.shape

(tensor([3, 4, 5, 6, 3, 4, 5, 6, 3, 4, 5, 6]), torch.Size([12]))

In [56]:
# whatever you're stacking, it'll be number things stacking as the new highest dim
# followed by lower dims
astack = torch.stack([samplec]*4,dim=0)
astack, astack.shape

(tensor([[3, 4, 5, 6],
         [3, 4, 5, 6],
         [3, 4, 5, 6],
         [3, 4, 5, 6]]),
 torch.Size([4, 4]))

In [80]:
bssm = F.softmax(astack.float(),dim=1)

In [83]:
torch.cat([bssm] * 7,dim=0).unsqueeze(-1).shape

torch.Size([28, 4, 1])

In [88]:
class DigitCaps(nn.Module):
    def __init__(self, num_capsules=10, num_routes=32 * 6 * 6, in_channels=8, out_channels=16):
        super(DigitCaps, self).__init__()

        self.in_channels = in_channels
        self.num_routes = num_routes
        self.num_capsules = num_capsules

        # Parameter means register it as module parameters as opposed to nn.Variable
        self.W = nn.Parameter(torch.randn(1, num_routes, num_capsules, out_channels, in_channels))
        print("self.W.shape",self.W.shape)
        # 1 x 1152 x 10 x 16 x 8
    def forward(self, x):
        batch_size = x.size(0)
        
        print("before unsq",x.shape)
        #prepping data for multiplying
        # 100 x 1152 x 8
        x = torch.stack([x] * self.num_capsules, dim=2).unsqueeze(4)
        # 100 x 1152 x 10 x 8 x 1
        print("post unsq",x.shape)
        
        #sizing this for a batch
        W = torch.cat([self.W] * batch_size, dim=0)
        # W shape 100, 1152, 10,16, 8
        # x shape 100, 1152, 10, 8, 1
        u_hat = torch.matmul(W, x)
        #u_hat shape 100,1152,16,1
        
        print("uhat shape",u_hat.shape)
        # 1 x 1152 x 10 x 1
        b_ij = Variable(torch.zeros(1, self.num_routes, self.num_capsules, 1))
        if USE_CUDA:
            b_ij = b_ij.cuda()

        num_iterations = 3
        for iteration in range(num_iterations):
            # 1
            # c_ij is the weight that multiplies by output vector i from a lower lvl capsule 
            # and sends to capspule at layer j
            # for each lower level capsule, c_ij add to 1
            # number of weigths equals number of higher level capsules
            # 2
            # C_ij is calcs for routing weights for a capsule, softmax ensures probability
            # if we have 3 lower level capsules and 2 higher level, at first iteration they'll be .5
            # this represents maximim confusion over where it'll be routed
            
            c_ij = F.softmax(b_ij,dim=1)
            #res shape we think 1 x 1152 x 10 x 1 (same shape)
             
            c_ij = torch.cat([c_ij] * batch_size, dim=0).unsqueeze(4)
            # cat makes it 100 x 1152 x 10
            # unsqueeze the catted thing 100 x 1152 x 10 x 1
            c_ij.cpu()
            u_hat.cpu()
            
            s_j = (c_ij * u_hat).sum(dim=1, keepdim=True)
            print("s_j.shape", s_j.shape)
            v_j = self.squash(s_j)
            
            if iteration < num_iterations - 1:
                #super important
                a_ij = torch.matmul(u_hat.transpose(3, 4), torch.cat([v_j] * self.num_routes, dim=1))
                b_ij = b_ij + a_ij.squeeze(4).mean(dim=0, keepdim=True)
        print("can see this")
        return v_j.squeeze(1)
    
    def squash(self, input_tensor):
        squared_norm = (input_tensor ** 2).sum(-1, keepdim=True)
        output_tensor = squared_norm *  input_tensor / ((1. + squared_norm) * torch.sqrt(squared_norm))
        return output_tensor

In [62]:
tosum = torch.tensor([[1,2,3],[2,2,2]])
tosum.sum(1)

tensor([6, 6])

In [57]:
class Decoder(nn.Module):
    def __init__(self):
        super(Decoder, self).__init__()
        
        self.reconstraction_layers = nn.Sequential(
            nn.Linear(16 * 10, 512),
            nn.ReLU(inplace=True),
            nn.Linear(512, 1024),
            nn.ReLU(inplace=True),
            nn.Linear(1024, 784),
            nn.Sigmoid()
        )
        
    def forward(self, x, data):
        classes = torch.sqrt((x ** 2).sum(2))
        classes = F.softmax(classes,dim=1)
        
        _, max_length_indices = classes.max(dim=1)
        masked = Variable(torch.sparse.torch.eye(10))
        if USE_CUDA:
            masked = masked.cuda()
        masked = masked.index_select(dim=0, index=max_length_indices.squeeze(1).data)
        
        reconstructions = self.reconstraction_layers((x * masked[:, :, None, None]).view(x.size(0), -1))
        reconstructions = reconstructions.view(-1, 1, 28, 28)
        
        return reconstructions, masked

In [58]:
class CapsNet(nn.Module):
    def __init__(self):
        super(CapsNet, self).__init__()
        self.conv_layer = ConvLayer()
        self.primary_capsules = PrimaryCaps()
        self.digit_capsules = DigitCaps()
        self.decoder = Decoder()
        
        self.mse_loss = nn.MSELoss()
        
    def forward(self, data):
        output = self.digit_capsules(self.primary_capsules(self.conv_layer(data)))
        reconstructions, masked = self.decoder(output, data)
        return output, reconstructions, masked
    
    def loss(self, data, x, target, reconstructions):
        return self.margin_loss(x, target) + self.reconstruction_loss(data, reconstructions)
    
    def margin_loss(self, x, labels, size_average=True):
        batch_size = x.size(0)

        v_c = torch.sqrt((x**2).sum(dim=2, keepdim=True))

        left = F.relu(0.9 - v_c).view(batch_size, -1)
        right = F.relu(v_c - 0.1).view(batch_size, -1)

        loss = labels * left + 0.5 * (1.0 - labels) * right
        loss = loss.sum(dim=1).mean()

        return loss
    
    def reconstruction_loss(self, data, reconstructions):
        loss = self.mse_loss(reconstructions.view(reconstructions.size(0), -1), data.view(reconstructions.size(0), -1))
        return loss * 0.0005

In [None]:
capsule_net = CapsNet()
if USE_CUDA:
    capsule_net = capsule_net.cuda()
optimizer = Adam(capsule_net.parameters())

### capsule_net.train works recursively

In [None]:
#capsule_net.train??

In [None]:
capsule_net.children

In [11]:
batch_size = 100
mnist = Mnist(batch_size)

In [12]:
adata,atarget = first(mnist.train_loader)
adata.shape,atarget.shape

(torch.Size([100, 1, 28, 28]), torch.Size([100]))

In [15]:
adata,atarget = first(mnist.train_loader)
adata.shape,atarget.shape
cl = ConvLayer()
res1 = cl(adata)
res1.shape

torch.Size([100, 256, 20, 20])

In [None]:
show_image(adata[0]),atarget[0]

In [None]:
#somehow this works
torch.eye(4).index_select(dim=0,index=tensor([2,3,2,3]))

In [None]:
target = torch.sparse.torch.eye(10).index_select(dim=0, index=atarget); target[:5]

### Difference between nn.Parameter and torch.autograd.Variable

nn.Parameter is a subclass of nn.Variable so most behaviors are the same.
The most important difference is that if you use nn.Parameter in a nn.Module's constructor, it will be added into the modules parameters just like nn.Module object do. Here is an example:

In [None]:
import torch

class MyModule(torch.nn.Module):

    def __init__(self):
        super().__init__()
        self.variable = torch.autograd.Variable(torch.Tensor([5]))
        self.parameter = torch.nn.Parameter(torch.Tensor([10]))

net = MyModule()
for param in net.parameters():
    print(param)
    print("Notice self.varible didn't get added to the parameters list")

In [None]:



n_epochs = 5


for epoch in range(n_epochs):
    capsule_net.train()
    train_loss = 0
    for batch_id, (data, target) in enumerate(mnist.train_loader):
        #makes the batch of one hot encoded targets
        target = torch.sparse.torch.eye(10).index_select(dim=0, index=target)
        data, target = tensor(data), tensor(target)

        if USE_CUDA:
            data, target = data.cuda(), target.cuda()

        optimizer.zero_grad()
        output, reconstructions, masked = capsule_net(data)
        loss = capsule_net.loss(data, output, target, reconstructions)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        
        if batch_id % 100 == 0:
            print("train accuracy:", sum(np.argmax(masked.data.cpu().numpy(), 1) == 
                                   np.argmax(target.data.cpu().numpy(), 1)) / float(batch_size) )
        
    print (train_loss / len(mnist.train_loader) )
        
    capsule_net.eval()
    test_loss = 0
    for batch_id, (data, target) in enumerate(mnist.test_loader):

        target = torch.sparse.torch.eye(10).index_select(dim=0, index=target)
        data, target = tensor(data), tensor(target)

        if USE_CUDA:
            #all the parameters and buffers of the module are moved to the gpu.
            data, target = data.cuda(), target.cuda()

        output, reconstructions, masked = capsule_net(data)
        loss = capsule_net.loss(data, output, target, reconstructions)

        test_loss += loss.item()
        
        if batch_id % 100 == 0:
            print("test accuracy:", sum(np.argmax(masked.data.cpu().numpy(), 1) == 
                                   np.argmax(target.data.cpu().numpy(), 1)) / float(batch_size) )
    
    print(test_loss / len(mnist.test_loader) )

In [None]:
import matplotlib
import matplotlib.pyplot as plt

def plot_images_separately(images):
    "Plot the six MNIST images separately."
    fig = plt.figure()
    for j in range(1, 7):
        ax = fig.add_subplot(1, 6, j)
        ax.matshow(images[j-1], cmap = matplotlib.cm.binary)
        plt.xticks(np.array([]))
        plt.yticks(np.array([]))
    plt.show()

In [None]:
plot_images_separately(data[:6,0].data.cpu().numpy())

In [None]:
plot_images_separately(reconstructions[:6,0].data.cpu().numpy())