# Relational RNNs by Adam Santoro et al. in PyTorch



In [1]:
import numpy as np

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [154]:
class RelationalMemory(nn.Module):
    """
    Args:
        input_size: 
        mem_slots: The total number of memory slots to use.
        head_size: The size of an attention head.
        num_heads: The number of attention heads to use. Defaults to 1.
        num_blocks: Number of times to compute attention per time step. Defaults to 1.
        forget_bias:
        input_bias:
        gate_style:
        attention_mlp_layers:
        key_size:
    """
    def __init__(self, input_size,
                 mem_slots, head_size, num_heads=1, num_blocks=1,
                 forget_bias=1.0, input_bias=0.0, gate_style='unit',
                 attention_mlp_layers=2, key_size=None):
        super(RelationalMemory, self).__init__()
        
        self._mem_slots = mem_slots
        self._head_size = head_size
        self._num_heads = num_heads
        self._mem_size = self._head_size * self._num_heads

        if num_blocks < 1:
            raise ValueError('num_blocks must be >= 1. Got: {}.'.format(num_blocks))
        self._num_blocks = num_blocks

        self._forget_bias = forget_bias
        self._input_bias = input_bias

        if gate_style not in ['unit', 'memory', None]:
            raise ValueError(
                'gate_style must be one of [\'unit\', \'memory\', None]. Got: '
                '{}.'.format(gate_style))
        self._gate_style = gate_style

        if attention_mlp_layers < 1:
            raise ValueError('attention_mlp_layers must be >= 1. Got: {}.'.format(
                attention_mlp_layers))
        self._attention_mlp_layers = attention_mlp_layers

        self._key_size = key_size if key_size else self._head_size

        self._linear = nn.Linear(in_features=input_size,
                                 out_features=self._mem_size)
        
        qkv_size = 2 * self._key_size + self._head_size
        total_size = qkv_size * self._num_heads
        self._attention_linear = nn.Linear(in_features=self._mem_size,
                                           out_features=total_size)
        self._attention_layer_norm = nn.LayerNorm(total_size)
        
        attention_mlp_module = nn.ModuleList([nn.Sequential(
                nn.Linear(in_features=self._mem_size,
                          out_features=self._mem_size),
                nn.ReLU())] * (self._attention_mlp_layers - 1) +
                [nn.Linear(in_features=self._mem_size,
                           out_features=self._mem_size)]
        )
        self._attention_mlp = nn.Sequential(*attention_mlp_module)
        
        self._attend_layer_norm_1 = nn.LayerNorm(self._mem_size)
        self._attend_layer_norm_2 = nn.LayerNorm(self._mem_size)
        
        num_gates = 2 * self._calculate_gate_size()
        self._gate_inputs_linear = nn.Linear(in_features=self._mem_size,
                                             out_features=num_gates)
        
        self._gate_memory_linear = nn.Linear(in_features=self._mem_size,
                                             out_features=num_gates)
        
        
    def initial_state(self, batch_size):
        """Creates the initial memory.

        We should ensure each row of the memory is initialized to be unique,
        so initialize the matrix to be the identity. We then pad or truncate
        as necessary so that init_state is of size (batch_size, self._mem_slots, self._mem_size).

        Returns:
            init_state: A truncated or padded matrix of size
            (batch_size, self._mem_slots, self._mem_size).
        """    
        init_state = torch.eye(n=self._mem_slots).repeat(batch_size, 1, 1)
        
        # Pad the matrix with zeros.
        if self._mem_size > self._mem_slots:
            difference = self._mem_size - self._mem_slots
            pad = torch.zeros((batch_size, self._mem_slots, difference))
            init_state = torch.cat([init_state, pad], dim=-1)
        # Truncation. Take the first `self._mem_size` components.
        elif self._mem_size < self._mem_slots:
            init_state = init_state[:, :, :self._mem_size]
        
        return init_state
        

    def _multihead_attention(self, memory): # memory: [B, MEM_SLOT, MEM_SIZE]
        # F = total_size
        # mem_slots = MEM_SLOT = N
        mem_slots = memory.size(1)
        
        # [B, MEM_SLOT, MEM_SIZE] -> [B*MEM_SLOT, MEM_SIZE] -> Linear -> [B*MEM_SLOT, F]
        qkv = self._attention_linear(memory.view(-1, memory.size(2)))
        
        # [B*MEM_SLOT, F] -> Layer Norm -> [B*MEM_SLOT, F] -> [B, MEM_SLOT, F]
        qkv = self._attention_layer_norm(qkv).view(memory.size(0), mem_slots, -1)
        
        # H = num_heads
        qkv_size = 2 * self._key_size + self._head_size
        
        # [B, N, F] -> [B, N, H, F/H]
        qkv_reshape = qkv.view(-1, mem_slots, self._num_heads, qkv_size)
        
        # [B, N, H, F/H] -> [B, H, N, F/H]
        qkv_transpose = qkv_reshape.permute(0, 2, 1, 3)
        # split q, k, v
        q, k, v = torch.split(qkv_transpose, [self._key_size, self._key_size, self._head_size], dim=-1)
        
        q *= qkv_size ** -0.5
        dot_product = torch.matmul(q, torch.transpose(k, 2, 3)) # [B, H, N, N]
        weights = F.softmax(dot_product, dim=-1)
        
        #[B, H, N, V]
        output = torch.matmul(weights, v)
        
        # [B, H, N, V] -> [B, N, H, V]
        output_transpose = output.permute(0, 2, 1, 3)
        
        # [B, N, H, V] -> [B, N, H * V]
        new_memory = output_transpose.contiguous().view(-1, output_transpose.size(1), 
                                                        output_transpose.size(2)*output_transpose.size(3))
        
        return new_memory #[B, MEM_SLOTS, MEM_SIZE]
    
    
    def _attend_over_memory(self, memory):
        # memory: [B, MEM_SLOT, MEM_SIZE]
        for _ in range(self._num_blocks):
            attended_memory = self._multihead_attention(memory) # [B, MEM_SLOT, MEM_SIZE]
            
            # add a skip connection the multiheaded attention's input.
            # memory = LN_1(memory + attended_memory) [B*MEM_SLOT, MEM_SIZE]
            memory = self._attend_layer_norm_1((memory + attended_memory).view(-1, memory.size(2)))
            
            # add a skip connection to the attention_mlp's input.
            # memory = LN_2( MLP(memory) + memory)
            memory = self._attend_layer_norm_2(self._attention_mlp(memory) + memory).view(-1, 
                                                                                          attended_memory.size(1),
                                                                                          attended_memory.size(2))
    
        return memory
    
    def _calculate_gate_size(self):
        if self._gate_style == 'unit':
            return self._mem_size
        elif self._gate_style == 'memory':
          return 1
        else:
            return 0
        
    def _create_gates(self, inputs, memory):
        memory = torch.tanh(memory)
        
        #inputs [B, 1, MEM_SIZE] -> [B, 1*MEM_SIZE]
        inputs = inputs.view(inputs.size(0), -1)
        
        # [B, 1*MEM_SIZE] -> Linear -> [B, num_gates] -> [B, 1, num_gates]
        gate_inputs = self._gate_inputs_linear(inputs).unsqueeze(1)
        
        # memory [B, MEM_SLOT, MEM_SIZE] -> [B*MEM_SLOT, MEM_SIZE] -> Linear -> [B*MEM_SLOT, 2*num_gates]
        # -> [B, MEM_SLOT, 2*num_gates]
        gate_memory = self._gate_memory_linear(memory.view(-1, memory.size(2))).view(memory.size(0),
                                                                                     memory.size(1),
                                                                                     -1)
        
        input_gate, forget_gate = torch.chunk(gate_memory + gate_inputs, 2, dim=2)
        
        input_gate = torch.sigmoid(input_gate + self._input_bias)
        forget_gate = torch.sigmoid(forget_gate + self._forget_bias)
        
        return input_gate, forget_gate #[B, MEM_SLOT, num_gate], [B, MEM_SLOT, num_gate]
        
                                              
    def forward(self, x, memory, treat_input_as_matrix=False):
        # x: [B, T, F=input_size]
        # memory: [B, MEM_SLOTS, MEM_SIZE]
        batch_size = x.size(0)
        total_timesteps = x.size(1)
        
        output_accumulator = x.new_zeros(batch_size, total_timesteps, self._mem_slots*self._mem_size)
        
        for index in range(total_timesteps):
            # For each time-step
            # inputs: [B, 1, F=input_size]
            inputs = x[:,index].unsqueeze(1)
            
            if treat_input_as_matrix:
                # [B, 1, F] -> [B*1, F] -> linear ->[B*1, MEM_SIZE] -> [B, 1, MEM_SIZE]
                inputs_reshape =  self._linear(inputs.view(-1, inputs.size(2))).view(inputs.size(0), 
                                                                                     -1, 
                                                                                     self._mem_size)
            else:
                # [B, 1, F] -> [B, 1*F] -> linear -> [B, MEM_SIZE] -> [B, 1, MEM_SIZE]
                inputs = inputs.view(inputs.size(0), -1)
                inputs = self._linear(inputs)
                inputs_reshape = inputs.unsqueeze(1)

            # [B, MEM_SLOTS, MEM_SIZE] -> [B, MEM_SLOT+1, MEM_SIZE]
            memory_plus_input = torch.cat([memory, inputs_reshape], dim=1)

            next_memory = self._attend_over_memory(memory_plus_input)
            n = inputs_reshape.size(1)
            # [B, MEM_SLOT+1, MEM_SIZE] -> [B, MEM_SLOT, MEM_SIZE]
            next_memory = next_memory[:, :-n, :]

            if self._gate_style == 'unit' or self._gate_style == 'memory':
                input_gate, forget_gate = self._create_gates(inputs_reshape, memory) #[B, MEM_SLOT, num_gate] 
                next_memory = input_gate * torch.tanh(next_memory)
                next_memory += forget_gate * memory
            
            
            # output: [B, MEM_SLOT, MEM_SIZE] -> [B, MEM_SLOT*MEM_SIZE]
            output = next_memory.view(next_memory.size(0), -1)
            
            import pdb
            pdb.set_trace()
            
            output_accumulator[:,index] = output.clone()
            memory = next_memory.clone()
        
            
        return output_accumulator, memory

In [155]:
mem_slots = 4
head_size = 32
num_heads = 2
batch_size = 5
input_size = 3

In [156]:
input_shape = (batch_size, 3, 3)
inputs = torch.Tensor(batch_size, 3, 3)

In [157]:
mem = RelationalMemory(input_size, mem_slots, head_size, num_heads)

In [158]:
init_state = mem.initial_state(batch_size)

In [159]:
init_state.shape

torch.Size([5, 4, 64])

In [160]:
output, next_memory = mem(inputs, init_state, treat_input_as_matrix=True)

> <ipython-input-154-df4d928e84b0>(231)forward()
-> output_accumulator[:,index] = output.clone()
(Pdb) output
tensor([[ 8.1520e-39,  2.4029e-39, -3.1129e-39,  ...,  1.9256e-39,
          6.0111e-01,  3.5943e-39],
        [ 1.0000e+00,  2.3586e-01, -3.4278e-39,  ...,  4.2129e-01,
          6.6003e-01,  8.1420e-01],
        [ 1.2292e+00,  1.6309e-01, -2.9620e-01,  ...,  7.5471e-02,
          4.4334e-01,  4.8905e-01],
        [ 1.0000e+00,  2.3586e-01, -3.4278e-39,  ...,  4.2129e-01,
          6.6003e-01,  8.1420e-01],
        [ 1.0000e+00,  2.4029e-39, -3.1129e-39,  ...,  1.9256e-39,
          2.4982e-39,  3.5943e-39]], grad_fn=<ViewBackward>)
(Pdb) next_memory
tensor([[[ 8.1520e-39,  2.4029e-39, -3.1129e-39,  ...,  1.9006e-39,
           5.7813e-01,  3.5945e-39],
         [ 1.3520e-39,  1.0000e+00, -3.1828e-39,  ...,  1.9116e-39,
           5.4240e-01,  3.5344e-39],
         [ 1.5484e-39,  2.8382e-39,  1.0000e+00,  ...,  2.6885e-39,
           7.0168e-01,  3.7286e-39],
         [ 5.3364

In [161]:
output.shape

torch.Size([5, 3, 256])

In [162]:
next_memory.shape

torch.Size([5, 4, 64])

In [151]:
init_state.shape

torch.Size([5, 4, 64])

In [152]:
output

tensor([[[ 1.1466e+00, -4.0538e-02, -6.4860e-02,  ...,  3.7660e-01,
           4.0484e-01, -3.2260e-01],
         [ 1.3230e+00, -2.9083e-01, -4.1117e-01,  ...,  3.6210e-01,
           5.4113e-01, -5.0793e-01],
         [ 1.4667e+00, -3.7545e-01, -6.5924e-01,  ...,  1.9481e-01,
           5.6986e-01, -7.0246e-01]],

        [[ 9.6494e-01,  1.2526e-01,  3.2965e-01,  ...,  7.4273e-40,
           3.8232e-39, -7.1376e-01],
         [ 1.1437e+00, -1.1511e-01,  6.7471e-03,  ..., -4.6366e-01,
          -2.0088e-01, -8.3691e-01],
         [ 1.3293e+00, -3.5907e-01, -2.7712e-01,  ..., -9.8271e-01,
          -3.6156e-01, -9.1689e-01]],

        [[ 1.1466e+00, -4.0538e-02, -6.4860e-02,  ...,  3.7660e-01,
           4.0484e-01, -3.2260e-01],
         [ 1.3230e+00, -2.9083e-01, -4.1117e-01,  ...,  3.6210e-01,
           5.4113e-01, -5.0793e-01],
         [ 1.4667e+00, -3.7545e-01, -6.5924e-01,  ...,  1.9481e-01,
           5.6986e-01, -7.0246e-01]],

        [[ 1.1466e+00, -4.0538e-02, -6.4860e-02, 

In [153]:
next_memory

tensor([[[ 1.4667, -0.3754, -0.6592,  ...,  0.1249,  0.5104, -0.7486],
         [ 1.0975,  0.7981, -0.7753,  ...,  0.2788,  0.5345, -0.7651],
         [ 1.0546, -0.4408,  0.8206,  ...,  0.0380,  0.4739, -0.7498],
         [ 1.0674, -0.3984, -0.7312,  ...,  0.1948,  0.5699, -0.7025]],

        [[ 1.3293, -0.3591, -0.2771,  ..., -0.9504, -0.3630, -0.9444],
         [ 1.1170,  0.5842, -0.5348,  ..., -0.9578, -0.4060, -0.9993],
         [ 0.9862, -0.2343,  0.4565,  ..., -0.9901, -0.3731, -1.0200],
         [ 0.9264, -0.2506, -0.4436,  ..., -0.9827, -0.3616, -0.9169]],

        [[ 1.4667, -0.3754, -0.6592,  ...,  0.1249,  0.5104, -0.7486],
         [ 1.0975,  0.7981, -0.7753,  ...,  0.2788,  0.5345, -0.7651],
         [ 1.0546, -0.4408,  0.8206,  ...,  0.0380,  0.4739, -0.7498],
         [ 1.0674, -0.3984, -0.7312,  ...,  0.1948,  0.5699, -0.7025]],

        [[ 1.4667, -0.3754, -0.6592,  ...,  0.1249,  0.5104, -0.7486],
         [ 1.0975,  0.7981, -0.7753,  ...,  0.2788,  0.5345, -0.7651],
