## Import modules

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
from torch.autograd import Variable

## Data generation

In [60]:
def generate_data(size, length):
    data = torch.rand(size, length).round()
    target = torch.fmod(torch.sum(data, 1), 2).squeeze()
    target = target.long()

    return data, target

## Model Definition

In [91]:
class Net(nn.Module):
    def __init__(self, length, num_units, num_layers, num_classes):
        super(Net, self).__init__()
        self.m0 = nn.Linear(length, num_units)
        self.h0 = nn.Linear(length, num_units)
        self.num_layers = num_layers

        # create the forget, input, candidate, output linearities for each layer

        self.forget_lin = [nn.Linear(num_units, num_units).cuda() for i in range(num_layers)]
        self.input_lin = [nn.Linear(num_units, num_units).cuda() for i in range(num_layers)]
        self.candidate_lin = [nn.Linear(num_units, num_units).cuda() for i in range(num_layers)]
        self.output_lin = [nn.Linear(num_units, num_units).cuda() for i in range(num_layers)]

        self.activation = nn.Linear(num_units + num_units, num_classes)

    def step(self, h, m, forget_lin_, input_lin_, candidate_lin_, output_lin_):
        # create the gates for each linearites, using the sigmoid as the activation function
        forget_gate = F.sigmoid(forget_lin_(h))
        input_gate = F.sigmoid(input_lin_(h))
        candidate = F.tanh(candidate_lin_(h))
        output_gate = F.sigmoid(output_lin_(h))

        # memory vector m, and a hidden vector h
        m = (input_gate * candidate) + (forget_gate * m)
        h = F.tanh(output_gate * m)

        return h, m

    def forward(self, inputs, hidden=None, force=True, steps=0):
        if force or steps == 0: steps = len(inputs)
        outputs = Variable(torch.zeros(steps, 1, 1))

        m = self.m0(inputs)
        h = self.h0(inputs)

        for i in range(self.num_layers):
            # for each layer, compute the vectors h, m, which are the inputs fed into the following layer
            forget_lin_ = self.forget_lin[i]
            input_lin_ = self.input_lin[i]
            candidate_lin_ = self.candidate_lin[i]
            output_lin_ = self.output_lin[i]
            h, m = self.step(h, m, forget_lin_, input_lin_, candidate_lin_, output_lin_)

        outputs = F.log_softmax(self.activation(torch.cat((h, m), 1)), dim=1)
        return outputs

## Model Init

In [136]:
LEARNING_RATE = 0.5
NUM_UNITS = 1
NUM_CLASSES = 2
NUM_LAYERS = 1
LENGTH = 30

model = Net(length=LENGTH, num_units=NUM_UNITS, num_layers=NUM_LAYERS, num_classes=NUM_CLASSES)
model.cuda()
opt = optim.Adam(model.parameters(), lr=LEARNING_RATE)

## Model Train

In [151]:
def train(epoch, log_interval):
    TOTAL_SIZE = 100000
    BATCH_SIZE = 10000
    NUM_BATCHES = TOTAL_SIZE // BATCH_SIZE
    
    d, t = generate_data(size=TOTAL_SIZE, length=LENGTH)
    print("shape of data:", d.shape)
    print("hidden units:", NUM_UNITS, ", layers:", NUM_LAYERS, ", sequence length:", LENGTH)
    idx = torch.randperm(TOTAL_SIZE)
    
    print("Running on train data")
    for epoch in range(1, EPOCH + 1):
        model.train()
        epoch_loss = 0
        idx = torch.randperm(TOTAL_SIZE)
        for i in range(NUM_BATCHES):
            batch_idx = idx[i * BATCH_SIZE:(i + 1) * BATCH_SIZE]
            data, target = d[batch_idx], t[batch_idx]

            data, target = data.cuda(), target.cuda()
            data, target = Variable(data), Variable(target)

            opt.zero_grad()
            output = model(data)
            loss = F.nll_loss(output, target)
            loss.backward()
            opt.step()
            epoch_loss = loss.data.item()
            
        if epoch % log_interval == 0:
            print('Train Epoch: {} \tLoss: {:.6f}'.format(epoch, epoch_loss))
    print('Final Epoch: {} \tLoss: {:.6f}'.format(epoch, epoch_loss))


def test():
    print("Running on test data")
    TEST_SIZE = 5000
    BATCH_SIZE = 100
    NUM_BATCHES = TEST_SIZE // BATCH_SIZE
    model.eval()
    test_loss = 0
    correct = 0
    data, target = generate_data(size = TEST_SIZE, length=LENGTH)
    batches = int(TEST_SIZE / BATCH_SIZE)

    for i in range(batches):
        d = data[i * BATCH_SIZE:(i + 1) * BATCH_SIZE]
        t = target[i * BATCH_SIZE:(i + 1) * BATCH_SIZE]
        d, t = d.cuda(), t.cuda()
        d, t = Variable(d, volatile=True), Variable(t, volatile=True)
        out = model(d)
        test_loss += F.nll_loss(out, t).data.item()
        pred = out.data.max(1, keepdim=True)[1]
        correct += pred.eq(t.data.view_as(pred)).cpu().sum().item()
    print("Test loss: ", test_loss / batches)
    print("Number correct: ", correct / TEST_SIZE)


# Experiments

### NUM_UNITS=100, LENGTH=15

In [147]:
LEARNING_RATE = 0.005
NUM_UNITS = 100
NUM_CLASSES = 2
NUM_LAYERS = 1
LENGTH = 15
EPOCH = 1000
LOG_EVERY_EPOCH = 100

model = Net(length=LENGTH, num_units=NUM_UNITS, num_layers=NUM_LAYERS, num_classes=NUM_CLASSES)
model.cuda()
opt = optim.Adam(model.parameters(), lr=LEARNING_RATE)

train(EPOCH, LOG_EVERY_EPOCH)

shape of data: torch.Size([100000, 15])
hidden units: 100 layers 1 sequence length: 15
Running on train data
Train Epoch: 100 	Loss: 0.693226
Train Epoch: 200 	Loss: 0.691268
Train Epoch: 300 	Loss: 0.041477
Train Epoch: 400 	Loss: 0.013026
Train Epoch: 500 	Loss: 0.005603
Train Epoch: 600 	Loss: 0.002506
Train Epoch: 700 	Loss: 0.002005
Train Epoch: 800 	Loss: 0.000566
Train Epoch: 900 	Loss: 0.000219
Train Epoch: 1000 	Loss: 0.000282
Final Epoch: 1000 	Loss: 0.000282


## NUM_UNITS=100, LENGTH=20

In [154]:
LEARNING_RATE = 0.005
NUM_UNITS = 100
NUM_CLASSES = 2
NUM_LAYERS = 1
LENGTH = 20
EPOCH = 1000
LOG_EVERY_EPOCH = 100

model = Net(length=LENGTH, num_units=NUM_UNITS, num_layers=NUM_LAYERS, num_classes=NUM_CLASSES)
model.cuda()
opt = optim.Adam(model.parameters(), lr=LEARNING_RATE)

train(EPOCH, LOG_EVERY_EPOCH)
test()

shape of data: torch.Size([100000, 20])
hidden units: 100 , layers: 1 , sequence length: 20
Running on train data
Train Epoch: 100 	Loss: 0.691040
Train Epoch: 200 	Loss: 0.687722
Train Epoch: 300 	Loss: 0.680695
Train Epoch: 400 	Loss: 0.671677
Train Epoch: 500 	Loss: 0.427290
Train Epoch: 600 	Loss: 0.147723
Train Epoch: 700 	Loss: 0.050909
Train Epoch: 800 	Loss: 0.024202
Train Epoch: 900 	Loss: 0.010777
Train Epoch: 1000 	Loss: 0.006203
Train Epoch: 1100 	Loss: 0.004822
Train Epoch: 1200 	Loss: 0.002946
Train Epoch: 1300 	Loss: 0.001825
Train Epoch: 1400 	Loss: 0.000892
Train Epoch: 1500 	Loss: 0.000643
Train Epoch: 1600 	Loss: 0.000374


KeyboardInterrupt: 

## NUM_UNITS=500, LENGTH=25

In [None]:
LEARNING_RATE = 0.005
NUM_UNITS = 500
NUM_CLASSES = 2
NUM_LAYERS = 1
LENGTH = 25
EPOCH = 5000
LOG_EVERY_EPOCH = 100

model = Net(length=LENGTH, num_units=NUM_UNITS, num_layers=NUM_LAYERS, num_classes=NUM_CLASSES)
model.cuda()
opt = optim.Adam(model.parameters(), lr=LEARNING_RATE)

train(EPOCH, LOG_EVERY_EPOCH)
test()

shape of data: torch.Size([100000, 25])
hidden units: 500 , layers: 1 , sequence length: 25
Running on train data
Train Epoch: 100 	Loss: 0.690813
Train Epoch: 200 	Loss: 0.684318
Train Epoch: 300 	Loss: 0.668866
Train Epoch: 400 	Loss: 0.637986
Train Epoch: 500 	Loss: 0.610248
Train Epoch: 600 	Loss: 0.570222
Train Epoch: 700 	Loss: 0.550168
Train Epoch: 800 	Loss: 0.508000
Train Epoch: 900 	Loss: 0.495314
Train Epoch: 1000 	Loss: 0.469450
Train Epoch: 1100 	Loss: 0.458772
Train Epoch: 1200 	Loss: 0.442051
Train Epoch: 1300 	Loss: 0.422570
Train Epoch: 1400 	Loss: 0.414792
Train Epoch: 1500 	Loss: 0.406297
Train Epoch: 1600 	Loss: 0.388922
Train Epoch: 1700 	Loss: 0.376663
Train Epoch: 1800 	Loss: 0.373813
Train Epoch: 1900 	Loss: 0.375763
Train Epoch: 2000 	Loss: 0.357228
Train Epoch: 2100 	Loss: 0.368847
Train Epoch: 2200 	Loss: 0.360956
Train Epoch: 2300 	Loss: 0.338571
Train Epoch: 2400 	Loss: 0.344475
Train Epoch: 2500 	Loss: 0.333713
Train Epoch: 2600 	Loss: 0.320933
Train Epoch