In [1]:
import numpy as np
import pickle as pkl
import torch
import torch.nn as nn
import torch.nn.functional as F

These are data generation. Don't change. Training data are pairs of neighbor numbers. Validation data are pairs of random numbers.  
The purpose of this setup is that if the model can generalize the relationship of the numbers from simple training data, then it is possible to do numerical reasoning whose training is on limited combination of numbers. 

In [2]:
x1 = [idx for idx in range(0, 15000)] * 2 + [idx+1 for idx in range(0, 15000)]
x2 = [idx for idx in range(0, 15000)] + [idx+1 for idx in range(0, 15000)] + [idx for idx in range(0, 15000)]
x1, x2 = np.array(x1), np.array(x2)
y = (x1 >= x2) * 1 + (x1 > x2) * 1
train_data = [x1, x2, y]
train_data = zip(*train_data)
train_data = sorted(train_data, key=lambda x: x[0])
train_data = list(zip(*train_data))

In [3]:
x1 = np.random.randint(0, 15000, 10000)
x2 = np.random.randint(0, 15000, 10000)
y = (x1 >= x2) * 1 + (x1 > x2) * 1
val_data = [x1, x2, y]

In [4]:
import itertools
import random
int_pair_list = list(itertools.permutations(list(range(300)), 2)) + [(i,i) for i in range(300)]*30
random.shuffle(int_pair_list)
x1 = [x[0] for x in int_pair_list]
x2 = [x[1] for x in int_pair_list]
x1, x2 = np.array(x1), np.array(x2)
y = (x1 >= x2) * 1 + (x1 > x2) * 1
train_data_JP = [x1[:70000], x2[:70000], y[:70000]]
val_data_JP = [x1[70000:], x2[70000:], y[70000:]]

In [71]:
x1 = [idx for idx in range(0, 150)] * 2 + [idx+1 for idx in range(0, 150)]
x2 = [idx for idx in range(0, 150)] + [idx+1 for idx in range(0, 150)] + [idx for idx in range(0, 150)]
x1, x2 = np.array(x1), np.array(x2)
y = (x1 >= x2) * 1 + (x1 > x2) * 1
train_data_short = [x1, x2, y]
train_data_short = zip(*train_data_short)
train_data_short = sorted(train_data_short, key=lambda x: x[0])
train_data_short = list(zip(*train_data_short))

In [6]:
DEVICE = "cuda"
BSZ = 450

These are the train and validate functions. Don't need to change unless want to try variations of training paradigm such as using multiclass SVM loss.

In [56]:
import torch
import torch.nn as nn
import torch.nn.functional as F

def train(model, train_loader, val_loader, fail_tol, learning_rate=3e-4, label=""):

    num_epochs = 1000

#     criterion = torch.nn.MultiMarginLoss()
    criterion = torch.nn.CrossEntropyLoss()
#     optimizer = torch.optim.LBFGS(model.parameters(), lr=learning_rate)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
#     scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'max', patience=3, factor=0.1)
    
    def closure():
            optimizer.zero_grad()
            outputs = model(x)
            loss = criterion(outputs, labels)
            for param in model.parameters():
                loss += 1e-3 * torch.norm(param)
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 3)
            return loss
    
    total_step = len(train_loader)
    loss_list, val_acc_list = [], []
    
    fail_cnt, cur_best = 0, 0
    for epoch in range(num_epochs+1):
        
        avg_loss = 0.
        if epoch > 0:
            for i, (x, labels) in enumerate(train_loader):
                model.train()

                with torch.no_grad():
                    outputs = model(x)
                    obj = criterion(outputs, labels)                
                optimizer.step(closure)

                avg_loss += obj.item() / len(train_loader)
            
        val_acc = test_model(val_loader, model)
        train_acc = test_model(train_loader, model)
        val_acc_list.append(val_acc)
        loss_list.append(avg_loss)

        if (val_acc > cur_best):
            print('Epoch: [{}/{}], Loss: {:.4}, Train acc: {:.4}, Val acc: {:.4}'.format(
                epoch, num_epochs, avg_loss, train_acc, val_acc))
            print("found best! save model...")
            torch.save(model.state_dict(), 'model' + "-" + label + '.ckpt')
            cur_best = val_acc
            fail_cnt = 0
        else:
            fail_cnt += 1
            print('Epoch: [{}/{}], Loss: {:.4}, Train acc: {:.4}, Val acc: {:.4} ({}/{})'.format(
                epoch, num_epochs, avg_loss, train_acc, val_acc, fail_cnt, fail_tol))
        if fail_cnt > fail_tol:
            return loss_list, val_acc_list

#         scheduler.step(val_acc)
    return loss_list, val_acc_list

def test_model(loader, model):
    from collections import Counter
    res_cnt = Counter()
    correct = 0
    total = 0
    model.eval()
    for x, labels in loader:
        outputs = model(x)
        predicted = outputs.max(1, keepdim=True)[1]
#         labels = labels.max(1)[1]
        res_cnt.update(list(predicted.squeeze().cpu().numpy()))
        
        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
    print(res_cnt)
    return (100 * correct / total)

There are two ways to input the pairs. Concatenation ($N\times 2$) and subtraction ($N\times 1$). Although subtraction should presumably be learned (with concatenation), it doesn't seem to work that way.

In [8]:
from torch.utils.data import Dataset
class numDataset(Dataset):
    def __init__(self, data_list, device=DEVICE):
        self.s1_list, self.s2_list, self.target_list = data_list
        self.device = device
        assert (len(self.s1_list) == len(self.target_list))

    def __len__(self):
        return len(self.target_list)
        
    def __getitem__(self, key):    

        s1_idx = self.s1_list[key]
        s2_idx = self.s2_list[key]       
        label = self.target_list[key]

        return [s1_idx, s2_idx, label, self.device]
    
def collate_func(batch):
    device = batch[0][3]
    data_list, label_list = [], []
    for datum in batch:
        # Can change comma to minus (or minus to comma) in the next line
        data_list.append([datum[0] , datum[1]])
        label_list.append(datum[2])

    return [torch.FloatTensor(np.array(data_list)).to(device), 
            torch.LongTensor(label_list).to(device)]

In [72]:
train_dataset = numDataset(train_data)
val_dataset = numDataset(val_data)
train_dataset_short = numDataset(train_data_short)
train_dataset_JP = numDataset(train_data_JP)
val_dataset_JP = numDataset(val_data_JP)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=BSZ,
                                           collate_fn=collate_func,
                                           shuffle=True)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset,
                                           batch_size=BSZ,
                                           collate_fn=collate_func,
                                           shuffle=False)
train_loader_short = torch.utils.data.DataLoader(dataset=train_dataset_short,
                                           batch_size=BSZ,
                                           collate_fn=collate_func,
                                           shuffle=True)
train_loader_JP = torch.utils.data.DataLoader(dataset=train_dataset_JP,
                                           batch_size=BSZ,
                                           collate_fn=collate_func,
                                           shuffle=True)
val_loader_JP = torch.utils.data.DataLoader(dataset=val_dataset_JP,
                                           batch_size=BSZ,
                                           collate_fn=collate_func,
                                           shuffle=False)

If using subtraction, input is 1d. Concatenation is 2d.

In [57]:
class fcNet(nn.Module):
    def __init__(self, n_layers, fc_hid_dim, device=DEVICE):
        super(fcNet, self).__init__()
        self.device = device
        self.fc_hid_dim = fc_hid_dim
        self.n_layers = n_layers

        # Specify the input dimension here
        if n_layers == 1:
            self.linears = nn.ModuleList([nn.Linear(2,3)])
        else:
            self.linears = nn.ModuleList([nn.Sequential(nn.Linear(2, fc_hid_dim), nn.Tanh())]+
                                         [nn.Sequential(nn.Linear(fc_hid_dim, fc_hid_dim), nn.Tanh())] * (n_layers-2)+
                                         [nn.Linear(fc_hid_dim, 3)])
        self.init_weights()
    def forward(self, x):
        for linear in self.linears:
            x = linear(x)
        return x
    
    def init_weights(self):
        initrange = 0.1
        lin_layers = [layer if type(layer) == torch.nn.modules.linear.Linear else layer[0] for layer in self.linears]
     
        for layer in lin_layers:
            layer.weight.data.uniform_(-initrange, initrange)
#             layer.weight.data.zero_()
            if layer in lin_layers:
                layer.bias.data.fill_(0)

Training is here. Feel free to change the hyperparameters.

In [78]:
n_layers = 4
fc_hid_dim = 32
model = fcNet(n_layers, fc_hid_dim).to(DEVICE)
# model.linears[0].weight.data = torch.tensor([[-5.,5.],
#                                             [1.,-1.],
#                                             [5.,-5.]]).to(DEVICE)
# model.linears[0].bias.data = torch.tensor([0.,2.,0.]).to(DEVICE)
print(model)
res = train(model, train_loader_JP, train_loader, 66, learning_rate=3e-4, label="")

fcNet(
  (linears): ModuleList(
    (0): Sequential(
      (0): Linear(in_features=2, out_features=32, bias=True)
      (1): Tanh()
    )
    (1): Sequential(
      (0): Linear(in_features=32, out_features=32, bias=True)
      (1): Tanh()
    )
    (2): Sequential(
      (0): Linear(in_features=32, out_features=32, bias=True)
      (1): Tanh()
    )
    (3): Linear(in_features=32, out_features=3, bias=True)
  )
)
Counter({2: 44995, 0: 5})
Counter({2: 54040, 0: 11785, 1: 4175})
Epoch: [0/1000], Loss: 0.0, Train acc: 35.08, Val acc: 33.32
found best! save model...
Counter({0: 44689, 2: 311})
Counter({0: 37546, 2: 32454})
Epoch: [1/1000], Loss: 0.7384, Train acc: 90.81, Val acc: 33.96
found best! save model...
Counter({2: 44275, 1: 645, 0: 80})
Counter({2: 32593, 0: 31370, 1: 6037})
Epoch: [2/1000], Loss: 0.212, Train acc: 98.54, Val acc: 34.13
found best! save model...
Counter({2: 41749, 1: 3242, 0: 9})
Counter({2: 31676, 0: 31274, 1: 7050})
Epoch: [3/1000], Loss: 0.1239, Train acc: 98.7

Counter({0: 44268, 1: 634, 2: 98})
Counter({0: 33731, 2: 31744, 1: 4525})
Epoch: [52/1000], Loss: 0.004582, Train acc: 96.88, Val acc: 34.0 (8/66)
Counter({1: 23778, 2: 20025, 0: 1197})
Counter({2: 31888, 0: 31688, 1: 6424})
Epoch: [53/1000], Loss: 0.003513, Train acc: 100.0, Val acc: 58.16 (9/66)
Counter({1: 42413, 0: 1979, 2: 608})
Counter({2: 31888, 0: 31688, 1: 6424})
Epoch: [54/1000], Loss: 0.001526, Train acc: 100.0, Val acc: 39.08 (10/66)
Counter({2: 35378, 1: 8380, 0: 1242})
Counter({2: 31888, 0: 31688, 1: 6424})
Epoch: [55/1000], Loss: 0.001129, Train acc: 100.0, Val acc: 43.28 (11/66)
Counter({2: 39559, 1: 4814, 0: 627})
Counter({2: 31888, 0: 31688, 1: 6424})
Epoch: [56/1000], Loss: 0.00235, Train acc: 100.0, Val acc: 39.07 (12/66)
Counter({2: 39028, 1: 5306, 0: 666})
Counter({2: 31888, 0: 31688, 1: 6424})
Epoch: [57/1000], Loss: 0.002132, Train acc: 100.0, Val acc: 39.55 (13/66)
Counter({2: 42238, 1: 2498, 0: 264})
Counter({2: 31888, 0: 31664, 1: 6448})
Epoch: [58/1000], Los

Counter({2: 31888, 0: 31688, 1: 6424})
Epoch: [106/1000], Loss: 0.001306, Train acc: 100.0, Val acc: 52.88 (62/66)
Counter({2: 39141, 1: 5294, 0: 565})
Counter({2: 31888, 0: 31688, 1: 6424})
Epoch: [107/1000], Loss: 0.001106, Train acc: 100.0, Val acc: 39.29 (63/66)
Counter({2: 41504, 1: 3142, 0: 354})
Counter({2: 31888, 0: 31688, 1: 6424})
Epoch: [108/1000], Loss: 0.001411, Train acc: 100.0, Val acc: 36.63 (64/66)
Counter({1: 25655, 0: 18936, 2: 409})
Counter({2: 31888, 0: 31688, 1: 6424})
Epoch: [109/1000], Loss: 0.001028, Train acc: 100.0, Val acc: 58.83 (65/66)
Counter({2: 41327, 1: 3196, 0: 477})
Counter({2: 31888, 0: 31688, 1: 6424})
Epoch: [110/1000], Loss: 0.001032, Train acc: 100.0, Val acc: 36.9 (66/66)
Counter({2: 41543, 1: 3006, 0: 451})
Counter({2: 31888, 0: 31688, 1: 6424})
Epoch: [111/1000], Loss: 0.001111, Train acc: 100.0, Val acc: 36.68 (67/66)


In [73]:
model.load_state_dict(torch.load('model' + "-" + "" + '.ckpt'))
# model = fcNet(n_layers, fc_hid_dim).to(DEVICE)
print(test_model(train_loader_JP, model))
print(test_model(val_loader_JP, model))
print(test_model(train_loader, model))
print(test_model(val_loader, model))
print(test_model(train_loader_short, model))

Counter({2: 31888, 0: 31688, 1: 6424})
100.0
Counter({0: 13162, 2: 12962, 1: 2576})
100.0
Counter({1: 21801, 0: 15000, 2: 8199})
84.88666666666667
Counter({2: 5027, 0: 4972, 1: 1})
100.0
Counter({1: 150, 0: 150, 2: 150})
100.0


In [77]:
for p in model.parameters():
    print(p)

In [93]:
X = np.array([train_data_short[0], train_data_short[1]]).T
y = train_data_short[2]

In [102]:
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(C=1, multi_class='multinomial', solver='sag')
# clf = LinearSVC(random_state=0, tol=1e-5, max_iter=100000)
clf.fit(X, y)



LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=None, penalty='l2', random_state=None, solver='sag',
          tol=0.0001, verbose=0, warm_start=False)

# Results:
lbfgs + C=1 learns the (almost) correct model.  
C=1e30, sag fail, similar to 1 layer NN?

In [106]:
X_val = np.array([train_data[0], train_data[1]]).T
y_val = train_data[2]

In [104]:
X_val = np.array([val_data[0], val_data[1]]).T
y_val = val_data[2]

In [107]:
print(clf.score(X, y), clf.score(X_val, y_val))

1.0 0.366711111111


In [34]:
y_hat = clf.predict(X_val)
clf.score(X_val, y_val)

0.87457777777777779

In [37]:
lab = 0
sum(yh == yi for yh, yi in zip(y_hat, y_val) if yi == lab ) / sum(yi == lab for yi in y_val)

0.62373333333333336

In [28]:
clf.coef_

array([[ -3.18961850e-01,   3.18943580e-01],
       [  7.21459410e-06,   8.22490433e-06],
       [  3.18954636e-01,  -3.18951804e-01]])

In [29]:
clf.intercept_

array([ -1.82958187e-03,   1.73274617e-03,   9.68357056e-05])