<a href="https://colab.research.google.com/github/jhmuller/nn_sort/blob/main/sorting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Teachine a Neural Net to Sort


# Teachine a Neural Net to Sort
Can I make a Neural Net that can sort?
It seemed like an interesting question to me.

The first challenge is how to model the data.
I am sorting integers and the net outputs floats.
I did not think that rounding and comparing
the results to the true value would work well.

So I devised a way to model the problem using 
the moves that bubble sort would make when 
sorting the list.  Each move is encoded
as a vector of length N, where N is the length 
of the input list. All the entries in the vector 
are 0 except the one that tells the position
of the left number to be swapped in this step of Bubble sort.  There are at most N*(N-1)/2 steps.
Once the steps are done, i.e. the list is sorted  subsequent steps are encoded with a 1 in the last position indicating Noop or Nothing to do.

The Net will output probabilities for each vector.
Then I will compare the predicted values to the actual and that will be the loss.

Let's see if it works.

In [1]:
import torch
import numpy as np
from torch import nn
import math
from itertools import permutations
from torch.utils.data import Dataset, DataLoader


## Generating the input data.


In [129]:
LIST_LEN = 7

def sort_seq(x):
  lst = x.copy()
  N = len(lst)
  swaped = True
  seq = []
  while swaped:
    swaped = False
    for i in range(N-1):
      if lst[i] > lst[i+1]:
        lst[i], lst[i+1] = lst[i+1], lst[i]
        seq.append(i)
        swaped = True
  while len(seq) < N*(N-1)/2:
    seq.append(N-1)
  return lst, seq

def apply_seq(x_in, seq):
  x_out = x_in.copy()
  N = len(x_in)
  for i in seq:
    if i >= N-1:
      continue
    x_out[i], x_out[i+1] = x_out[i+1], x_out[i]
  return x_out

def lst_diff(true, pred):
  diff = 0
  for i, p in enumerate(pred):
    diff += abs(p - true[i])
  return diff

def factorial(n):
  res = n
  for i in range(2, n):
    res = res*i
  return res

def generate_input_data(m=5):
  res = []
  w = int((m*(m-1))/2)
  first = list(range(m))
  iter = permutations(first)
  for i, x in enumerate(iter):
    _, y = sort_seq(list(x))
    mat = np.zeros((w, m))
    for j, p in enumerate(y):
      mat[j, p] = 1
    item = {}
    item["list"] = torch.Tensor(x)
    item["mat"] = mat
    res.append(item)
  print(f" # items: {len(res)}")
  return res


class SortDataset(Dataset):
    """Sort dataset."""

    def __init__(self, data):
      self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return(self.data[idx])

sdata = generate_input_data(m=LIST_LEN)

AttributeError: ignored

In [89]:
sort_seq([3, 4, 0])

([0, 3, 4], [1, 0, 2])

## Split into Train and Test
I put 80% of the permutations in train and the rest in test.  I do this simply by taking the first 80% generated above for train and the rest for test.

Also here I create the train and test dataloaders.

In [43]:
N = len(sdata)
cutoff = int(N*.8)
train = sdata[:cutoff]
test = sdata[cutoff:]
train_dataset = SortDataset(train)
train_dataloader = DataLoader(train_dataset, batch_size=4,
                        shuffle=True, num_workers=0)
test_dataset = SortDataset(test)
test_dataloader = DataLoader(test_dataset, batch_size=4,
                        shuffle=True, num_workers=0)

## Define the Net
I will use 2 hidden, linear, layers of size 100.

I reshape the output into a matrix and do softmax on the rows to make probabilities.


In [118]:
import torch.nn.functional as F

class SortNet(nn.Module):
    def __init__(self, lst_len, num_hidden_layers = 10, hidden_size=200):
        """
        """
        self.lst_len = lst_len
        self.num_hidden_layers = num_hidden_layers
        self.seq_len = int((lst_len * (lst_len-1))/2)
        out_len = (lst_len) * self.seq_len        
        print(lst_len, out_len)
        super().__init__()
        self.input = nn.Linear(lst_len, hidden_size)   
        self.hidden_layers = []
        for i in range(num_hidden_layers):
          self.hidden_layers.append(nn.Linear(hidden_size, hidden_size))                                  
        self.output = nn.Linear(hidden_size, out_len)

    def forward(self, x):
            """
            """
            y = F.leaky_relu(self.input(x))
            for i in range(self.num_hidden_layers):
              y = F.leaky_relu(self.hidden_layers[i](y))           
            y = (self.output(y))
            y_pred = y.view((y.shape[0], self.seq_len , self.lst_len))
            y_pred = torch.softmax(y_pred, dim=2)
            return y_pred

## Train loop
Nothing too unusual here, mostly a standard torch train loop.

I do have to convert the input to tensors and to float32.

In [127]:
print_freq = 20
model = SortNet(LIST_LEN, num_hidden_layers=10, hidden_size=100)
loss_fn =  nn.MSELoss() # nn.L1Loss() #
optimizer = torch.optim.Adam(model.parameters(), lr = 0.005)
for ei in range(181):
  model.train()
  for bi, sample in enumerate(train_dataloader):
    X = torch.Tensor(sample["list"])
    X = X.to(torch.float32)
    y = torch.Tensor(sample["mat"])
    y = y.to(torch.float32)
    y_pred = model(X)          

    optimizer.zero_grad()       
    loss = loss_fn(y, y_pred)  
    #if ei % print_freq == 0: 
    # print(f"epoch {ei} batch {bi} loss {loss}")

    loss.backward()        
    optimizer.step()
  if ei % print_freq == 0: 
    print(f"epoch {ei} batch {bi} loss {loss}")

6 90
epoch 0 batch 143 loss 0.06909219920635223
epoch 20 batch 143 loss 0.0753786638379097
epoch 40 batch 143 loss 0.06060944125056267
epoch 60 batch 143 loss 0.06364498287439346
epoch 80 batch 143 loss 0.06556541472673416
epoch 100 batch 143 loss 0.06278382986783981
epoch 120 batch 143 loss 0.044715266674757004
epoch 140 batch 143 loss 0.0627627745270729
epoch 160 batch 143 loss 0.03545656055212021
epoch 180 batch 143 loss 0.03620775416493416


In [46]:
with torch.no_grad():
  sum_loss = 0
  model.eval()
  for bi, sample in enumerate(test_dataloader):
    X = torch.Tensor(sample["list"])
    X = X.to(torch.float32) 
    y = torch.Tensor(sample["mat"])
    y = y.to(torch.float32)
    y_pred = model(X)           # compute model output
    if ei == 50 and (bi % print_freq == 0):
      pass     
    loss = loss_fn(y, y_pred)  # calculate loss
    sum_loss += loss
    if bi % (print_freq*10) == 0: 
      print(f"batch {bi} loss {loss} sum_loss {sum_loss}")

batch 0 loss 0.11207365244626999 sum_loss 0.11207365244626999


In [106]:
test = torch.Tensor([1, 2, 4, 3])
test = test.unsqueeze(0)
sample = test_dataset[10]
X = sample["list"]
print(X)
X = X.unsqueeze(0)
res = model(X)
np = X.numpy()
print(np)
lst = list(np[0])
print(type(lst))
print(lst)
sorted, true_seq = sort_seq(lst)
print(true_seq)

tensor([4., 5., 1., 3., 0., 2.])
[[4. 5. 1. 3. 0. 2.]]
<class 'list'>
[4.0, 5.0, 1.0, 3.0, 0.0, 2.0]
[1, 2, 3, 4, 0, 1, 2, 3, 1, 2, 0, 5, 5, 5, 5]


In [107]:
torch.sum(res, dim=2)
am = torch.argmax(res, dim=2)
print(am)
pred_seq = list(am.numpy()[0])
print(lst, pred_seq)
apply_seq(lst, true_seq)

tensor([[0, 2, 3, 4, 0, 2, 3, 1, 2, 1, 0, 5, 5, 5, 5]])
[4.0, 5.0, 1.0, 3.0, 0.0, 2.0] [0, 2, 3, 4, 0, 2, 3, 1, 2, 1, 0, 5, 5, 5, 5]


[0.0, 1.0, 2.0, 3.0, 4.0, 5.0]

In [11]:
class Square(nn.Module):
    """ Custom Linear layer but mimics a standard linear layer """
    def __init__(self, size_in, size_out):
        super().__init__()
        self.size_in, self.size_out = size_in, size_out
        weights = torch.Tensor(size_out, size_in)
        self.weights = nn.Parameter(weights)  # nn.Parameter is a Tensor that's a module parameter.
        bias = torch.Tensor(size_out)
        self.bias = nn.Parameter(bias)

        # initialize weights and biases
        nn.init.kaiming_uniform_(self.weights, a=math.sqrt(5)) # weight init
        fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.weights)
        bound = 1 / math.sqrt(fan_in)
        nn.init.uniform_(self.bias, -bound, bound)  # bias init

    def forward(self, x):
        w_times_x= torch.mm(x, self.weights.t())
        # 
        print(f" x shape {x.shape} weights shape {self.weights.shape} res shape {w_timex_x.shape}")
        return torch.add(w_times_x, self.bias) 