# Notes

- All numeric values (integer, decimal, timestamp) and boolean values should be stored as floats that can be used by the model directly.
- String values might be categorical, where we can create a vocabulary, map to integers, and use as numeric values.
- String values might be free text, where we can to tokenize them and create a vocabulary.
- The model should accept a primary key or row number.

# Assumptions

- <1 million rows, single machines
    - Will eventually remove size limitation and support distributed training
    - Will also need better way to store embeddings (partitions?)

# Initialization

In [21]:
from summon.model import NumericModel

model = NumericModel(columns=1)

# Data

TODO: feature and output scaling/normalization (big integers suck to deal with in NNs)

In [22]:
import torch
import pandas as pd
import numpy as np
from pathlib import Path

data_dir = Path("/tmp/data/")

df = pd.read_parquet(str(data_dir / "fever.snappy.parquet"))

X = torch.tensor(df["id"].to_numpy(dtype=np.int32), dtype=torch.int32)
Y = torch.tensor(df["evidence_annotation_id"].to_numpy(dtype=np.int32), dtype=torch.int32)

X.shape, Y.shape

(torch.Size([426559]), torch.Size([426559]))

In [23]:
X[:10]

tensor([ 75397,  75397, 150448, 150448, 214861, 156709,  83235, 129629, 129629,
        149579], dtype=torch.int32)

In [24]:
Y[:10]

tensor([ 92206,  92206, 174271, 174271, 255136, 180804, 100277, 151831, 151831,
        173384], dtype=torch.int32)

# Training

In [25]:
from torch.optim import SGD
from torch.nn import L1Loss

model.train()

optimizer = SGD(model.parameters(), lr=1)
mae_loss = L1Loss()

batch_size = 32

In [42]:
import torch

g = torch.Generator().manual_seed(2147483647)

iterations = 100_000

for i in range(iterations):
    optimizer.zero_grad()

    # mini-batch
    ix = torch.randint(0, len(X), (batch_size, ), generator=g)
    uX, uY = X[ix], Y[ix]

    # forward pass
    x = uX.view(-1, 1).float()
    x = model(x)

    # loss
    loss = mae_loss(x.view(-1), uY)

    # optimize
    loss.backward()
    optimizer.step()

    # track stats
    if i % 20_000 == 0:
        print(f"{i} / {iterations}: {loss.item():.3f}")

0 / 100000: 131352.188
20000 / 100000: 106876.969
40000 / 100000: 129963.844
60000 / 100000: 120019.625
80000 / 100000: 102921.617


In [46]:
@torch.no_grad()
def total_loss() -> "float":

    # mini-batch
    ix = torch.randint(0, len(X), (1000, ), generator=g)
    uX, uY = X[ix], Y[ix]

    # forward pass
    x = uX.view(-1, 1).float()
    x = model(x)

    # loss
    loss = mae_loss(x.view(-1), uY)

    return loss.item()

f"loss: {total_loss()}"

'loss: 114605.2109375'