model/core.py

In [1]:
import math
import numpy as np
import pandas as pd

In [2]:
# replace(torchtable, ..custom_types)
from torchtable import *

In [3]:
from torchtable.utils import *
from torchtable.field import Field, FieldCollection, CategoricalField, NumericField
from torchtable.dataset import TabularDataset

In [4]:
import torch
import torch.nn as nn

In [15]:
class BatchHandlerModel(nn.Module):
    def __init__(self, embs: List[nn.Module],
                 batch_cat_field_getters: List[Callable[[Dict], torch.tensor]],
                 batch_num_field_getters: Callable[[Dict], torch.tensor]):
        super().__init__()
        assert len(embs) == len(batch_cat_field_getters)
        self.embs = nn.ModuleList(embs)
        self.batch_cat_field_getters = batch_cat_field_getters
        self.batch_num_field_getters = batch_num_field_getters
    
    @staticmethod
    def field_to_embedding(fld: CategoricalField) -> nn.Module:
        num_embeddings = fld.cardinality
        embedding_dim = with_default(fld.metadata.get("embedding_dim"),
                                      min((num_embeddings * (num_embeddings - 1)) // 2, 50))
        return nn.Embedding(num_embeddings, embedding_dim, padding_idx=fld.metadata.get("padding_idx"))
    
    @classmethod
    def from_dataset(cls, dataset: TabularDataset) -> 'DefaultModel':
        # construct mapping from example field to embedding matrix
        embs = []
        batch_cat_field_getters: List[Callable[[Dict], torch.tensor]] = []
        batch_num_field_getters: List[Callable[[Dict], torch.tensor]] = []
        def register_field(k: str, fld: Field, i: int):
            getter = (lambda b: b[k][i]) if i > -1 else (lambda b: b[k])
            if fld.categorical:
                embs.append(cls.field_to_embedding(fld))
                batch_cat_field_getters.append(getter)
            elif fld.continuous:
                batch_num_field_getters.append(getter)
        list(dataset.fields.flatmap(register_field, with_index=True))
        return cls(embs, batch_cat_field_getters, batch_num_field_getters)
    
    def forward(self, batch):
        cat_data = [emb(getter(batch)) for emb, getter in zip(self.embs, self.batch_cat_field_getters)]
        num_data = [getter(batch).unsqueeze(1) for getter in self.batch_num_field_getters]
        return torch.cat(cat_data + num_data, dim=1)

    def out_dim(self):
        return sum(e.embedding_dim for e in self.embs) + len(self.batch_num_field_getters)

# Tests

test_model.py

In [6]:
import pytest
import pandas as pd

In [7]:
# ignore
from torchtable.operator import LambdaOperator
from torchtable.field import *
from torchtable.dataset import TabularDataset
from torchtable.loader import DefaultLoader

In [8]:
# uncomment
# from torchtable.utils import *
# from torchtable.operator import LambdaOperator
# from torchtable.field import Field, FieldCollection, CategoricalField, NumericField
# from torchtable.dataset import TabularDataset
# from torchtable.loader import DefaultLoader
# from torchtable.model import BatchHandlerModel

In [8]:
# test_from_dataset
df = pd.DataFrame({"a": [1, 2, 3, 4, 5],
                   "b": [-0.4, -2.1, 3.3, 4.4, 5.5]})
ds = TabularDataset.from_df(df, fields={
    "a": CategoricalField(max_features=100),
    "b": NumericField(normalization="Gaussian"),
})
dl = DefaultLoader.from_dataset(ds, 5)
model = BatchHandlerModel.from_dataset(ds)
batch, _ = next(iter(dl))
assert model(batch).size(0) == 5

In [16]:
# test_from_dataset_metadata
df = pd.DataFrame({"a": [1, 2, 3, 4, 5],
                   "b": [-0.4, -2.1, 3.3, 4.4, 5.5],
                   "c": [0.1, 0.2, 0.3, 0.4, 0.5]})
ds = TabularDataset.from_df(df, fields={
    "a": CategoricalField(max_features=100, metadata={"embedding_dim": 10}),
    "b": NumericField(normalization="Gaussian"),
    "c": NumericField(normalization=None),
})
dl = DefaultLoader.from_dataset(ds, 5)
model = BatchHandlerModel.from_dataset(ds)
batch, _ = next(iter(dl))
assert model(batch).size(0) == 5
assert model(batch).size(1) == 12
assert model.out_dim() == 12

In [11]:
# test_from_dataset_field_collection
df = pd.DataFrame({"a": [1, 2, 3, 4, 5],
                   "b": [-0.4, -2.1, 3.3, 4.4, 5.5]})
ds = TabularDataset.from_df(df, fields={
    "a": CategoricalField(max_features=100),
    "b": FieldCollection(NumericField(normalization="Gaussian"),
                         Field(LambdaOperator(lambda x: x * 2), continuous=True),
                         CategoricalField()),
})
dl = DefaultLoader.from_dataset(ds, 3)
model = BatchHandlerModel.from_dataset(ds)
assert len(model.embs) == 2
batch, _ = next(iter(dl))
assert model(batch).size(0) == 3

In [20]:
# test_from_dataset_only_categorical
df = pd.DataFrame({"a": [1, 2, 3, 4, 5],
                   "b": [-0.4, -2.1, 3.3, 4.4, 5.5]})
ds = TabularDataset.from_df(df, fields={
    "a": CategoricalField(max_features=100),
    "b": None,
})
dl = DefaultLoader.from_dataset(ds, 4)
model = BatchHandlerModel.from_dataset(ds)
batch, _ = next(iter(dl))
assert model(batch).size(0) == 4
assert model.out_dim() == 15

In [21]:
# test_from_dataset_only_numerical
df = pd.DataFrame({"a": [1, 2, 3, 4, 5],
                   "b": [-0.4, -2.1, 3.3, 4.4, 5.5]})
ds = TabularDataset.from_df(df, fields={
    "a": NumericField(),
    "b": FieldCollection(NumericField(), NumericField()),
})
dl = DefaultLoader.from_dataset(ds, 4)
model = BatchHandlerModel.from_dataset(ds)
batch, _ = next(iter(dl))
assert model(batch).size(0) == 4
assert model.out_dim() == 3

In [14]:
# test_from_dataset_flattened
df = pd.DataFrame({"a": [1, 2, 3, 4, 5],
                   "b": [-0.4, -2.1, 3.3, 4.4, 5.5]})
ds = TabularDataset.from_df(df, fields={
    "a": NumericField(),
    "b": FieldCollection(NumericField(), CategoricalField(handle_unk=False), flatten=True),
})
dl = DefaultLoader.from_dataset(ds, 4)
model = BatchHandlerModel.from_dataset(ds)
batch, _ = next(iter(dl))
assert model(batch).size(0) == 4