field/core.py

In [1]:
import logging
import numpy as np
import pandas as pd

import torch.utils.data
from pathlib import Path
import warnings

In [2]:
# ignore
import sys; sys.path.append("..")

In [3]:
# replace(torchtable, ..custom_types)
from torchtable import *

In [4]:
# replace(torchtable, .)
from torchtable.utils import *
from torchtable.operator import Operator, LambdaOperator, FillMissing, Categorize, Normalize, ToTensor, UnknownCategoryError

In [5]:
logger = logging.getLogger(__name__)

In [6]:
class Field:
    """
    A single field in the output mini batch. A Field acts as a continaer for all relevant information regarding an output in the output mini batch.
    Primarily, it stores a pipeline to apply to a column/set of columns in the input.
    It also stores a pipeline for converting the input batch to an appropriate type for the downstream model (generally a torch.tensor).
    This class can directly be instantiated with a custom pipeline but is generally used as a subclass for other fields.

    Example:
        >>> fld = Field(LambdaOperator(lambda x: x + 1) > LambdaOperator(lambda x: x ** 2))
        >>> fld.transform(1)
        ... 9
    
    Args:
        pipeline: An operator representing the set of operations mapping the input column to the output.
            This transformation will be applied during the construction of the dataset. 
            If the pipeline is resource intensive and applying it all at once is unrealistic, consider deferring some of the processing to `batch_pipeline`.
        is_target: Whether the field is an input or target field. Affects default batching behavior.
        continuous: Whether the output is continuous.
        categorical: Whether the output is categorical/discrete.
        batch_pipeline: The transformation to apply to this field during batching.
            By default, this will simply be an operation to transform the input to a tensor to feed to the model.
            This can be set to any Operator that the user wishes so that arbitrary transformations (e.g. padding, noising) can be applied during data loading.
        dtype: The output tensor dtype. Only relevant when batch_pipeline is None (using the default pipeline).
        metadata: Additional data about the field to store. 
            Use cases include adding data about model parameters (e.g. size of embeddings for this field).
    """
    def __init__(self, pipeline: Operator, name: Optional[str]=None,
                 is_target: bool=False, continuous: bool=True,
                 categorical: bool=False, batch_pipeline: Optional[Operator]=None,
                 dtype: Optional[torch.dtype]=None, metadata: dict={}):
        self.pipeline = pipeline
        self.name = name
        self.is_target = is_target
        if categorical and continuous:
            raise ValueError("""A field cannot be both continuous and categorical. 
            If you want both a categorical and continuous representation, consider using multiple fields.""")
        self.continuous, self.categorical = continuous, categorical

        if dtype is not None and batch_pipeline is not None:
            logger.warning("""Setting a custom batch pipeline will cause this field to ignore the dtype argument.
            If you want to manually set the dtype, consider attaching a ToTensor operation to the pipeline.""")
        dtype = with_default(dtype, torch.long if self.categorical else torch.float)
        self.batch_pipeline = with_default(batch_pipeline, ToTensor(dtype))
        self.metadata = metadata
        
    def transform(self, x: pd.Series, train=True) -> ArrayLike:
        """
        Method to process the input column during construction of the dataset.
        Kwargs:
            train: If true, this transformation may change some internal parameters of the pipeline.
                For instance, if there is a normalization step in the pipeline, 
                the mean and std will be computed on the current input.
                Otherwise, the pipeline will use statistics computed in the past.
        """
        return self.pipeline(x, train=train)

    def __repr__(self):
        return f"{self.__class__.__name__}[{self.name}]"
    
    def transform_batch(self, x: ArrayLike, device: Optional[torch.device]=None, 
                        train: bool=True) -> torch.tensor:
        """Method to process batch input during loading of the dataset."""
        return self.batch_pipeline(x, device=device, train=train)

    def index(self, example: ArrayLike, idx) -> ArrayLike:
        """
        Wrapper for indexing. The field must provide the ability to index via a list for batching later on.
        """
        if isinstance(example, pd.Series):
            return example.iloc[idx]
        else:
            return example[idx]

In [7]:
class IdentityField(Field):
    """
    A field that does not modify the input.
    """
    def __init__(self, name=None, is_target=False, continuous=True, categorical=False, metadata={}):
        super().__init__(LambdaOperator(lambda x: x), name=name,
                         is_target=is_target, continuous=continuous, categorical=categorical, metadata=metadata)

In [8]:
class NumericField(Field):
    """
    A field corresponding to a continous, numerical output (e.g. price, distance, etc.)
    
    Args:
        fill_missing: The method of filling missing values. See the `FillMissing` operator for details.
        
        normalization: The method of normalization. See the `Normalize` operator for details.
    """
    def __init__(self, name=None,
                 fill_missing="median", normalization="Gaussian",
                 is_target=False, metadata={}):
        pipeline = FillMissing(fill_missing) > Normalize(normalization)
        super().__init__(pipeline, name, is_target, continuous=True, categorical=False, metadata=metadata)

In [9]:
class CategoricalField(Field):
    """
    A field corresponding to a categorica, discrete output (e.g. id, group, gender)
    
    Args:
        See the `Categorize` operator for more details.
    """
    def __init__(self, name=None, min_freq=0, max_features=None,
                 handle_unk=None, is_target=False, metadata: dict={}):
        pipeline = Categorize(min_freq=min_freq, max_features=max_features,
                              handle_unk=handle_unk)
        self.vocab = pipeline.transformer
        super().__init__(pipeline, name, is_target, continuous=False, categorical=True, metadata=metadata)
    
    def transform(self, x: pd.Series, train=True) -> ArrayLike:
        try:
            return super().transform(x, train=train)
        except UnknownCategoryError:
            raise UnknownCategoryError(f"Unknown category encountered in {self.name}. Consider setting handle_unk=True.")
    
    @property
    def cardinality(self):
        """The number of unique outputs."""
        return len(self.vocab)

In [10]:
# ignore
class DatetimeFeatureField(Field):
    """
    A generic field for constructing features from datetime columns.
    
    Args:
        func: Feature construction function
    """
    def __init__(self, func: Callable[[pd.Series], pd.Series], fill_missing: Optional[str]=None,
                 name=None, is_target=False, continuous=False, metadata: dict={}):
        pipeline = (LambdaOperator(lambda s: pd.to_datetime(s))
                    > FillMissing(method=fill_missing) 
                    > LambdaOperator(lambda s: func(s.dt)))
        super().__init__(pipeline, name=name, is_target=is_target, continuous=continuous,
                         categorical=not continuous, metadata=metadata)

In [11]:
# ignore
class DayofWeekField(DatetimeFeatureField):
    def __init__(self, **kwargs):
        super().__init__(lambda x: x.dayofweek, **kwargs)

In [12]:
# ignore
class DayField(DatetimeFeatureField):
    def __init__(self, **kwargs):
        super().__init__(lambda x: x.day, **kwargs)

In [13]:
# ignore
class MonthStartField(DatetimeFeatureField):
    def __init__(self, **kwargs):
        super().__init__(lambda x: x.is_month_start, continuous=False, **kwargs)

In [14]:
# ignore
class MonthEndField(DatetimeFeatureField):
    def __init__(self, **kwargs):
        super().__init__(lambda x: x.is_month_end, **kwargs)

In [15]:
# ignore
class HourField(DatetimeFeatureField):
    def __init__(self, **kwargs):
        super().__init__(lambda x: x.hour, **kwargs)

In [16]:
# ignore
def date_fields(**kwargs) -> List[DatetimeFeatureField]:
    """The default set of fields for feature engineering using a field with date information"""
    return [DayofWeekField(**kwargs), DayField(**kwargs),
            MonthStartField(**kwargs), MonthEndField(**kwargs),
           ]

In [17]:
# ignore
def datetime_fields(**kwargs) -> List[DatetimeFeatureField]:
    """The default set of fields for feature engineering using a field with date and time information"""
    return [DayofWeekField(**kwargs), DayField(**kwargs),
            MonthStartField(**kwargs), MonthEndField(**kwargs),
            HourField(**kwargs),
           ]

In [18]:
class FieldCollection(list):
    """
    A list of fields with some auxillary methods.
    
    Args:
        flatten: If set to True, each field in this collection will be mapped to one key in the batch/dataset.
            Otherwise, each field in this collection will be mapped to an entry in a list for the same key in the batch/dataset.
    """
    def __init__(self, *args, flatten: bool=False, namespace: Optional[str]=None):
        for a in args: self.append(a)
        self.flatten = flatten
        self.namespace = None
        self.set_namespace(namespace)
    
    def index(self, examples: List[ArrayLike], idx) -> List[ArrayLike]:
        return [fld.index(ex, idx) for fld, ex in zip(self, examples)]

    @property
    def name(self) -> str:
        return self.namespace

    def set_namespace(self, nm: str) -> None:
        """Set names of inner fields as well"""
        old_namespace = self.namespace
        if old_namespace == nm: return
        self.namespace = nm
        for i, fld in enumerate(self):
            if fld.name is None: 
                fld.name = f"{self.namespace}/_{i}"
            else:
                if old_namespace is not None and fld.name.startswith(f"{old_namespace}/"):
                    fld.name = fld.name[len(old_namespace)+1:]
                fld.name = f"{self.namespace}/{fld.name}"    
    @name.setter
    def name(self, nm: str):
        self.set_namespace(nm)
    
    def transform(self, *args, **kwargs) -> list:
        """Applies transform with each field and returns a list"""
        return [fld.transform(*args, **kwargs) for fld in self]

# Tests

test_field.py

In [20]:
import pytest

In [21]:
# ignore
from torchtable import *
from torchtable.operator import *

In [22]:
# uncomment
# from torchtable import *
# from torchtable.operator import *
# from torchtable.field import *

In [23]:
# test_field
fld = Field(LambdaOperator(lambda x: x + 1) > LambdaOperator(lambda x: x ** 2))
assert fld.transform(1) == 4

In [24]:
# test_numeric_field
rng = np.random.RandomState(21)
x = pd.Series(data=rng.normal(0, 1, (100, )))
x[x < 0] = np.nan
for mthd in ["median", "mean", "mode"]:
    fld = NumericField(fill_missing=mthd)
    assert not pd.isnull(fld.transform(x)).any()

fld = NumericField(fill_missing=None)
assert pd.isnull(fld.transform(x)).any()

In [25]:
# test_numeric_field_norm
rng = np.random.RandomState(21)
x = pd.Series(data=rng.normal(-1, 4, (100, )))
fld = NumericField(fill_missing=None, normalization="Gaussian")
np.testing.assert_almost_equal(fld.transform(x).mean(), 0.)
np.testing.assert_almost_equal(fld.transform(x).std(), 1.)

fld = NumericField(fill_missing=None, normalization="RankGaussian")
np.testing.assert_almost_equal(fld.transform(x).mean(), 0.)

In [26]:
# test_numeric_joint
"""Smoke test for NumericField with various settings"""
rng = np.random.RandomState(21)
x = pd.Series(data=rng.normal(2, 0.5, (100, )))
for fill_mthd in ["median", "mean", "mode"]:
    for norm_mthd in [None, "Gaussian", "RankGaussian"]:
        fld = NumericField(fill_missing=fill_mthd, normalization=norm_mthd)
        fld.transform(x)

In [27]:
# test_categorical_field
"""Smoke test for categorical field with default settings"""
rng = np.random.RandomState(21)
x = pd.Series(data=rng.randint(-3, 15, (100, )))
fld = CategoricalField(handle_unk=False)
assert fld.transform(x).nunique() == len(fld.vocab)
assert fld.transform(x).nunique() == fld.cardinality

In [28]:
# test_datetime_fields
"""Smoke test for fields"""
x = pd.to_datetime(pd.DataFrame({'year': [2015, 2016, 2015, 2017, 2020], 'month': [2, 3, 4, 5, 1], 
                             'day': [4, 5, 10, 29, 30], 'hour': [2, 3, 12, 11, 5]}))
for fld_type in [DayofWeekField, DayField, MonthStartField, MonthEndField, HourField]:
    assert not pd.isnull(fld_type().transform(x)).any()

In [29]:
# test_date_fields
x = pd.to_datetime(pd.DataFrame({'year': [2011, 1995, 2015, 2017, 2030], 'month': [12, 9, 7, 5, 10], 
                                 'day': [14, 13, 9, 19, 1]}))
for fld_type in [DayofWeekField, DayField, MonthStartField, MonthEndField]:
    assert not pd.isnull(fld_type().transform(x)).any()

In [30]:
# test_batch_transform
"""Smoke test for batch transformations"""
rng = np.random.RandomState(21)
a = pd.Series(data=rng.normal(0, 1, (100, )))
fld = NumericField()
tsr = fld.transform_batch(fld.transform(a))

In [31]:
# test_field_metadata
fld = NumericField(metadata={"foo": "bar"})
assert fld.metadata["foo"] == "bar"

In [32]:
# test_field_collection
fld0 = NumericField()
fld1 = CategoricalField()
flds = FieldCollection(fld0, fld1)
assert len(flds) == 2
assert flds[0] == fld0
assert flds[1] == fld1

In [33]:
# test_namespace
fld0 = NumericField()
fld1 = CategoricalField(name="bar")
flds = FieldCollection(fld0, fld1, namespace="foo")
assert fld0.name == "foo/_0"
assert fld1.name == "foo/bar"
flds.name = "hoge"
assert fld0.name == "hoge/_0"
assert fld1.name == "hoge/bar"

fld0 = NumericField()
fld1 = CategoricalField(name="bar")
flds = FieldCollection(fld0, fld1)
flds.name = "hoge"
flds.name = "hoge"
assert fld0.name == "hoge/_0"
assert fld1.name == "hoge/bar"

In [34]:
# test_index
fld = NumericField()
np.testing.assert_almost_equal(fld.index(np.arange(10), [0, 3, 5]), np.array([0, 3, 5]))

In [35]:
# test_index_series
fld = NumericField()
np.testing.assert_almost_equal(fld.index(pd.Series(data=np.arange(10)), [0, 3, 5]), np.array([0, 3, 5]))

In [36]:
# test_index_fieldcollection
flds = FieldCollection(NumericField(), NumericField())
arr1, arr2 = flds.index([np.array([5, 4, 2, 3, 1]), np.array([1, 2, 3, 4, 5])], [1, 4, 2])
np.testing.assert_almost_equal(arr1, np.array([4, 1, 2]))
np.testing.assert_almost_equal(arr2, np.array([2, 5, 3]))

In [37]:
# test_fieldcollection_transform
flds = FieldCollection(Field(LambdaOperator(lambda x: x * 2)), Field(LambdaOperator(lambda x: x + 3)))
assert flds.transform(1) == [2, 4]

In [38]:
# test_unknown_cat
fld = CategoricalField(name="hoge", handle_unk=False)
a = pd.Series(data=np.array([1, 2, 3]))
fld.transform(a)
b = pd.Series(data=np.array([3, 4]))
with pytest.raises(UnknownCategoryError):
    fld.transform(b, train=False)