field/core.py

In [1]:
import logging
import numpy as np
import pandas as pd

import torch.utils.data
from pathlib import Path
import warnings

In [2]:
# ignore
import sys; sys.path.append("..")

In [3]:
# replace(torchtable, ..custom_types)
from torchtable import *

In [4]:
# replace(torchtable, .)
from torchtable.utils import *
from torchtable.operator import Operator, LambdaOperator, FillMissing, Categorize, Normalize, ToTensor

In [5]:
logger = logging.getLogger(__name__)

In [6]:
class Field:
    """
    A single field in the output mini batch. 
    A Field object wraps a pipeline to apply to a column/set of columns in the input.
    This class can directly be instantiated with a custom pipeline.
    Example:
        >>> fld = Field(LambdaOperator(lambda x: x + 1) > LambdaOperator(lambda x: x ** 2))
        >>> fld.transform(1)
        ... 9
    Args:
        pipeline: An operator representing the set of operations mapping the input column to the output.
        This transformation will be applied during the construction of the dataset. 
        If the pipeline is resource intensive and applying it all at once is unrealistic, consider deferring some of the processing to `batch_pipeline`.
    Kwargs:
        is_target: Whether the field is an input or target field. Affects default batching behavior.
        continuous: Whether the output is continuous.
        categorical: Whether the output is categorical/discrete.
        batch_pipeline: The transformation to apply to this field during batching.
        By default, this will simply be an operation to transform the input to a tensor to feed to the model.
        This can be set to any Operator that the user wishes so that arbitrary transformations (e.g. padding, noising) can be applied during data loading.
        dtype: The output tensor dtype. Only relevant when batch_pipeline is None (using the default pipeline).
    """
    def __init__(self, pipeline: Operator, name: Optional[str]=None,
                 is_target: bool=False, continuous: bool=True,
                 categorical: bool=False, batch_pipeline: Optional[Operator]=None,
                 dtype: Optional[torch.dtype]=None):
        self.pipeline = pipeline
        self.name = name
        self.is_target = is_target
        if categorical and continuous:
            raise ValueError("""A field cannot be both continuous and categorical. 
            If you want both a categorical and continuous representation, consider using multiple fields.""")
        self.continuous = continuous
        self.categorical = categorical
        if dtype is not None and batch_pipeline is not None:
            logger.warning("""Setting a custom batch pipeline will cause this field to ignore the dtype argument.
            If you want to manually set the dtype, consider attaching a ToTensor operation to the pipeline.""")
        dtype = with_default(dtype, torch.long if self.categorical else torch.float)
        self.batch_pipeline = with_default(batch_pipeline, ToTensor(dtype))
        
    def transform(self, x: pd.Series, train=True) -> ArrayLike:
        """
        Method to process the input column during construction of the dataset.
        Kwargs:
            train: If true, this transformation may change some internal parameters of the pipeline.
            For instance, if there is a normalization step in the pipeline, the mean and std will be computed on the current input.
            Otherwise, the pipeline will use statistics computed in the past.
        """
        return self.pipeline(x, train=train)

    def __repr__(self):
        return f"{self.__class__.__name__}[{self.name}]"
    
    def transform_batch(self, x: ArrayLike, device: Optional[torch.device]=None, 
                        train: bool=True) -> torch.tensor:
        """Method to process batch input during loading of the dataset."""
        return self.batch_pipeline(x, device=device, train=train)

In [7]:
class IdentityField(Field):
    """
    A field that does not modify the input.
    """
    def __init__(self, name=None, is_target=False, continuous=True, categorical=False):
        super().__init__(LambdaOperator(lambda x: x), name=name,
                         is_target=is_target, continuous=continuous, categorical=categorical)

In [8]:
class NumericField(Field):
    """
    A field corresponding to a continous, numerical output (e.g. price, distance, etc.)
    Args:
        fill_missing: The method of filling missing values. See the `FillMissing` operator for details.
        normalization: The method of normalization. See the `Normalize` operator for details.
    """
    def __init__(self, name=None,
                 fill_missing="median", normalization="Gaussian",
                 is_target=False):
        pipeline = FillMissing(fill_missing) > Normalize(normalization)
        super().__init__(pipeline, name, is_target, continuous=True, categorical=False)

In [9]:
class CategoricalField(Field):
    """
    A field corresponding to a categorica, discrete output (e.g. id, group, gender)
    Args:
        See the `Categorize` operator for more details.
    """
    def __init__(self, name=None, min_freq=0, max_features=None,
                 handle_unk=None, is_target=False):
        pipeline = Categorize(min_freq=min_freq, max_features=max_features,
                              handle_unk=handle_unk)
        self.vocab = pipeline.transformer
        super().__init__(pipeline, name, is_target, continuous=False, categorical=True)
    
    @property
    def cardinality(self):
        """The number of unique outputs."""
        return len(self.vocab)

In [10]:
class DatetimeFeatureField(Field):
    """
    A generic field for constructing features from datetime columns.
    Args:
        func: Feature construction function
    """
    def __init__(self, func: Callable[[pd.Series], pd.Series], fill_missing: Optional[str]=None,
                 name=None, is_target=False, continuous=False):
        pipeline = (LambdaOperator(lambda s: pd.to_datetime(s))
                    > FillMissing(method=fill_missing) 
                    > LambdaOperator(lambda s: func(s.dt)))
        super().__init__(pipeline, name=name, is_target=is_target, continuous=continuous, categorical=not continuous)

In [11]:
class DayofWeekField(DatetimeFeatureField):
    def __init__(self, **kwargs):
        super().__init__(lambda x: x.dayofweek, **kwargs)

In [12]:
class DayField(DatetimeFeatureField):
    def __init__(self, **kwargs):
        super().__init__(lambda x: x.day, **kwargs)

In [13]:
class MonthStartField(DatetimeFeatureField):
    def __init__(self, **kwargs):
        super().__init__(lambda x: x.is_month_start, continuous=False, **kwargs)

In [14]:
class MonthEndField(DatetimeFeatureField):
    def __init__(self, **kwargs):
        super().__init__(lambda x: x.is_month_end, **kwargs)

In [15]:
class HourField(DatetimeFeatureField):
    def __init__(self, **kwargs):
        super().__init__(lambda x: x.hour, **kwargs)

In [16]:
def date_fields(**kwargs) -> List[DatetimeFeatureField]:
    """The default set of fields for feature engineering using a field with date information"""
    return [DayofWeekField(**kwargs), DayField(**kwargs),
            MonthStartField(**kwargs), MonthEndField(**kwargs),
           ]

In [17]:
def datetime_fields(**kwargs) -> List[DatetimeFeatureField]:
    """The default set of fields for feature engineering using a field with date and time information"""
    return [DayofWeekField(**kwargs), DayField(**kwargs),
            MonthStartField(**kwargs), MonthEndField(**kwargs),
            HourField(**kwargs),
           ]

# Tests

test_field.py

In [18]:
import pytest

In [19]:
from torchtable import *
from torchtable.operator import *
from torchtable.field import *

In [20]:
# test_field
fld = Field(LambdaOperator(lambda x: x + 1) > LambdaOperator(lambda x: x ** 2))
assert fld.transform(1) == 4

In [21]:
# test_numeric_field
rng = np.random.RandomState(21)
x = pd.Series(data=rng.normal(0, 1, (100, )))
x[x < 0] = np.nan
for mthd in ["median", "mean", "mode"]:
    fld = NumericField(fill_missing=mthd)
    assert not pd.isnull(fld.transform(x)).any()

fld = NumericField(fill_missing=None)
assert pd.isnull(fld.transform(x)).any()

In [22]:
# test_numeric_field_norm
rng = np.random.RandomState(21)
x = pd.Series(data=rng.normal(-1, 4, (100, )))
fld = NumericField(fill_missing=None, normalization="Gaussian")
np.testing.assert_almost_equal(fld.transform(x).mean(), 0.)
np.testing.assert_almost_equal(fld.transform(x).std(), 1.)

fld = NumericField(fill_missing=None, normalization="RankGaussian")
np.testing.assert_almost_equal(fld.transform(x).mean(), 0.)

In [23]:
# test_numeric_joint
"""Smoke test for NumericField with various settings"""
rng = np.random.RandomState(21)
x = pd.Series(data=rng.normal(2, 0.5, (100, )))
for fill_mthd in ["median", "mean", "mode"]:
    for norm_mthd in [None, "Gaussian", "RankGaussian"]:
        fld = NumericField(fill_missing=fill_mthd, normalization=norm_mthd)
        fld.transform(x)

In [24]:
# test_categorical_field
"""Smoke test for categorical field with default settings"""
rng = np.random.RandomState(21)
x = pd.Series(data=rng.randint(-3, 15, (100, )))
fld = CategoricalField(handle_unk=False)
assert fld.transform(x).nunique() == len(fld.vocab)
assert fld.transform(x).nunique() == fld.cardinality

In [25]:
# test_datetime_fields
"""Smoke test for fields"""
x = pd.to_datetime(pd.DataFrame({'year': [2015, 2016, 2015, 2017, 2020], 'month': [2, 3, 4, 5, 1], 
                             'day': [4, 5, 10, 29, 30], 'hour': [2, 3, 12, 11, 5]}))
for fld_type in [DayofWeekField, DayField, MonthStartField, MonthEndField, HourField]:
    assert not pd.isnull(fld_type().transform(x)).any()

In [26]:
# test_date_fields
x = pd.to_datetime(pd.DataFrame({'year': [2011, 1995, 2015, 2017, 2030], 'month': [12, 9, 7, 5, 10], 
                                 'day': [14, 13, 9, 19, 1]}))
for fld_type in [DayofWeekField, DayField, MonthStartField, MonthEndField]:
    assert not pd.isnull(fld_type().transform(x)).any()

In [27]:
# test_batch_transform
"""Smoke test for batch transformations"""
rng = np.random.RandomState(21)
a = pd.Series(data=rng.normal(0, 1, (100, )))
fld = NumericField()
fld.transform_batch(fld.transform(a))

tensor([-0.1459, -0.2012,  0.8748, -1.2703,  0.5982, -1.6943, -0.2896, -0.3164,
         0.9554, -0.1092, -0.6697,  1.1846,  1.4060, -0.7407,  0.5483, -0.5157,
         0.0541,  0.1426, -1.2878, -0.0377, -1.0885, -1.0208, -0.5246, -1.9492,
        -1.4754,  0.1189,  0.5037,  0.6978, -0.5305,  0.5532,  1.3658,  1.0025,
        -1.3891, -1.7282,  0.2784,  1.0640, -0.1197,  0.7915,  0.1312,  0.1117,
         0.1794,  0.1086, -0.4925,  1.6250,  0.7613, -0.6181, -0.3636, -1.0692,
         0.3509, -1.4296, -0.4085,  0.6000,  0.4254,  0.4380, -1.1995,  0.6162,
        -0.2229,  1.1388, -0.3554,  1.4700,  0.2900,  0.6003, -0.4638, -0.4767,
        -0.4576,  0.8367, -0.5153, -0.2058,  0.9936,  0.3838, -0.3381,  0.1686,
        -1.4083, -1.5058,  0.3079,  2.4737,  1.7792,  2.2543,  0.4349, -0.5888,
         0.8995, -1.2123, -2.7663,  0.2984,  0.8672,  0.5607,  1.3494,  0.6169,
         0.2257, -1.0809, -0.2183, -1.2000, -0.4417,  1.6783,  0.0987,  0.2541,
        -0.7451,  2.3317, -1.9484, -0.96