operator.py

In [1]:
import numpy as np
import pandas as pd
from pandas.api.types import is_numeric_dtype, is_categorical_dtype
from collections import Counter, defaultdict
from scipy.special import erfinv
from scipy.stats import rankdata

from typing import *

In [2]:
SeriesLike = Union[pd.Series, pd.DataFrame]
ArrayLike = Union[pd.Series, np.array]
T = TypeVar("T")
OneorMore = Union[T, Iterable[T]]
ColumnName = Union[str, List[str]]

# Implementation

At the core of torchtable is the Operator. We compose pipelines from Operators to process columns in the dataset.

In [3]:
# utils
def _most_frequent(x: np.ndarray):
    c = Counter(x)
    return c.most_common(1)[0][0]

In [4]:
class Operator:
    def __init__(self):
        self.before = None
        self.built = False

    def __gt__(self, op: 'Operator') -> 'Operator':
        """Syntactic sugar for piping"""
        return self.pipe(op)

    def __lt__(self, op: 'Operator') -> 'Operator':
        """Syntactic sugar for hooking"""
        return self.hook(op)
    
    def pipe(self, op: 'Operator') -> 'Operator':
        """Connect an operator after this operator.
        Returns the connected operator.
        """
        op.before = self
        return op
    
    def hook(self, op: 'Operator') -> 'Operator':
        """Connect an operator to the beginning of this pipeline. Returns self."""
        if self.before is not None:
            self.before.hook(op)
        else:
            self.before = op
        return self

    def apply(self, x: Any, test=False) -> Any:
        """Takes output of previous stage in the pipeline and produces output.
        Override in subclasses."""
        return None
    
    def __call__(self, x, **kwargs):
        if self.before is not None:
            return self.apply(self.before(x, **kwargs), **kwargs)
        else:
            return self.apply(x, **kwargs)

In [5]:
class LambdaOperator(Operator):
    """Generic operator"""
    def __init__(self, func: Callable[[T], T]=None):
        super().__init__()
        self.func = func
    
    def apply(self, x: Any, test=False) -> Any:
        return self.func(x)

In [6]:
class TransformerOperator(Operator):
    """Wrapper for any stateful transformer with fit and transform methods"""
    def __init__(self, transformer):
        super().__init__()
        self.transformer = transformer
    
    def build(self, x: Any) -> None:
        self.transformer.fit(x)
    
    def apply(self, x: Any, test=False):
        if not test: self.build(x)
        return self.transformer.transform(x)

In [7]:
class _Normalizer:
    _methods = set(["Gaussian", "RankGaussian"])

    def __init__(self, method):
        self.method = method
        if method is not None and method not in self._methods:
            raise ValueError(f"Invalid normalization method {method}")
    
    def fit(self, x: pd.Series):
        if self.method == "Gaussian":
            self.mean, self.std = x.mean(), x.std()
        elif self.method == "RankGaussian":
            # TODO: store state
            pass
        return self

    def transform(self, x: pd.Series) -> pd.Series:
        if self.method == "Gaussian":
            return (x - self.mean) / (self.std + 1e-8)
        elif self.method == "RankGaussian":
            # TODO: store state
            # prevent divergence to infinity by restricting normalized ranks to range[-0.99, 0.99]
            x = (rankdata(x) / len(x) - 0.5) * 0.99 * 2
            x = erfinv(x)
            return (x - x.mean())
        else:
            return x

class Normalize(TransformerOperator):
    def __init__(self, method):
        super().__init__(_Normalizer(method))

In [8]:
class _MissingFiller:
    _method_mapping = {
        "median": lambda x: x.median(),
        "mean": lambda x: x.mean(),
        "mode": lambda x: _most_frequent(x.dropna()),
    }
    
    def __init__(self, method):
        if callable(method):
            self.method = method
        elif method in self._method_mapping:
            self.method = self._method_mapping[method]
        elif method is None:
            self.method = None
        else:
            raise ValueError(f"Invalid method of filling missing data: {method}")
        self.na_mapping = {}

    def fit(self, x: pd.Series) -> '_MissingFiller':
        self.fill_value = self.method(x)
        return self
    
    def transform(self, x: pd.Series, test=False) -> pd.Series:
        if self.method is not None:
            return x.fillna(self.fill_value)
        else:
            return x
        
class FillMissing(TransformerOperator):
    def __init__(self, method):
        super().__init__(_MissingFiller(method))

In [9]:
class Vocab:
    def __init__(self, min_freq=0, max_features=None,
                 handle_unk=False):
        if not handle_unk and (max_features is not None or min_freq > 0):
            warnings.warn("""Setting max_features or min_freq will potentially cause some categories to become unknown.
            Set handle_unk to True to handle categories left out due to max_features or min_freq being set.
            """)
        self.min_freq = min_freq
        self.max_features = max_features
        self.handle_unk = handle_unk
            
    
    def fit(self, x: pd.Series) -> 'Vocab':
        counter = Counter()
        for v in x: counter[v] += 1
        
        self.index = defaultdict(int)
        # if handle unknown category, reserve 0 for unseen categories
        idx = 1 if self.handle_unk else 0
        for k, c in counter.most_common(self.max_features):
            if c < self.min_freq: break
            self.index[k] = idx; idx += 1
        return self
    
    def _get_index(self, x):
        if x not in self.index and not self.handle_unk:
            raise ValueError("Found category not in vocabulary. Try setting handle_unk to True.")
        else:
            return self.index[x]
    
    def transform(self, x: pd.Series, test=False) -> pd.Series:
        return x.apply(self._get_index)

    def __len__(self):
        return len(self.index)
    
class Categorize(TransformerOperator):
    """Converts categorical data into integer ids"""
    def __init__(self, min_freq=0, max_features=None,
                 handle_unk=False):
        super().__init__(Vocab(min_freq=min_freq, max_features=max_features,
                               handle_unk=handle_unk))
    
    @property
    def vocab_size(self):
        return len(self.transformer)

In [10]:
np.r_[[np.arange(10), np.arange(10)]]

array([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
       [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]])

In [11]:
class FeatureEngineeringOperator(Operator):
    def __init__(self, funcs: List[Callable]):
        super().__init__()
        self.funcs = funcs

    def apply(self, x: pd.Series, test=False) -> np.array:
        return np.r_[[f(x) for f in self.funcs]]

In [12]:
class Field:
    """Base class for other fields. Can also be instantiated by passing a pipeline."""
    def __init__(self, pipeline: Operator, name=None,
                 is_target=False, continuous=True):
        self.pipeline = pipeline
        self.name = name
        self.is_target = is_target
    
    def set_source(self, column: SeriesLike) -> None:   
        self.source = column
    
    def compute(self, test=False) -> ArrayLike:
        return self.pipeline(self.source, test=test)

In [13]:
class IdentityField(Field):
    def __init__(self, name=None, is_target=False, continuous=True):
        super().__init__(LambdaOperator(lambda x: x), name=name,
                         is_target=is_target, continous=continuous)

In [14]:
class NumericField(Field):
    def __init__(self, name=None,
                 fill_missing="median", normalization="Gaussian",
                 is_target=False):
        pipeline = FillMissing(fill_missing) > Normalize(normalization)
        super().__init__(pipeline, name, is_target, continuous=True)
    
    def __repr__(self):
        return f"NumericField[{self.name}]"

In [15]:
class CategoricalField(Field):
    def __init__(self, name=None, min_freq=0, max_features=None,
                 handle_unk=False, is_target=False):
        pipeline = Categorize(min_freq=min_freq, max_features=max_features,
                              handle_unk=handle_unk)
        super().__init__(pipeline, name, is_target, continuous=False)
    
    def __repr__(self):
        return f"CategoricalField[{self.name}]"

In [16]:
class DateField:
    def __init__(self, name=None, is_target=False):
        pipeline = LambdaOperator(lambda s: s.dt) > FeatureEngineeringOperator([
            lambda dt: dt.dayofweek,
            lambda dt: dt.is_month_end,
            lambda dt: dt.is_month_start,
        ])
        super().__init__(pipeline, name, is_target, continuous=False)

    def __repr__(self):
        return f"DateField[{self.name}]"

In [17]:
class DatetimeField(Field):
    def __init__(self, name=None, is_target=False):
        pipeline = LambdaOperator(lambda s: s.dt) > FeatureEngineeringOperator([
            lambda dt: dt.dayofweek,
            lambda dt: dt.day,
            lambda dt: dt.is_month_end,
            lambda dt: dt.is_month_start,
            lambda dt: dt.hour,
        ])
        super().__init__(pipeline, name, is_target, continuous=False)
    
    def __repr__(self):
        return f"DatetimeField[{self.name}]"

Now for the meat of the implementaion. 

In [18]:
import torch.utils.data
import warnings

In [19]:
class TabularDataset(torch.utils.data.Dataset):
    
    def __init__(self, examples: Dict[ColumnName, ArrayLike],
                 fields: Dict[ColumnName, Field], test=False):
        self.examples = examples
        self.fields = fields
        self.test = test
    
    def __len__(self):
        return len(self.examples)
    
    def __getitem__(self, idx):
        return self.examples.iloc[idx]
            
    @classmethod
    def from_df(cls, df: pd.DataFrame, fields: Dict[ColumnName, OneorMore[Field]],
                test=False) -> 'TabularDataset':
        # TODO: implement auto handling of fields
        missing_cols = set(df.columns) - set(fields.keys())
        if len(missing_cols) > 0:
            warnings.warn(f"The following columns are missing from the fields list: {missing_cols}")
        
        additional_fields = {}
        for k, fld in fields.items():
            if fld is None: continue
            if isinstance(fld, Field):
                fld.set_source(df[k])
                if fld.name is None: fld.name = k
            else:
                # if multiple fields are specified, hook them all to the same column
                for i, f in enumerate(fld):
                    f.set_source(df[k])
                    if f.name is None: f.name = f"{k}_{i}"
                    additional_fields[f.name] = f
        
        fields = {k: v for k, v in fields.items() if isinstance(v, Field)}
        fields.update(additional_fields)
        examples = {}
        for fld in fields.values():
            # TODO: Handle multidimensional outputs
            examples[fld.name] = fld.compute(test=test)
        return cls(examples, fields, test=test)
    
    @classmethod
    def from_dfs(cls, train_df: pd.DataFrame, 
                 val_df: pd.DataFrame=None, test_df: pd.DataFrame=None,
                 fields: Dict[ColumnName, OneorMore[Field]]=None) -> Iterable['TabularDataset']:
        train = cls.from_df(train_df, fields, test=False)
        yield train
        if val_df is not None:
            yield cls.from_df(val_df, fields, test=True)
        if test_df is not None:
            non_target_fields = {k: v for k, v in train.fields if not v.is_target}                
            yield cls.from_df(test_df, non_target_fields, test=True)

# Tests

In [20]:
import pytest

In [21]:
# test_pipe()
op1 = LambdaOperator(lambda x: x + 1)
op2 = op1 > LambdaOperator(lambda x: x ** 2)
assert op2(1) == 4
op3 = LambdaOperator(lambda x: x + 3)
op2 > op3
assert op3(2) == 12

In [22]:
# test_hook()
op1 = LambdaOperator(lambda x: x + 3)
op2 = LambdaOperator(lambda x: x * 2)
op2 < op1
assert op2(1) == 8
op3 = LambdaOperator(lambda x: x ** 2)
op3 < op2
assert op3(1) == 64

In [23]:
# test_normalizer_gaussian
norm = Normalize("Gaussian")
rng = np.random.RandomState(21)
a = rng.normal(4, 10, (200, ))
a_normed = norm(a)
np.testing.assert_almost_equal(a_normed.mean(), 0.)
np.testing.assert_almost_equal(a_normed.std(), 1.)

In [25]:
# test_normalizer_rank_gaussian
norm = Normalize("RankGaussian")
rng = np.random.RandomState(21)
a = rng.normal(4, 10, (200, ))
a_normed = norm(a)
np.testing.assert_almost_equal(a_normed.mean(), 0.)

In [28]:
# test_missing_filler
rng = np.random.RandomState(21)
x = pd.Series(data=rng.normal(0, 1, (100, )))
x[x < 0] = np.nan
for mthd in ["median", "mean", "mode"]:
    filler = FillMissing(mthd)
    assert not pd.isnull(filler(x)).any()

In [29]:
# test_categorize
rng = np.random.RandomState(21)
a = pd.Series(data=rng.randint(0, 20, (100, )))
cat = Categorize()
a_transformed = cat(a)

In [30]:
# test_categorize_min_max_freq
rng = np.random.RandomState(21)
a = pd.Series(data=np.array([1, 2, 1, 4, 1, 2, 3, 3, 5]))
cat = Categorize(min_freq=2, max_features=None, handle_unk=True)
a_transformed = cat(a)
assert (a_transformed[a == 4] == 0).all()
assert (a_transformed[a == 5] == 0).all()
assert (a_transformed[a == 1] != 0).all()

In [31]:
# test_categorize_unknown
rng = np.random.RandomState(21)
a = pd.Series(data=np.array([0, 6, 7, 8, 9, 6, 3, 1, 2, 4]))
cat = Categorize(min_freq=0, max_features=None, handle_unk=True)
cat(pd.Series(data=np.arange(6)))
a_transformed = cat(a, test=True)
assert (a_transformed[a > 5] == 0).all()
assert (a_transformed[a <= 5] > 0).all()

In [33]:
df = pd.read_csv("sample.csv", parse_dates=["purchase_date"])

In [34]:
df.head(2)

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id
0,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_e020e9b302,-8,-0.703331,2017-06-25 15:33:07,1.0,16,37
1,Y,C_ID_4e6213e9bc,88,N,0,A,367,M_ID_86ec983688,-7,-0.733128,2017-07-15 12:10:45,1.0,16,16


In [35]:
(df["purchase_date"].max() - df["purchase_date"].min()).days

423

In [36]:
(df["purchase_date"] - df["purchase_date"].min()).apply(lambda s: s.days)

0       175
1       195
2       220
3       243
4        67
5       418
6        78
7       321
8       151
9        74
10      128
11      403
12      158
13      194
14      296
15      163
16      402
17      187
18      372
19      222
20      241
21      181
22      194
23      125
24       18
25      421
26      118
27      173
28       68
29      216
       ... 
9969    308
9970      0
9971    312
9972     28
9973    216
9974    101
9975    313
9976     64
9977    197
9978     55
9979     95
9980    258
9981    300
9982     82
9983    205
9984    301
9985    191
9986    170
9987    119
9988    291
9989    287
9990    198
9991      1
9992     27
9993    286
9994    312
9995    100
9996    102
9997    327
9998    202
Name: purchase_date, Length: 9999, dtype: int64

In [37]:
def register_start(x, op):
    op.start_date = x.min()

In [38]:
train = TabularDataset.from_df(df, fields={
    "installments": NumericField(normalization=None),
    "category_2": NumericField(normalization=None),
    "authorized_flag": CategoricalField(min_freq=3),
    "purchase_date": [
        DatetimeField(),
#         Field(LambdaOperator(lambda x,op: (x - op.start_date).apply(lambda d: d.days),
#                              build_func=register_start),
#               name="elapsed_time", continuous=True)
    ]
}, test=False)

            Set handle_unk to True to handle categories left out due to max_features or min_freq being set.
            
  import sys


In [39]:
dt = df["purchase_date"].dt

In [40]:
dt.minute

0       33
1       10
2        4
3        6
4       14
5       45
6       10
7        5
8        2
9       41
10      42
11       5
12       2
13      59
14      29
15      40
16      19
17       3
18      54
19      53
20      53
21      27
22       6
23      29
24      16
25      54
26      30
27      43
28      46
29      10
        ..
9969    38
9970    46
9971    11
9972    17
9973    47
9974    10
9975    54
9976    10
9977    53
9978    47
9979    34
9980     4
9981     1
9982    52
9983    40
9984    58
9985     4
9986    10
9987    46
9988     0
9989    22
9990    47
9991    57
9992     3
9993     9
9994     0
9995    37
9996    51
9997     0
9998     2
Name: purchase_date, Length: 9999, dtype: int64