Option to turn labelling on and off on tfmdlists  
calling parallel on tls to get labels returns list of nones

In [None]:
from fastai2.basics import *
from pigboat.basics import *

In [None]:
class AttrProxy(GetAttr):
    def __init__(self, default): self.default = default

In [None]:
def _get_proxy(x):
    if x.__class__.__module__ != 'builtins': raise ValueError('Use only with builtins')
    name = 'Proxy' + x.__class__.__name__.capitalize()
    return type(name, (x.__class__,), {})(x)

In [None]:
def _add_attr(obj, name, value):
    try:                   
        setattr(obj, name, getattr(obj,'labels',value))
        return obj
    # It's not possible to set attributes on builtin types, so we wrap with a proxy
    except AttributeError: return _add_attr(_get_proxy(obj), name, value)

In [None]:
def _maintain_labels(old, new):
    if hasattr(old, 'labels'): new = _add_attr(new, 'labels', old.labels)
    return new

In [None]:
def maintain_labels(f):
    def _inner(fn, x, **kwargs):
        return _maintain_labels(x, f(fn, x, **kwargs))
    return _inner

In [None]:
# figure out delegates
_old_init = Pipeline.__init__
@patch
def __init__(self:Pipeline, *args, **kwargs):
    _old_init(self, *args, **kwargs)
    for o in self.fs: o._do_call = maintain_labels(o._do_call)

In [None]:
# Can confirm function was called without doing "res is not x"?
@typedispatch
def subscribe(tfm):
    old_call = tfm.__call__
    def _inner(f):
        def _call(self, x, **kwargs):
            res = old_call(self, x, **kwargs)
            res = _maintain_labels(x, res)
            if res is not x: res = f(res)
            return res
        tfm.__call__ = _call
        return f
    return _inner

In [None]:
class Labeller:
    def __init__(self, abstain='abstain'): self.abstain = abstain
        
    def __call__(self, tfm):
        def _inner(f):
            return subscribe(tfm)(self._add_label(f))
        return _inner
    
    def _add_label(self, f):
        def _inner(x):
            label = ifnone(f(x), self.abstain)
            x = _add_attr(x, 'labels', [])
            x.labels.append(label)
            return x
        return _inner

In [None]:
def labeller(tfm):
    def _inner(f):
        return subscribe(tfm)(_add_label(f))
    return _inner

In [None]:
# Does not work with wrapper style
# @Transform
# def neg(x:Tensor): return -x
# @Transform
# def add_greeting(x:str): return 'hello ' + x

In [None]:
class Neg(Transform):
    def encodes(self, x:Tensor): return -x

In [None]:
def func(x): return x+3

In [None]:
# @labeller(neg)
# def labeller_cat1(x): return CAT1
# @labeller(neg)
# def labeller_cat2(x): return CAT2

In [None]:
# @labeller(add_greeting)
# def labeller_greeting(x): return CAT1

In [None]:
CAT1,CAT2 = 'cat1','cat2'

In [None]:
# @labeller(Neg)
# def func1(x): return CAT2

In [None]:
# x = tensor(2)
# tfms = Pipeline([Neg])
# res = tfms(x)

In [None]:
# res.labels

In [None]:
# @Transform
# def str2int(x:MyStr): return MyInt(float(x))
# @Transform
# def int2tensor(x:MyInt): return tensor(x)

In [None]:
# @labeller(str2int)
# def func1(x): return 'str2int'
# @labeller(int2tensor)
# def func2(x):
#     if x>2: return 'int2tensor'

In [None]:
# pipe = Pipeline([str2int, int2tensor])

In [None]:
# res = pipe(MyStr('1.2'))
# res.labels

## IMDB tests

In [None]:
from fastai2.text.all import *

In [None]:
source = untar_data(URLs.IMDB_SAMPLE)

In [None]:
df = pd.read_csv(source/'texts.csv')

In [None]:
splits = ColSplitter('is_valid')(df)

In [None]:
df['original'] = df['text']

In [None]:
col_reader = mk_transform(ColReader('original'))
tkzer = Tokenizer.from_df('text')
nmzer = Numericalize()

In [None]:
ABSTAIN,POS,NEG = 'abstain','positive','negative'

In [None]:
class Noop(Transform):
    def encodes(self, x): return x.copy()

In [None]:
labeller = Labeller()

In [None]:
@labeller(Tokenizer)
def test(x):
    return POS

In [None]:
@labeller(ColReader)
def liked(x):
    return NEG

In [None]:
@labeller(Noop)
def nothing(x): return 'nada'

In [None]:
source2 = untar_data(URLs.IMDB)

In [None]:
folders = ['test', 'train', 'unsup']
fns = get_text_files(source2, folders=folders)

In [None]:
# tls = TfmdLists(fns, tfms=[Tokenizer.from_folder(source2), Numericalize])

In [None]:
vocab = [ABSTAIN,POS,NEG]

In [None]:
# dset = Datasets(df, tfms=[[ColReader('original'), Tokenizer.from_df('text'), Numericalize()]])
tls = TfmdLists(df, tfms=[ColReader('original'), Tokenizer.from_df('text'), Numericalize(), CategorizeTaskLabels(vocab=vocab)])

In [None]:
labels = [o.labels for o in progress_bar(tls)]

In [None]:
tls[0].labels

TensorMultiCategory([1, 2])

In [None]:
tls.decode(tls[0]).labels

(#2) ['negative','positive']

In [None]:
labels

[[TensorCategory(1), TensorCategory(2)],
 [TensorCategory(1), TensorCategory(2)],
 [TensorCategory(1), TensorCategory(2)],
 [TensorCategory(1), TensorCategory(2)],
 [TensorCategory(1), TensorCategory(2)],
 [TensorCategory(1), TensorCategory(2)],
 [TensorCategory(1), TensorCategory(2)],
 [TensorCategory(1), TensorCategory(2)],
 [TensorCategory(1), TensorCategory(2)],
 [TensorCategory(1), TensorCategory(2)],
 [TensorCategory(1), TensorCategory(2)],
 [TensorCategory(1), TensorCategory(2)],
 [TensorCategory(1), TensorCategory(2)],
 [TensorCategory(1), TensorCategory(2)],
 [TensorCategory(1), TensorCategory(2)],
 [TensorCategory(1), TensorCategory(2)],
 [TensorCategory(1), TensorCategory(2)],
 [TensorCategory(1), TensorCategory(2)],
 [TensorCategory(1), TensorCategory(2)],
 [TensorCategory(1), TensorCategory(2)],
 [TensorCategory(1), TensorCategory(2)],
 [TensorCategory(1), TensorCategory(2)],
 [TensorCategory(1), TensorCategory(2)],
 [TensorCategory(1), TensorCategory(2)],
 [TensorCategory

In [None]:
cat = Categorize(vocab=['pos','neg'], add_na=False)

In [None]:
cat('adafd')

KeyError: 'adafd'