In [None]:
#export
from local.imports import *
from local.test import *
from local.core import *
from local.data.all import *
from local.tabular.core import *
from local.notebook.showdoc import show_doc

In [None]:
#default_exp tabular.core

# Tabular with rapids

> Basic functions to preprocess tabular data before assembling it in a `DataBunch` on the GPU.

In [None]:
#export
try: import cudf,nvcategory
except: print("This requires rapids, see https://rapids.ai/ for installation details")

## TabularProcessors

In [None]:
#export
class CategorifyGPU(TabularProc):
    "Transform the categorical variables to that type."
    order = 1
    def setup(self, df, trn_idx=None):
        self.categories = {}
        for n in self.cat_names: 
            col = df[n] if trn_idx is None else df.loc[trn_idx, n]
            if col.dtype != "object": col = col.astype("str")
            self.categories[n] = nvcategory.from_strings(col.data).keys()
        
    def __call__(self, df):
        for n in self.cat_names:
            if df[n].dtype != "object": df[n] = df[n].astype("str")
            df[n] = nvcategory.from_strings(df[n].data).set_keys(self.categories[n]).values()

In [None]:
show_doc(CategorifyGPU, title_level=3)

<h3 id="<code>class</code> <code>CategorifyGPU</code>" class="doc_header"><code>class</code> <code>CategorifyGPU</code><a href="" class="source_link" style="float:right">[source]</a></h3>

> <code>CategorifyGPU</code>(**`cat_names`**=*`None`*, **`cont_names`**=*`None`*, **`func`**=*`None`*) :: [`TabularProc`](/tabular.core.html#TabularProc)

Transform the categorical variables to that type.

In [None]:
cat = CategorifyGPU(cat_names='a')
df = cudf.from_pandas(pd.DataFrame({'a':[0,1,2,0,2]}))
cat.setup(df)
test_eq(list(cat.categories['a'].to_host()), ['0','1','2'])
cat(df)
test_eq(df['a'].to_array(), np.array([0,1,2,0,2]))
df1 = cudf.from_pandas(pd.DataFrame({'a':[1,0,3,-1,2]}))
cat(df1)
#Values that weren't in the training df are sent to -1 (na)
test_eq(df1['a'].to_array(), np.array([1,0,-1,-1,2]))

In [None]:
cat = CategorifyGPU(cat_names='a')
df = cudf.from_pandas(pd.DataFrame({'a':[0,1,2,3,2]}))
cat.setup(df, trn_idx=[0,1,2])
test_eq(list(cat.categories['a'].to_host()), ["0","1","2"])
cat(df)
test_eq(df['a'].to_array(), np.array([0,1,2,-1,2]))

In [None]:
#export
class NormalizeGPU(TabularProc):
    "Normalize the continuous variables."
    order = 2
    def setup(self, df, trn_idx=None):
        self.means,self.stds = {},{}
        for n in self.cont_names:
            col = (df[n] if trn_idx is None else df.loc[trn_idx,n])
            self.means[n],self.stds[n] = col.mean(),col.std(ddof=0)
    
    def __call__(self, df):
        for n in self.cont_names: df[n] = (df[n]-self.means[n]) / (1e-7 + self.stds[n])

In [None]:
show_doc(NormalizeGPU, title_level=3)

<h3 id="<code>class</code> <code>NormalizeGPU</code>" class="doc_header"><code>class</code> <code>NormalizeGPU</code><a href="" class="source_link" style="float:right">[source]</a></h3>

> <code>NormalizeGPU</code>(**`cat_names`**=*`None`*, **`cont_names`**=*`None`*, **`func`**=*`None`*) :: [`TabularProc`](/tabular.core.html#TabularProc)

Normalize the continuous variables.

In [None]:
norm = NormalizeGPU(cont_names='a')
df = cudf.from_pandas(pd.DataFrame({'a':[0,1,2,3,4]}))
norm.setup(df)
x = np.array([0,1,2,3,4])
m,s = x.mean(),x.std()
test_eq(norm.means, {'a': m})
test_close(norm.stds['a'], s)
norm(df)
test_close(df['a'].to_array(), (x-m)/s)
df1 = cudf.from_pandas(pd.DataFrame({'a':[5,6,7]}))
norm(df1)
test_close(df1['a'].to_array(), (np.array([5,6,7])-m)/s)

In [None]:
norm = NormalizeGPU(cont_names='a')
df = cudf.from_pandas(pd.DataFrame({'a':[0,1,2,3,4]}))
norm.setup(df, trn_idx=[0,1,2])
x = np.array([0,1,2])
m,s = x.mean(),x.std()
test_eq(norm.means, {'a': m})
test_close(norm.stds['a'], s)
norm(df)
test_close(df['a'].to_array(), (np.array([0,1,2,3,4])-m)/s)

In [None]:
#export
def get_median(col):
    "Get the median of a cudf Series `col`"
    col = col.dropna().reset_index(drop=True)
    return col.sort_values()[len(col)//2]

In [None]:
#export
class FillMissingGPU(TabularProc):
    "Fill the missing values in continuous columns."
    def __init__(self, cat_names=None, cont_names=None, fill_strategy=FillStrategy.median, add_col=True, fill_val=0.):
        super().__init__(cat_names, cont_names)
        self.fill_strategy,self.add_col,self.fill_val = fill_strategy,add_col,fill_val
    
    def setup(self, df, trn_idx=None):
        self.na_dict = {}
        for n in self.cont_names:
            col = df[n] if trn_idx is None else df.loc[trn_idx,n]
            if col.isnull().any():
                if self.fill_strategy == FillStrategy.median:     filler = get_median(col)
                elif self.fill_strategy == FillStrategy.constant: filler = self.fill_val
                else: filler = col.dropna().value_counts().index[0]
                self.na_dict[n] = filler
                if self.add_col:
                    df[n+'_na'] = df[n].isnull()
                    if n+'_na' not in self.cat_names: self.cat_names.append(n+'_na')

    def __call__(self, df):
        for n in self.cont_names:
            if n in self.na_dict:
                if self.add_col: df[n+'_na'] = df[n].isnull()
                df[n] = df[n].fillna(self.na_dict[n])
            elif df[n].isnull().sum() != 0:
                raise Exception(f"""There are nan values in field {n} but there were none in the training set given at setup. 
                Please fix those manually.""")

In [None]:
show_doc(FillMissingGPU, title_level=3)

<h3 id="<code>class</code> <code>FillMissingGPU</code>" class="doc_header"><code>class</code> <code>FillMissingGPU</code><a href="" class="source_link" style="float:right">[source]</a></h3>

> <code>FillMissingGPU</code>(**`cat_names`**=*`None`*, **`cont_names`**=*`None`*, **`fill_strategy`**=*`'median'`*, **`add_col`**=*`True`*, **`fill_val`**=*`0.0`*) :: [`TabularProc`](/tabular.core.html#TabularProc)

Fill the missing values in continuous columns.

In [None]:
fill1,fill2,fill3 = (FillMissingGPU(cont_names='a', fill_strategy=s) 
                     for s in [FillStrategy.median, FillStrategy.constant, FillStrategy.most_common])
df = cudf.from_pandas(pd.DataFrame({'a':[0,1,np.nan,1,2,3,4]}))
df1 = df.copy(); df2 = df.copy()
fill1.setup(df); fill2.setup(df1); fill3.setup(df2)
test_eq(fill1.na_dict, {'a': 2.})
test_eq(fill2.na_dict, {'a': 0})
test_eq(fill3.na_dict, {'a': 1.0})
for f in [fill1, fill2, fill3]: test_eq(f.cat_names, ['a_na'])

fill1(df); fill2(df1); fill3(df2)
for df_,v in zip([df, df1, df2], [2., 0., 1.]):
    test_eq(df_['a'].to_array(), np.array([0, 1, v, 1, 2, 3, 4]))
    test_eq(df_['a_na'].to_array(), np.array([0, 0, 1, 0, 0, 0, 0]))
    
dfa = cudf.from_pandas(pd.DataFrame({'a':[np.nan,0,np.nan]}))
dfa1 = dfa.copy(); dfa2 = dfa.copy()
fill1(dfa); fill2(dfa1); fill3(dfa2)
for df_,v in zip([dfa, dfa1, dfa2], [2., 0., 1.]):
    test_eq(df_['a'].to_array(), np.array([v, 0, v]))
    test_eq(df_['a_na'].to_array(), np.array([1, 0, 1]))

## TabularProcessor -

In [None]:
#export
class TabularPreprocessorGPU():
    "An object that will preprocess dataframes using `procs`"
    def __init__(self, procs, cat_names=None, cont_names=None, cat_y=None, inplace=True):
        self.cat_names,self.cont_names,self.cat_y,self.inplace = L(cat_names),L(cont_names),L(cat_y),inplace
        self.procs = L(p if isinstance(p, type) else partial(TabularProc, func=p) for p in procs).sorted(key='order')
    
    def __call__(self, df, trn_idx=None):
        "Call each of `self.procs` on `df`, setup on `df[trn_idx]` if not None"
        df = df if self.inplace else df.copy()
        if trn_idx is None:
            for p in self.procs: p(df)
        else:
            self.procs,procs = [],self.procs
            for p in procs: 
                p_ = p(cat_names=self.cat_names + self.cat_y if p==CategorifyGPU else self.cat_names, cont_names=self.cont_names)
                p_.setup(df, trn_idx=trn_idx)
                p_(df)
                if p!= CategorifyGPU: self.cat_names,self.cont_names = p_.cat_names,p_.cont_names
                else:
                    self.classes = {n:'#na#'+L(p_.categories[n].to_host(), use_list=True) for n in self.cat_names + self.cat_y}
                self.procs.append(p_)
            for p in self.procs:
                if isinstance(p, Normalize): self.means,self.stds = p.means,p.stds
        return df

In [None]:
procs = [NormalizeGPU, CategorifyGPU, FillMissingGPU, noop]
proc = TabularPreprocessorGPU(procs, 'a', 'b', inplace=False)

#Test reordering and partialize
test_eq(proc.procs, [FillMissingGPU, proc.procs[1], CategorifyGPU, NormalizeGPU])
test_eq(proc.procs[1].func, TabularProc)
test_eq(proc.procs[1].keywords, {'func': noop})

df = cudf.from_pandas(pd.DataFrame({'a':[0,1,2,1,1,2,0], 'b':[0,1,np.nan,1,2,3,4]}))

#Test setup and apply on df_trn
df1 = proc(df, trn_idx=range_of(df))
test_eq(df1['a'].to_array(), [0,1,2,1,1,2,0])
test_eq(df1['b_na'].to_array(), [0,0,1,0,0,0,0])
x = np.array([0,1,2,1,2,3,4])
m,s = x.mean(),x.std()
test_close(df1['b'].to_array(), (x-m)/s)
test_eq(proc.classes, {'a': ['#na#','0','1','2'], 'b_na': ['#na#','False','True']})

#Test apply on df_val
df = cudf.from_pandas(pd.DataFrame({'a':[2,1,3], 'b':[4,5,np.nan]}))
df1 = proc(df)
test_eq(proc.classes, {'a': ['#na#','0','1','2'], 'b_na': ['#na#','False','True']})
test_eq(df1['a'].to_array(), [2,1,-1])
test_eq(df1['b_na'].to_array(), [0,0,1])
x = np.array([4, 5, 2])
test_close(df1['b'].to_array(), (x-m)/s)

#Test apply on cat_y
procs = [NormalizeGPU, CategorifyGPU, FillMissingGPU, noop]
proc = TabularPreprocessorGPU(procs, 'a', 'b', cat_y='c', inplace=False)

df = cudf.from_pandas(pd.DataFrame({'a':[0,1,2,1,1,2,0], 'b':[0,1,np.nan,1,2,3,4], 'c': ['b','a','b','a','a','b','a']}))
df1 = proc(df, trn_idx=range_of(df))
test_eq(proc.cat_names, ['a', 'b_na'])
test_eq(df1['a'].to_array(), [0,1,2,1,1,2,0])
test_eq(df1['b_na'].to_array(), [0,0,1,0,0,0,0])
test_eq(df1['c'].to_array(), [1,0,1,0,0,1,0])
x = np.array([0,1,2,1,2,3,4])
m,s = x.mean(),x.std()
test_close(df1['b'].to_array(), (x-m)/s)
test_eq(proc.classes, {'a': ['#na#','0','1','2'], 'b_na': ['#na#','False','True'], 'c': ['#na#','a','b']})

In [None]:
#export
def process_df_gpu(df, splits, procs, cat_names=None, cont_names=None, cat_y=None, inplace=True):
    "Process `df` with `procs` and returns the processed dataframe and the `TabularProcessorGPU` associated"
    proc = TabularPreprocessorGPU(procs, cat_names, cont_names, cat_y, inplace=inplace)
    res = proc(df, trn_idx=splits[0])
    return res,proc

Pass the same `splits` as you will use for splitting the data, so that the setup is only done on the training set. `cat_names` are the names of the categorical variables, `cont_names` the continous ones, `cat_y` are the names of the dependent variables that are categories. If `inplace=True`, processing is applied inplace, otherwis it creates a copy of `df`.

In [None]:
#export
class TabularLine(pd.Series):
    "A line of a dataframe that knows how to show itself"
    def show(self, ctx=None, **kwargs):
        if ctx is None: return self
        else: return ctx.append(self)

In [None]:
#export
class TensorTabular(tuple):
    
    def get_ctxs(self, max_samples=10, **kwargs):
        n_samples = min(self[0].shape[0], max_samples)
        df = pd.DataFrame(index = range(n_samples))
        return [df.iloc[i] for i in range(n_samples)]
    
    def display(self, ctxs): display_df(pd.DataFrame(ctxs))

In [None]:
#export
class ReadTabLine(ItemTransform):
    def __init__(self, proc): 
        self.proc = proc
        self.o2is = {n: defaultdict(int, {v:i for i,v in enumerate(proc.classes[n])}) for n in proc.cat_names}
    
    def encodes(self, row): 
        cats = [self.o2is[n][row[n]] for n in self.proc.cat_names]
        conts = [row[n] for n in self.proc.cont_names]
        return TensorTabular((tensor(cats).long(),tensor(conts).float()))
    
    def decodes(self, o) -> TabularLine:
        dic = {c: self.proc.classes[c][v] for v,c in zip(o[0], self.proc.cat_names)}
        ms = getattr(self.proc, 'means', {c:0 for c in self.proc.cont_names})
        ss = getattr(self.proc, 'stds',  {c:1 for c in self.proc.cont_names})
        dic.update({c: (v*ss[c] + ms[c]).item() for v,c in zip(o[1], self.proc.cont_names)})
        return pd.Series(dic)

In [None]:
#export
class ReadTabTarget(ItemTransform):
    def __init__(self, proc): 
        self.proc = proc
        assert len(proc.cat_y) == 1
        self.o2i = defaultdict(int, {v:i for i,v in enumerate(proc.classes[proc.cat_y[0]])})
    
    def encodes(self, row): return self.o2i[row[self.proc.cat_y[0]]]-1
    def decodes(self, o) -> Category: return self.proc.classes[self.proc.cat_y[0]][o+1]

In [None]:
tds = TfmdDS(df1, tfms=[[ReadTabLine(proc)], ReadTabTarget(proc)], use_list=None)

In [None]:
enc = tds[1]
test_eq(enc[0][0], tensor([2,1]))
test_close(enc[0][1], tensor([-0.628828]))
test_eq(enc[1], 0)

dec = tds.decode(enc)
assert isinstance(dec[0], TabularLine)
test_close(dec[0], pd.Series({'a': 1, 'b_na': False, 'b': 1}))
test_eq(dec[1], 'a')

test_stdout(lambda: print(tds.show_at(1)), """a               1
b_na        False
b               1
category        a
dtype: object""")

## Integration example

In [None]:
path = decompress_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,49,Private,101320,Assoc-acdm,12.0,Married-civ-spouse,,Wife,White,Female,0,1902,40,United-States,>=50k
1,44,Private,236746,Masters,14.0,Divorced,Exec-managerial,Not-in-family,White,Male,10520,0,45,United-States,>=50k
2,38,Private,96185,HS-grad,,Divorced,,Unmarried,Black,Female,0,0,32,United-States,<50k
3,38,Self-emp-inc,112847,Prof-school,15.0,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,United-States,>=50k
4,42,Self-emp-not-inc,82297,7th-8th,,Married-civ-spouse,Other-service,Wife,Black,Female,0,0,50,United-States,<50k


In [None]:
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
procs = [Categorify, FillMissing, Normalize]

In [None]:
splits = RandomSplitter()(range_of(df))
df1,proc = process_df(df, splits, procs=procs, cat_names=cat_names, cont_names=cont_names, cat_y="salary", inplace=False)

In [None]:
dsrc = DataSource(df1, filts=splits, tfms=[[ReadTabLine(proc)], [ReadTabTarget(proc)]])

In [None]:
dbch = dsrc.databunch(bs=64)
dbch.show_batch()

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,education-num_na,age,fnlwgt,education-num,category
0,Private,Assoc-voc,Married-civ-spouse,Craft-repair,Husband,White,False,51.0,99063.992188,11.0,<50k
1,Local-gov,Assoc-voc,Divorced,Tech-support,Unmarried,Asian-Pac-Islander,False,44.0,73199.0,11.0,<50k
2,Private,Assoc-voc,Married-civ-spouse,Craft-repair,Husband,White,False,39.0,53569.0,11.0,<50k
3,Private,Some-college,Married-civ-spouse,Adm-clerical,Husband,White,False,49.0,280525.0,10.0,>=50k
4,Private,Some-college,Never-married,Tech-support,Own-child,White,False,25.0,245628.0,10.0,<50k
5,State-gov,Doctorate,Married-civ-spouse,Prof-specialty,Husband,White,False,41.0,116520.0,16.0,>=50k
6,Private,HS-grad,Married-civ-spouse,Sales,Husband,Other,False,25.0,195201.0,9.0,<50k
7,Private,HS-grad,Divorced,Craft-repair,Not-in-family,White,False,19.0,517036.0,9.0,<50k
8,Private,Masters,Married-civ-spouse,Prof-specialty,Husband,White,False,48.0,47343.0,14.0,>=50k
9,Self-emp-not-inc,Masters,Married-civ-spouse,Sales,Wife,White,False,41.0,186909.0,14.0,>=50k


## Export -

In [None]:
#hide
from local.notebook.export import notebook2script
notebook2script(all_fs=True)

Converted 00_test.ipynb.
Converted 01_core.ipynb.
Converted 01a_dataloader.ipynb.
Converted 01a_script.ipynb.
Converted 02_transforms.ipynb.
Converted 03_pipeline.ipynb.
Converted 04_data_external.ipynb.
Converted 05_data_core.ipynb.
Converted 06_data_source.ipynb.
Converted 07_vision_core.ipynb.
Converted 08_pets_tutorial.ipynb.
Converted 09_vision_augment.ipynb.
Converted 11_layers.ipynb.
Converted 12_optimizer.ipynb.
Converted 13_learner.ipynb.
Converted 14_callback_schedule.ipynb.
Converted 15_callback_hook.ipynb.
Converted 16_callback_progress.ipynb.
Converted 17_callback_tracker.ipynb.
Converted 18_callback_fp16.ipynb.
Converted 19_callback_mixup.ipynb.
Converted 20_metrics.ipynb.
Converted 21_tutorial_imagenette.ipynb.
Converted 30_text_core.ipynb.
Converted 31_text_data.ipynb.
Converted 32_text_models_awdlstm.ipynb.
Converted 33_test_models_core.ipynb.
Converted 34_callback_rnn.ipynb.
Converted 35_tutorial_wikitext.ipynb.
Converted 36_text_models_qrnn.ipynb.
Converted 40_tabula