In [None]:
#export
from local.imports import *
from local.test import *
from local.core import *
from local.data.all import *
from local.notebook.showdoc import show_doc

In [None]:
adult_source = decompress_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(adult_source/'adult.csv')

In [None]:
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']

In [None]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,49,Private,101320,Assoc-acdm,12.0,Married-civ-spouse,,Wife,White,Female,0,1902,40,United-States,>=50k
1,44,Private,236746,Masters,14.0,Divorced,Exec-managerial,Not-in-family,White,Male,10520,0,45,United-States,>=50k
2,38,Private,96185,HS-grad,,Divorced,,Unmarried,Black,Female,0,0,32,United-States,<50k
3,38,Self-emp-inc,112847,Prof-school,15.0,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,United-States,>=50k
4,42,Self-emp-not-inc,82297,7th-8th,,Married-civ-spouse,Other-service,Wife,Black,Female,0,0,50,United-States,<50k


## Preprocessing

In [None]:
means,stds = {},{}
for n in cont_names:
    if pd.isnull(df[n]).any(): 
        df[n+"_na"] = pd.isnull(df[n])
        cat_names.append(n+'_na')
        df[n] = df[n].fillna(df[n].median())
    means[n],stds[n] = (df[n].mean(),df[n].std())
    df[n] = (df[n] - means[n]) / stds[n]

In [None]:
cat_names

['workclass',
 'education',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'education-num_na']

In [None]:
for n in cat_names + ["salary"]: df[n] = pd.Categorical(df.loc[:,n], ordered=True)

In [None]:
cats  = df[cat_names]
conts = df[cont_names]
y = df["salary"]

In [None]:
cats.head()

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,education-num_na
0,Private,Assoc-acdm,Married-civ-spouse,,Wife,White,False
1,Private,Masters,Divorced,Exec-managerial,Not-in-family,White,False
2,Private,HS-grad,Divorced,,Unmarried,Black,True
3,Self-emp-inc,Prof-school,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,False
4,Self-emp-not-inc,7th-8th,Married-civ-spouse,Other-service,Wife,Black,True


In [None]:
conts.head()

Unnamed: 0,age,fnlwgt,education-num
0,0.763785,-0.838071,0.752389
1,0.397227,0.44498,1.535566
2,-0.042641,-0.886721,-0.030787
3,-0.042641,-0.728862,1.927154
4,0.250604,-1.018298,-0.030787


## DataBunch

In [None]:
class DFDataset(GetAttr):
    _xtra = ["__len__"]
    def __init__(self, cats, conts, y): 
        self.y,self.conts,self.cats = y,conts,cats
        self.default = y
    def __getitem__(self, i): 
        return ((self.cats.iloc[i].values,self.conts.iloc[i].values), self.y[i])

In [None]:
cats = df.loc[:, cat_names]
for n in cat_names: cats[n] = cats[n].cat.codes
conts = df[cont_names]

In [None]:
ds = DFDataset(cats, conts, y.cat.codes)

In [None]:
ds[0]

((array([ 4,  7,  2, -1,  5,  4,  0], dtype=int8),
  array([ 0.76378467, -0.83807092,  0.75238915])),
 1)

In [None]:
dl = TfmdDL(ds)

In [None]:
dl.one_batch()

((tensor([[ 4,  7,  2, -1,  5,  4,  0],
          [ 4, 12,  0,  4,  1,  4,  0],
          [ 4, 11,  0, -1,  4,  2,  1],
          [ 5, 14,  2, 10,  0,  1,  0],
          [ 6,  5,  2,  8,  5,  2,  1],
          [ 4, 11,  4,  6,  3,  4,  0],
          [ 4, 15,  0, -1,  2,  4,  0],
          [ 4,  1,  2, -1,  0,  4,  0],
          [ 4, 11,  2,  3,  0,  4,  0],
          [ 5, 11,  2, -1,  0,  4,  1],
          [ 4,  9,  4, -1,  3,  2,  0],
          [ 4,  1,  4,  1,  3,  4,  1],
          [ 4,  8,  2, -1,  5,  4,  1],
          [ 4,  9,  2, -1,  0,  4,  1],
          [ 4,  8,  2, 12,  0,  4,  1],
          [ 4, 11,  6, -1,  4,  4,  0]], dtype=torch.int8),
  tensor([[ 0.7638, -0.8381,  0.7524],
          [ 0.3972,  0.4450,  1.5356],
          [-0.0426, -0.8867, -0.0308],
          [-0.0426, -0.7289,  1.9272],
          [ 0.2506, -1.0183, -0.0308],
          [-1.3622, -1.1991, -0.4224],
          [ 0.7638, -1.3770, -0.0308],
          [-0.1160, -0.4817, -1.2056],
          [ 0.5439,  1.3116,

## With rapids

In [None]:
import cudf
import nvcategory

In [None]:
df = pd.read_csv(adult_source/'adult.csv')
df = cudf.from_pandas(df)

In [None]:
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']

In [None]:
means,stds = {},{}
for n in cont_names:
    if df[n].isnull().any(): 
        df[n+"_na"] = df[n].isnull()
        cat_names.append(n+'_na')
        df[n] = df[n].fillna(df[n].mean()) #TODO: request median
    means[n],stds[n] = (df[n].mean(),df[n].std())
    df[n] = (df[n] - means[n]) / stds[n]

In [None]:
cat_names

['workclass',
 'education',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'education-num_na']

In [None]:
classes = {}
for n in cat_names + ["salary"]: 
    if df[n].dtype != "object": df[n] = df[n].astype("str")
    classes[n] = nvcategory.from_strings(df[n].data).keys()
    df[n] = nvcategory.from_strings(df[n].data).set_keys(classes[n]).values()

In [None]:
cats  = df[cat_names]
conts = df[cont_names]
y = df["salary"]

In [None]:
print(cats.head())

   workclass  education  marital-status  occupation  relationship  race  education-num_na
0          4         13               2           0             5     4                 0
1          4          4               0          11             1     4                 0
2          4          3               0           0             4     2                 1
3          5          6               2           3             0     1                 0
4          6         11               2          15             5     2                 1


In [None]:
print(conts.head())

                    age              fnlwgt       education-num
0    0.7637846676602527 -0.8380709161872288  0.7519271022067199
1    0.3972273770201378 0.44498004180288914  1.5351092115844664
2  -0.04264137174799999 -0.8867208554107108                 0.0
3  -0.04264137174799999 -0.7288619873792298  1.9267002662733397
4    0.2506044607640919 -1.0182983346581724                 0.0


## DataBunch TODO

In [None]:
class DFDataset(GetAttr):
    _xtra = ["__len__"]
    def __init__(self, cats, conts, y): 
        self.y,self.conts,self.cats = y,conts,cats
        self.default = y
    def __getitem__(self, i): 
        return ((self.cats.iloc[i].values,self.conts.iloc[i].values), self.y[i])

In [None]:
cats = df.loc[:, cat_names]
for n in cat_names: cats[n] = cats[n].cat.codes
conts = df[cont_names]

In [None]:
ds = DFDataset(cats, conts, y.cat.codes)

In [None]:
ds[0]

((array([ 4,  7,  2, -1,  5,  4,  0], dtype=int8),
  array([ 0.76378467, -0.83807092,  0.75238915])),
 1)

In [None]:
dl = TfmdDL(ds)

In [None]:
dl.one_batch()

((tensor([[ 4,  7,  2, -1,  5,  4,  0],
          [ 4, 12,  0,  4,  1,  4,  0],
          [ 4, 11,  0, -1,  4,  2,  1],
          [ 5, 14,  2, 10,  0,  1,  0],
          [ 6,  5,  2,  8,  5,  2,  1],
          [ 4, 11,  4,  6,  3,  4,  0],
          [ 4, 15,  0, -1,  2,  4,  0],
          [ 4,  1,  2, -1,  0,  4,  0],
          [ 4, 11,  2,  3,  0,  4,  0],
          [ 5, 11,  2, -1,  0,  4,  1],
          [ 4,  9,  4, -1,  3,  2,  0],
          [ 4,  1,  4,  1,  3,  4,  1],
          [ 4,  8,  2, -1,  5,  4,  1],
          [ 4,  9,  2, -1,  0,  4,  1],
          [ 4,  8,  2, 12,  0,  4,  1],
          [ 4, 11,  6, -1,  4,  4,  0]], dtype=torch.int8),
  tensor([[ 0.7638, -0.8381,  0.7524],
          [ 0.3972,  0.4450,  1.5356],
          [-0.0426, -0.8867, -0.0308],
          [-0.0426, -0.7289,  1.9272],
          [ 0.2506, -1.0183, -0.0308],
          [-1.3622, -1.1991, -0.4224],
          [ 0.7638, -1.3770, -0.0308],
          [-0.1160, -0.4817, -1.2056],
          [ 0.5439,  1.3116,