# Using Neural Nets to Predict from Tabular Data

This is the most relevant application for most folks, since most businesses work with tabular (structured) data than other unstructured (visual, audio, etc.) types.

In [1]:
from fastai import *
from fastai.tabular import *

We always use Pandas DataFrames to handle tabular data.

In [2]:
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')

Our dataset is a simple spreadsheet of demographic info for adults. Our task will be to predict which adults have an income that's greater than or equal to $50,000.

In [3]:
# Specify target column (predict who has an income greater than 
# or equal to $50K)
dep_var = '>=50k'
# Categorical columns
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
# Continuous columns
cont_names = ['age', 'fnlwgt', 'education-num']
# Preprocessing functions. Already defined for us in fastai.
# Categorify creates vector embeddings for each cat var.
procs = [FillMissing, Categorify, Normalize]

In [4]:
# Specify which rows to use as the test set.
test = TabularList.from_df(df.iloc[800:1000].copy(), path=path, cat_names=cat_names, cont_names=cont_names)

In [5]:
bs=64

data = (TabularList.from_df(df, path=path, cat_names=cat_names, cont_names=cont_names, procs=procs)
        .split_by_idx(list(range(800,1000)))
        .label_from_df(cols=dep_var)
        .add_test(test, label=0)
        .databunch(bs=bs))

In [6]:
data.show_batch(rows=10)

workclass,education,marital-status,occupation,relationship,race,education-num_na,age,fnlwgt,education-num,target
Private,HS-grad,Never-married,Exec-managerial,Other-relative,White,False,-1.2158,0.7112,-0.4224,0
?,HS-grad,Married-civ-spouse,?,Husband,White,False,-0.1896,0.2516,-0.4224,0
Private,Prof-school,Married-civ-spouse,Prof-specialty,Husband,White,False,0.6899,-0.7576,1.9245,1
Private,Doctorate,Never-married,Prof-specialty,Not-in-family,White,False,0.4701,-0.3861,2.3157,1
Private,Masters,Never-married,Prof-specialty,Not-in-family,White,False,-0.7760,-0.7638,1.5334,0
Local-gov,Bachelors,Married-civ-spouse,Handlers-cleaners,Husband,White,False,-0.7760,-0.0896,1.1422,1
Private,Bachelors,Divorced,Exec-managerial,Unmarried,White,False,-0.5561,-1.5596,1.1422,0
Private,Prof-school,Married-civ-spouse,Prof-specialty,Husband,White,False,-0.2629,0.6506,1.9245,1
Private,Some-college,Never-married,Adm-clerical,Not-in-family,Black,False,-1.3624,2.2946,-0.0312,0
?,Bachelors,Married-spouse-absent,?,Not-in-family,White,False,-0.7027,-1.4073,1.1422,0


In [8]:
learn = tabular_learner(data, layers=[200,100], metrics=accuracy)

In [9]:
learn.model

TabularModel(
  (embeds): ModuleList(
    (0): Embedding(10, 6)
    (1): Embedding(17, 9)
    (2): Embedding(8, 5)
    (3): Embedding(16, 9)
    (4): Embedding(7, 4)
    (5): Embedding(6, 4)
    (6): Embedding(3, 2)
  )
  (emb_drop): Dropout(p=0.0)
  (bn_cont): BatchNorm1d(3, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layers): Sequential(
    (0): Linear(in_features=42, out_features=200, bias=True)
    (1): ReLU(inplace)
    (2): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Linear(in_features=200, out_features=100, bias=True)
    (4): ReLU(inplace)
    (5): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): Linear(in_features=100, out_features=2, bias=True)
  )
)

In [10]:
learn.fit(1, 1e-2)

Total time: 00:03
epoch  train_loss  valid_loss  accuracy
1      0.367535    0.381236    0.820000  (00:03)



## Inference

In [11]:
row = df.loc[2]
learn.predict(row)

(0, tensor(1), tensor([0.0532, 0.9468]))

In [12]:
row

age                           38
workclass                Private
fnlwgt                     96185
education                HS-grad
education-num                NaN
marital-status          Divorced
occupation                   NaN
relationship           Unmarried
race                       Black
sex                       Female
capital-gain                   0
capital-loss                   0
hours-per-week                32
native-country     United-States
>=50k                          0
Name: 2, dtype: object

Our model predicts a very low probability of `0.0532` that the individual at row 3 has an income greater than $50K.