<a href="https://colab.research.google.com/github/harnalashok/deeplearning/blob/main/fastai_tabular_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Handling tabular data using deeplearning

In [None]:
# Last amended: 30th April, 2019
# Myfolder: /home/ashok/Documents/3.tabular_data
# lubuntu_deeplearning_III
# Ref: i)  As given in this document
#      ii) On moodle in fastai section
#     iii) https://arxiv.org/pdf/1803.09820.pdf
# Objective: 
#            i)  Using deeplearning for tabular data
#            ii) Categorical transformation to embeddings
#           iii) What are weight decay, cyclical learning rate
#                and other regularization techniques including 
#                good batch size selection

In [None]:
# Transforming categories to embeddings:
# Read in this order:
# https://www.fast.ai/2018/04/29/categorical-embeddings/
# https://developers.google.com/machine-learning/crash-course/embeddings/obtaining-embeddings
# https://forums.fast.ai/t/confused-on-categorical-variable-embedding/24797/12
# Absract of: https://arxiv.org/pdf/1604.06737.pdf
# https://www.avanwyk.com/cdc-mortality-fastai-tabular/

In [1]:
# Imports
from fastai.tabular import * 
import pandas as pd
import numpy as np

In [3]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


In [4]:
# untar_data() is a function of fastai.datasets
help(untar_data)


Help on function untar_data in module fastai.datasets:

untar_data(url: str, fname: Union[pathlib.Path, str] = None, dest: Union[pathlib.Path, str] = None, data=True, force_download=False, verbose=False) -> pathlib.Path
    Download `url` to `fname` if `dest` doesn't exist, and un-tgz to folder `dest`.



In [5]:
# URLs is a class in python having a numebr of attributes.
# To see some of those, press tab after URLs. 
# Each attribute refers to a dataset on Internet

#URLs.
URLs.AMAZON_REVIEWS
URLs.ADULT_SAMPLE
URLs.CARS

'https://s3.amazonaws.com/fast-ai-nlp/amazon_review_full_csv'

'http://files.fast.ai/data/examples/adult_sample'

'https://s3.amazonaws.com/fast-ai-imageclas/stanford-cars'

In [6]:
# Doenload adult data to folder: /home/ashok/.fastai/data/adult_sample
# No download is performed if dataset is already available
# The function returns dataset folder
path = untar_data(URLs.ADULT_SAMPLE)

Downloading http://files.fast.ai/data/examples/adult_sample.tgz


In [7]:
# Understanding 'path' and its behaviour
path
type(path)            # It is a special type
path/"abc.csv"        # Also see this special syntax

PosixPath('/root/.fastai/data/adult_sample')

pathlib.PosixPath

PosixPath('/root/.fastai/data/adult_sample/abc.csv')

In [8]:
# Read datafile now
df = pd.read_csv(path/'adult.csv')
df.head()
df.shape        # (32561, 15)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,49,Private,101320,Assoc-acdm,12.0,Married-civ-spouse,,Wife,White,Female,0,1902,40,United-States,>=50k
1,44,Private,236746,Masters,14.0,Divorced,Exec-managerial,Not-in-family,White,Male,10520,0,45,United-States,>=50k
2,38,Private,96185,HS-grad,,Divorced,,Unmarried,Black,Female,0,0,32,United-States,<50k
3,38,Self-emp-inc,112847,Prof-school,15.0,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,United-States,>=50k
4,42,Self-emp-not-inc,82297,7th-8th,,Married-civ-spouse,Other-service,Wife,Black,Female,0,0,50,United-States,<50k


(32561, 15)

In [None]:
df.dtypes
df.dtypes.value_counts()

In [10]:
# How much memory per column
df.memory_usage()
# And total memory usage
f'Total memory usage in MBs: {df.memory_usage().sum()/1000000}'


Index                128
age               260488
workclass         260488
fnlwgt            260488
education         260488
education-num     260488
marital-status    260488
occupation        260488
relationship      260488
race              260488
sex               260488
capital-gain      260488
capital-loss      260488
hours-per-week    260488
native-country    260488
salary            260488
dtype: int64

'Total memory usage in MBs: 3.907448'

In [11]:
# Reducing memory usage
# Transform numeric, int64' to proper datatype
df.age = df.age.astype('uint8')  # After all age varies between [0,100)
df['hours-per-week'] = df['hours-per-week'].astype(np.uint8)  # Max hours per week can be: 7 X 24 = 168
# What about few other fields
df['capital-gain'].min() 
df['capital-gain'].max()
df['capital-loss'].min()
df['capital-loss'].max()

# Get some details about unit8 and uint16
np.iinfo(np.uint8)
np.iinfo(np.uint16)

# Transform 'object' to pandas 'category' datatype
df.education = df.education.astype('category')

0

99999

0

4356

iinfo(min=0, max=255, dtype=uint8)

iinfo(min=0, max=65535, dtype=uint16)

In [12]:
# So what is the currennt memory usage
df.memory_usage()
sum(df.memory_usage())

Index                128
age                32561
workclass         260488
fnlwgt            260488
education          33245
education-num     260488
marital-status    260488
occupation        260488
relationship      260488
race              260488
sex               260488
capital-gain      260488
capital-loss      260488
hours-per-week     32561
native-country    260488
salary            260488
dtype: int64

3224351

In [13]:
# We will carry out transformation on our data
# The list of classes that will perform this is below:
# https://docs.fast.ai/tabular.transform.html
# While we can invoke methods of these classes
#  we will use Class TabularDataBunch to invoke all
#   these on our dataset
# https://docs.fast.ai/tabular.transform.html#FillMissing
# FillMissing: Fill the missing values in continuous columns.
# https://docs.fast.ai/tabular.transform.html#Categorify
# Categorify: Transform the categorical variables to that type.
# https://docs.fast.ai/tabular.transform.html#Normalize
# Normalize: Normalize the continuous variables.

procs = [FillMissing, Categorify, Normalize]


In [14]:
# Create an index for validation dataset
valid_idx = range(len(df)-2000, len(df))    # That is last 2000 datapoints

In [18]:
# Segregate our fields
dep_var = 'salary'
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']

In [16]:
df.to_csv("adult_sample", index = False)

In [17]:
# https://docs.fast.ai/tabular.data.html
# Create a DataBunch object suitable for tabular data.
# The best way to quickly get your data in a DataBunch object 
#  suitable for tabular data is to organize it in two dataframes.
#  One for training, one for validation, and if you have it, 
#   one for testing. 

data = TabularDataBunch.from_df('/content/adult_sample',
                                df,
                                dep_var,
                                valid_idx=valid_idx,
                                procs=procs,
                                cat_names=cat_names
                               )

In [19]:
# Some properties of DataBunch object
data.train_ds.cont_names  # `cont_names` defaults to: set(df)-set(cat_names)-{dep_var}
data.train_ds.cat_names
data.batch_size
dx = data.show_batch()


['age',
 'education-num',
 'hours-per-week',
 'fnlwgt',
 'capital-loss',
 'capital-gain']

['workclass',
 'education',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native-country',
 'education-num_na']

64

workclass,education,marital-status,occupation,relationship,race,sex,native-country,education-num_na,age,education-num,hours-per-week,fnlwgt,capital-loss,capital-gain,target
Private,9th,Married-civ-spouse,Transport-moving,Husband,White,Male,United-States,False,-0.9976,-1.9889,0.3682,3.058,-0.2168,-0.1459,<50k
Private,5th-6th,Never-married,Handlers-cleaners,Not-in-family,White,Female,United-States,False,0.9795,-2.7725,-0.0358,0.9691,-0.2168,-0.1459,<50k
Local-gov,Masters,Never-married,Prof-specialty,Not-in-family,White,Female,United-States,False,-0.0457,1.5376,1.1761,-1.5075,3.9281,-0.1459,<50k
Private,10th,Married-civ-spouse,Sales,Husband,White,Male,United-States,False,2.444,-1.597,-2.0556,-0.1134,-0.2168,-0.1459,<50k
Local-gov,Assoc-acdm,Never-married,Prof-specialty,Own-child,White,Female,United-States,False,-1.0709,0.7539,-3.1059,-0.1284,-0.2168,-0.1459,<50k


In [20]:
# Create a learner object and learn
# Specify metrics, including a TabularModel 
# emb_szs is a dict mapping categorical 
#  column names to embedding sizes; you only 
#   need to pass sizes for columns where you want 
#    to override the default behaviour of the model.
#     Generally embedding vector size is deided by some
#      heuristic (see" https://forums.fast.ai/t/wiki-lesson-5/9403/14)
# The rule of thumb for determining the embedding size is the cardinality size divided by 2, but no bigger than 50.
# See this: https://medium.com/@hiromi_suenaga/deep-learning-2-part-1-lesson-4-2048a26d58aa

# For metrics, Pl see: https://docs.fast.ai/metrics.html

learn = tabular_learner(data,
                        layers=[200,100],
                        emb_szs={'native-country': 10},
                        metrics=[accuracy]
                       )

# Next start  learning
# TODO: What is 1cycle policy?
epochs = 2
learn.fit_one_cycle(epochs, 1e-2)


epoch,train_loss,valid_loss,accuracy,time
0,0.335528,0.324225,0.846,00:03
1,0.321627,0.316807,0.851,00:03


In [21]:
learn.predict(df.iloc[0])


(Category tensor(1), tensor(1), tensor([0.2010, 0.7990]))

In [22]:
df.iloc[0]

age                                49
workclass                     Private
fnlwgt                         101320
education                  Assoc-acdm
education-num                    12.0
marital-status     Married-civ-spouse
occupation                        NaN
relationship                     Wife
race                            White
sex                            Female
capital-gain                        0
capital-loss                     1902
hours-per-week                     40
native-country          United-States
salary                          >=50k
Name: 0, dtype: object