<a href="https://colab.research.google.com/github/harnalashok/deeplearning/blob/main/fastai_tabular_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Handling tabular data using deeplearning

In [None]:
# Last amended: 30th April, 2019
# Myfolder: /home/ashok/Documents/3.tabular_data
# lubuntu_deeplearning_III
# Ref: i)  As given in this document
#      ii) On moodle in fastai section
#     iii) https://arxiv.org/pdf/1803.09820.pdf
# Objective: 
#            i)  Using deeplearning for tabular data
#            ii) Categorical transformation to embeddings
#           iii) What are weight decay, cyclical learning rate
#                and other regularization techniques including 
#                good batch size selection

In [None]:
# Transforming categories to embeddings:
# Read in this order:
# https://www.fast.ai/2018/04/29/categorical-embeddings/
# https://developers.google.com/machine-learning/crash-course/embeddings/obtaining-embeddings
# https://forums.fast.ai/t/confused-on-categorical-variable-embedding/24797/12
# Absract of: https://arxiv.org/pdf/1604.06737.pdf
# https://www.avanwyk.com/cdc-mortality-fastai-tabular/

In [None]:
# Imports
from fastai.tabular import * 
import pandas as pd
import numpy as np

In [None]:
# untar_data() is a function of fastai.datasets
help(untar_data)


In [None]:
# URLs is a class in python having a numebr of attributes.
# To see some of those, press tab after URLs. 
# Each attribute refers to a dataset on Internet

#URLs.
URLs.AMAZON_REVIEWS
URLs.ADULT_SAMPLE
URLs.CARS

In [None]:
# Doenload adult data to folder: /home/ashok/.fastai/data/adult_sample
# No download is performed if dataset is already available
# The function returns dataset folder
path = untar_data(URLs.ADULT_SAMPLE)

In [None]:
# Understanding 'path' and its behaviour
path
type(path)            # It is a special type
path/"abc.csv"        # Also see this special syntax

In [None]:
# Read datafile now
df = pd.read_csv(path/'adult.csv')
df.head()
df.shape        # (32561, 15)

In [None]:
df.dtypes
df.dtypes.value_counts()

In [None]:
# How much memory per column
df.memory_usage()
# And total memory usage
f'Total memory usage in MBs: {df.memory_usage().sum()/1000000}'


In [None]:
# Reducing memory usage
# Transform numeric, int64' to proper datatype
df.age = df.age.astype('uint8')  # After all age varies between [0,100)
df['hours-per-week'] = df['hours-per-week'].astype(np.uint8)  # Max hours per week can be: 7 X 24 = 168
# What about few other fields
df['capital-gain'].min() 
df['capital-gain'].max()
df['capital-loss'].min()
df['capital-loss'].max()

# Get some details about unit8 and uint16
np.iinfo(np.uint8)
np.iinfo(np.uint16)

# Transform 'object' to pandas 'category' datatype
df.education = df.education.astype('category')

In [None]:
# So what is the currennt memory usage
df.memory_usage()
sum(df.memory_usage())

In [None]:
# We will carry out transformation on our data
# The list of classes that will perform this is below:
# https://docs.fast.ai/tabular.transform.html
# While we can invoke methods of these classes
#  we will use Class TabularDataBunch to invoke all
#   these on our dataset
# https://docs.fast.ai/tabular.transform.html#FillMissing
# FillMissing: Fill the missing values in continuous columns.
# https://docs.fast.ai/tabular.transform.html#Categorify
# Categorify: Transform the categorical variables to that type.
# https://docs.fast.ai/tabular.transform.html#Normalize
# Normalize: Normalize the continuous variables.

procs = [FillMissing, Categorify, Normalize]


In [None]:
# Create an index for validation dataset
valid_idx = range(len(df)-2000, len(df))    # That is last 2000 datapoints

In [None]:
# Segregate our fields
dep_var = 'salary'
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']

In [None]:
# https://docs.fast.ai/tabular.data.html
# Create a DataBunch object suitable for tabular data.
# The best way to quickly get your data in a DataBunch object 
#  suitable for tabular data is to organize it in two dataframes.
#  One for training, one for validation, and if you have it, 
#   one for testing. 

data = TabularDataBunch.from_df('/home/ashok/.fastai/data/adult_sample',
                                df,
                                dep_var,
                                valid_idx=valid_idx,
                                procs=procs,
                                cat_names=cat_names
                               )

In [None]:
# Some properties of DataBunch object
data.train_ds.cont_names  # `cont_names` defaults to: set(df)-set(cat_names)-{dep_var}
data.train_ds.cat_names
data.batch_size
dx = data.show_batch()


In [None]:
# Create a learner object and learn
# Specify metrics, including a TabularModel 
# emb_szs is a dict mapping categorical 
#  column names to embedding sizes; you only 
#   need to pass sizes for columns where you want 
#    to override the default behaviour of the model.
#     Generally embedding vector size is deided by some
#      heuristic (see" https://forums.fast.ai/t/wiki-lesson-5/9403/14)
# The rule of thumb for determining the embedding size is the cardinality size divided by 2, but no bigger than 50.
# See this: https://medium.com/@hiromi_suenaga/deep-learning-2-part-1-lesson-4-2048a26d58aa

# For metrics, Pl see: https://docs.fast.ai/metrics.html

learn = tabular_learner(data,
                        layers=[200,100],
                        emb_szs={'native-country': 10},
                        metrics=[accuracy]
                       )

# Next start  learning
# TODO: What is 1cycle policy?
epochs = 2
learn.fit_one_cycle(epochs, 1e-2)


In [None]:
learn.predict(df.iloc[0])


In [None]:
df.iloc[0]