In this implementation:
- Missing values are treated as a separate category i.e. we do not replace missing values with NaN.
- Use 99:1 train to validation set ratio.


In [None]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [None]:
from fastai.structured import *
from fastai.column_data import *

In [None]:
# Set printing options. 
# These options determine the way floating point numbers, arrays and
# other NumPy objects are displayed.
np.set_printoptions(threshold=50, edgeitems=20)

### Set data path

In [None]:
PATH = 'data/porto-seguro/'

### Read csv data into dataframe

In [None]:
train_data_df = pd.read_csv(f'{PATH}train/train.csv')
test_data_df = pd.read_csv(f'{PATH}test/test.csv')

### List of column names

In [None]:
all_var = [var for var in train_data_df.columns]

In [None]:
all_var

### Create list of categorical and continuous variables

In [None]:
cat_vars = [var for var in train_data_df.columns if var.endswith('cat')]
bin_vars = [var for var in train_data_df.columns if var.endswith('bin')]
ind_vars = ['ps_ind_01', 'ps_ind_03','ps_ind_14', 'ps_ind_15']
cat_vars = cat_vars + bin_vars #+ ind_vars
cat_vars

In [None]:
cont_vars = [var for var in train_data_df.columns if var not in cat_vars]
cont_vars

In [None]:
for var in cat_vars: 
    train_data_df[var] = train_data_df[var].astype('category').cat.as_ordered()

In [None]:
for var in cont_vars:
    train_data_df[var] = train_data_df[var].astype('float32')

In [None]:
del cont_vars[1]

In [None]:
cont_vars

In [None]:
train_data_df.info()

### Display training data sample

In [None]:
train_data_df.head()

### Display test data sample

In [None]:
test_data_df.head()

### Create validation set indices

In [None]:
# Tune VAL_PCT hyperparameter
# Tune MAX_FOLDS hyperparameter
VAL_FRAC = 0.01
MAX_FOLDS = 5

num_obs = train_data_df.shape[0]

cv_dict = {f'ids_fold_{num_fold}': get_cv_idxs(n=num_obs, cv_idx=num_fold, val_pct=VAL_FRAC) for num_fold in range(MAX_FOLDS)}

In [None]:
len(cv_dict['ids_fold_0'])

### Set response variable

In [None]:
TARGET_VAR = "target"

### Train set, response variable, feature scaling

In [None]:
train, target, nas, mapper = proc_df(df=train_data_df, y_fld=TARGET_VAR, do_scale=True)

### Convert categorical values to type `category`

In [None]:
for var in cat_vars: 
    train[var] = train[var].astype('category').cat.as_ordered()

### Convert remaining variables to `float32`

In [None]:
for var in cont_vars:
    train[var] = train[var].astype('float32')

target.astype('float32')

In [None]:
train.info()

### Gini metric

In [None]:
from sklearn.metrics import roc_auc_score

inv_log = lambda x : np.exp(x)

def gini_metric(preds, targs):
    preds = inv_log(preds)
    auc = roc_auc_score(y_true=targs, y_score=preds)
    return 2 * auc - 1

### Create embeddings

In [None]:
cat_sz = [(c, len(train[c].cat.categories) + 1) for c in cat_vars]

In [None]:
cat_sz

In [None]:
emb_szs = [(c, min(100, (c + 1) // 2)) for _, c in cat_sz]

In [None]:
emb_szs

### Model Architecture / Hyperparameters

In [None]:
layer_1 = 1024
layer_2 = 512
layer_3 = 256
batch_size = 256
embed_dropout = 0.01
layer_dropout = 0.15

In [None]:
# Create model with all category variables
md = ColumnarModelData.from_data_frame(path=PATH, 
                                       val_idxs=cv_dict['ids_fold_0'], 
                                       df=train, 
                                       y=target, 
                                       cat_flds=cat_vars, 
                                       bs=batch_size)

### Model learner

In [None]:
# Use all categorical variables
m = md.get_learner(emb_szs=emb_szs, 
                   n_cont=(len(train.columns) - len(cat_vars)),
                   emb_drop=embed_dropout, 
                   out_sz=1, 
                   szs=[layer_1, layer_2], 
                   drops=[layer_dropout, layer_dropout], 
                   y_range=None)


In [None]:
m.lr_find()

In [None]:
m.sched.plot()

In [None]:
lr = 0.0001

In [None]:
m.fit(lrs=lr, n_cycle=12, metrics=[accuracy, gini_metric])

In [None]:
m.fit(lrs=lr, n_cycle=6, cycle_len=2, metrics=[accuracy, gini_metric])

In [None]:
m.predict()

In [None]:
m