In [None]:
#hide
from fastbook import *
setup_book()
from pandas.api.types import is_string_dtype, is_numeric_dtype, is_categorical_dtype
from fastai.tabular.all import *
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
import dtreeviz
from IPython.display import Image, display_svg, SVG

pd.options.display.max_rows = 20
pd.options.display.max_columns = 8

# Tabular Modeling Deep Dive

## Categorical Embeddings

## Beyond Deep Learning

## The Dataset

### Kaggle Competitions

In [None]:
creds = ''

In [None]:
# set up kaggle creds

cred_path = Path('~/.kaggle/kaggle.json').expanduser()
if not cred_path.exists():
    cred_path.parent.mkdir(exist_ok=True)
    cred_path.write_text(creds)
    cred_path.chmod(0o600)

In [None]:
comp = 'bluebook-for-bulldozers'
path = URLs.path(comp)
path

In [None]:
#hide
Path.BASE_PATH = path

In [None]:
# download and unzip data
from kaggle import api

if not path.exists():
    path.mkdir(parents=true)
    api.competition_download_cli(comp, path=path)
    shutil.unpack_archive(str(path/f'{comp}.zip'), str(path))

# the ls() method is provided by fastcore, and file_type= filters by mimetype primary type (e.g. 'text' in 'text/csv') where the mimetype of a file is inferred by the file extension
path.ls(file_type='text')

### Look at the Data

In [None]:
# default has low_memory=True, but that risks using different data types for some rows
df = pd.read_csv(path/'TrainAndValid.csv', low_memory=False)

In [None]:
df.columns

In [None]:
# observe that ProductSize is categorical, but has a natural ordering
df['ProductSize'].unique()

In [None]:
# after inspecting the categories of this column, we define an ordering by a tuple of strings
sizes = 'Large','Large / Medium','Medium','Small','Mini','Compact'

In [None]:
# and then convert the column to a categorical type with the ordering
df['ProductSize'] = df['ProductSize'].astype('category').cat.set_categories(sizes, ordered=True)

In [None]:
# here we need to do some preprocessing regarding the format of the output; we are interested in predicting the log of the sale price, not the actual sale price, since we are evaluated on the RMSLE (root mean squared log error) wrt the actual sale prick
dep_var = 'SalePrice'

In [None]:
df[dep_var] = np.log(df[dep_var])

## Decision Trees

### Handling Dates

In [None]:
# many dates are meaningful in more than their sequential relationship with each other; for example, the day of the week is meaningful, as is whether the date falls on a holiday
# add_datepart replaces the date column with a bunch of more appropriate columns
# 'Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear', 'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start'
df = add_datepart(df, 'saledate')

In [None]:
df_test = pd.read_csv(path/'Test.csv', low_memory=False)
df_test = add_datepart(df_test, 'saledate')

In [None]:
' '.join(o for o in df.columns if o.startswith('sale'))

### Using TabularPandas and TabularProc

In [None]:
# we define a chain of TabularProcs (supplied by fastai) to be applied to the data
# to further clean it up. Categorify converts strings to pandas categories, and FillMissing replaces missing values with the median of the column and adds a new column indicating which rows were missing
# TabularProcs differ from Transforms in that they modify the dataframe in place and return it, rather than returning a new dataframe; and that it is eager. Transforms are lazy, and are applied only when the data is accessed via it.
procs = [Categorify, FillMissing]

In [None]:
# the data is a time series, and the tasy is to predict future sale prices given historical information
# we therefore split the data by date

cond = (df.saleYear<2011) | (df.saleMonth<10)
train_idx = np.where( cond)[0]
valid_idx = np.where(~cond)[0]

# splits is a tuple of two lists of indices corresponding to df, one for the training set and one for the validation set
splits = (list(train_idx),list(valid_idx))

In [None]:
# we also extract the names of the categorical and continuous columns
cont,cat = cont_cat_split(df, 1, dep_var=dep_var)

In [None]:
# then reform the dataframe into a TabularPandas object; we have passed in the procs, the categorical and continuous column names, the name of the dependent variable, and the splits
to = TabularPandas(df, procs, cat, cont, y_names=dep_var, splits=splits)

In [None]:
# we see that a TabularPandas object contains a train and a valid set
len(to.train),len(to.valid)

In [None]:
# one sees that the categorical data is still displayed as strings
to.show(3)

In [None]:
# here is a version where we have specified by hand certain columns to be categorical and no columns continuous
to1 = TabularPandas(df, procs, ['state', 'ProductGroup', 'Drive_System', 'Enclosure'], [], y_names=dep_var, splits=splits)
to1.show(3)

In [None]:
# observe that the underlying categorical data is now reindexed to integers
to.items.head(3)

In [None]:
# note that we have projected the data into 4 columns of interest here
to1.items[['state', 'ProductGroup', 'Drive_System', 'Enclosure']].head(3)

In [None]:
to.classes['ProductSize']

In [None]:
# it is possible to save a TabularPandas to a pickle file
save_pickle(path/'to.pkl',to)

### Creating the Decision Tree

In [None]:
#hide
# and to read it back in
to = load_pickle(path/'to.pkl')

In [None]:
xs,y = to.train.xs,to.train.y
valid_xs,valid_y = to.valid.xs,to.valid.y

In [None]:
# we fit an sklearn decision tree regressor to the data provided by the TabularPandas object to a max of 4 leaf nodes
m = DecisionTreeRegressor(max_leaf_nodes=4)
m.fit(xs, y);

In [None]:
# fastbook has a helper function to draw a decision tree using graphviz
draw_tree(m, xs, size=10, leaves_parallel=True, precision=2)

In [None]:
# we use dtreeviz to illustrate the decision tree
# we first sample 500 rows of the data without replacement
# then we pass the model, the sample input, the expected (dependent variable) label, the sample input feature names, and the name of the dependent variable
# finally, we view the tree
samp_idx = np.random.permutation(len(y))[:500]
viz_model = dtreeviz.model(m, xs.iloc[samp_idx], y.iloc[samp_idx], None, xs.columns, dep_var)
viz_model.view(fontname='DejaVu Sans', scale=1.6, label_fontsize=10, orientation='LR')

In [None]:
# in our data 1000 seems to be a placeholder for a missing value in the YearMade column
# this is not an issue for training our decision tree, but it affects the visualization
# we set it to 1950 for the purposes of visualization, which is still far below than the range of valid values, but we can now better see the distribution of the valid values
xs.loc[xs['YearMade']<1900, 'YearMade'] = 1950
valid_xs.loc[valid_xs['YearMade']<1900, 'YearMade'] = 1950

In [None]:
# we retrain and re-visualize with the replaced values
m = DecisionTreeRegressor(max_leaf_nodes=4).fit(xs, y)

viz_model = dtreeviz.model(m, xs.iloc[samp_idx], y.iloc[samp_idx], None, xs.columns, dep_var)
viz_model.view(fontname='DejaVu Sans', scale=1.6, label_fontsize=10, orientation='LR')

In [None]:
# we illustrate what happens if we omit the max_leaf_nodes parameter
m = DecisionTreeRegressor()
m.fit(xs, y);

In [None]:
# we quickly define a function to compute the rmsle (assuming the output is already the log of the sale price)
def r_mse(pred,y): return round(math.sqrt(((pred-y)**2).mean()), 6)
def m_rmse(m, xs, y): return r_mse(m.predict(xs), y)

In [None]:
# the model is perfect on the training set
m_rmse(m, xs, y)

In [None]:
# but not on the validation set
m_rmse(m, valid_xs, valid_y)

In [None]:
# we can see that without any coonstraints, we have learned a model that memorizes the data
m.get_n_leaves(), len(xs)

In [None]:
# hence we may consider constraining the model for example by ruling out any split that contains fewer than 25 items in one of the leaves
#   c.t. min_samples_leaf
m = DecisionTreeRegressor(min_samples_leaf=25)
m.fit(to.train.xs, to.train.y)
m_rmse(m, xs, y), m_rmse(m, valid_xs, valid_y)

In [None]:
# with a more reasonable number of leaf nodes
m.get_n_leaves()

### Categorical Variables

## Random Forests

In [None]:
#hide
# pip install —pre -f https://sklearn-nightly.scdn8.secure.raxcdn.com scikit-learn —U

### Creating a Random Forest

In [None]:
# here, n_jobs=-1 means to use all available cores
# n_estimators is the number of trees to use
# max_samples is the number of samples to use for each tree (recall that we sample with replacement)
# max_features is the number of features to consider for each split
# min_samples_leaf is the minimum number of samples that must be represented in each leaf
# oob_score=True has the model expose an oob_score_ attribute that is the R^2 score of the model on the out-of-bag samples
def rf(xs, y, n_estimators=40, max_samples=200_000,
       max_features=0.5, min_samples_leaf=5, **kwargs):
    return RandomForestRegressor(n_jobs=-1, n_estimators=n_estimators,
        max_samples=max_samples, max_features=max_features,
        min_samples_leaf=min_samples_leaf, oob_score=True).fit(xs, y)

In [None]:
m = rf(xs, y);

In [None]:
# as expected, thes model performs much better than the decision tree
m_rmse(m, xs, y), m_rmse(m, valid_xs, valid_y)

# it is a property of random forests that it is not very sensitive to hyperparameter choices; increasing n_estimators rhould result in a better model
# sklearn's defaults work well too
# the sklearn docs show that when we use many estimators, we can achieve lower error with a smaller max_features

In [None]:
# we illustrate the effect of the number of trees on the error
# we can get the predictions of the ith 0idx tree witm m.estimators_[i].predict(xs)
preds = np.stack([t.predict(valid_xs) for t in m.estimators_])

In [None]:
# we verify that the mean of the predictions of the trees is the same as by the usual API, by comparing their errors
r_mse(preds.mean(0), valid_y)

In [None]:
# we plot the error as a function of the number of trees
plt.plot([r_mse(preds[:i+1].mean(0), valid_y) for i in range(40)]);

### Out-of-Bag Error

In [None]:
# m.oob_prediction_ is the prediction of the model on the out-of-bag samples
r_mse(m.oob_prediction_, y)

## Model Interpretation

### Tree Variance for Prediction Confidence

In [None]:
preds = np.stack([t.predict(valid_xs) for t in m.estimators_])

In [None]:
preds.shape

In [None]:
# this computes the standard deviation among the predictions between the trees
preds_std = preds.std(0)

In [None]:
# in most applications, this confidence informs us how muce we can rely on the model's predictions
preds_std[:5]

### Feature Importance

In [None]:
# m.feature_importances_ gives the importance of each feature, as computed against the training data; the Gini importance, summing to 1
def rf_feat_importance(m, df):
    return pd.DataFrame({'cols':df.columns, 'imp':m.feature_importances_}
                       ).sort_values('imp', ascending=False)

In [None]:
fi = rf_feat_importance(m, xs)
fi[:10]

In [None]:
def plot_fi(fi):
    return fi.plot('cols', 'imp', 'barh', figsize=(12,7), legend=False)

plot_fi(fi[:30]);

### Removing Low-Importance Variables

In [None]:
to_keep = fi[fi.imp>0.005].cols
len(to_keep)

In [None]:
# xs_imp is the training data projected onto the important features
xs_imp = xs[to_keep]
valid_xs_imp = valid_xs[to_keep]

In [None]:
m = rf(xs_imp, y)

In [None]:
m_rmse(m, xs_imp, y), m_rmse(m, valid_xs_imp, valid_y)

In [None]:
len(xs.columns), len(xs_imp.columns)

In [None]:
plot_fi(rf_feat_importance(m, xs_imp));

### Removing Redundant Features

In [None]:
# this fastai function draws a dendrogram that merges the most similar columns first (to the right)
# we see that saleYear and saleElapsed are very similar, as are ProductGroup and ProductGroupDesc
cluster_columns(xs_imp)

In [None]:
def get_oob(df):
    m = RandomForestRegressor(n_estimators=40, min_samples_leaf=15,
        max_samples=50000, max_features=0.5, n_jobs=-1, oob_score=True)
    m.fit(df, y)
    return m.oob_score_

In [None]:
get_oob(xs_imp)

In [None]:
# we see what happens when we drop one column suspected to be redundant; one by one
{c:get_oob(xs_imp.drop(c, axis=1)) for c in (
    'saleYear', 'saleElapsed', 'ProductGroupDesc','ProductGroup',
    'fiModelDesc', 'fiBaseModel',
    'Hydraulics_Flow','Grouser_Tracks', 'Coupler_System')}

In [None]:
# now we try dropping multiple
to_drop = ['saleYear', 'ProductGroupDesc', 'fiBaseModel', 'Grouser_Tracks']
get_oob(xs_imp.drop(to_drop, axis=1))

In [None]:
# again we create a dataset without the dropped columns
xs_final = xs_imp.drop(to_drop, axis=1)
valid_xs_final = valid_xs_imp.drop(to_drop, axis=1)

In [None]:
save_pickle(path/'xs_final.pkl', xs_final)
save_pickle(path/'valid_xs_final.pkl', valid_xs_final)

In [None]:
xs_final = load_pickle(path/'xs_final.pkl')
valid_xs_final = load_pickle(path/'valid_xs_final.pkl')

In [None]:
# by doing this, we have greatly simplified our dataset and model without increasing the error
m = rf(xs_final, y)
m_rmse(m, xs_final, y), m_rmse(m, valid_xs_final, valid_y)

### Partial Dependence

In [None]:
# we now investigate the relationship between ProductSize and SalePrice
# we first plot a horizontal bar plot of the number of items in each category
# these come from the pandas APIs: valid_xs_final is a DataFrame
# pandas uses matplotlib to plot
# p is of type matplotlib.axes.Axes
# c is of type fastai.data.transforms.CategoryMap , that behaves like an array of strings
# plt.yticks sets the y-axis tick labels
p = valid_xs_final['ProductSize'].value_counts(sort=False).plot.barh()
c = to.classes['ProductSize']
plt.yticks(range(len(c)), c);

In [None]:
# ax is of type matplotlib.AxesSubplot
ax = valid_xs_final['YearMade'].hist()

In [None]:
# to see the relationship between YearMade and SalePrice, we use the PartialDependenceDisplay class from sklearn
# we can't just naively take e.g. the mean of the SalePrice for each YearMade, because we have to take into account the other features
# what we do is to replace the YearMade column in e.g. valid_xs_final with a single value, and then see how the model's predictions change
# one then observes a nearly linear relationship between YearMade and SalePrice
# for ProductSize, we see that prices vanish for small sizes. Where data is missing, we see that the modle predicts a price close to medium sizes.
from sklearn.inspection import PartialDependenceDisplay

fig,ax = plt.subplots(figsize=(12, 4))
PartialDependenceDisplay.from_estimator(m, valid_xs_final, ['YearMade','ProductSize'], 
                                        grid_resolution=20, ax=ax)

### Data Leakage

### Tree Interpreter

In [None]:
#hide
#Data leakage refers to information about the target of a model that shouldn't be available to it during training, but is. To detect data leakage, try to
# - Check whether the accuracy of the model is *too good to be true*.
# - Look for important predictors that don't make sense in practice.
# - Look for partial dependence plot results that don't make sense in practice.

# we now consider explaining the model's prediction for a single row
import warnings
warnings.simplefilter('ignore', FutureWarning)

from treeinterpreter import treeinterpreter
from waterfall_chart import plot as waterfall

In [None]:
# first consider 5 rows
row = valid_xs_final.iloc[:5]

In [None]:
# obtain prediction data about the model on these rows
# prediction is the prediction of the model
# bias is the average prediction of the model
# contributions is the contribution of each feature to the prediction; its sum is the difference between the prediction and the bias
prediction,bias,contributions = treeinterpreter.predict(m, row.values)

In [None]:
prediction[0], bias[0], contributions[0].sum()

In [None]:
# the waterfall chart shows the contribution of each feature to the value of the (regression) prediction
waterfall(valid_xs_final.columns, contributions[0], threshold=0.08, 
          rotation_value=45,formatting='{:,.3f}');

## Extrapolation and Neural Networks

### The Extrapolation Problem

In [None]:
#hide
# an issue to tree-based models is that they don't generalize well
np.random.seed(42)

In [None]:
# plot a y=x plot with some normal noise
x_lin = torch.linspace(0,20, steps=40)
y_lin = x_lin + torch.randn_like(x_lin)
plt.scatter(x_lin, y_lin);

In [None]:
# convert to column vector
xs_lin = x_lin.unsqueeze(1)
x_lin.shape,xs_lin.shape

In [None]:
# alternate method to convert to column vector
x_lin[:,None].shape

In [None]:
# we now train a random forest on part of this data having small values
m_lin = RandomForestRegressor().fit(xs_lin[:30],y_lin[:30])

In [None]:
# and when we predict, we see that the model hasn't learned the linear relationship outside the domain of the training data
plt.scatter(x_lin, y_lin, 20)
plt.scatter(x_lin, m_lin.predict(xs_lin), color='red', alpha=0.5);

### Finding Out-of-Domain Data

In [None]:
# to see if your test set is different from your training set, and in what way, you can train a random forest to predict whether a row is in the training set or the validation set
# we concatenate the training and validation sets
# and initialize a dependent variable that is 0 for the training set and 1 for the validation set
# here, rf_feat_importance is a fastai function that returns a dataframe of the feature importances
# we see that saleElapsed encodes the date, so it contributes greatly since the training and validation sets are split by date
# we see that SalesID and MadhineID may be too.

df_dom = pd.concat([xs_final, valid_xs_final])
is_valid = np.array([0]*len(xs_final) + [1]*len(valid_xs_final))

m = rf(df_dom, is_valid)
rf_feat_importance(m, df_dom)[:6]

In [None]:
# we compute how removing these columns might affect the model's performance; and removing the SalesID and MachineID seems like it might actually improve predictions
m = rf(xs_final, y)
print('orig', m_rmse(m, valid_xs_final, valid_y))

for c in ('SalesID','saleElapsed','MachineID'):
    m = rf(xs_final.drop(c,axis=1), y)
    print(c, m_rmse(m, valid_xs_final.drop(c,axis=1), valid_y))

In [None]:
# we now remove them; the resultant model should be more resilient wrt differences like in that between the training and validation sets
time_vars = ['SalesID','MachineID']
xs_final_time = xs_final.drop(time_vars, axis=1)
valid_xs_time = valid_xs_final.drop(time_vars, axis=1)

m = rf(xs_final_time, y)
m_rmse(m, valid_xs_time, valid_y)

In [None]:
# otoh we may consider removing old data that is not representative of the current data
# intuition: for a random tree, it is unable to learn to not trust old data wrt certain relationships
# we plot the histogram and see that we have most data from 2002 to 2011 and so it is sensible to just consider data from this subset
xs['saleYear'].hist();

In [None]:
filt = xs['saleYear']>2004
xs_filt = xs_final_time[filt]
y_filt = y[filt]

In [None]:
m = rf(xs_filt, y_filt)
m_rmse(m, xs_filt, y_filt), m_rmse(m, valid_xs_time, valid_y)

### Using a Neural Network

In [None]:
# we now consider using a neural network to model the data instead.
# we again perform the same preprocessing as before
df_nn = pd.read_csv(path/'TrainAndValid.csv', low_memory=False)
df['ProductSize'] = df['ProductSize'].astype('category').cat.set_categories(sizes, ordered=True)
df_nn[dep_var] = np.log(df_nn[dep_var])
df_nn = add_datepart(df_nn, 'saledate')

In [None]:
# we elomonate features to the same as before
df_nn_final = df_nn[list(xs_final_time.columns) + [dep_var]]

In [None]:
# we use max_card to determine which variables should be treated as categorical
# categorical variables typically should not be more than 10000, and if they are, consider first preprocessing and compressing
cont_nn,cat_nn = cont_cat_split(df_nn_final, max_card=9000, dep_var=dep_var)

In [None]:
# view which variables are continuous
cont_nn

In [None]:
# view the categories of our categorical variables
# ModelID seems to be co-redundant alongside fiModelDescriptor
df_nn_final[cat_nn].nunique()

In [None]:
# we validate that this does not cause a drop in performance by evaluating a random forest 
xs_filt2 = xs_filt.drop('fiModelDescriptor', axis=1)
valid_xs_time2 = valid_xs_time.drop('fiModelDescriptor', axis=1)
m2 = rf(xs_filt2, y_filt)
m_rmse(m2, xs_filt2, y_filt), m_rmse(m2, valid_xs_time2, valid_y)

In [None]:
cat_nn.remove('fiModelDescriptor')

In [None]:
# we define the TabularPandas for the neural network
procs_nn = [Categorify, FillMissing, Normalize]
to_nn = TabularPandas(df_nn_final, procs_nn, cat_nn, cont_nn,
                      splits=splits, y_names=dep_var)

In [None]:
# we can typically use larger batch sizes for tabular data compared to images since each item is typically smaller
dls = to_nn.dataloaders(1024)

In [None]:
y = to_nn.train.y
y.min(),y.max()

In [None]:
# here the log price is restricted to the range 8 to 12; the dense layers are 500 and 250
# here we are using the library's mse loss function
learn = tabular_learner(dls, y_range=(8,12), layers=[500,250],
                        n_out=1, loss_func=F.mse_loss)

In [None]:
# recall our procedure for selecting an appropriate lr
learn.lr_find()

In [None]:
# there's no pretrained model here, so no fine_tune. We just fit for 5 epochs
learn.fit_one_cycle(5, 1e-2)

In [None]:
# compute loss on validation set
# we have achieved a lower loss than with the random forest, though more training time and hyperparameter tuning was required
preds,targs = learn.get_preds()
r_mse(preds,targs)

In [None]:
learn.save('nn')

### Sidebar: fastai's Tabular Classes

### End sidebar

## Ensembling

In [None]:
# ensembling the rf model and the nn model together
# note that we need to do a little processing on the output
rf_preds = m.predict(valid_xs_time)
ens_preds = (to_np(preds.squeeze()) + rf_preds) /2

In [None]:
r_mse(ens_preds,valid_y)

### Boosting

Boosting is an ensemble method that refers to adding models rather than averaging them; we train a small model that underfits, then train a new model on the residuals; the differences as the target; and repeat. Models composed of boosted trees include gradient boosting machines (GBMs) and gradient boosted decision machines (GBDTs). But GBMs require hyperparameter tuning and can overfit

### Combining Embeddings with Other Methods

Consider using embeddings learned by an NN as the input to other ML models; thes technique has frequently gotten great results (the advantage is that while the embedding mapping still needs to be run at inference, the later layers may be replaced by a simpler model)

## Conclusion: Our Advice for Tabular Modeling

## Questionnaire

1. What is a continuous variable?
1. What is a categorical variable?
1. Provide two of the words that are used for the possible values of a categorical variable.
1. What is a "dense layer"?
1. How do entity embeddings reduce memory usage and speed up neural networks?
1. What kinds of datasets are entity embeddings especially useful for?
1. What are the two main families of machine learning algorithms?
1. Why do some categorical columns need a special ordering in their classes? How do you do this in Pandas?
1. Summarize what a decision tree algorithm does.
1. Why is a date different from a regular categorical or continuous variable, and how can you preprocess it to allow it to be used in a model?
1. Should you pick a random validation set in the bulldozer competition? If no, what kind of validation set should you pick?
1. What is pickle and what is it useful for?
1. How are `mse`, `samples`, and `values` calculated in the decision tree drawn in this chapter?
1. How do we deal with outliers, before building a decision tree?
1. How do we handle categorical variables in a decision tree?
1. What is bagging?
1. What is the difference between `max_samples` and `max_features` when creating a random forest?
1. If you increase `n_estimators` to a very high value, can that lead to overfitting? Why or why not?
1. In the section "Creating a Random Forest", just after <<max_features>>, why did `preds.mean(0)` give the same result as our random forest?
1. What is "out-of-bag-error"?
1. Make a list of reasons why a model's validation set error might be worse than the OOB error. How could you test your hypotheses?
1. Explain why random forests are well suited to answering each of the following question:
   - How confident are we in our predictions using a particular row of data?
   - For predicting with a particular row of data, what were the most important factors, and how did they influence that prediction?
   - Which columns are the strongest predictors?
   - How do predictions vary as we vary these columns?
1. What's the purpose of removing unimportant variables?
1. What's a good type of plot for showing tree interpreter results?
1. What is the "extrapolation problem"?
1. How can you tell if your test or validation set is distributed in a different way than your training set?
1. Why do we ensure `saleElapsed` is a continuous variable, even although it has less than 9,000 distinct values?
1. What is "boosting"?
1. How could we use embeddings with a random forest? Would we expect this to help?
1. Why might we not always use a neural net for tabular modeling?

### Further Research

1. Pick a competition on Kaggle with tabular data (current or past) and try to adapt the techniques seen in this chapter to get the best possible results. Compare your results to the private leaderboard.
1. Implement the decision tree algorithm in this chapter from scratch yourself, and try it on the dataset you used in the first exercise.
1. Use the embeddings from the neural net in this chapter in a random forest, and see if you can improve on the random forest results we saw.
1. Explain what each line of the source of `TabularModel` does (with the exception of the `BatchNorm1d` and `Dropout` layers).