In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

from fastai.imports import *
from fastai.structured import *

from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from IPython.display import display
from pathlib import Path
from sklearn.ensemble import ExtraTreesRegressor

from sklearn import metrics

# Lesson 3

## 00:00:00 - Lesson 2 recap

* Common refrain = things like Random Forests hide meaning from us - not the case: can understand data deeper and quicker.
* Going to look at larger datasets.

## 00:02:32 - Audience questions

* Q1: When should I use Random Forests?
* A1: Almost always for structured data: good starting point.
  * Deep learning for unstructured data.

## 00:04:42 - proc_df function bug

* `proc_df` didn't have a way to reuse the missing data values in the test set.
  * Now returns `nas` as a third return elemnt in the tuple

In [2]:
PATH = Path('./data/bluebook/')
df_raw = pd.read_csv(f'{PATH}/Train.csv', low_memory=False, parse_dates=['saledate'])
add_datepart(df_raw, 'saledate')
train_cats(df_raw)

In [3]:
df, y, na_dict = proc_df(df_raw, 'SalePrice')

In [4]:
na_dict

{'auctioneerID': 2.0, 'MachineHoursCurrentMeter': 0.0}

## 00:09:25 - Bigger datasets and Groceries competition

* Ability to explain problem key to machine learning.
* Favorita Grocery Sales Forecasting:
  * Goal: Predict how much stock would be sold on each day for each item during a 2 week period.
  * Data provided:
    * how many units of each product on each sold was sold in the last year.
    * Metadata for store (location, class of store)
    * Metadata for each product (category of product)
    * Metadata for each data (oil price on the date)

* "Relational dataset" - number of things you can join together.
* Star schema:
  * Central transactions table `train.csv`
    * Includes `unit_sales` by `date`, `store_nbr` and `item_nbr`
  * Can join metadata to central table.
* Snowflake schema:
  * Tables join other tables.

In [5]:
PATH = Path('./data/grocery-sales')

In [6]:
!kaggle competitions download -c favorita-grocery-sales-forecasting --path {PATH}

test.csv.7z: Downloaded 5MB of 5MB to data/grocery-sales
stores.csv.7z: Downloaded 648B of 648B to data/grocery-sales
oil.csv.7z: Downloaded 4KB of 4KB to data/grocery-sales
items.csv.7z: Downloaded 14KB of 14KB to data/grocery-sales
holidays_events.csv.7z: Downloaded 2KB of 2KB to data/grocery-sales
transactions.csv.7z: Downloaded 214KB of 214KB to data/grocery-sales
train.csv.7z: Downloaded 452MB of 452MB to data/grocery-sales
sample_submission.csv.7z: Downloaded 651KB of 651KB to data/grocery-sales


In [22]:
for file in PATH.iterdir():
    if not str(file).endswith('7z'):
        continue

    !7z x {file} -o{PATH} -aoa > /dev/null

In [30]:
!mkdir -p {PATH}/tmp

### 00:15:14 - `limit_memory` in `read_csv`

* Set `limit_memory=False` will run out of memory on big datasets.
  * Instead, want to tell Pandas what the dataframe types are by passing a `dtype` argument to `read_csv`.  

In [23]:
types = {
    'id': 'int64', 'item_nbr': 'int32', 'store_nbr': 'int8',
    'unit_sales': 'float32', 'onpromotion': 'object'
}

In [25]:
%%time
df_all = pd.read_csv(
    PATH / 'train.csv', parse_dates=['date'], dtype=types,
    infer_datetime_format=True
)

CPU times: user 2min 56s, sys: 34.9 s, total: 3min 31s
Wall time: 4min


In [31]:
df_all.onpromotion.fillna(False, inplace=True)
df_all.onpromotion = df_all.onpromotion.map({'False': False, 'True': True})
df_all.onpromotion = df_all.onpromotion.astype(bool)

%time df_all.to_feather(PATH / 'tmp/raw_groceries')

CPU times: user 2.03 s, sys: 31.5 s, total: 33.5 s
Wall time: 55.6 s


In [32]:
%time df_all.describe(include='all')

CPU times: user 48.7 s, sys: 24.5 s, total: 1min 13s
Wall time: 1min 8s


Unnamed: 0,id,date,store_nbr,item_nbr,unit_sales,onpromotion
count,125497000.0,125497040,125497000.0,125497000.0,125497000.0,125497040
unique,,1684,,,,1
top,,2017-07-01 00:00:00,,,,True
freq,,118194,,,,125497040
first,,2013-01-01 00:00:00,,,,
last,,2017-08-15 00:00:00,,,,
mean,62748520.0,,27.46458,972769.2,8.554856,
std,36227880.0,,16.33051,520533.6,23.60515,
min,0.0,,1.0,96995.0,-15372.0,
25%,31374260.0,,12.0,522383.0,2.0,
