# Lesson 1 and 2 Notes
- Intro to RFs, how to pre-process, hyperparamters, general fiddling around 

### Imports and Setup

In [73]:
%load_ext autoreload
%autoreload 2

%matplotlib inline
import math

In [None]:
import sys
import os
sys.path.insert(0, "/Users/JI/Documents/Github/fastai/old/")
# print(sys.path)
import fastai
print(sys.modules['fastai'])

In [33]:
from fastai.structured import add_datepart,train_cats,proc_df,fix_missing,numericalize,set_rf_samples
import pandas as pd
import numpy as np
from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from IPython.display import display
from sklearn import metrics


In [6]:
PATH = "./data/bulldozers/"
# !ls {PATH}

### Load Dataset

In [37]:
df_raw = pd.read_csv(f'{PATH}Train.csv', low_memory=False, 
                     parse_dates=["saledate"])

### Look at the Data

#### Display all cols in a df

In [39]:
def display_all(df):
    with pd.option_context("display.max_rows", 1000, "display.max_columns", 1000): 
        display(df)

In [41]:
# functions to look inside data
df_raw.head()
df_raw.dtypes
df_raw.columns
df_summ = DataFrameSummary(df_raw)
display_all(df_summ.summary())

Unnamed: 0,SalesID,SalePrice,MachineID,ModelID,datasource,auctioneerID,YearMade,MachineHoursCurrentMeter,UsageBand,saledate,fiModelDesc,fiBaseModel,fiSecondaryDesc,fiModelSeries,fiModelDescriptor,ProductSize,fiProductClassDesc,state,ProductGroup,ProductGroupDesc,Drive_System,Enclosure,Forks,Pad_Type,Ride_Control,Stick,Transmission,Turbocharged,Blade_Extension,Blade_Width,Enclosure_Type,Engine_Horsepower,Hydraulics,Pushblock,Ripper,Scarifier,Tip_Control,Tire_Size,Coupler,Coupler_System,Grouser_Tracks,Hydraulics_Flow,Track_Type,Undercarriage_Pad_Width,Stick_Length,Thumb,Pattern_Changer,Grouser_Type,Backhoe_Mounting,Blade_Type,Travel_Controls,Differential_Type,Steering_Controls
count,401125,401125,401125,401125,401125,380989,401125,142765,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
mean,1.91971e+06,31099.7,1.2179e+06,6889.7,134.666,6.55604,1899.16,3457.96,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
std,909021,23036.9,440992,6221.78,8.96224,16.9768,291.797,27590.3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
min,1.13925e+06,4750,0,28,121,0,1000,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
25%,1.41837e+06,14500,1.0887e+06,3259,132,1,1985,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
50%,1.63942e+06,24000,1.27949e+06,4604,132,2,1995,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
75%,2.24271e+06,40000,1.46807e+06,8724,136,4,2000,3025,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
max,6.33334e+06,142000,2.48633e+06,37198,172,99,2013,2.4833e+06,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
counts,401125,401125,401125,401125,401125,380989,401125,142765,69639,401125,401125,401125,263934,56908,71919,190350,401125,401125,401125,401125,104361,400800,192077,79134,148606,79134,183230,79134,25219,25219,25219,25219,320570,25219,104137,25230,25219,94718,213952,43458,43362,43362,99153,99872,99218,99288,99218,99153,78672,79833,79834,69411,69369
uniques,401125,899,341027,5218,5,30,72,15152,3,3919,4999,1950,175,122,139,6,74,53,6,6,4,6,2,4,3,2,8,2,2,6,3,2,12,2,4,2,3,17,3,2,2,3,2,19,29,3,3,3,2,10,7,4,5


### Pre-Processing Steps
Need to make cols numeric
1. change dates to numerics (add_datepart)
2. change all string names to categorical variables (train_cats). To apply the same categorical mappings to test set, use (apply_cats). Make sure the categorical mappings make sense, i.e. Low, Med, High instead of High, Low, Med etc.
3. Take care of missing/null values (proc_df does all of below)
    - if numeric, add new col(_na) with 1 or 0, and fill na with median value (fix_missing)
    - pandas auto sets null categorical values to -1, so add 1 to all codes using (numericalize)

#### Transform output
- the project uses RMSLE (root mean squared log error), therefore transform the output variable SalePrice by taking the log

In [78]:
df_raw.SalePrice = np.log(df_raw.SalePrice)

In [79]:
df_raw.SalePrice

0         11.097410
1         10.950807
2          9.210340
3         10.558414
4          9.305651
            ...    
401120     9.259131
401121     9.305651
401122     9.350102
401123     9.104980
401124     8.955448
Name: SalePrice, Length: 401125, dtype: float64

#### Change dates to numerics and add interesting date info

In [42]:
add_datepart(df_raw,'saledate')

#### Change strings to categoricals

In [43]:
train_cats(df_raw)

In [46]:
display(df_raw.UsageBand.cat.categories)
df_raw.UsageBand.cat.set_categories(['High', 'Medium', 'Low'], ordered=True, inplace=True)
display(df_raw.UsageBand.cat.categories)

Index(['High', 'Medium', 'Low'], dtype='object')

Index(['High', 'Medium', 'Low'], dtype='object')

In [47]:
# or replace with numbers
df_raw.UsageBand = df_raw.UsageBand.cat.codes

#### Save Pre-Processed data in feather format

In [80]:
os.makedirs('tmp',exist_ok=True)
df_raw.to_feather('tmp/bulldozers-raw')

#### Take care of null values

In [81]:
df_raw = pd.read_feather('tmp/bulldozers-raw')

In [48]:
display_all(df_raw.isnull().sum().sort_index()/len(df_raw))

Backhoe_Mounting            0.803872
Blade_Extension             0.937129
Blade_Type                  0.800977
Blade_Width                 0.937129
Coupler                     0.466620
Coupler_System              0.891660
Differential_Type           0.826959
Drive_System                0.739829
Enclosure                   0.000810
Enclosure_Type              0.937129
Engine_Horsepower           0.937129
Forks                       0.521154
Grouser_Tracks              0.891899
Grouser_Type                0.752813
Hydraulics                  0.200823
Hydraulics_Flow             0.891899
MachineHoursCurrentMeter    0.644089
MachineID                   0.000000
ModelID                     0.000000
Pad_Type                    0.802720
Pattern_Changer             0.752651
ProductGroup                0.000000
ProductGroupDesc            0.000000
ProductSize                 0.525460
Pushblock                   0.937129
Ride_Control                0.629527
Ripper                      0.740388
S

In [85]:
df, y, nas = proc_df(df_raw, 'SalePrice')
df

Unnamed: 0,SalesID,MachineID,ModelID,datasource,auctioneerID,YearMade,MachineHoursCurrentMeter,UsageBand,fiModelDesc,fiBaseModel,...,saleDayofyear,saleIs_month_end,saleIs_month_start,saleIs_quarter_end,saleIs_quarter_start,saleIs_year_end,saleIs_year_start,saleElapsed,auctioneerID_na,MachineHoursCurrentMeter_na
0,1139246,999089,3157,121,3.0,2004,68.0,2,950,296,...,320,False,False,False,False,False,False,1163635200,False,False
1,1139248,117657,77,121,3.0,1996,4640.0,2,1725,527,...,86,False,False,False,False,False,False,1080259200,False,False
2,1139249,434808,7009,121,3.0,2001,2838.0,0,331,110,...,57,False,False,False,False,False,False,1077753600,False,False
3,1139251,1026470,332,121,3.0,2001,3486.0,0,3674,1375,...,139,False,False,False,False,False,False,1305763200,False,False
4,1139253,1057373,17311,121,3.0,2007,722.0,1,4208,1529,...,204,False,False,False,False,False,False,1248307200,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
401120,6333336,1840702,21439,149,1.0,2005,0.0,-1,657,207,...,306,False,False,False,False,False,False,1320192000,False,True
401121,6333337,1830472,21439,149,1.0,2005,0.0,-1,657,207,...,306,False,False,False,False,False,False,1320192000,False,True
401122,6333338,1887659,21439,149,1.0,2005,0.0,-1,657,207,...,306,False,False,False,False,False,False,1320192000,False,True
401123,6333341,1903570,21435,149,2.0,2005,0.0,-1,483,159,...,298,False,False,False,False,False,False,1319500800,False,True


### Random Forests
- A tree consists of a sequence of binary decisions/splits
- How do you find the most simple basic split (which variable, which split point)?
    - for every feature and for every split within that feature, we find the weighted avg of the mse, which one had the best mse and we picked that
    - split when you hit a limit, or when leaf nodes have only one decision left
- How can you make a decision tree better?
    - **FORESTS!** RFs are simply a way of *Bagging* trees
- What is Bagging?
    - construct multiple uncorelated models whose errors are close to random
- What are some RF hyperparameters?
    - num_estimators(trees) - as many as you have time to fit, give you good r2 (mostly testing using 20,30 - finally using 1k or so)
    - min_samples_leaf - min # of samples
    - 

#### Basic Model

In [60]:
m = RandomForestRegressor(n_jobs=-1)
m.fit(df,y)
%time m.score(df,y)



CPU times: user 2.93 s, sys: 394 ms, total: 3.32 s
Wall time: 2.56 s


0.9825350346119319

#### Creating a validation set

In [86]:
# split raw df into train and val sets, val set contains most recent values
def split_vals(a,n):
    return a[:n].copy(),a[n:].copy()

n_valid = 12000 # same as Kaggle's test set
n_train = len(df) - n_valid
raw_train, raw_valid = split_vals(df_raw,n_train)
X_train, X_valid = split_vals(df,n_train)
y_train, y_valid = split_vals(y,n_train)

X_train.shape, y_train.shape, X_valid.shape
# a = np.array([1,2,3,4,5,6,7,8,9]) # figuring out what split_vals does
# split_vals(a,3)

((389125, 66), (389125,), (12000, 66))

#### Re-training with validation set

In [87]:
def rmse(x,y): return math.sqrt(((x-y)**2).mean())

def print_score(m):
    res = [rmse(m.predict(X_train), y_train), rmse(m.predict(X_valid), y_valid),
                m.score(X_train, y_train), m.score(X_valid, y_valid)]
    if hasattr(m,'oob_score_'): res.append(m.oob_score_)
    print(res)

In [88]:
m = RandomForestRegressor(n_jobs=-1)
%time m.fit(X_train,y_train)
print_score(m)



CPU times: user 1min 40s, sys: 877 ms, total: 1min 41s
Wall time: 33.9 s
[0.09050009786847515, 0.25139148711010156, 0.982882792166662, 0.8871376449956733]


#### Speeding things up
If something takes longer than 10s to run, it's to slow to be interactive. Ideally, you want to create a model and tune hyperparameters quickly, then run on entire dataset when you head home
- Run your models on subsamples of the data, will give you most of the insights you can get, rather than training on all of huge dataset

## Notes

- It's good not knowing about the dataset before hand, will keep you open-minded as to what the data is saying

- Kaggle API: https://github.com/Kaggle/kaggle-api

- you can open a terminal within jupyter

- can run shell commands from jupyter with ! before, eg. **!ls /path** or **!ls {PATH}**

- read_csv low_memory=False reads in more of the file

- look at the evaluation metric to determine how to modify the variables (log loss, etc.)

- RFs are very robust, great place to start

- Scikit learn steps: 
        1. Create Instance of model you want
        2. call fit, pass in independent variables, and dependent variable


- ? docs, ?? source code

- pandas has a Category data type but doesn't change anything into it by default, use train_cats(df) to do so. Stores a mapping from integers to the strings

- RFs are trivially parallelizable (it will split up data across CPUs and linearly scale) n_jobs=-1

- the fastai library in this directory is a symlink (file that points to another file) pointing to the original fastai folder ../../old/fastai
ln -s ../../old/fastai

- R^2 is useful, ratio of how good your model is (root mean squared error) versus how good the naive mean model is (root squared error)

- creating validation set is most important thing to do in a ML model. Test set should never be touched until you are done done with modeling.

- when dealing with time series data, you want your test set to be of a different time (future) than your training set. So set your validation set to be different as well, not randomized.

- an effective ML model is accurate for the training set and also generalizes well

### Figuring out what add_datepart does

In [19]:
test_df = df_raw[['saledate']]

In [26]:
test_df.columns = ['testing']
test_df

Unnamed: 0,testing
0,2006-11-16
1,2004-03-26
2,2004-02-26
3,2011-05-19
4,2009-07-23
...,...
401120,2011-11-02
401121,2011-11-02
401122,2011-11-02
401123,2011-10-25


In [27]:
add_datepart(test_df,'testing')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  for n in attr: df[targ_pre + n] = getattr(fld.dt, n.lower())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[targ_pre + 'Elapsed'] = fld.astype(np.int64) // 10 ** 9
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [28]:
test_df

Unnamed: 0,testingYear,testingMonth,testingWeek,testingDay,testingDayofweek,testingDayofyear,testingIs_month_end,testingIs_month_start,testingIs_quarter_end,testingIs_quarter_start,testingIs_year_end,testingIs_year_start,testingElapsed
0,2006,11,46,16,3,320,False,False,False,False,False,False,1163635200
1,2004,3,13,26,4,86,False,False,False,False,False,False,1080259200
2,2004,2,9,26,3,57,False,False,False,False,False,False,1077753600
3,2011,5,20,19,3,139,False,False,False,False,False,False,1305763200
4,2009,7,30,23,3,204,False,False,False,False,False,False,1248307200
...,...,...,...,...,...,...,...,...,...,...,...,...,...
401120,2011,11,44,2,2,306,False,False,False,False,False,False,1320192000
401121,2011,11,44,2,2,306,False,False,False,False,False,False,1320192000
401122,2011,11,44,2,2,306,False,False,False,False,False,False,1320192000
401123,2011,10,43,25,1,298,False,False,False,False,False,False,1319500800


In [29]:
add_datepart(df_raw,'saledate')

In [30]:
df_raw

Unnamed: 0,SalesID,SalePrice,MachineID,ModelID,datasource,auctioneerID,YearMade,MachineHoursCurrentMeter,UsageBand,fiModelDesc,...,saleDay,saleDayofweek,saleDayofyear,saleIs_month_end,saleIs_month_start,saleIs_quarter_end,saleIs_quarter_start,saleIs_year_end,saleIs_year_start,saleElapsed
0,1139246,66000,999089,3157,121,3.0,2004,68.0,Low,521D,...,16,3,320,False,False,False,False,False,False,1163635200
1,1139248,57000,117657,77,121,3.0,1996,4640.0,Low,950FII,...,26,4,86,False,False,False,False,False,False,1080259200
2,1139249,10000,434808,7009,121,3.0,2001,2838.0,High,226,...,26,3,57,False,False,False,False,False,False,1077753600
3,1139251,38500,1026470,332,121,3.0,2001,3486.0,High,PC120-6E,...,19,3,139,False,False,False,False,False,False,1305763200
4,1139253,11000,1057373,17311,121,3.0,2007,722.0,Medium,S175,...,23,3,204,False,False,False,False,False,False,1248307200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
401120,6333336,10500,1840702,21439,149,1.0,2005,,,35NX2,...,2,2,306,False,False,False,False,False,False,1320192000
401121,6333337,11000,1830472,21439,149,1.0,2005,,,35NX2,...,2,2,306,False,False,False,False,False,False,1320192000
401122,6333338,11500,1887659,21439,149,1.0,2005,,,35NX2,...,2,2,306,False,False,False,False,False,False,1320192000
401123,6333341,9000,1903570,21435,149,2.0,2005,,,30NX,...,25,1,298,False,False,False,False,False,False,1319500800
