### Read Data from Training Files

In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

from fastai.imports import *
from fastai.structured import *

In [2]:
PATH= 'dataset'

In [6]:
###Run this code once to read from csv-> store feather and from then on use feather format as it loads fast
#df_raw = pd.read_csv(f'{PATH}/Train.csv',low_memory = False,parse_dates=["saledate"])
#os.makedirs('tmp',exist_ok=True)
#df_raw.to_feather('tmp/df_raw')

In [25]:
df_raw = pd.read_feather('tmp/df_raw')

In [26]:
### df_raw.columns: Investigate columns of the data

In [27]:
df_raw.SalePrice  = np.log(df_raw.SalePrice) #For calculating error

##  Pre-Processing Steps

### Step 1: Split Date Time into useful colums such as week etc.,

In [28]:
add_datepart(df_raw,'saledate')

In [18]:
###df_raw.columns :: Investigate newly added cells

### Step 2: Convert String to Categorical Data Types in pandas

In [29]:
train_cats(df_raw)

In [13]:
df_raw.UsageBand.cat.categories

Index(['High', 'Low', 'Medium'], dtype='object')

In [30]:
### Change some of the categories to meaningful ascening order
df_raw.UsageBand.cat.set_categories(["High","Medium","Low"],ordered=True, inplace=True)

### Step 3: Fill Null Values

In [33]:
#Get Null Value Statistics
#df_raw.info()
#(df_raw.isnull().sum().sort_index())/len(df_raw)

In [None]:
# Pandas automatically fills na for categories with -1
# We can add 1 to numeric codes so that 0 can represent missing values

In [41]:
#Fix Missing: replace numeric columns with median and create a seperate table with _na (boolean indicating missing)

In [36]:
df,y,na_dict =proc_df(df_raw,'SalePrice')

In [37]:
df.columns #This add two new columns with _na for numerical coumns

Index(['SalesID', 'MachineID', 'ModelID', 'datasource', 'auctioneerID',
       'YearMade', 'MachineHoursCurrentMeter', 'UsageBand', 'fiModelDesc',
       'fiBaseModel', 'fiSecondaryDesc', 'fiModelSeries', 'fiModelDescriptor',
       'ProductSize', 'fiProductClassDesc', 'state', 'ProductGroup',
       'ProductGroupDesc', 'Drive_System', 'Enclosure', 'Forks', 'Pad_Type',
       'Ride_Control', 'Stick', 'Transmission', 'Turbocharged',
       'Blade_Extension', 'Blade_Width', 'Enclosure_Type', 'Engine_Horsepower',
       'Hydraulics', 'Pushblock', 'Ripper', 'Scarifier', 'Tip_Control',
       'Tire_Size', 'Coupler', 'Coupler_System', 'Grouser_Tracks',
       'Hydraulics_Flow', 'Track_Type', 'Undercarriage_Pad_Width',
       'Stick_Length', 'Thumb', 'Pattern_Changer', 'Grouser_Type',
       'Backhoe_Mounting', 'Blade_Type', 'Travel_Controls',
       'Differential_Type', 'Steering_Controls', 'saleYear', 'saleMonth',
       'saleWeek', 'saleDay', 'saleDayofweek', 'saleDayofyear',
       'saleI

## Fitting the model

### Step 4: Split and Train the Model

In [39]:
#Split the data into train and validation

In [43]:
def split_vals(a,n): return a[:n].copy(), a[n:].copy()

In [44]:
n_valid = 12000 # same as Kaggle's test set size
n_trn = len(df) - n_valid
raw_train, raw_valid = split_vals(df_raw,n_trn)
X_train, X_valid = split_vals(df,n_trn)
y_train, y_valid = split_vals(y ,n_trn)

X_train.shape, y_train.shape, X_valid.shape,y_valid.shape

((389125, 66), (389125,), (12000, 66), (12000,))

### Step 5:Train the Model

In [45]:
def rmse(x,y): return math.sqrt(((x-y)**2).mean())

In [52]:
def prettyprint_score(m):
    res = {'train_error':rmse(m.predict(X_train),y_train), 'valid_error':rmse(m.predict(X_valid),y_valid), 
           'train_acc': m.score(X_train,y_train),'valid_acc':m.score(X_valid,y_valid)}
    if hasattr(m,'oob_score_'): res['oob_score']=(m.oob_score_)
    print(res)

In [53]:
m = RandomForestRegressor(n_jobs=-1)
%time m.fit(X_train,y_train)

CPU times: user 2min 7s, sys: 272 ms, total: 2min 8s
Wall time: 46 s


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

### Step 6: Get Accuracy of the Model

In [54]:
prettyprint_score(m)

{'train_error': 0.0901574986999728, 'valid_error': 0.24676202387934568, 'train_acc': 0.983012145420834, 'valid_acc': 0.8912561709638892}
