### Read Data from Training Files

In [1]:
#Kaggle Competition: https://www.kaggle.com/c/avito-demand-prediction/

In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

from fastai.imports import *
from fastai.structured import *

In [2]:
PATH= 'dataset'

In [3]:
###Run this code once to read from csv-> store feather and from then on use feather format as it loads fast
#df_raw = pd.read_csv(f'{PATH}/train.csv',low_memory = False,parse_dates=['activation_date'])
#os.makedirs('tmp',exist_ok=True)
#df_raw.to_feather('tmp/df_raw')

In [4]:
df_raw = pd.read_feather('tmp/df_raw')

In [5]:
df_raw.columns

Index(['item_id', 'user_id', 'region', 'city', 'parent_category_name',
       'category_name', 'param_1', 'param_2', 'param_3', 'title',
       'description', 'price', 'item_seq_number', 'activation_date',
       'user_type', 'image', 'image_top_1', 'deal_probability'],
      dtype='object')

In [6]:
df_raw.head().transpose()

Unnamed: 0,0,1,2,3,4
item_id,b912c3c6a6ad,2dac0150717d,ba83aefab5dc,02996f1dd2ea,7c90be56d2ab
user_id,e00f8ff2eaf9,39aeb48f0017,91e2f88dd6e3,bf5cccea572d,ef50846afc0b
region,Свердловская область,Самарская область,Ростовская область,Татарстан,Волгоградская область
city,Екатеринбург,Самара,Ростов-на-Дону,Набережные Челны,Волгоград
parent_category_name,Личные вещи,Для дома и дачи,Бытовая электроника,Личные вещи,Транспорт
category_name,Товары для детей и игрушки,Мебель и интерьер,Аудио и видео,Товары для детей и игрушки,Автомобили
param_1,Постельные принадлежности,Другое,"Видео, DVD и Blu-ray плееры",Автомобильные кресла,С пробегом
param_2,,,,,ВАЗ (LADA)
param_3,,,,,2110
title,Кокоби(кокон для сна),Стойка для Одежды,Philips bluray,Автокресло,"ВАЗ 2110, 2003"


In [7]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1503424 entries, 0 to 1503423
Data columns (total 18 columns):
item_id                 1503424 non-null object
user_id                 1503424 non-null object
region                  1503424 non-null object
city                    1503424 non-null object
parent_category_name    1503424 non-null object
category_name           1503424 non-null object
param_1                 1441848 non-null object
param_2                 848882 non-null object
param_3                 640859 non-null object
title                   1503424 non-null object
description             1387148 non-null object
price                   1418062 non-null float64
item_seq_number         1503424 non-null int64
activation_date         1503424 non-null datetime64[ns]
user_type               1503424 non-null object
image                   1390836 non-null object
image_top_1             1390836 non-null float64
deal_probability        1503424 non-null float64
dtypes: datetim

In [26]:
### df_raw.columns: Investigate columns of the data

### Step 1: Split Date Time into useful colums such as week etc.,

In [8]:
add_datepart(df_raw,'activation_date')

In [9]:
###df_raw.columns :: Investigate newly added cells

### Step 2: Convert String to Categorical Data Types in pandas

In [10]:
train_cats(df_raw)

In [11]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1503424 entries, 0 to 1503423
Data columns (total 30 columns):
item_id                        1503424 non-null category
user_id                        1503424 non-null category
region                         1503424 non-null category
city                           1503424 non-null category
parent_category_name           1503424 non-null category
category_name                  1503424 non-null category
param_1                        1441848 non-null category
param_2                        848882 non-null category
param_3                        640859 non-null category
title                          1503424 non-null category
description                    1387148 non-null category
price                          1418062 non-null float64
item_seq_number                1503424 non-null int64
user_type                      1503424 non-null category
image                          1390836 non-null category
image_top_1                    1390836

In [12]:
df_raw.param_1.cat.categories

Index(['ASUS', 'Acer', 'Alcatel', 'BQ', 'BlackBerry', 'DEXP', 'Explay', 'Fly',
       'HTC', 'Highscreen',
       ...
       'Шпиц', 'Экзотическая', 'Экскаваторы', 'Электронные книги',
       'Эрдельтерьер', 'Этикетки, бутылки, пробки', 'Ювелирные изделия',
       'Ягдтерьер', 'Японский бобтейл', 'Японский хин'],
      dtype='object', length=371)

### Step 3: Fill Null Values

In [33]:
#Get Null Value Statistics
#df_raw.info()
#(df_raw.isnull().sum().sort_index())/len(df_raw)

In [None]:
# Pandas automatically fills na for categories with -1
# We can add 1 to numeric codes so that 0 can represent missing values

In [41]:
#Fix Missing: replace numeric columns with median and create a seperate table with _na (boolean indicating missing)

In [13]:
df,y,na_dict =proc_df(df_raw,'deal_probability')

In [14]:
df.columns #This add two new columns with _na for numerical coumns

Index(['item_id', 'user_id', 'region', 'city', 'parent_category_name',
       'category_name', 'param_1', 'param_2', 'param_3', 'title',
       'description', 'price', 'item_seq_number', 'user_type', 'image',
       'image_top_1', 'activation_Year', 'activation_Month', 'activation_Week',
       'activation_Day', 'activation_Dayofweek', 'activation_Dayofyear',
       'activation_Is_month_end', 'activation_Is_month_start',
       'activation_Is_quarter_end', 'activation_Is_quarter_start',
       'activation_Is_year_end', 'activation_Is_year_start',
       'activation_Elapsed', 'price_na', 'image_top_1_na'],
      dtype='object')

## Fitting the model

### Step 4: Split and Train the Model

In [15]:
#Split the data into train and validation

In [16]:
def split_vals(a,n): return a[:n].copy(), a[n:].copy()

In [18]:
n_valid = int(0.2*len(df)) # same as Kaggle's test set size
n_trn = len(df) - n_valid
raw_train, raw_valid = split_vals(df_raw,n_trn)
X_train, X_valid = split_vals(df,n_trn)
y_train, y_valid = split_vals(y ,n_trn)

X_train.shape, y_train.shape, X_valid.shape,y_valid.shape

((1202740, 31), (1202740,), (300684, 31), (300684,))

### Step 5:Train the Model

In [19]:
def rmse(x,y): return math.sqrt(((x-y)**2).mean())

In [20]:
def prettyprint_score(m):
    res = {'train_error':rmse(m.predict(X_train),y_train), 'valid_error':rmse(m.predict(X_valid),y_valid), 
           'train_acc': m.score(X_train,y_train),'valid_acc':m.score(X_valid,y_valid)}
    if hasattr(m,'oob_score_'): res['oob_score']=(m.oob_score_)
    print(res)

In [21]:
m = RandomForestRegressor(n_jobs=-1)
%time m.fit(X_train,y_train)

CPU times: user 8min 52s, sys: 516 ms, total: 8min 53s
Wall time: 3min 6s


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

### Step 6: Get Accuracy of the Model

In [22]:
prettyprint_score(m)

{'train_error': 0.10139743538441366, 'valid_error': 0.23963856093369665, 'train_acc': 0.8478775525692439, 'valid_acc': 0.15371757497114424}
