In [2]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [1]:
from fastai.structured import *
from fastai.column_data import *
from IPython.display import HTML
np.set_printoptions(threshold=50, edgeitems=20)

PATH='../../csv/hwkc/'

In [3]:
import glob

In [4]:
glob.glob(f'{PATH}/*')

['../../csv/hwkc/models',
 '../../csv/hwkc/test.csv.gz',
 '../../csv/hwkc/tmp',
 '../../csv/hwkc/df_model',
 '../../csv/hwkc/sample_submission.csv.gz',
 '../../csv/hwkc/items.csv',
 '../../csv/hwkc/item_categories.csv',
 '../../csv/hwkc/shops.csv',
 '../../csv/hwkc/sales_train.csv.gz']

## Importing data

In [None]:
df_st = pd.read_csv(f'{PATH}/sales_train.csv.gz')
df_st['date'] = pd.to_datetime(df_st.date,dayfirst=True)

In [6]:
print(df_st.shape)
df_st.head()

(2935849, 6)


Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,2013-01-02,0,59,22154,999.0,1.0
1,2013-01-03,0,25,2552,899.0,1.0
2,2013-01-05,0,25,2552,899.0,-1.0
3,2013-01-06,0,25,2554,1709.05,1.0
4,2013-01-15,0,25,2555,1099.0,1.0


In [7]:
df_i = pd.read_csv(f'{PATH}/items.csv')
df_i.head()
df_st = df_st.merge(df_i[['item_category_id']],left_on='item_id',right_index=True,copy=False)

In [8]:
from datetime import date
df = df_st.copy()

In [9]:
var_cols = ['date','shop_id','item_id','item_price']
df = df[var_cols]
add_datepart(df,'date',drop=False)


In [10]:
df.head()

Unnamed: 0,date,shop_id,item_id,item_price,Year,Month,Week,Day,Dayofweek,Dayofyear,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start,Is_year_end,Is_year_start,Elapsed
0,2013-01-02,59,22154,999.0,2013,1,1,2,2,2,False,False,False,False,False,False,1357084800
3270,2013-01-23,24,22154,999.0,2013,1,4,23,2,23,False,False,False,False,False,False,1358899200
17081,2013-01-20,27,22154,999.0,2013,1,3,20,6,20,False,False,False,False,False,False,1358640000
25918,2013-01-02,25,22154,999.0,2013,1,1,2,2,2,False,False,False,False,False,False,1357084800
25919,2013-01-03,25,22154,999.0,2013,1,1,3,3,3,False,False,False,False,False,False,1357171200


## Adding Russian Official ho

In [11]:
rus_hol = [
    {'dia' : 23,'mes' : 2},
    {'dia' : 8,'mes' : 3},
    {'dia' : 22,'mes' : 8},
    {'dia' : 1,'mes' : 3},
    {'dia' : 9,'mes' : 5},
    {'dia' : 12,'mes' : 6},
    {'dia' : 4,'mes' : 11}
]

dt_hols = np.array([np.datetime64(date(y,h['mes'],h['dia'])) for y in list(df.Year.unique()) + [2012,2016] for h in rus_hol])
df['is_hollyday'] = df.date.isin(dt_hols)


In [12]:
dt_dif = pd.DataFrame( [ {'dt' : d, 
       'last_hol_before' : (d - max([h for h in dt_hols if h <= d])).astype('timedelta64[D]')/ np.timedelta64(1, 'D'), 
       'first_hol_after' : (min([h for h in dt_hols if h > d]) - d).astype('timedelta64[D]')/ np.timedelta64(1, 'D') 
      } for d in df.date.unique() ])
df = df.merge(dt_dif,left_on='date',right_on='dt',copy=False)
df.head()

Unnamed: 0,date,shop_id,item_id,item_price,Year,Month,Week,Day,Dayofweek,Dayofyear,...,Is_month_start,Is_quarter_end,Is_quarter_start,Is_year_end,Is_year_start,Elapsed,is_hollyday,dt,first_hol_after,last_hol_before
0,2013-01-02,59,22154,999.0,2013,1,1,2,2,2,...,False,False,False,False,False,1357084800,False,2013-01-02,52.0,59.0
1,2013-01-02,25,22154,999.0,2013,1,1,2,2,2,...,False,False,False,False,False,1357084800,False,2013-01-02,52.0,59.0
2,2013-01-02,54,22154,999.0,2013,1,1,2,2,2,...,False,False,False,False,False,1357084800,False,2013-01-02,52.0,59.0
3,2013-01-02,46,22154,999.0,2013,1,1,2,2,2,...,False,False,False,False,False,1357084800,False,2013-01-02,52.0,59.0
4,2013-01-02,25,2565,549.0,2013,1,1,2,2,2,...,False,False,False,False,False,1357084800,False,2013-01-02,52.0,59.0


In [13]:
df.set_index('date',inplace=True)
cat_vars = ['shop_id','item_id','Year','Month',
               'Week','Day','Dayofweek','Dayofyear',
               'Is_month_end','Is_month_start','Is_quarter_end','Is_quarter_start',
               'Is_year_end','Is_year_start','is_hollyday']
contin_vars = ['item_price','first_hol_after','last_hol_before']

In [14]:
for v in cat_vars: df[v] = df[v].astype('category').cat.as_ordered()
for v in contin_vars: df[v] = df[v].astype('float32')
df = df[cat_vars + contin_vars]

In [15]:
df.reset_index().to_feather(f'{PATH}df_model')

In [16]:
val_idx = np.flatnonzero(
    (df.index<=datetime.datetime(2015,10,31)) & (df.index>=datetime.datetime(2015,10,1)))
y = df_st.item_cnt_day.clip(lower=0,upper=20).values
yl = np.log(y)



## DL

In [17]:
def inv_y(a): return np.exp(a)

def exp_rmspe(y_pred, targ):
    targ = inv_y(targ)
    pct_var = (targ - inv_y(y_pred))/targ
    return math.sqrt((pct_var**2).mean())

max_log_y = np.max(yl)
y_range = (0, max_log_y*1.2)

In [18]:
md = ColumnarModelData.from_data_frame(PATH, val_idx, df, yl.astype(np.float32), 
                                   cat_flds=cat_vars, bs=128)

In [19]:
cat_sz = [(c, len(df[c].cat.categories)+1) for c in cat_vars]

In [20]:
cat_sz

[('shop_id', 61),
 ('item_id', 21808),
 ('Year', 4),
 ('Month', 13),
 ('Week', 53),
 ('Day', 32),
 ('Dayofweek', 8),
 ('Dayofyear', 366),
 ('Is_month_end', 3),
 ('Is_month_start', 3),
 ('Is_quarter_end', 3),
 ('Is_quarter_start', 3),
 ('Is_year_end', 3),
 ('Is_year_start', 3),
 ('is_hollyday', 3)]

In [21]:
emb_szs = [(c, min(50, (c+1)//2)) for _,c in cat_sz]

In [22]:
emb_szs

[(61, 31),
 (21808, 50),
 (4, 2),
 (13, 7),
 (53, 27),
 (32, 16),
 (8, 4),
 (366, 50),
 (3, 2),
 (3, 2),
 (3, 2),
 (3, 2),
 (3, 2),
 (3, 2),
 (3, 2)]

In [23]:
display(df.columns)
display(cat_vars)



Index(['shop_id', 'item_id', 'Year', 'Month', 'Week', 'Day', 'Dayofweek',
       'Dayofyear', 'Is_month_end', 'Is_month_start', 'Is_quarter_end',
       'Is_quarter_start', 'Is_year_end', 'Is_year_start', 'is_hollyday',
       'item_price', 'first_hol_after', 'last_hol_before'],
      dtype='object')

['shop_id',
 'item_id',
 'Year',
 'Month',
 'Week',
 'Day',
 'Dayofweek',
 'Dayofyear',
 'Is_month_end',
 'Is_month_start',
 'Is_quarter_end',
 'Is_quarter_start',
 'Is_year_end',
 'Is_year_start',
 'is_hollyday']

In [24]:
m = md.get_learner(emb_szs, len(df.columns)-len(cat_vars),
                   0.04, 1, [1000,500], [0.001,0.01])
lr = 1e-3

  for o in self.lins: kaiming_normal(o.weight.data)
  kaiming_normal(self.outp.weight.data)


In [25]:
m.lr_find()

  0%|          | 0/22519 [00:00<?, ?it/s]


RuntimeError: index out of range at /opt/conda/conda-bld/pytorch_1524584710464/work/aten/src/TH/generic/THTensorMath.c:343



In [None]:
m.sched.plot(100)

In [None]:
m = md.get_learner(emb_szs, len(df.columns)-len(cat_vars),
                   0.04, 1, [1000,500], [0.001,0.01], y_range=y_range)
lr = 1e-3

In [None]:
m.fit(lr, 3, metrics=[exp_rmspe])