In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
%matplotlib inline 

In [2]:
DATA_FOLDER = './competitive-data-science-predict-future-sales/'

transactions    = pd.read_csv(os.path.join(DATA_FOLDER, 'sales_train.csv'), parse_dates=['date'])
items           = pd.read_csv(os.path.join(DATA_FOLDER, 'items.csv'))
item_categories = pd.read_csv(os.path.join(DATA_FOLDER, 'item_categories.csv'))
shops           = pd.read_csv(os.path.join(DATA_FOLDER, 'shops.csv'))
test = pd.read_csv(DATA_FOLDER + 'test.csv')
sample_sub = pd.read_csv(DATA_FOLDER + 'sample_submission.csv')

In [46]:
transactions.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,2013-02-01,0,59,22154,999.0,1.0
1,2013-03-01,0,25,2552,899.0,1.0
2,2013-05-01,0,25,2552,899.0,-1.0
3,2013-06-01,0,25,2554,1709.05,1.0
4,2013-01-15,0,25,2555,1099.0,1.0


In [47]:
items.head()

Unnamed: 0,item_name,item_id,item_category_id
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,0,40
1,!ABBYY FineReader 12 Professional Edition Full...,1,76
2,***В ЛУЧАХ СЛАВЫ (UNV) D,2,40
3,***ГОЛУБАЯ ВОЛНА (Univ) D,3,40
4,***КОРОБКА (СТЕКЛО) D,4,40


In [48]:
item_categories.head()

Unnamed: 0,item_category_name,item_category_id
0,PC - Гарнитуры/Наушники,0
1,Аксессуары - PS2,1
2,Аксессуары - PS3,2
3,Аксессуары - PS4,3
4,Аксессуары - PSP,4


In [49]:
shops.head()

Unnamed: 0,shop_name,shop_id
0,"!Якутск Орджоникидзе, 56 фран",0
1,"!Якутск ТЦ ""Центральный"" фран",1
2,"Адыгея ТЦ ""Мега""",2
3,"Балашиха ТРК ""Октябрь-Киномир""",3
4,"Волжский ТЦ ""Волга Молл""",4


The test dataset is for November 2015!

In [50]:
test.head()

Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268


In [51]:
sample_sub.head()

Unnamed: 0,ID,item_cnt_month
0,0,0.5
1,1,0.5
2,2,0.5
3,3,0.5
4,4,0.5


## Cleaning data

In [52]:
# nan
transactions.isna().sum()

date              0
date_block_num    0
shop_id           0
item_id           0
item_price        0
item_cnt_day      0
dtype: int64

In [53]:
items.isna().sum()

item_name           0
item_id             0
item_category_id    0
dtype: int64

In [54]:
item_categories.isna().sum()

item_category_name    0
item_category_id      0
dtype: int64

In [55]:
# duplicates
transactions.duplicated().value_counts()

False    2935843
True           6
dtype: int64

In [56]:
transactions.drop_duplicates(inplace=True)

In [57]:
items.duplicated().value_counts()

False    22170
dtype: int64

In [58]:
item_categories.duplicated().value_counts()

False    84
dtype: int64

## Joining forms

In [59]:
data = pd.merge(transactions, items, how='left', on='item_id')
data = pd.merge(data, item_categories, how='left', on='item_category_id')
data = pd.merge(data, shops, how='left', on='shop_id')

In [60]:
data.isnull().sum()

date                  0
date_block_num        0
shop_id               0
item_id               0
item_price            0
item_cnt_day          0
item_name             0
item_category_id      0
item_category_name    0
shop_name             0
dtype: int64

In [61]:
# add month, year
data = data.assign(month=data.date.map(lambda x: x.month), year=data.date.map(lambda x: x.year))

In [62]:
# save processed data
data.to_hdf(DATA_FOLDER + 'train_data.hf', 'df')

## Load data

In [2]:
DATA_FOLDER = './competitive-data-science-predict-future-sales/'
data = pd.read_hdf(DATA_FOLDER + 'train_data.hf', 'df')
items           = pd.read_csv(os.path.join(DATA_FOLDER, 'items.csv'))
item_categories = pd.read_csv(os.path.join(DATA_FOLDER, 'item_categories.csv'))
shops           = pd.read_csv(os.path.join(DATA_FOLDER, 'shops.csv'))
test = pd.read_csv(DATA_FOLDER + 'test.csv')
sample_sub = pd.read_csv(DATA_FOLDER + 'sample_submission.csv')

In [4]:
data.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_name,item_category_id,item_category_name,shop_name,month,year
0,2013-02-01,0,59,22154,999.0,1.0,ЯВЛЕНИЕ 2012 (BD),37,Кино - Blu-Ray,"Ярославль ТЦ ""Альтаир""",2,2013
1,2013-03-01,0,25,2552,899.0,1.0,DEEP PURPLE The House Of Blue Light LP,58,Музыка - Винил,"Москва ТРК ""Атриум""",3,2013
2,2013-05-01,0,25,2552,899.0,-1.0,DEEP PURPLE The House Of Blue Light LP,58,Музыка - Винил,"Москва ТРК ""Атриум""",5,2013
3,2013-06-01,0,25,2554,1709.05,1.0,DEEP PURPLE Who Do You Think We Are LP,58,Музыка - Винил,"Москва ТРК ""Атриум""",6,2013
4,2013-01-15,0,25,2555,1099.0,1.0,DEEP PURPLE 30 Very Best Of 2CD (Фирм.),56,Музыка - CD фирменного производства,"Москва ТРК ""Атриум""",1,2013


## aggregating

In [3]:
columns = ['date_block_num', 'shop_id', 'item_id', 'item_name', 'item_category_id', 'item_category_name',
           'shop_name', 'month', 'year']
data_monthly = data.groupby(columns).agg(order_cnt=('item_cnt_day', 'count'), item_cnt=('item_cnt_day', 'sum'), item_mean_price=('item_price', 'mean')).reset_index()
data_monthly.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_name,item_category_id,item_category_name,shop_name,month,year,order_cnt,item_cnt,item_mean_price
0,0,0,32,1+1,40,Кино - DVD,"!Якутск Орджоникидзе, 56 фран",1,2013,3,4.0,221.0
1,0,0,32,1+1,40,Кино - DVD,"!Якутск Орджоникидзе, 56 фран",3,2013,1,2.0,221.0
2,0,0,33,1+1 (BD),37,Кино - Blu-Ray,"!Якутск Орджоникидзе, 56 фран",1,2013,2,2.0,347.0
3,0,0,33,1+1 (BD),37,Кино - Blu-Ray,"!Якутск Орджоникидзе, 56 фран",3,2013,1,1.0,347.0
4,0,0,35,10 ЛЕТ СПУСТЯ,40,Кино - DVD,"!Якутск Орджоникидзе, 56 фран",1,2013,1,1.0,247.0


## Benchmarks

### Lagged values

In [5]:
bench = pd.merge(test, data_monthly.loc[(data_monthly.month==10) & (data_monthly.year==2015)][['shop_id', 'item_id', 'item_cnt']], how='left', on=['shop_id', 'item_id'])
bench = bench.rename({'item_cnt': 'item_cnt_month'}, axis=1)[['ID', 'item_cnt_month']]
bench = bench.fillna(0)
def cap(x):
    if x < 0:
        return 0
    if x > 20:
        return 20
    return x
bench = bench.assign(item_cnt_month=bench.item_cnt_month.map(cap))
bench.head()

Unnamed: 0,ID,item_cnt_month
0,0,0.0
1,1,0.0
2,2,0.0
3,3,0.0
4,4,0.0


In [66]:
bench.to_csv('bench_lag_sub.csv', index=False)

In [67]:
!kaggle competitions submit -c competitive-data-science-predict-future-sales -f bench_lag_sub.csv -m "benchmark submission test"

100%|██████████████████████████████████████| 2.14M/2.14M [00:01<00:00, 1.18MB/s]
Successfully submitted to Predict Future Sales

In [69]:
!kaggle competitions submissions competitive-data-science-predict-future-sales

fileName            date                 description                 status    publicScore  privateScore  
------------------  -------------------  --------------------------  --------  -----------  ------------  
bench_lag_sub.csv   2020-07-10 22:51:22  benchmark submission test   complete  1.14178      None          
bench2_lag_sub.csv  2020-07-10 22:27:16  benchmark2 submission test  complete  1.14178      None          
bench_lag_sub.csv   2020-07-10 21:47:56  benchmark submission test   complete  1.14178      None          
bench_lag_sub.csv   2020-07-10 21:46:14  benchmark submission test   error     None         None          


Strange, tips says the score would be 1.16777.
Maybe the result is without duplication removal?

In [35]:
DATA_FOLDER = './competitive-data-science-predict-future-sales/'

transactions    = pd.read_csv(os.path.join(DATA_FOLDER, 'sales_train.csv'), parse_dates=['date'])
items           = pd.read_csv(os.path.join(DATA_FOLDER, 'items.csv'))
item_categories = pd.read_csv(os.path.join(DATA_FOLDER, 'item_categories.csv'))
shops           = pd.read_csv(os.path.join(DATA_FOLDER, 'shops.csv'))
test = pd.read_csv(DATA_FOLDER + 'test.csv')
sample_sub = pd.read_csv(DATA_FOLDER + 'sample_submission.csv')

data2 = pd.merge(transactions, items, how='left', on='item_id')
data2 = pd.merge(data2, item_categories, how='left', on='item_category_id')
data2 = pd.merge(data2, shops, how='left', on='shop_id')

data2 = data2.assign(month=data2.date.map(lambda x: x.month), year=data2.date.map(lambda x: x.year))

columns = ['shop_id', 'item_id', 'item_name', 'item_category_id', 'item_category_name',
           'shop_name', 'month', 'year']
data_monthly2 = data2.groupby(columns).agg(order_cnt=('item_cnt_day', 'count'), item_cnt=('item_cnt_day', 'sum'), item_mean_price=('item_price', 'mean')).reset_index()

bench2 = pd.merge(test, data_monthly2.loc[(data_monthly.month==10) & (data_monthly2.year==2015)][['shop_id', 'item_id', 'item_cnt']], how='left', on=['shop_id', 'item_id'])
bench2 = bench2.rename({'item_cnt': 'item_cnt_month'}, axis=1)[['ID', 'item_cnt_month']]
bench2 = bench2.fillna(0)
def cap(x):
    if x < 0:
        return 0
    if x > 20:
        return 20
    return x
bench2 = bench2.assign(item_cnt_month=bench2.item_cnt_month.map(cap))

bench2.to_csv('bench2_lag_sub.csv', index=False)

In [36]:
!kaggle competitions submit -c competitive-data-science-predict-future-sales -f bench2_lag_sub.csv -m "benchmark2 submission test"

100%|██████████████████████████████████████| 2.14M/2.14M [00:01<00:00, 1.35MB/s]
Successfully submitted to Predict Future Sales

In [70]:
!kaggle competitions submissions competitive-data-science-predict-future-sales

fileName            date                 description                 status    publicScore  privateScore  
------------------  -------------------  --------------------------  --------  -----------  ------------  
bench_lag_sub.csv   2020-07-10 22:51:22  benchmark submission test   complete  1.14178      None          
bench2_lag_sub.csv  2020-07-10 22:27:16  benchmark2 submission test  complete  1.14178      None          
bench_lag_sub.csv   2020-07-10 21:47:56  benchmark submission test   complete  1.14178      None          
bench_lag_sub.csv   2020-07-10 21:46:14  benchmark submission test   error     None         None          


Strange!

## Basic feature engineering

### shop and categories

In [19]:
data_monthly.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_name,item_category_id,item_category_name,shop_name,month,year,order_cnt,item_cnt,item_mean_price
0,0,0,32,1+1,40,Кино - DVD,"!Якутск Орджоникидзе, 56 фран",1,2013,3,4.0,221.0
1,0,0,32,1+1,40,Кино - DVD,"!Якутск Орджоникидзе, 56 фран",3,2013,1,2.0,221.0
2,0,0,33,1+1 (BD),37,Кино - Blu-Ray,"!Якутск Орджоникидзе, 56 фран",1,2013,2,2.0,347.0
3,0,0,33,1+1 (BD),37,Кино - Blu-Ray,"!Якутск Орджоникидзе, 56 фран",3,2013,1,1.0,347.0
4,0,0,35,10 ЛЕТ СПУСТЯ,40,Кино - DVD,"!Якутск Орджоникидзе, 56 фран",1,2013,1,1.0,247.0


In [4]:
data_monthly = data_monthly.assign(city=data_monthly.shop_name.str.split(' ').map(lambda x: x[0]))
data_monthly.loc[data_monthly.city=='!Якутск', 'city'] = 'Якутск'
data_monthly.city.unique()

array(['Якутск', 'Адыгея', 'Балашиха', 'Волжский', 'Воронеж', 'Жуковский',
       'Интернет-магазин', 'Казань', 'Калуга', 'Коломна', 'Красноярск',
       'Курск', 'Москва', 'Н.Новгород', 'Новосибирск', 'Омск',
       'РостовНаДону', 'СПб', 'Самара', 'Сергиев', 'Сургут', 'Тюмень',
       'Уфа', 'Химки', 'Чехов', 'Ярославль', 'Вологда', 'Цифровой',
       'Выездная', 'Томск', 'Мытищи'], dtype=object)

In [5]:
category_split = data_monthly.item_category_name.str.split('-')
data_monthly = data_monthly.assign(category=category_split.map(lambda x: x[0].strip()), 
                                   subcategory=category_split.map(lambda x: x[1].strip() if len(x) > 1 else x[0].strip()))

In [8]:
data_monthly.category.unique()

array(['Кино', 'Музыка', 'Книги', 'Программы', 'Игры PC', 'Подарки',
       'Игры', 'Аксессуары', 'Игровые консоли', 'Карты оплаты',
       'Чистые носители (шпиль)', 'Чистые носители (штучные)',
       'Элементы питания', 'Служебные', 'Доставка товара', 'PC',
       'Карты оплаты (Кино, Музыка, Игры)', 'Билеты (Цифра)', 'Игры MAC',
       'Игры Android'], dtype=object)

In [9]:
data_monthly.subcategory.unique()

array(['DVD', 'Blu', 'MP3', 'Аудиокниги', 'Аудиокниги 1С',
       '1С:Предприятие 8', 'Методические материалы 1С', 'Обучающие',
       'Стандартные издания', 'Развитие', 'CD локального производства',
       'Музыкальное видео', 'Для дома и офиса', 'PS3', 'XBOX 360',
       'PSVita', 'Дополнительные издания', 'PSP', 'Коллекционные издания',
       'Гаджеты, роботы, спорт', 'CD фирменного производства', 'PSN',
       'Аксессуары для игр', 'Сувениры', 'Live!', 'Коллекционное',
       'Мягкие игрушки', 'Сувениры (в навеску)', 'Подарочные издания',
       'PS4', 'Чистые носители (шпиль)', 'Чистые носители (штучные)',
       'Настольные игры (компактные)', 'Настольные игры', 'Фигурки',
       'Элементы питания', 'Служебные', 'Винил', 'Открытки, наклейки',
       'Атрибутика', 'Бизнес литература', 'Доставка товара',
       'Сумки, Альбомы, Коврики д/мыши', 'PS2', 'Сертификаты, услуги',
       'Гарнитуры/Наушники', 'Компьютерная литература', 'Путеводители',
       'Аудиокниги (Цифра)', 'Для до

### Lagged feature

In [6]:
data_lag = data_monthly[['date_block_num','shop_id','item_id', 'item_cnt']]
data_lag['date_block_num'] += 1
data_lag = data_lag.rename({'item_cnt': 'item_cnt_lag'}, axis=1)
data_lag.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_lag
0,1,0,32,4.0
1,1,0,32,2.0
2,1,0,33,2.0
3,1,0,33,1.0
4,1,0,35,1.0


In [7]:
data_monthly = pd.merge(data_monthly, data_lag, on=['date_block_num','shop_id','item_id'], how='left')
data_monthly.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_name,item_category_id,item_category_name,shop_name,month,year,order_cnt,item_cnt,item_mean_price,city,category,subcategory,item_cnt_lag
0,0,0,32,1+1,40,Кино - DVD,"!Якутск Орджоникидзе, 56 фран",1,2013,3,4.0,221.0,Якутск,Кино,DVD,
1,0,0,32,1+1,40,Кино - DVD,"!Якутск Орджоникидзе, 56 фран",3,2013,1,2.0,221.0,Якутск,Кино,DVD,
2,0,0,33,1+1 (BD),37,Кино - Blu-Ray,"!Якутск Орджоникидзе, 56 фран",1,2013,2,2.0,347.0,Якутск,Кино,Blu,
3,0,0,33,1+1 (BD),37,Кино - Blu-Ray,"!Якутск Орджоникидзе, 56 фран",3,2013,1,1.0,347.0,Якутск,Кино,Blu,
4,0,0,35,10 ЛЕТ СПУСТЯ,40,Кино - DVD,"!Якутск Орджоникидзе, 56 фран",1,2013,1,1.0,247.0,Якутск,Кино,DVD,


In [10]:
data_monthly_lagged = data_monthly.loc[data_monthly.date_block_num > 0]
data_monthly_lagged.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_name,item_category_id,item_category_name,shop_name,month,year,order_cnt,item_cnt,item_mean_price,city,category,subcategory,item_cnt_lag
93463,1,0,30,007: КООРДИНАТЫ «СКАЙФОЛЛ»,40,Кино - DVD,"!Якутск Орджоникидзе, 56 фран",2,2013,9,31.0,265.0,Якутск,Кино,DVD,
93464,1,0,31,007: КООРДИНАТЫ «СКАЙФОЛЛ» (BD),37,Кино - Blu-Ray,"!Якутск Орджоникидзе, 56 фран",2,2013,7,11.0,434.0,Якутск,Кино,Blu,
93465,1,0,32,1+1,40,Кино - DVD,"!Якутск Орджоникидзе, 56 фран",2,2013,4,6.0,221.0,Якутск,Кино,DVD,4.0
93466,1,0,32,1+1,40,Кино - DVD,"!Якутск Орджоникидзе, 56 фран",2,2013,4,6.0,221.0,Якутск,Кино,DVD,2.0
93467,1,0,32,1+1,40,Кино - DVD,"!Якутск Орджоникидзе, 56 фран",5,2013,1,1.0,221.0,Якутск,Кино,DVD,4.0


In [13]:
data_monthly_lagged.isna().sum()

date_block_num              0
shop_id                     0
item_id                     0
item_name                   0
item_category_id            0
item_category_name          0
shop_name                   0
month                       0
year                        0
order_cnt                   0
item_cnt                    0
item_mean_price             0
city                        0
category                    0
subcategory                 0
item_cnt_lag          1016880
dtype: int64

In [11]:
data_monthly_lagged = data_monthly_lagged.fillna(0)

In [12]:
data_monthly_lagged.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_name,item_category_id,item_category_name,shop_name,month,year,order_cnt,item_cnt,item_mean_price,city,category,subcategory,item_cnt_lag
93463,1,0,30,007: КООРДИНАТЫ «СКАЙФОЛЛ»,40,Кино - DVD,"!Якутск Орджоникидзе, 56 фран",2,2013,9,31.0,265.0,Якутск,Кино,DVD,0.0
93464,1,0,31,007: КООРДИНАТЫ «СКАЙФОЛЛ» (BD),37,Кино - Blu-Ray,"!Якутск Орджоникидзе, 56 фран",2,2013,7,11.0,434.0,Якутск,Кино,Blu,0.0
93465,1,0,32,1+1,40,Кино - DVD,"!Якутск Орджоникидзе, 56 фран",2,2013,4,6.0,221.0,Якутск,Кино,DVD,4.0
93466,1,0,32,1+1,40,Кино - DVD,"!Якутск Орджоникидзе, 56 фран",2,2013,4,6.0,221.0,Якутск,Кино,DVD,2.0
93467,1,0,32,1+1,40,Кино - DVD,"!Якутск Орджоникидзе, 56 фран",5,2013,1,1.0,221.0,Якутск,Кино,DVD,4.0


### target encoding
Here using expanding mean

In [29]:
# train/val split
features = ['item_cnt_lag', 'month', 'year', 'item_mean_price', 'city', 'category', 'subcategory', 'shop_id', 'item_id', 'item_cnt']
train_X = data_monthly_lagged.loc[data_monthly_lagged.date_block_num < 33][features[:-1]]
train_y = data_monthly_lagged.loc[data_monthly_lagged.date_block_num < 33][features[-1]].values
val_X = data_monthly_lagged.loc[data_monthly_lagged.date_block_num == 33][features[:-1]]
val_y = data_monthly_lagged.loc[data_monthly_lagged.date_block_num == 33][features[-1]].values

In [30]:
train_X.head()

Unnamed: 0,item_cnt_lag,month,year,item_mean_price,city,category,subcategory,shop_id,item_id
93463,0.0,2,2013,265.0,Якутск,Кино,DVD,0,30
93464,0.0,2,2013,434.0,Якутск,Кино,Blu,0,31
93465,4.0,2,2013,221.0,Якутск,Кино,DVD,0,32
93466,2.0,2,2013,221.0,Якутск,Кино,DVD,0,32
93467,4.0,5,2013,221.0,Якутск,Кино,DVD,0,32


In [31]:
train_mean = train_y.mean()

In [32]:
# encode on train_X and map to val_X
columns_encode = ['city', 'category', 'subcategory', 'shop_id', 'item_id']
for col in columns_encode:
    encoded = data_monthly_lagged.loc[data_monthly_lagged.date_block_num < 33][[col, 'item_cnt']].groupby(col).agg(**{col+'_encoded': ('item_cnt', 'mean')}).reset_index()
    val_X = pd.merge(val_X, encoded, on=col, how='left')
val_X.head()

Unnamed: 0,item_cnt_lag,month,year,item_mean_price,city,category,subcategory,shop_id,item_id,city_encoded,category_encoded,subcategory_encoded,shop_id_encoded,item_id_encoded
0,0.0,10,2015,399.0,Адыгея,Кино,Blu,2,31,1.874067,1.456171,1.259678,1.874067,1.852427
1,1.0,4,2015,300.0,Адыгея,Программы,1С:Предприятие 8,2,486,1.874067,2.204295,2.022894,1.874067,2.567901
2,1.0,10,2015,300.0,Адыгея,Программы,1С:Предприятие 8,2,486,1.874067,2.204295,2.022894,1.874067,2.567901
3,0.0,10,2015,420.0,Адыгея,Книги,Методические материалы 1С,2,787,1.874067,1.641077,1.774115,1.874067,2.039216
4,0.0,4,2015,3300.0,Адыгея,Программы,1С:Предприятие 8,2,794,1.874067,2.204295,2.022894,1.874067,1.294118


In [33]:
val_X = val_X.fillna(train_mean)

In [34]:
train_mean

2.3503225446150804

In [35]:
# regularize train_X
columns_encode = ['city', 'category', 'subcategory', 'shop_id', 'item_id']
train_data = data_monthly_lagged.loc[data_monthly_lagged.date_block_num < 33]
for col in columns_encode:
    cumsum = train_data.groupby(col)['item_cnt'].cumsum() - train_data['item_cnt']
    cumcnt = train_data.groupby(col).cumcount()
    train_X[col+'_encoded']= cumsum/cumcnt
train_X = train_X.fillna(train_mean)
train_X.head()

Unnamed: 0,item_cnt_lag,month,year,item_mean_price,city,category,subcategory,shop_id,item_id,city_encoded,category_encoded,subcategory_encoded,shop_id_encoded,item_id_encoded
93463,0.0,2,2013,265.0,Якутск,Кино,DVD,0,30,2.350323,2.350323,2.350323,2.350323,2.350323
93464,0.0,2,2013,434.0,Якутск,Кино,Blu,0,31,31.0,31.0,2.350323,31.0,2.350323
93465,4.0,2,2013,221.0,Якутск,Кино,DVD,0,32,21.0,21.0,31.0,21.0,2.350323
93466,2.0,2,2013,221.0,Якутск,Кино,DVD,0,32,16.0,16.0,18.5,16.0,6.0
93467,4.0,5,2013,221.0,Якутск,Кино,DVD,0,32,13.5,13.5,14.333333,13.5,6.0


In [36]:
%store train_X train_y val_X val_y data_monthly_lagged test shops items item_categories

Stored 'train_X' (DataFrame)
Stored 'train_y' (ndarray)
Stored 'val_X' (DataFrame)
Stored 'val_y' (ndarray)
Stored 'data_monthly_lagged' (DataFrame)
Stored 'test' (DataFrame)
Stored 'shops' (DataFrame)
Stored 'items' (DataFrame)
Stored 'item_categories' (DataFrame)


### xgboost

In [None]:
% store -r

In [28]:
import xgboost as xgb

In [37]:
dtrain = xgb.DMatrix(train_X[['city_encoded', 'category_encoded', 'subcategory_encoded', 'shop_id_encoded', 'item_id_encoded', 'item_cnt_lag', 'month', 'year', 'item_mean_price']].values, label=train_y)
dval = xgb.DMatrix(val_X[['city_encoded', 'category_encoded', 'subcategory_encoded', 'shop_id_encoded', 'item_id_encoded', 'item_cnt_lag', 'month', 'year', 'item_mean_price']].values, label=val_y)

In [38]:
param = {'max_depth': 10, 'eta': 0.1, 'nthread': 4, 'seed': 2020, 'objective': 'reg:squarederror', 'early_stopping_rounds': 100}
evallist = [(dval, 'eval'), (dtrain, 'train')]

In [39]:
num_round = 20
bst = xgb.train(param, dtrain, num_round, evallist)

[0]	eval-rmse:28.2952	train-rmse:11.1264
[1]	eval-rmse:28.4272	train-rmse:10.6662
[2]	eval-rmse:28.3579	train-rmse:10.2668
[3]	eval-rmse:28.5626	train-rmse:9.9233
[4]	eval-rmse:28.7501	train-rmse:9.61927
[5]	eval-rmse:28.7235	train-rmse:9.36243
[6]	eval-rmse:28.7238	train-rmse:9.1206
[7]	eval-rmse:28.7092	train-rmse:8.90513
[8]	eval-rmse:28.7062	train-rmse:8.7223
[9]	eval-rmse:28.719	train-rmse:8.55874
[10]	eval-rmse:28.7302	train-rmse:8.37068
[11]	eval-rmse:28.7614	train-rmse:8.19898
[12]	eval-rmse:28.7728	train-rmse:8.03428
[13]	eval-rmse:28.7895	train-rmse:7.93008
[14]	eval-rmse:28.7879	train-rmse:7.81357
[15]	eval-rmse:28.8108	train-rmse:7.66384
[16]	eval-rmse:28.9605	train-rmse:7.53143
[17]	eval-rmse:28.9595	train-rmse:7.42112
[18]	eval-rmse:28.9593	train-rmse:7.35026
[19]	eval-rmse:28.9475	train-rmse:7.23194


In [40]:
np.sqrt(((train_X['item_cnt_lag'].values-train_y)**2).mean())

15.530080948938716

In [41]:
np.sqrt(((val_X['item_cnt_lag'].values-val_y)**2).mean())

29.005139565736513

#### Outliers!!!
We need to deal with outliers to reduce rmse