# Predict Future Sales in Russia

- https://www.kaggle.com/c/competitive-data-science-predict-future-sales/

### Data files
- item_categories.csv
- items.csv
- sales_train.csv
- sample_submission.csv
- shops.csv
- test.csv

In [123]:
# importar pacotes necessários
import numpy as np
import pandas as pd

In [124]:
# definir parâmetros extras
pd.set_option('precision', 4)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

In [125]:
# importar pacotes usados na seleção do modelo e na medição da precisão
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

# importar os pacotes necessários para os algoritmos de regressão
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Lars
from sklearn.linear_model import LassoLars
from sklearn.linear_model import OrthogonalMatchingPursuit
from sklearn.linear_model import PassiveAggressiveRegressor
from sklearn.linear_model import BayesianRidge
from sklearn.linear_model import ARDRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveRegressor
from sklearn.linear_model import RANSACRegressor
from sklearn.linear_model import TheilSenRegressor
from sklearn.linear_model import HuberRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import RadiusNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.svm import NuSVR
from sklearn.svm import LinearSVR
from sklearn.tree import DecisionTreeRegressor

from xgboost import XGBRegressor

In [126]:
input_prefix = 'data/'

translate = False # original em russo
#translate = True # traduzir para inglês

trfile = '-translated' if translate else ''
trcol = '_translated' if translate else ''

## Shops

In [127]:
df_shops = pd.read_csv(input_prefix + 'shops' + trfile + '.csv', index_col='shop_id')
print('shape:', df_shops.shape)
df_shops.head()

shape: (60, 1)


Unnamed: 0_level_0,shop_name
shop_id,Unnamed: 1_level_1
0,"!Якутск Орджоникидзе, 56 фран"
1,"!Якутск ТЦ ""Центральный"" фран"
2,"Адыгея ТЦ ""Мега"""
3,"Балашиха ТРК ""Октябрь-Киномир"""
4,"Волжский ТЦ ""Волга Молл"""


In [128]:
df_shops.describe()

Unnamed: 0,shop_name
count,60
unique,60
top,"РостовНаДону ТРК ""Мегацентр Горизонт"" Островной"
freq,1


In [129]:
df_shops.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 60 entries, 0 to 59
Data columns (total 1 columns):
shop_name    60 non-null object
dtypes: object(1)
memory usage: 720.0+ bytes


### Extract city name from the shop name

- 'Москва ТРК "Атриум"' => 'Москва'
- 'Н.Новгород ТРЦ "Фантастика" => 'Н.Новгород'

In [130]:
df_shops['city_name'] = df_shops['shop_name' + trcol].apply(lambda s: s.split()[0])

df_shops.head()

Unnamed: 0_level_0,shop_name,city_name
shop_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"!Якутск Орджоникидзе, 56 фран",!Якутск
1,"!Якутск ТЦ ""Центральный"" фран",!Якутск
2,"Адыгея ТЦ ""Мега""",Адыгея
3,"Балашиха ТРК ""Октябрь-Киномир""",Балашиха
4,"Волжский ТЦ ""Волга Молл""",Волжский


In [131]:
df_shops.groupby('city_name').count().head(10)

Unnamed: 0_level_0,shop_name
city_name,Unnamed: 1_level_1
!Якутск,2
Адыгея,1
Балашиха,1
Волжский,1
Вологда,1
Воронеж,3
Выездная,1
Жуковский,2
Интернет-магазин,1
Казань,2


In [132]:
df_shops['city_code'] = (
    df_shops['city_name'].astype('category').cat.codes + 1
).astype('category')

df_shops.head()

Unnamed: 0_level_0,shop_name,city_name,city_code
shop_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,"!Якутск Орджоникидзе, 56 фран",!Якутск,1
1,"!Якутск ТЦ ""Центральный"" фран",!Якутск,1
2,"Адыгея ТЦ ""Мега""",Адыгея,2
3,"Балашиха ТРК ""Октябрь-Киномир""",Балашиха,3
4,"Волжский ТЦ ""Волга Молл""",Волжский,4


In [133]:
df_shops.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 60 entries, 0 to 59
Data columns (total 3 columns):
shop_name    60 non-null object
city_name    60 non-null object
city_code    60 non-null category
dtypes: category(1), object(2)
memory usage: 2.2+ KB


In [134]:
df_shops.describe()

Unnamed: 0,shop_name,city_name,city_code
count,60,60,60
unique,60,32,32
top,"РостовНаДону ТРК ""Мегацентр Горизонт"" Островной",Москва,15
freq,1,13,13


## Item categories

In [135]:
df_categories = pd.read_csv(input_prefix + 'item_categories' + trfile + '.csv', index_col='item_category_id')
print('shape:', df_categories.shape)
df_categories.head()

shape: (84, 1)


Unnamed: 0_level_0,item_category_name
item_category_id,Unnamed: 1_level_1
0,PC - Гарнитуры/Наушники
1,Аксессуары - PS2
2,Аксессуары - PS3
3,Аксессуары - PS4
4,Аксессуары - PSP


In [136]:
df_categories.describe()

Unnamed: 0,item_category_name
count,84
unique,84
top,Книги - Аудиокниги (Цифра)
freq,1


### Extract group and subgroup names from item category name

- 'Игровые консоли - PS4' => 'Игровые консоли'
- 'Карты оплаты - Windows (Цифра)' => 'Карты оплаты'
- 'Книги - Комиксы, манга' => 'Книги'

In [137]:
df_categories['group_name'] = \
    df_categories['item_category_name' + trcol].apply(
        lambda s: s.split(' - ')[0].split(' (')[0].upper())

df_categories.head()

Unnamed: 0_level_0,item_category_name,group_name
item_category_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,PC - Гарнитуры/Наушники,PC
1,Аксессуары - PS2,АКСЕССУАРЫ
2,Аксессуары - PS3,АКСЕССУАРЫ
3,Аксессуары - PS4,АКСЕССУАРЫ
4,Аксессуары - PSP,АКСЕССУАРЫ


In [138]:
df_categories.groupby('group_name').count().head(10)

Unnamed: 0_level_0,item_category_name
group_name,Unnamed: 1_level_1
PC,1
АКСЕССУАРЫ,7
БИЛЕТЫ,1
ДОСТАВКА ТОВАРА,1
ИГРОВЫЕ КОНСОЛИ,8
ИГРЫ,8
ИГРЫ ANDROID,1
ИГРЫ MAC,1
ИГРЫ PC,4
КАРТЫ ОПЛАТЫ,5


In [139]:
def extract_subgroup(s):
    gs = s.split(' - ')
    if len(gs) > 1:
        gs2 = gs[1].split(' (')
        return gs2[0].upper()
    else:
        return ''
    #return gs[1] if len(gs) > 1 else ''
    
df_categories['subgroup_name'] = \
    df_categories['item_category_name' + trcol].apply(
        lambda s: extract_subgroup(s))

df_categories.head()

Unnamed: 0_level_0,item_category_name,group_name,subgroup_name
item_category_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,PC - Гарнитуры/Наушники,PC,ГАРНИТУРЫ/НАУШНИКИ
1,Аксессуары - PS2,АКСЕССУАРЫ,PS2
2,Аксессуары - PS3,АКСЕССУАРЫ,PS3
3,Аксессуары - PS4,АКСЕССУАРЫ,PS4
4,Аксессуары - PSP,АКСЕССУАРЫ,PSP


In [140]:
df_categories.groupby('subgroup_name').count().head(10)

Unnamed: 0_level_0,item_category_name,group_name
subgroup_name,Unnamed: 1_level_1,Unnamed: 2_level_1
,7,7
1С:ПРЕДПРИЯТИЕ 8,1,1
BLU-RAY,1,1
BLU-RAY 3D,1,1
BLU-RAY 4K,1,1
CD ЛОКАЛЬНОГО ПРОИЗВОДСТВА,1,1
CD ФИРМЕННОГО ПРОИЗВОДСТВА,1,1
DVD,1,1
LIVE!,2,2
MAC,1,1


In [141]:
df_categories['group_code'] = (
    df_categories['group_name'].astype('category').cat.codes + 1).astype('category')

df_categories['subgroup_code'] = (
    df_categories['subgroup_name'].astype('category').cat.codes + 1).astype('category')

df_categories.head()

Unnamed: 0_level_0,item_category_name,group_name,subgroup_name,group_code,subgroup_code
item_category_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,PC - Гарнитуры/Наушники,PC,ГАРНИТУРЫ/НАУШНИКИ,1,30
1,Аксессуары - PS2,АКСЕССУАРЫ,PS2,2,12
2,Аксессуары - PS3,АКСЕССУАРЫ,PS3,2,13
3,Аксессуары - PS4,АКСЕССУАРЫ,PS4,2,14
4,Аксессуары - PSP,АКСЕССУАРЫ,PSP,2,16


In [142]:
df_categories.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 84 entries, 0 to 83
Data columns (total 5 columns):
item_category_name    84 non-null object
group_name            84 non-null object
subgroup_name         84 non-null object
group_code            84 non-null category
subgroup_code         84 non-null category
dtypes: category(2), object(3)
memory usage: 4.9+ KB


In [143]:
df_categories.describe()

Unnamed: 0,item_category_name,group_name,subgroup_name,group_code,subgroup_code
count,84,84,84.0,84,84
unique,84,18,55.0,18,55
top,Книги - Аудиокниги (Цифра),КНИГИ,,12,1
freq,1,13,7.0,13,7


## Items

In [144]:
df_items = pd.read_csv(input_prefix + 'items' + trfile + '.csv', index_col='item_id')
print('shape:', df_items.shape)
df_items.head()

shape: (22170, 2)


Unnamed: 0_level_0,item_name,item_category_id
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,40
1,!ABBYY FineReader 12 Professional Edition Full...,76
2,***В ЛУЧАХ СЛАВЫ (UNV) D,40
3,***ГОЛУБАЯ ВОЛНА (Univ) D,40
4,***КОРОБКА (СТЕКЛО) D,40


In [145]:
#if not translate:
#    df_items['item_category_id'] = df_items['item_category_id'].astype('category')

In [146]:
df_items.describe()

Unnamed: 0,item_category_id
count,22170.0
mean,46.2908
std,15.9415
min,0.0
25%,37.0
50%,40.0
75%,58.0
max,83.0


In [147]:
import re

def extract_main_subject(str):
    s = str.upper()
    # remover caracteres do começo => !"*/
    s = re.sub("^[!*/\"]+ ?", "", s)
    # remover termo "1C:" do começo do nome
    s = re.sub("^1C.", "", s)
    # remover termo "THE" do começo do nome
    s = re.sub("^THE ", "", s)
    # obter primeira palavra em maiúsculo
    s = s.split()[0]
    # substituir caracteres => '`’
    s = re.sub("['`’]", "_", s)
    # remover caracteres do fim da palavra => :.®,!
    s = re.sub("[:.,!®]$", "", s)
    return s
    
df_items['subject_name'] = \
    df_items['item_name' + trcol].apply(
        lambda s: extract_main_subject(s))

df_items.head()

Unnamed: 0_level_0,item_name,item_category_id,subject_name
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,40,ВО
1,!ABBYY FineReader 12 Professional Edition Full...,76,ABBYY
2,***В ЛУЧАХ СЛАВЫ (UNV) D,40,В
3,***ГОЛУБАЯ ВОЛНА (Univ) D,40,ГОЛУБАЯ
4,***КОРОБКА (СТЕКЛО) D,40,КОРОБКА


In [148]:
df_items.tail()

Unnamed: 0_level_0,item_name,item_category_id,subject_name
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
22165,"Ядерный титбит 2 [PC, Цифровая версия]",31,ЯДЕРНЫЙ
22166,Язык запросов 1С:Предприятия [Цифровая версия],54,ЯЗЫК
22167,Язык запросов 1С:Предприятия 8 (+CD). Хрустале...,49,ЯЗЫК
22168,Яйцо для Little Inu,62,ЯЙЦО
22169,Яйцо дракона (Игра престолов),69,ЯЙЦО


In [149]:
df_items.groupby('subject_name').count().head(10)

Unnamed: 0_level_0,item_name,item_category_id
subject_name,Unnamed: 1_level_1,Unnamed: 2_level_1
007,5,5
1+1,2,2
10,6,6
100,20,20
1000,2,2
10000,1,1
101,8,8
11,2,2
11-11-11,1,1
12,8,8


In [150]:
df_items['subject_code'] = (
    df_items['subject_name'].astype('category').cat.codes + 1).astype('category')

df_items.head()

Unnamed: 0_level_0,item_name,item_category_id,subject_name,subject_code
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,40,ВО,1803
1,!ABBYY FineReader 12 Professional Edition Full...,76,ABBYY,111
2,***В ЛУЧАХ СЛАВЫ (UNV) D,40,В,1614
3,***ГОЛУБАЯ ВОЛНА (Univ) D,40,ГОЛУБАЯ,1958
4,***КОРОБКА (СТЕКЛО) D,40,КОРОБКА,2630


In [151]:
df_items.tail()

Unnamed: 0_level_0,item_name,item_category_id,subject_name,subject_code
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
22165,"Ядерный титбит 2 [PC, Цифровая версия]",31,ЯДЕРНЫЙ,4646
22166,Язык запросов 1С:Предприятия [Цифровая версия],54,ЯЗЫК,4648
22167,Язык запросов 1С:Предприятия 8 (+CD). Хрустале...,49,ЯЗЫК,4648
22168,Яйцо для Little Inu,62,ЯЙЦО,4649
22169,Яйцо дракона (Игра престолов),69,ЯЙЦО,4649


In [152]:
df_items.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22170 entries, 0 to 22169
Data columns (total 4 columns):
item_name           22170 non-null object
item_category_id    22170 non-null int64
subject_name        22170 non-null object
subject_code        22170 non-null category
dtypes: category(1), int64(1), object(2)
memory usage: 727.3+ KB


In [153]:
df_items.describe()

Unnamed: 0,item_category_id
count,22170.0
mean,46.2908
std,15.9415
min,0.0
25%,37.0
50%,40.0
75%,58.0
max,83.0


In [154]:
# join items + categories:item_category_id => group_code, subgroup_code
df_items2 = pd.merge(df_items, df_categories, how='left', on='item_category_id')
df_items2.head()

Unnamed: 0,item_name,item_category_id,subject_name,subject_code,item_category_name,group_name,subgroup_name,group_code,subgroup_code
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,40,ВО,1803,Кино - DVD,КИНО,DVD,11,8
1,!ABBYY FineReader 12 Professional Edition Full...,76,ABBYY,111,Программы - Для дома и офиса (Цифра),ПРОГРАММЫ,ДЛЯ ДОМА И ОФИСА,15,31
2,***В ЛУЧАХ СЛАВЫ (UNV) D,40,В,1614,Кино - DVD,КИНО,DVD,11,8
3,***ГОЛУБАЯ ВОЛНА (Univ) D,40,ГОЛУБАЯ,1958,Кино - DVD,КИНО,DVD,11,8
4,***КОРОБКА (СТЕКЛО) D,40,КОРОБКА,2630,Кино - DVD,КИНО,DVD,11,8


In [155]:
del(df_items)
del(df_categories)

In [156]:
df_items2['item_category_id'] = df_items2['item_category_id'].astype('category')

In [157]:
df_items2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22170 entries, 0 to 22169
Data columns (total 9 columns):
item_name             22170 non-null object
item_category_id      22170 non-null category
subject_name          22170 non-null object
subject_code          22170 non-null category
item_category_name    22170 non-null object
group_name            22170 non-null object
subgroup_name         22170 non-null object
group_code            22170 non-null category
subgroup_code         22170 non-null category
dtypes: category(4), object(5)
memory usage: 884.6+ KB


In [158]:
df_items2.describe()

Unnamed: 0,item_name,item_category_id,subject_name,subject_code,item_category_name,group_name,subgroup_name,group_code,subgroup_code
count,22170,22170,22170,22170,22170,22170,22170,22170,22170
unique,22170,84,4655,4655,84,18,55,18,55
top,CHRISTOPHE GOZE A Day In Ibiza 2 (digipack),40,ФИГУРКА,4348,Кино - DVD,КИНО,DVD,11,8
freq,1,5035,599,599,5035,7464,5035,7464,5035


## Sales (training)

In [159]:
dateparse = lambda x: pd.datetime.strptime(x, '%d.%m.%Y')
df_train = pd.read_csv(input_prefix + 'sales_train.csv', nrows=100,
                       parse_dates=['date'], date_parser=dateparse)
print('shape:', df_train.shape)
df_train.head()

shape: (100, 6)


Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,2013-01-02,0,59,22154,999.0,1.0
1,2013-01-03,0,25,2552,899.0,1.0
2,2013-01-05,0,25,2552,899.0,-1.0
3,2013-01-06,0,25,2554,1709.05,1.0
4,2013-01-15,0,25,2555,1099.0,1.0


In [39]:
#for col in ('shop_id', 'item_id'):
#    df_train[col] = df_train[col].astype('category')

In [160]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
date              100 non-null datetime64[ns]
date_block_num    100 non-null int64
shop_id           100 non-null int64
item_id           100 non-null int64
item_price        100 non-null float64
item_cnt_day      100 non-null float64
dtypes: datetime64[ns](1), float64(2), int64(3)
memory usage: 4.7 KB


In [161]:
df_train.describe()

Unnamed: 0,date_block_num,shop_id,item_id,item_price,item_cnt_day
count,100.0,100.0,100.0,100.0,100.0
mean,0.0,25.34,2912.82,856.575,1.12
std,0.0,3.4,1947.1627,624.8042,0.4981
min,0.0,25.0,2473.0,58.0,-1.0
25%,0.0,25.0,2588.25,399.0,1.0
50%,0.0,25.0,2746.0,599.0,1.0
75%,0.0,25.0,2833.0,999.0,1.0
max,0.0,59.0,22154.0,2699.0,4.0


In [165]:
# obter última data de compra para cada par (loja, produto)
df_maxdates = (df_train.groupby(['shop_id', 'item_id'], sort=False)['date'].max()).to_frame()
df_maxdates.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,date
shop_id,item_id,Unnamed: 2_level_1
59,22154,2013-01-02
25,2552,2013-01-05
25,2554,2013-01-06
25,2555,2013-01-15
25,2564,2013-01-10


In [166]:
df_maxdates.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 42 entries, (59, 22154) to (25, 2808)
Data columns (total 1 columns):
date    42 non-null datetime64[ns]
dtypes: datetime64[ns](1)
memory usage: 868.0 bytes


In [168]:
#TODO: obter último valor de venda de cada par (loja, produto)
#df_train['last_sale'] = df_maxdates
#df_train['last_price'] = ??
df_train.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,2013-01-02,0,59,22154,999.0,1.0
1,2013-01-03,0,25,2552,899.0,1.0
2,2013-01-05,0,25,2552,899.0,-1.0
3,2013-01-06,0,25,2554,1709.05,1.0
4,2013-01-15,0,25,2555,1099.0,1.0


In [175]:
pd.__version__

'0.23.4'

In [183]:
# calcular quantidade total de produtos vendidos para cada loja por mês
df_sumitems = (df_train.groupby(['date_block_num', 'shop_id', 'item_id'])['item_cnt_day'].sum()).to_frame()
df_sumitems.columns = ['item_cnt_month']
df_sumitems.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,item_cnt_month
date_block_num,shop_id,item_id,Unnamed: 3_level_1
0,25,2473,1.0
0,25,2480,1.0
0,25,2515,1.0
0,25,2522,1.0
0,25,2546,1.0


In [42]:
#TODO: converter 'date' para tipo data - OK

#TODO: mesclar com tabelas de lojas, itens e categorias

#TODO: verificar o que significam contagens negativas - OK
# It means the item is returned. You should predict that too.
# Or in the other words: it is not missing data or mistake.

In [43]:
# join items + categories:item_category_id => group_code, subgroup_code
df_train2 = pd.merge(df_train, df_shops, how='left', on='shop_id')
df_train2.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,shop_name,city_name,city_code
0,2013-01-02,0,59,22154,999.0,1.0,"Ярославль ТЦ ""Альтаир""",Ярославль,32
1,2013-01-03,0,25,2552,899.0,1.0,"Москва ТРК ""Атриум""",Москва,15
2,2013-01-05,0,25,2552,899.0,-1.0,"Москва ТРК ""Атриум""",Москва,15
3,2013-01-06,0,25,2554,1709.05,1.0,"Москва ТРК ""Атриум""",Москва,15
4,2013-01-15,0,25,2555,1099.0,1.0,"Москва ТРК ""Атриум""",Москва,15


In [44]:
del(df_train)

In [45]:
df_train2['shop_id'] = df_train2['shop_id'].astype('category')

In [46]:
df_train2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100 entries, 0 to 99
Data columns (total 9 columns):
date              100 non-null datetime64[ns]
date_block_num    100 non-null int64
shop_id           100 non-null category
item_id           100 non-null int64
item_price        100 non-null float64
item_cnt_day      100 non-null float64
shop_name         100 non-null object
city_name         100 non-null object
city_code         100 non-null category
dtypes: category(2), datetime64[ns](1), float64(2), int64(2), object(2)
memory usage: 7.0+ KB


In [47]:
df_train2.describe()

Unnamed: 0,date_block_num,item_id,item_price,item_cnt_day
count,100.0,100.0,100.0,100.0
mean,0.0,2912.82,856.575,1.12
std,0.0,1947.1627,624.8042,0.4981
min,0.0,2473.0,58.0,-1.0
25%,0.0,2588.25,399.0,1.0
50%,0.0,2746.0,599.0,1.0
75%,0.0,2833.0,999.0,1.0
max,0.0,22154.0,2699.0,4.0


In [48]:
# join sales + items:item_id => subject_code, (group_code, subgroup_code)
#df_train3 = pd.merge(df_train2, df_items2, how='left', right_on='item_id', left_index=True)
#df_train3.head()

In [49]:
#FIXME: remover isso depois!
df_train3 = df_train2

In [50]:
# criar colunas contendo dia da semana a partir de 'date'
df_train3['year'] = df_train3['date'].dt.year
df_train3['month'] = df_train3['date'].dt.month
df_train3['day'] = df_train3['date'].dt.day
df_train3['dow'] = df_train3['date'].dt.dayofweek
df_train3['woy'] = df_train3['date'].dt.weekofyear

In [51]:
#TODO: baixar calendário de feriados na Rússia:
#      https://www.google.com/search?q=holidays+calendar+in+russia+format%3Acsv
#TODO: criar colunas indicando feriado
#TODO: criar colunas indicando véspera ou pós-feriado

In [52]:
df_train3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100 entries, 0 to 99
Data columns (total 14 columns):
date              100 non-null datetime64[ns]
date_block_num    100 non-null int64
shop_id           100 non-null category
item_id           100 non-null int64
item_price        100 non-null float64
item_cnt_day      100 non-null float64
shop_name         100 non-null object
city_name         100 non-null object
city_code         100 non-null category
year              100 non-null int64
month             100 non-null int64
day               100 non-null int64
dow               100 non-null int64
woy               100 non-null int64
dtypes: category(2), datetime64[ns](1), float64(2), int64(7), object(2)
memory usage: 10.9+ KB


In [53]:
df_train3.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,shop_name,city_name,city_code,year,month,day,dow,woy
0,2013-01-02,0,59,22154,999.0,1.0,"Ярославль ТЦ ""Альтаир""",Ярославль,32,2013,1,2,2,1
1,2013-01-03,0,25,2552,899.0,1.0,"Москва ТРК ""Атриум""",Москва,15,2013,1,3,3,1
2,2013-01-05,0,25,2552,899.0,-1.0,"Москва ТРК ""Атриум""",Москва,15,2013,1,5,5,1
3,2013-01-06,0,25,2554,1709.05,1.0,"Москва ТРК ""Атриум""",Москва,15,2013,1,6,6,1
4,2013-01-15,0,25,2555,1099.0,1.0,"Москва ТРК ""Атриум""",Москва,15,2013,1,15,1,3


In [54]:
df_train3.drop(['date', 'shop_name', 'city_name'], axis=1, inplace=True)
df_train3.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_price,item_cnt_day,city_code,year,month,day,dow,woy
0,0,59,22154,999.0,1.0,32,2013,1,2,2,1
1,0,25,2552,899.0,1.0,15,2013,1,3,3,1
2,0,25,2552,899.0,-1.0,15,2013,1,5,5,1
3,0,25,2554,1709.05,1.0,15,2013,1,6,6,1
4,0,25,2555,1099.0,1.0,15,2013,1,15,1,3


In [55]:
df_train3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100 entries, 0 to 99
Data columns (total 11 columns):
date_block_num    100 non-null int64
shop_id           100 non-null category
item_id           100 non-null int64
item_price        100 non-null float64
item_cnt_day      100 non-null float64
city_code         100 non-null category
year              100 non-null int64
month             100 non-null int64
day               100 non-null int64
dow               100 non-null int64
woy               100 non-null int64
dtypes: category(2), float64(2), int64(7)
memory usage: 9.3 KB


In [56]:
#TODO: treino: todos os meses, exceto o último / teste: último mês
#TODO: avaliar diversos algoritmos de regressão e escolher o de melhor escore (métrica: MSE)

In [57]:
#TODO: usando o melhor algoritmo avaliado, treinar novamente usando todos os meses disponíveis
#TODO: montar dataframe contendo lojas e itens presentes na base de testes (test.csv)
#TODO: usar todas as datas do mês seguinte (01/11/2015 a 30/11/2015)
#TODO: incrementar dataframe fazendo JOINs com tabelas de lojas, itens e categorias
#TODO: incrementar dataframe produzindo campos extras de datas (year, month, day, dow, woy)

In [58]:
#TODO: submeter o dataframe de entrada ao modelo treinado a fim de obter previsões
#TODO: calcular somas das quantidades de produtos agrupadas por loja e produto
#TODO: fazer JOIN da base de testes com esse dataframe final, produzindo o arquivo de submissão final

In [59]:
data = df_train3

In [60]:
# realizar normalização nos dados numéricos contínuos
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
#FIXME: normalizar os valores da coluna preço
#data['item_price'] = scaler.fit_transform(data['item_price'])

#data.head()

In [61]:
# definir dados de entrada
X = data.drop(['item_cnt_day'], axis=1) # tudo, exceto a coluna alvo
y = data['item_cnt_day'] # apenas a coluna alvo

print('Forma dos dados originais:', X.shape, y.shape)

Forma dos dados originais: (100, 10) (100,)


## Treinamento dos modelos preditivos

In [62]:
MSE = 'neg_mean_squared_error'

models = {}

In [63]:
from datetime import datetime

# avalia o desempenho do modelo, retornando o valor do MSE
def evaluate_model_cv(name, model, X=X, y=y):
    start = datetime.now()
    kfold = KFold(n_splits=10, random_state=42)
    results = cross_val_score(model, X, y, cv=kfold, scoring=MSE, verbose=1)
    end = datetime.now()
    elapsed = int((end - start).total_seconds() * 1000)
    score = (-1) * results.mean()
    stddev = results.std()
    models[name] = {'model': model, 'score': score, 'stddev': stddev, 'elapsed': elapsed}
    print(model, '\nScore: %.2f (+/- %.2f) [%5s ms]' % (score, stddev, elapsed))
    return score, stddev, elapsed

In [64]:
# faz o ajuste fino do modelo, calculando os melhores hiperparâmetros
def fine_tune_model(model, params, X=X, y=y):
  print('\nFine Tuning Model:')
  print(model, "\nparams:", params)
  kfold = KFold(n_splits=10, random_state=42)
  grid = GridSearchCV(estimator=model, param_grid=params, scoring=MSE, cv=kfold, verbose=1)
  grid.fit(X, y)
  print('\nGrid Best Score: %.2f' % (grid.best_score_ * (-1)))
  print('Best Params:', grid.best_params_)
  return grid

### Avaliação e ajuste fino de cada modelo preditivo

- https://scikit-learn.org/stable/modules/classes.html

In [65]:
model = LogisticRegression(n_jobs=-1, random_state=42, multi_class='auto', C=1000, solver='sag')
evaluate_model_cv('LR', model)

params = dict(
    solver=['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    C=np.logspace(-3, 3, 7)
)
#fine_tune_model(model, params)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


LogisticRegression(C=1000, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=-1, penalty='l2', random_state=42,
                   solver='sag', tol=0.0001, verbose=0, warm_start=False) 
Score: 0.27 (+/- 0.43) [17223 ms]


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   17.2s finished


In [66]:
model = KNeighborsRegressor(n_jobs=-1, n_neighbors=13, weights='distance')
evaluate_model_cv('KNN', model)

#n_neighbors=5, weights=’uniform’, algorithm=’auto’, leaf_size=30, p=2, metric=’minkowski’,
#metric_params=None, n_jobs=None

params = {
    'n_neighbors': [1, 3, 5, 7, 9, 11, 13],
    'weights': ['uniform', 'distance']
}
#fine_tune_model(model, params)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=-1, n_neighbors=13, p=2,
                    weights='distance') 
Score: 0.29 (+/- 0.45) [ 1116 ms]


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    1.1s finished


In [67]:
model = RandomForestRegressor(random_state=42, n_jobs=-1, n_estimators=10, max_depth=5)
evaluate_model_cv('RF', model)

#n_estimators=’warn’, criterion=’mse’, max_depth=None, min_samples_split=2, min_samples_leaf=1, 
#min_weight_fraction_leaf=0.0, max_features=’auto’, max_leaf_nodes=None, min_impurity_decrease=0.0, 
#min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, 
#verbose=0, warm_start=False

params = {
    'n_estimators': [5, 10, 25, 50, 75, 100],
    'max_depth': [None, 3, 5, 7, 9, 11, 13]
}
#fine_tune_model(model, params)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=5,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
                      oob_score=False, random_state=42, verbose=0,
                      warm_start=False) 
Score: 0.35 (+/- 0.37) [ 3651 ms]


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    3.6s finished


## Comparação final entre os algoritmos

In [68]:
results = []
names = []
scores = []
stddevs = []
times = []

best_model = None
highest_score = None

for name, model in models.items():
    score, stddev, elapsed = model['score'], model['stddev'], model['elapsed']
    results.append((score, stddev))
    names.append(name)
    scores.append(score)
    stddevs.append(stddev)
    times.append(elapsed)
    
    if not highest_score or score < highest_score:
        best_model = model['model']
        highest_score = score

In [69]:
results_df = pd.DataFrame({'Model Name': names, 'Score': scores, 'Std Dev': stddevs, 'Time (ms)': times})
results_df.sort_values(by=['Score', 'Time (ms)'], ascending=[True, False])

Unnamed: 0,Model Name,Score,Std Dev,Time (ms)
0,LR,0.27,0.4337,17223
1,KNN,0.2947,0.4516,1116
2,RF,0.3506,0.3748,3651


In [70]:
print('Best Model:\n\n%s' % (best_model))

Best Model:

LogisticRegression(C=1000, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=-1, penalty='l2', random_state=42,
                   solver='sag', tol=0.0001, verbose=0, warm_start=False)


In [71]:
model = best_model
model.fit(X, y)



LogisticRegression(C=1000, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=-1, penalty='l2', random_state=42,
                   solver='sag', tol=0.0001, verbose=0, warm_start=False)

In [72]:
X.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_price,city_code,year,month,day,dow,woy
0,0,59,22154,999.0,32,2013,1,2,2,1
1,0,25,2552,899.0,15,2013,1,3,3,1
2,0,25,2552,899.0,15,2013,1,5,5,1
3,0,25,2554,1709.05,15,2013,1,6,6,1
4,0,25,2555,1099.0,15,2013,1,15,1,3


In [73]:
del(X)
del(y)

## Sales (testing)

In [90]:
df_test = pd.read_csv(input_prefix + 'test.csv', nrows=10000) #, index_col='ID')
print('shape:', df_test.shape)
df_test.head()

shape: (10000, 3)


Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268


In [91]:
#TODO: verificar o que está sendo esperado (acumulado mensal por loja e produto?)
# item_cnt_day: number of products sold. You are predicting a monthly amount of this measure

In [92]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 3 columns):
ID         10000 non-null int64
shop_id    10000 non-null int64
item_id    10000 non-null int64
dtypes: int64(3)
memory usage: 234.4 KB


In [93]:
df_test.describe()

Unnamed: 0,ID,shop_id,item_id
count,10000.0,10000.0,10000.0
mean,4999.5,4.51,10984.9873
std,2886.8957,0.4999,6256.3121
min,0.0,4.0,30.0
25%,2499.75,4.0,5339.0
50%,4999.5,5.0,11147.0
75%,7499.25,5.0,16068.25
max,9999.0,5.0,22167.0


In [94]:
#TODO: usar todas as datas do mês seguinte (01/11/2015 a 30/11/2015)
df_dates = pd.DataFrame({'date': pd.date_range(start='2015-11-01', end='2015-11-30', freq='D')})
df_dates.head()

Unnamed: 0,date
0,2015-11-01
1,2015-11-02
2,2015-11-03
3,2015-11-04
4,2015-11-05


In [95]:
# gerar dataframe a partir do produto cartesiano
df_test['key'] = 0
df_dates['key'] = 0
df_test2 = df_test.merge(df_dates, how='left', on='key')
for df in [df_test, df_dates, df_test2]:
    df.drop('key', axis=1, inplace=True)
df_test2.head()

Unnamed: 0,ID,shop_id,item_id,date
0,0,5,5037,2015-11-01
1,0,5,5037,2015-11-02
2,0,5,5037,2015-11-03
3,0,5,5037,2015-11-04
4,0,5,5037,2015-11-05


In [96]:
del(df_test)
del(df_dates)

In [97]:
# definir campo fixo
df_test2['date_block_num'] = 34 # Novembro/2015

In [98]:
#TODO: obter último preço disponível de cada item
df_test2['item_price'] = 908.5

In [99]:
# incrementar dataframe fazendo JOINs com tabelas de lojas, itens e categorias
# incluir coluna com código da cidade
df_test3 = pd.merge(df_test2, df_shops[['city_code']], how='left', on='shop_id')
df_test3.head()
del(df_test2)

In [100]:
# incrementar dataframe produzindo campos extras de datas (year, month, day, dow, woy)
df_test3['year'] = df_test3['date'].dt.year
df_test3['month'] = df_test3['date'].dt.month
df_test3['day'] = df_test3['date'].dt.day
df_test3['dow'] = df_test3['date'].dt.dayofweek
df_test3['woy'] = df_test3['date'].dt.weekofyear

In [101]:
df_test3.drop(['date'], axis=1, inplace=True)
df_test3.head()

Unnamed: 0,ID,shop_id,item_id,date_block_num,item_price,city_code,year,month,day,dow,woy
0,0,5,5037,34,908.5,5,2015,11,1,6,44
1,0,5,5037,34,908.5,5,2015,11,2,0,45
2,0,5,5037,34,908.5,5,2015,11,3,1,45
3,0,5,5037,34,908.5,5,2015,11,4,2,45
4,0,5,5037,34,908.5,5,2015,11,5,3,45


In [102]:
X_pred = df_test3.drop(['ID'], axis=1)
y_pred = best_model.predict(X_pred)

In [103]:
X_pred.info()
X_pred.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 300000 entries, 0 to 299999
Data columns (total 10 columns):
shop_id           300000 non-null int64
item_id           300000 non-null int64
date_block_num    300000 non-null int64
item_price        300000 non-null float64
city_code         300000 non-null category
year              300000 non-null int64
month             300000 non-null int64
day               300000 non-null int64
dow               300000 non-null int64
woy               300000 non-null int64
dtypes: category(1), float64(1), int64(8)
memory usage: 23.2 MB


Unnamed: 0,shop_id,item_id,date_block_num,item_price,city_code,year,month,day,dow,woy
0,5,5037,34,908.5,5,2015,11,1,6,44
1,5,5037,34,908.5,5,2015,11,2,0,45
2,5,5037,34,908.5,5,2015,11,3,1,45
3,5,5037,34,908.5,5,2015,11,4,2,45
4,5,5037,34,908.5,5,2015,11,5,3,45


In [104]:
y_pred[:20]

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1.])

In [118]:
df_pred = pd.DataFrame({'ID': df_test3['ID'], 'item_cnt_day': y_pred})
#df_pred = df_test3.drop(['shop_id', 'item_id', 'date_block_num', 'item_price', 'city_code',
#       'year', 'month', 'day', 'dow', 'woy'], axis=1)
#df_pred['item_cnt_day'] = y_pred
df_pred.head()

Unnamed: 0,ID,item_cnt_day
0,0,1.0
1,0,1.0
2,0,1.0
3,0,1.0
4,0,1.0


In [119]:
df_final = df_pred.groupby(['ID']).sum()
df_final.columns = ['item_cnt_month']
df_final.head()

Unnamed: 0_level_0,item_cnt_month
ID,Unnamed: 1_level_1
0,30.0
1,30.0
2,30.0
3,30.0
4,30.0


In [120]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 0 to 9999
Data columns (total 1 columns):
item_cnt_month    10000 non-null float64
dtypes: float64(1)
memory usage: 156.2 KB


## Sample submission

In [121]:
df_sample = pd.read_csv(input_prefix + 'sample_submission.csv', index_col='ID')
print('shape:', df_sample.shape)
df_sample.head()

shape: (214200, 1)


Unnamed: 0_level_0,item_cnt_month
ID,Unnamed: 1_level_1
0,0.5
1,0.5
2,0.5
3,0.5
4,0.5


In [122]:
df_sample.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 214200 entries, 0 to 214199
Data columns (total 1 columns):
item_cnt_month    214200 non-null float64
dtypes: float64(1)
memory usage: 3.3 MB


In [None]:
!mkdir submissions

In [None]:
df_sample['item_cnt_month'] = 0.3
df_sample.to_csv('submissions/submission-0.3.csv')