# Predict Future Sales in Russia

- https://www.kaggle.com/c/competitive-data-science-predict-future-sales/

### Data files
- item_categories.csv
- items.csv
- sales_train.csv
- sample_submission.csv
- shops.csv
- test.csv

In [1]:
# importar pacotes necessários
import numpy as np
import pandas as pd

In [2]:
# definir parâmetros extras
pd.set_option('precision', 4)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

In [3]:
# importar pacotes usados na seleção do modelo e na medição da precisão
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

# importar os pacotes necessários para os algoritmos de regressão
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Lars
from sklearn.linear_model import LassoLars
from sklearn.linear_model import OrthogonalMatchingPursuit
from sklearn.linear_model import PassiveAggressiveRegressor
from sklearn.linear_model import BayesianRidge
from sklearn.linear_model import ARDRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveRegressor
from sklearn.linear_model import RANSACRegressor
from sklearn.linear_model import TheilSenRegressor
from sklearn.linear_model import HuberRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import RadiusNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.svm import NuSVR
from sklearn.svm import LinearSVR
from sklearn.tree import DecisionTreeRegressor

from xgboost import XGBRegressor

In [4]:
input_prefix = 'data/'
#input_prefix = 'https://github.com/hjort/ai-labs/raw/master/jupyter/future-sales/data/'

translate = False # original em russo
#translate = True # traduzir para inglês

trfile = '-translated' if translate else ''
trcol = '_translated' if translate else ''

bzfile = '.bz2' # ''

## Shops

In [5]:
df_shops = pd.read_csv(
    input_prefix + 'shops' + trfile + '.csv' + bzfile,
    index_col='shop_id')
print('shape:', df_shops.shape)
df_shops.head()

shape: (60, 1)


Unnamed: 0_level_0,shop_name
shop_id,Unnamed: 1_level_1
0,"!Якутск Орджоникидзе, 56 фран"
1,"!Якутск ТЦ ""Центральный"" фран"
2,"Адыгея ТЦ ""Мега"""
3,"Балашиха ТРК ""Октябрь-Киномир"""
4,"Волжский ТЦ ""Волга Молл"""


In [6]:
df_shops.describe()

Unnamed: 0,shop_name
count,60
unique,60
top,"Казань ТЦ ""Бехетле"""
freq,1


In [7]:
df_shops.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 60 entries, 0 to 59
Data columns (total 1 columns):
shop_name    60 non-null object
dtypes: object(1)
memory usage: 960.0+ bytes


### Extract city name from the shop name

- 'Москва ТРК "Атриум"' => 'Москва'
- 'Н.Новгород ТРЦ "Фантастика" => 'Н.Новгород'

In [8]:
df_shops['city_name'] = df_shops['shop_name' + trcol].apply(lambda s: s.split()[0])

df_shops.head()

Unnamed: 0_level_0,shop_name,city_name
shop_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"!Якутск Орджоникидзе, 56 фран",!Якутск
1,"!Якутск ТЦ ""Центральный"" фран",!Якутск
2,"Адыгея ТЦ ""Мега""",Адыгея
3,"Балашиха ТРК ""Октябрь-Киномир""",Балашиха
4,"Волжский ТЦ ""Волга Молл""",Волжский


In [9]:
df_shops.groupby('city_name').count().\
    sort_values(by='shop_name', ascending=False).head()

Unnamed: 0_level_0,shop_name
city_name,Unnamed: 1_level_1
Москва,13
Тюмень,3
Воронеж,3
РостовНаДону,3
!Якутск,2


In [10]:
df_shops['city_code'] = (
    df_shops['city_name'].astype('category').cat.codes + 1
).astype('category')

df_shops.head()

Unnamed: 0_level_0,shop_name,city_name,city_code
shop_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,"!Якутск Орджоникидзе, 56 фран",!Якутск,1
1,"!Якутск ТЦ ""Центральный"" фран",!Якутск,1
2,"Адыгея ТЦ ""Мега""",Адыгея,2
3,"Балашиха ТРК ""Октябрь-Киномир""",Балашиха,3
4,"Волжский ТЦ ""Волга Молл""",Волжский,4


In [11]:
df_shops.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 60 entries, 0 to 59
Data columns (total 3 columns):
shop_name    60 non-null object
city_name    60 non-null object
city_code    60 non-null category
dtypes: category(1), object(2)
memory usage: 3.0+ KB


In [12]:
df_shops.describe()

Unnamed: 0,shop_name,city_name,city_code
count,60,60,60
unique,60,32,32
top,"Казань ТЦ ""Бехетле""",Москва,15
freq,1,13,13


## Item categories

In [13]:
df_categories = pd.read_csv(
    input_prefix + 'item_categories' + trfile + '.csv' + bzfile,
    index_col='item_category_id')
print('shape:', df_categories.shape)
df_categories.head()

shape: (84, 1)


Unnamed: 0_level_0,item_category_name
item_category_id,Unnamed: 1_level_1
0,PC - Гарнитуры/Наушники
1,Аксессуары - PS2
2,Аксессуары - PS3
3,Аксессуары - PS4
4,Аксессуары - PSP


In [14]:
df_categories.describe()

Unnamed: 0,item_category_name
count,84
unique,84
top,Книги - Бизнес литература
freq,1


### Extract group and subgroup names from item category name

- 'Игровые консоли - PS4' => 'Игровые консоли'
- 'Карты оплаты - Windows (Цифра)' => 'Карты оплаты'
- 'Книги - Комиксы, манга' => 'Книги'

In [15]:
df_categories['group_name'] = \
    df_categories['item_category_name' + trcol].apply(
        lambda s: s.split(' - ')[0].split(' (')[0].upper())

df_categories.head()

Unnamed: 0_level_0,item_category_name,group_name
item_category_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,PC - Гарнитуры/Наушники,PC
1,Аксессуары - PS2,АКСЕССУАРЫ
2,Аксессуары - PS3,АКСЕССУАРЫ
3,Аксессуары - PS4,АКСЕССУАРЫ
4,Аксессуары - PSP,АКСЕССУАРЫ


In [16]:
df_categories.groupby('group_name').count().\
    sort_values(by='item_category_name', ascending=False).head()

Unnamed: 0_level_0,item_category_name
group_name,Unnamed: 1_level_1
КНИГИ,13
ПОДАРКИ,12
ИГРОВЫЕ КОНСОЛИ,8
ИГРЫ,8
АКСЕССУАРЫ,7


In [17]:
def extract_subgroup(s):
    gs = s.split(' - ')
    if len(gs) > 1:
        gs2 = gs[1].split(' (')
        return gs2[0].upper()
    else:
        return ''
    #return gs[1] if len(gs) > 1 else ''
    
df_categories['subgroup_name'] = \
    df_categories['item_category_name' + trcol].apply(
        lambda s: extract_subgroup(s))

df_categories.head()

Unnamed: 0_level_0,item_category_name,group_name,subgroup_name
item_category_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,PC - Гарнитуры/Наушники,PC,ГАРНИТУРЫ/НАУШНИКИ
1,Аксессуары - PS2,АКСЕССУАРЫ,PS2
2,Аксессуары - PS3,АКСЕССУАРЫ,PS3
3,Аксессуары - PS4,АКСЕССУАРЫ,PS4
4,Аксессуары - PSP,АКСЕССУАРЫ,PSP


In [18]:
df_categories.groupby('subgroup_name').count().head(10)

Unnamed: 0_level_0,item_category_name,group_name
subgroup_name,Unnamed: 1_level_1,Unnamed: 2_level_1
,7,7
1С:ПРЕДПРИЯТИЕ 8,1,1
BLU-RAY,1,1
BLU-RAY 3D,1,1
BLU-RAY 4K,1,1
CD ЛОКАЛЬНОГО ПРОИЗВОДСТВА,1,1
CD ФИРМЕННОГО ПРОИЗВОДСТВА,1,1
DVD,1,1
LIVE!,2,2
MAC,1,1


In [None]:
df_categories['group_code'] = (
    df_categories['group_name'].astype('category').cat.codes + 1).astype('category')

df_categories['subgroup_code'] = (
    df_categories['subgroup_name'].astype('category').cat.codes + 1).astype('category')

df_categories.head()

Unnamed: 0_level_0,item_category_name,group_name,subgroup_name,group_code,subgroup_code
item_category_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,PC - Гарнитуры/Наушники,PC,ГАРНИТУРЫ/НАУШНИКИ,1,30
1,Аксессуары - PS2,АКСЕССУАРЫ,PS2,2,12
2,Аксессуары - PS3,АКСЕССУАРЫ,PS3,2,13
3,Аксессуары - PS4,АКСЕССУАРЫ,PS4,2,14
4,Аксессуары - PSP,АКСЕССУАРЫ,PSP,2,16


In [None]:
df_categories.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 84 entries, 0 to 83
Data columns (total 5 columns):
item_category_name    84 non-null object
group_name            84 non-null object
subgroup_name         84 non-null object
group_code            84 non-null category
subgroup_code         84 non-null category
dtypes: category(2), object(3)
memory usage: 6.5+ KB


In [None]:
df_categories.describe()

Unnamed: 0,item_category_name,group_name,subgroup_name,group_code,subgroup_code
count,84,84,84.0,84,84
unique,84,18,55.0,18,55
top,Книги - Бизнес литература,КНИГИ,,12,1
freq,1,13,7.0,13,7


## Items

In [None]:
df_items = pd.read_csv(
    input_prefix + 'items' + trfile + '.csv' + bzfile,
    index_col='item_id')
print('shape:', df_items.shape)
df_items.head()

shape: (22170, 2)


Unnamed: 0_level_0,item_name,item_category_id
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,40
1,!ABBYY FineReader 12 Professional Edition Full...,76
2,***В ЛУЧАХ СЛАВЫ (UNV) D,40
3,***ГОЛУБАЯ ВОЛНА (Univ) D,40
4,***КОРОБКА (СТЕКЛО) D,40


In [None]:
#if not translate:
#    df_items['item_category_id'] = df_items['item_category_id'].astype('category')

In [None]:
df_items.describe()

Unnamed: 0,item_category_id
count,22170.0
mean,46.2908
std,15.9415
min,0.0
25%,37.0
50%,40.0
75%,58.0
max,83.0


In [None]:
import re

def extract_main_subject(str):
    s = str.upper()
    # remover caracteres do começo => !"*/
    s = re.sub("^[!*/\"]+ ?", "", s)
    # remover termo "1C:" do começo do nome
    s = re.sub("^1C.", "", s)
    # remover termo "THE" do começo do nome
    s = re.sub("^THE ", "", s)
    # obter primeira palavra em maiúsculo
    s = s.split()[0]
    # substituir caracteres => '`’
    s = re.sub("['`’]", "_", s)
    # remover caracteres do fim da palavra => :.®,!
    s = re.sub("[:.,!®]$", "", s)
    return s
    
df_items['subject_name'] = \
    df_items['item_name' + trcol].apply(
        lambda s: extract_main_subject(s))

df_items.head()

Unnamed: 0_level_0,item_name,item_category_id,subject_name
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,40,ВО
1,!ABBYY FineReader 12 Professional Edition Full...,76,ABBYY
2,***В ЛУЧАХ СЛАВЫ (UNV) D,40,В
3,***ГОЛУБАЯ ВОЛНА (Univ) D,40,ГОЛУБАЯ
4,***КОРОБКА (СТЕКЛО) D,40,КОРОБКА


In [None]:
df_items.tail()

Unnamed: 0_level_0,item_name,item_category_id,subject_name
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
22165,"Ядерный титбит 2 [PC, Цифровая версия]",31,ЯДЕРНЫЙ
22166,Язык запросов 1С:Предприятия [Цифровая версия],54,ЯЗЫК
22167,Язык запросов 1С:Предприятия 8 (+CD). Хрустале...,49,ЯЗЫК
22168,Яйцо для Little Inu,62,ЯЙЦО
22169,Яйцо дракона (Игра престолов),69,ЯЙЦО


In [None]:
df_items.groupby('subject_name').count().head(10)

Unnamed: 0_level_0,item_name,item_category_id
subject_name,Unnamed: 1_level_1,Unnamed: 2_level_1
007,5,5
1+1,2,2
10,6,6
100,20,20
1000,2,2
10000,1,1
101,8,8
11,2,2
11-11-11,1,1
12,8,8


In [None]:
df_items['subject_code'] = (
    df_items['subject_name'].astype('category').cat.codes + 1).astype('category')

df_items.head()

Unnamed: 0_level_0,item_name,item_category_id,subject_name,subject_code
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,40,ВО,1803
1,!ABBYY FineReader 12 Professional Edition Full...,76,ABBYY,111
2,***В ЛУЧАХ СЛАВЫ (UNV) D,40,В,1614
3,***ГОЛУБАЯ ВОЛНА (Univ) D,40,ГОЛУБАЯ,1958
4,***КОРОБКА (СТЕКЛО) D,40,КОРОБКА,2630


In [None]:
df_items.tail()

Unnamed: 0_level_0,item_name,item_category_id,subject_name,subject_code
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
22165,"Ядерный титбит 2 [PC, Цифровая версия]",31,ЯДЕРНЫЙ,4646
22166,Язык запросов 1С:Предприятия [Цифровая версия],54,ЯЗЫК,4648
22167,Язык запросов 1С:Предприятия 8 (+CD). Хрустале...,49,ЯЗЫК,4648
22168,Яйцо для Little Inu,62,ЯЙЦО,4649
22169,Яйцо дракона (Игра престолов),69,ЯЙЦО,4649


In [None]:
df_items.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22170 entries, 0 to 22169
Data columns (total 4 columns):
item_name           22170 non-null object
item_category_id    22170 non-null int64
subject_name        22170 non-null object
subject_code        22170 non-null category
dtypes: category(1), int64(1), object(2)
memory usage: 932.5+ KB


In [None]:
df_items.describe()

Unnamed: 0,item_category_id
count,22170.0
mean,46.2908
std,15.9415
min,0.0
25%,37.0
50%,40.0
75%,58.0
max,83.0


In [None]:
# join items + categories:item_category_id => group_code, subgroup_code
df_items2 = pd.merge(df_items, #.reset_index(),
                     df_categories, how='left', on='item_category_id')
df_items2.index.names = ['item_id']
df_items2.head()

Unnamed: 0_level_0,item_name,item_category_id,subject_name,subject_code,item_category_name,group_name,subgroup_name,group_code,subgroup_code
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,40,ВО,1803,Кино - DVD,КИНО,DVD,11,8
1,!ABBYY FineReader 12 Professional Edition Full...,76,ABBYY,111,Программы - Для дома и офиса (Цифра),ПРОГРАММЫ,ДЛЯ ДОМА И ОФИСА,15,31
2,***В ЛУЧАХ СЛАВЫ (UNV) D,40,В,1614,Кино - DVD,КИНО,DVD,11,8
3,***ГОЛУБАЯ ВОЛНА (Univ) D,40,ГОЛУБАЯ,1958,Кино - DVD,КИНО,DVD,11,8
4,***КОРОБКА (СТЕКЛО) D,40,КОРОБКА,2630,Кино - DVD,КИНО,DVD,11,8


In [None]:
del(df_items)
del(df_categories)

In [None]:
#df_items2.set_index(['item_id'], inplace=True)
df_items2['item_category_id'] = df_items2['item_category_id'].astype('category')

In [None]:
df_items2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22170 entries, 0 to 22169
Data columns (total 9 columns):
item_name             22170 non-null object
item_category_id      22170 non-null category
subject_name          22170 non-null object
subject_code          22170 non-null category
item_category_name    22170 non-null object
group_name            22170 non-null object
subgroup_name         22170 non-null object
group_code            22170 non-null category
subgroup_code         22170 non-null category
dtypes: category(4), object(5)
memory usage: 1.3+ MB


In [None]:
df_items2.describe()

Unnamed: 0,item_name,item_category_id,subject_name,subject_code,item_category_name,group_name,subgroup_name,group_code,subgroup_code
count,22170,22170,22170,22170,22170,22170,22170,22170,22170
unique,22170,84,4655,4655,84,18,55,18,55
top,ТОМ И ДЖЕРРИ: ЗИМНИЕ СКАЗКИ (регион),40,ФИГУРКА,4348,Кино - DVD,КИНО,DVD,11,8
freq,1,5035,599,599,5035,7464,5035,7464,5035


## Sales (testing)

In [30]:
df_test = pd.read_csv(
    input_prefix + 'test.csv' + bzfile, #nrows=10000,
    index_col='ID')
print('shape:', df_test.shape)
df_test.head()

shape: (214200, 2)


Unnamed: 0_level_0,shop_id,item_id
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
0,5,5037
1,5,5320
2,5,5233
3,5,5232
4,5,5268


In [31]:
#TODO: verificar o que está sendo esperado (acumulado mensal por loja e produto?)
# item_cnt_day: number of products sold. You are predicting a monthly amount of this measure

In [32]:
for col in ['shop_id', 'item_id']:
    df_test[col] = df_test[col].astype('category')

In [33]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 214200 entries, 0 to 214199
Data columns (total 2 columns):
shop_id    214200 non-null category
item_id    214200 non-null category
dtypes: category(2)
memory usage: 2.4 MB


In [34]:
df_test.describe()

Unnamed: 0,shop_id,item_id
count,214200,214200
unique,42,5100
top,59,22167
freq,5100,42


In [35]:
# criar dataframe sem quantidade de itens
df_zeroed = df_test.copy()
df_zeroed['key'] = 0
df_zeroed = pd.merge(df_zeroed,
                     pd.DataFrame({'date_block_num': np.arange(0, 34), 'key': np.zeros(34, dtype=int)}),
                     how='left', on='key').\
                set_index(['shop_id', 'item_id', 'date_block_num']).\
                drop(['key'], axis=1)
df_zeroed.head()

shop_id,item_id,date_block_num
5,5037,0
5,5037,1
5,5037,2
5,5037,3
5,5037,4


In [36]:
df_zeroed.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 7282800 entries, (5, 5037, 0) to (45, 969, 33)
Empty DataFrame

## Sales (training)

In [37]:
dateparse = lambda x: pd.datetime.strptime(x, '%d.%m.%Y')
df_train = pd.read_csv(
    input_prefix + 'sales_train.csv' + bzfile, #nrows=300, # somente primeiras linhas!
    parse_dates=['date'], date_parser=dateparse)
print('shape:', df_train.shape)
df_train.head()

shape: (2935849, 6)


Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,2013-01-02,0,59,22154,999.0,1.0
1,2013-01-03,0,25,2552,899.0,1.0
2,2013-01-05,0,25,2552,899.0,-1.0
3,2013-01-06,0,25,2554,1709.05,1.0
4,2013-01-15,0,25,2555,1099.0,1.0


In [38]:
#for col in ('shop_id', 'item_id'):
#    df_train[col] = df_train[col].astype('category')

In [39]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2935849 entries, 0 to 2935848
Data columns (total 6 columns):
date              datetime64[ns]
date_block_num    int64
shop_id           int64
item_id           int64
item_price        float64
item_cnt_day      float64
dtypes: datetime64[ns](1), float64(2), int64(3)
memory usage: 134.4 MB


In [40]:
df_train.describe()

Unnamed: 0,date_block_num,shop_id,item_id,item_price,item_cnt_day
count,2935800.0,2935800.0,2935800.0,2935800.0,2935800.0
mean,14.57,33.002,10197.0,890.85,1.2426
std,9.423,16.227,6324.3,1729.8,2.6188
min,0.0,0.0,0.0,-1.0,-22.0
25%,7.0,22.0,4476.0,249.0,1.0
50%,14.0,31.0,9343.0,399.0,1.0
75%,23.0,47.0,15684.0,999.0,1.0
max,33.0,59.0,22169.0,307980.0,2169.0


In [41]:
# calcular quantidade total de vendas para cada produto e loja por mês
df_sumitems = (df_train.groupby(['date_block_num', 'shop_id', 'item_id'])['item_cnt_day'].sum()).to_frame()
df_sumitems.columns = ['item_cnt_month']
df_sumitems.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,item_cnt_month
date_block_num,shop_id,item_id,Unnamed: 3_level_1
0,0,32,6.0
0,0,33,3.0
0,0,35,1.0
0,0,43,1.0
0,0,51,2.0


In [42]:
del(df_train)

In [48]:
df_zeroed.info()
df_zeroed.index.names

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 7282800 entries, (5, 5037, 0) to (45, 969, 33)
Empty DataFrame

FrozenList(['shop_id', 'item_id', 'date_block_num'])

In [49]:
df_sumitems.info()
df_sumitems.index.names

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1609124 entries, (0, 0, 32) to (33, 59, 22102)
Data columns (total 1 columns):
item_cnt_month    1609124 non-null float64
dtypes: float64(1)
memory usage: 18.6 MB


FrozenList(['date_block_num', 'shop_id', 'item_id'])

In [None]:
df_train2 = pd.merge(df_zeroed, #.reset_index(),
                     df_sumitems[df_zeroed.index], #.reset_index(),
                     how='right', on=['date_block_num', 'shop_id', 'item_id'])
df_train2.head()

In [None]:
df_train2.info()

In [None]:
# join items + categories:item_category_id => group_code, subgroup_code
df_train3 = pd.merge(df_train2,
                     df_shops[['city_code']],
                     how='left', on='shop_id')
df_train3.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,city_code
0,0,0,32,6.0,1
1,0,0,33,3.0,1
2,0,0,35,1.0,1
3,0,0,43,1.0,1
4,0,0,51,2.0,1


In [None]:
del(df_sumitems)
del(df_train2)
#del(df_shops)

In [None]:
df_train3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1609124 entries, 0 to 1609123
Data columns (total 5 columns):
date_block_num    1609124 non-null int64
shop_id           1609124 non-null int64
item_id           1609124 non-null int64
item_cnt_month    1609124 non-null float64
city_code         1609124 non-null category
dtypes: category(1), float64(1), int64(3)
memory usage: 62.9 MB


In [None]:
df_train3.describe(include='category')

Unnamed: 0,city_code
count,1609124
unique,32
top,15
freq,492643


In [None]:
# join sales + items:item_id => subject_code, (group_code, subgroup_code)
df_train4 = pd.merge(df_train3,
                     df_items2.drop(['item_name', 'subject_name', 'item_category_name',
                                     'group_name', 'subgroup_name'], axis=1),
                     how='left', on='item_id')
df_train4.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,city_code,item_category_id,subject_code,group_code,subgroup_code
0,0,0,32,6.0,1,40,2,11,8
1,0,0,33,3.0,1,37,2,11,3
2,0,0,35,1.0,1,40,3,11,8
3,0,0,43,1.0,1,40,4,11,8
4,0,0,51,2.0,1,57,4,13,11


In [None]:
del(df_train3)
#del(df_items2)

In [None]:
for col in ['shop_id', 'item_id']:
    df_train4[col] = df_train4[col].astype('category')

In [None]:
# criar dataframe com possíveis meses
df_dates = pd.DataFrame({'date': pd.date_range(start='2013-01-01', periods=35, freq='M')})
df_dates.index.names = ['date_block_num']
df_dates.head()

Unnamed: 0_level_0,date
date_block_num,Unnamed: 1_level_1
0,2013-01-31
1,2013-02-28
2,2013-03-31
3,2013-04-30
4,2013-05-31


In [None]:
df_dates.tail()

Unnamed: 0_level_0,date
date_block_num,Unnamed: 1_level_1
30,2015-07-31
31,2015-08-31
32,2015-09-30
33,2015-10-31
34,2015-11-30


In [None]:
# criar colunas ano e mês
df_dates['year'] = df_dates['date'].dt.year
df_dates['month'] = df_dates['date'].dt.month
df_dates.head()

Unnamed: 0_level_0,date,year,month
date_block_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,2013-01-31,2013,1
1,2013-02-28,2013,2
2,2013-03-31,2013,3
3,2013-04-30,2013,4
4,2013-05-31,2013,5


In [None]:
#TODO: baixar calendário de feriados na Rússia:
#      https://www.google.com/search?q=holidays+calendar+in+russia+format%3Acsv
#TODO: criar colunas indicando feriado
#TODO: criar colunas indicando véspera ou pós-feriado

In [None]:
df_train5 = pd.merge(df_train4,
                     df_dates.drop(['date'], axis=1),
                     how='left', on='date_block_num')
df_train5.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,city_code,item_category_id,subject_code,group_code,subgroup_code,year,month
0,0,0,32,6.0,1,40,2,11,8,2013,1
1,0,0,33,3.0,1,37,2,11,3,2013,1
2,0,0,35,1.0,1,40,3,11,8,2013,1
3,0,0,43,1.0,1,40,4,11,8,2013,1
4,0,0,51,2.0,1,57,4,13,11,2013,1


In [None]:
#del(df_dates)

In [None]:
df_train5.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1609124 entries, 0 to 1609123
Data columns (total 11 columns):
date_block_num      1609124 non-null int64
shop_id             1609124 non-null category
item_id             1609124 non-null category
item_cnt_month      1609124 non-null float64
city_code           1609124 non-null category
item_category_id    1609124 non-null category
subject_code        1609124 non-null category
group_code          1609124 non-null category
subgroup_code       1609124 non-null category
year                1609124 non-null int64
month               1609124 non-null int64
dtypes: category(7), float64(1), int64(3)
memory usage: 76.2 MB


In [None]:
#TODO: treino: todos os meses, exceto o último / teste: último mês
#TODO: avaliar diversos algoritmos de regressão e escolher o de melhor escore (métrica: MSE)

In [None]:
#TODO: usando o melhor algoritmo avaliado, treinar novamente usando todos os meses disponíveis
#TODO: montar dataframe contendo lojas e itens presentes na base de testes (test.csv)
#TODO: usar todas as datas do mês seguinte (01/11/2015 a 30/11/2015)
#TODO: incrementar dataframe fazendo JOINs com tabelas de lojas, itens e categorias
#TODO: incrementar dataframe produzindo campos extras de datas (year, month, day, dow, woy)

In [None]:
#TODO: submeter o dataframe de entrada ao modelo treinado a fim de obter previsões
#TODO: calcular somas das quantidades de produtos agrupadas por loja e produto
#TODO: fazer JOIN da base de testes com esse dataframe final, produzindo o arquivo de submissão final

In [None]:
df_train_final = df_train5
df_train_final.columns

Index(['date_block_num', 'shop_id', 'item_id', 'item_cnt_month', 'city_code',
       'item_category_id', 'subject_code', 'group_code', 'subgroup_code',
       'year', 'month'],
      dtype='object')

In [None]:
# gravar dados de treino em arquivo
df_train_final.to_csv('train-data.csv', index_label='id')

In [None]:
data = df_train_final
#data = df_train_final.sample(frac=1.0).head(20000)

In [None]:
# definir dados de entrada
X = data.drop(['item_cnt_month'], axis=1) # tudo, exceto a coluna alvo
y = data['item_cnt_month'] # apenas a coluna alvo

print('Forma dos dados originais:', X.shape, y.shape)

Forma dos dados originais: (1609124, 10) (1609124,)


## Treinamento dos modelos preditivos

In [None]:
MSE = 'neg_mean_squared_error'

models = {}

parallel_jobs = 8 # paralelização

In [None]:
from datetime import datetime

# avalia o desempenho do modelo, retornando o valor do MSE
def evaluate_model_cv(name, model, X=X, y=y):
    start = datetime.now()
    kfold = KFold(n_splits=10, random_state=42)
    results = cross_val_score(model, X, y, cv=kfold, scoring=MSE, verbose=1, n_jobs=parallel_jobs)
    end = datetime.now()
    elapsed = int((end - start).total_seconds() * 1000)
    score = (-1) * results.mean()
    stddev = results.std()
    models[name] = {'model': model, 'score': score, 'stddev': stddev, 'elapsed': elapsed}
    print(model, '\nScore: %.2f (+/- %.2f) [%5s ms]' % (score, stddev, elapsed))
    return score, stddev, elapsed

In [None]:
# faz o ajuste fino do modelo, calculando os melhores hiperparâmetros
def fine_tune_model(model, params, X=X, y=y):
  print('\nFine Tuning Model:')
  print(model, "\nparams:", params)
  kfold = KFold(n_splits=10, random_state=42)
  grid = GridSearchCV(estimator=model, param_grid=params, scoring=MSE, cv=kfold, verbose=1, n_jobs=parallel_jobs)
  grid.fit(X, y)
  print('\nGrid Best Score: %.2f' % (grid.best_score_ * (-1)))
  print('Best Params:', grid.best_params_)
  return grid

### Avaliação e ajuste fino de cada modelo preditivo

- https://scikit-learn.org/stable/modules/classes.html

In [None]:
model = KNeighborsRegressor(n_jobs=-1, n_neighbors=11, weights='distance')
%time evaluate_model_cv('KNN', model)

#n_neighbors=5, weights=’uniform’, algorithm=’auto’, leaf_size=30, p=2, metric=’minkowski’,
#metric_params=None, n_jobs=None

params = {
    'n_neighbors': [1, 3, 5, 7, 9, 11, 13],
    'weights': ['uniform', 'distance']
}
#fine_tune_model(model, params)

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   6 out of  10 | elapsed:   41.3s remaining:   27.5s


KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=-1, n_neighbors=11, p=2,
                    weights='distance') 
Score: 51.03 (+/- 32.22) [49682 ms]
CPU times: user 209 ms, sys: 217 ms, total: 426 ms
Wall time: 49.7 s


[Parallel(n_jobs=8)]: Done  10 out of  10 | elapsed:   49.6s finished


In [None]:
model = RandomForestRegressor(random_state=42, n_jobs=-1, n_estimators=100, max_depth=3)
%time evaluate_model_cv('RF', model)

#n_estimators=’warn’, criterion=’mse’, max_depth=None, min_samples_split=2, min_samples_leaf=1, 
#min_weight_fraction_leaf=0.0, max_features=’auto’, max_leaf_nodes=None, min_impurity_decrease=0.0, 
#min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, 
#verbose=0, warm_start=False

params = {
    'n_estimators': [5, 10, 25, 50, 75, 100],
    'max_depth': [None, 3, 5, 7, 9, 11, 13]
}
#fine_tune_model(model, params)

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   6 out of  10 | elapsed:  3.4min remaining:  2.3min


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=3,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
                      oob_score=False, random_state=42, verbose=0,
                      warm_start=False) 
Score: 57.13 (+/- 28.91) [250635 ms]
CPU times: user 170 ms, sys: 262 ms, total: 432 ms
Wall time: 4min 10s


[Parallel(n_jobs=8)]: Done  10 out of  10 | elapsed:  4.2min finished


In [None]:
model = GradientBoostingRegressor(random_state=42,
    learning_rate=0.05, n_estimators=100, max_depth=4, max_features=0.85)
%time evaluate_model_cv('GB', model)

#loss=’ls’, learning_rate=0.1, n_estimators=100, subsample=1.0, criterion=’friedman_mse’, min_samples_split=2,
#min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0, 
#min_impurity_split=None, init=None, random_state=None, max_features=None, alpha=0.9, verbose=0, 
#max_leaf_nodes=None, warm_start=False, presort=’auto’, validation_fraction=0.1, n_iter_no_change=None, 
#tol=0.0001

params = dict(
    n_estimators=[100, 250, 500],
    max_features=[0.75, 0.85, 1.0],
    max_depth=[4, 8, 12],
    learning_rate=[0.05, 0.1, 0.15],
    #subsample=[0.4, 0.6, 0.8]
)
#fine_tune_model(model, params)

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   6 out of  10 | elapsed:  4.2min remaining:  2.8min


In [None]:
model = ExtraTreesRegressor(random_state=42, n_jobs=-1, n_estimators=75, max_features=0.85)
%time evaluate_model_cv('ET', model)

#n_estimators=’warn’, criterion=’mse’, max_depth=None, min_samples_split=2, min_samples_leaf=1, 
#min_weight_fraction_leaf=0.0, max_features=’auto’, max_leaf_nodes=None, min_impurity_decrease=0.0, 
#min_impurity_split=None, bootstrap=False, oob_score=False, n_jobs=None, random_state=None, verbose=0,
#warm_start=False

params = dict(
    n_estimators=[50, 75, 100, 200],
    max_features=['auto', 0.75, 0.85, 1.0]
)
#fine_tune_model(model, params)

In [None]:
model = BaggingRegressor(random_state=42, n_jobs=-1, base_estimator=DecisionTreeRegressor(), max_features=0.5, n_estimators=200)
%time evaluate_model_cv('BG', model)

#base_estimator=None, n_estimators=10, max_samples=1.0, max_features=1.0, bootstrap=True, 
#bootstrap_features=False, oob_score=False, warm_start=False, n_jobs=None, random_state=None, verbose=0

params = dict(
    n_estimators=[50, 75, 100, 200],
    max_features=[0.5, 0.75, 1.0]
)
#fine_tune_model(model, params)

In [None]:
model = AdaBoostRegressor(random_state=42, n_estimators=100, base_estimator=DecisionTreeRegressor())
%time evaluate_model_cv('ABDT', model)

# base_estimator=None, n_estimators=50, learning_rate=1.0, algorithm=’SAMME.R’, random_state=None

params = dict(
    n_estimators=[50, 75, 100, 200]
)
#fine_tune_model(model, params)

## Comparação final entre os algoritmos

In [None]:
results = []
names = []
scores = []
stddevs = []
times = []

best_model = None
highest_score = None

for name, model in models.items():
    score, stddev, elapsed = model['score'], model['stddev'], model['elapsed']
    results.append((score, stddev))
    names.append(name)
    scores.append(score)
    stddevs.append(stddev)
    times.append(elapsed)
    
    if not highest_score or score < highest_score:
        best_model = model['model']
        highest_score = score

In [None]:
results_df = pd.DataFrame({'Model Name': names, 'Score': scores, 'Std Dev': stddevs, 'Time (ms)': times})
results_df.sort_values(by=['Score', 'Time (ms)'], ascending=[True, False])

In [None]:
print('Best Model:\n\n%s' % (best_model))

In [None]:
model = best_model
model.fit(X, y)

In [None]:
X.head()

In [None]:
del(X)
del(y)

## ....

In [None]:
df_test.head()

In [None]:
# definir campo fixo
df_test['date_block_num'] = 34 # Novembro/2015

In [None]:
# incrementar colunas de data
df_test2 = pd.merge(df_test,
                    df_dates.drop(['date'], axis=1),
                    how='left', on='date_block_num')
df_test2.head()

In [None]:
# incrementar dataframe fazendo JOINs com tabelas de lojas, itens e categorias
# incluir coluna com código da cidade
df_test3 = pd.merge(df_test2, df_shops[['city_code']], how='left', on='shop_id')
df_test3.head()
#del(df_test2)

In [None]:
df_test4 = pd.merge(df_test3,
                     df_items2.drop(['item_name', 'subject_name', 'item_category_name',
                                     'group_name', 'subgroup_name'], axis=1),
                     how='left', on='item_id')
df_test4.head()

In [None]:
for col in ['shop_id', 'item_id']:
    df_test4[col] = df_test4[col].astype('category')

In [None]:
df_test_final = df_test4
df_test_final.columns

In [None]:
df_test_final.info()

In [None]:
# colocar as colunas na mesma ordem dos dados de treino
cols = df_train_final.columns.drop(['item_cnt_month'])
cols

In [None]:
# realizar a previsão
X_pred = df_test_final[cols]
#X_pred = df_test4.drop(['ID'], axis=1)
y_pred = best_model.predict(X_pred)

In [None]:
# gravar dados de teste em arquivo
X_pred.to_csv('test-data.csv', index_label='id')

In [None]:
X_pred.info()
X_pred.head()

In [None]:
y_pred[:20]

In [None]:
df_final = pd.DataFrame(data=y_pred, columns=['item_cnt_month'], index=df_test4.index)
df_final.index.names = ['ID']
df_final.head()

In [None]:
df_final.info()

In [None]:
!test -d submissions/ || mkdir submissions/

In [None]:
df_final.to_csv('submissions/submission.csv')

## Sample submission

In [None]:
df_sample = pd.read_csv(input_prefix + 'sample_submission.csv', index_col='ID')
print('shape:', df_sample.shape)
df_sample.head()

In [None]:
df_sample.info()

In [None]:
del(df_sample)