# Predict Future Sales in Russia

- https://www.kaggle.com/c/competitive-data-science-predict-future-sales/

### Data files
- item_categories.csv
- items.csv
- sales_train.csv
- sample_submission.csv
- shops.csv
- test.csv

In [1]:
import pandas as pd

In [2]:
# definir parâmetros extras
pd.set_option('precision', 4)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

In [3]:
input_prefix = 'data/'

translate = False # original em russo
#translate = True # traduzir para inglês

trfile = '-translated' if translate else ''
trcol = '_translated' if translate else ''

## Shops

In [4]:
df_shops = pd.read_csv(input_prefix + 'shops' + trfile + '.csv', index_col='shop_id')
print('shape:', df_shops.shape)
df_shops.head()

shape: (60, 1)


Unnamed: 0_level_0,shop_name
shop_id,Unnamed: 1_level_1
0,"!Якутск Орджоникидзе, 56 фран"
1,"!Якутск ТЦ ""Центральный"" фран"
2,"Адыгея ТЦ ""Мега"""
3,"Балашиха ТРК ""Октябрь-Киномир"""
4,"Волжский ТЦ ""Волга Молл"""


In [5]:
df_shops.describe()

Unnamed: 0,shop_name
count,60
unique,60
top,"РостовНаДону ТЦ ""Мега"""
freq,1


In [6]:
df_shops.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 60 entries, 0 to 59
Data columns (total 1 columns):
shop_name    60 non-null object
dtypes: object(1)
memory usage: 960.0+ bytes


### Extract city name from the shop name

- 'Москва ТРК "Атриум"' => 'Москва'
- 'Н.Новгород ТРЦ "Фантастика" => 'Н.Новгород'

In [7]:
df_shops['city_name'] = df_shops['shop_name' + trcol].apply(lambda s: s.split()[0])

df_shops.head()

Unnamed: 0_level_0,shop_name,city_name
shop_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"!Якутск Орджоникидзе, 56 фран",!Якутск
1,"!Якутск ТЦ ""Центральный"" фран",!Якутск
2,"Адыгея ТЦ ""Мега""",Адыгея
3,"Балашиха ТРК ""Октябрь-Киномир""",Балашиха
4,"Волжский ТЦ ""Волга Молл""",Волжский


In [8]:
df_shops.groupby('city_name').count().head(10)

Unnamed: 0_level_0,shop_name
city_name,Unnamed: 1_level_1
!Якутск,2
Адыгея,1
Балашиха,1
Волжский,1
Вологда,1
Воронеж,3
Выездная,1
Жуковский,2
Интернет-магазин,1
Казань,2


In [9]:
df_shops['city_code'] = (
    df_shops['city_name'].astype('category').cat.codes + 1
).astype('category')

df_shops.head()

Unnamed: 0_level_0,shop_name,city_name,city_code
shop_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,"!Якутск Орджоникидзе, 56 фран",!Якутск,1
1,"!Якутск ТЦ ""Центральный"" фран",!Якутск,1
2,"Адыгея ТЦ ""Мега""",Адыгея,2
3,"Балашиха ТРК ""Октябрь-Киномир""",Балашиха,3
4,"Волжский ТЦ ""Волга Молл""",Волжский,4


In [10]:
df_shops.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 60 entries, 0 to 59
Data columns (total 3 columns):
shop_name    60 non-null object
city_name    60 non-null object
city_code    60 non-null category
dtypes: category(1), object(2)
memory usage: 3.0+ KB


In [11]:
df_shops.describe()

Unnamed: 0,shop_name,city_name,city_code
count,60,60,60
unique,60,32,32
top,"РостовНаДону ТЦ ""Мега""",Москва,15
freq,1,13,13


## Item categories

In [12]:
df_categories = pd.read_csv(input_prefix + 'item_categories' + trfile + '.csv', index_col='item_category_id')
print('shape:', df_categories.shape)
df_categories.head()

shape: (84, 1)


Unnamed: 0_level_0,item_category_name
item_category_id,Unnamed: 1_level_1
0,PC - Гарнитуры/Наушники
1,Аксессуары - PS2
2,Аксессуары - PS3
3,Аксессуары - PS4
4,Аксессуары - PSP


In [13]:
df_categories.describe()

Unnamed: 0,item_category_name
count,84
unique,84
top,Книги - Аудиокниги 1С
freq,1


### Extract group and subgroup names from item category name

- 'Игровые консоли - PS4' => 'Игровые консоли'
- 'Карты оплаты - Windows (Цифра)' => 'Карты оплаты'
- 'Книги - Комиксы, манга' => 'Книги'

In [14]:
df_categories['group_name'] = \
    df_categories['item_category_name' + trcol].apply(
        lambda s: s.split(' - ')[0].split(' (')[0].upper())

df_categories.head()

Unnamed: 0_level_0,item_category_name,group_name
item_category_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,PC - Гарнитуры/Наушники,PC
1,Аксессуары - PS2,АКСЕССУАРЫ
2,Аксессуары - PS3,АКСЕССУАРЫ
3,Аксессуары - PS4,АКСЕССУАРЫ
4,Аксессуары - PSP,АКСЕССУАРЫ


In [15]:
df_categories.groupby('group_name').count().head(10)

Unnamed: 0_level_0,item_category_name
group_name,Unnamed: 1_level_1
PC,1
АКСЕССУАРЫ,7
БИЛЕТЫ,1
ДОСТАВКА ТОВАРА,1
ИГРОВЫЕ КОНСОЛИ,8
ИГРЫ,8
ИГРЫ ANDROID,1
ИГРЫ MAC,1
ИГРЫ PC,4
КАРТЫ ОПЛАТЫ,5


In [16]:
def extract_subgroup(s):
    gs = s.split(' - ')
    if len(gs) > 1:
        gs2 = gs[1].split(' (')
        return gs2[0].upper()
    else:
        return ''
    #return gs[1] if len(gs) > 1 else ''
    
df_categories['subgroup_name'] = \
    df_categories['item_category_name' + trcol].apply(
        lambda s: extract_subgroup(s))

df_categories.head()

Unnamed: 0_level_0,item_category_name,group_name,subgroup_name
item_category_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,PC - Гарнитуры/Наушники,PC,ГАРНИТУРЫ/НАУШНИКИ
1,Аксессуары - PS2,АКСЕССУАРЫ,PS2
2,Аксессуары - PS3,АКСЕССУАРЫ,PS3
3,Аксессуары - PS4,АКСЕССУАРЫ,PS4
4,Аксессуары - PSP,АКСЕССУАРЫ,PSP


In [17]:
df_categories.groupby('subgroup_name').count().head(10)

Unnamed: 0_level_0,item_category_name,group_name
subgroup_name,Unnamed: 1_level_1,Unnamed: 2_level_1
,7,7
1С:ПРЕДПРИЯТИЕ 8,1,1
BLU-RAY,1,1
BLU-RAY 3D,1,1
BLU-RAY 4K,1,1
CD ЛОКАЛЬНОГО ПРОИЗВОДСТВА,1,1
CD ФИРМЕННОГО ПРОИЗВОДСТВА,1,1
DVD,1,1
LIVE!,2,2
MAC,1,1


In [18]:
df_categories['group_code'] = (
    df_categories['group_name'].astype('category').cat.codes + 1).astype('category')

df_categories['subgroup_code'] = (
    df_categories['subgroup_name'].astype('category').cat.codes + 1).astype('category')

df_categories.head()

Unnamed: 0_level_0,item_category_name,group_name,subgroup_name,group_code,subgroup_code
item_category_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,PC - Гарнитуры/Наушники,PC,ГАРНИТУРЫ/НАУШНИКИ,1,30
1,Аксессуары - PS2,АКСЕССУАРЫ,PS2,2,12
2,Аксессуары - PS3,АКСЕССУАРЫ,PS3,2,13
3,Аксессуары - PS4,АКСЕССУАРЫ,PS4,2,14
4,Аксессуары - PSP,АКСЕССУАРЫ,PSP,2,16


In [19]:
df_categories.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 84 entries, 0 to 83
Data columns (total 5 columns):
item_category_name    84 non-null object
group_name            84 non-null object
subgroup_name         84 non-null object
group_code            84 non-null category
subgroup_code         84 non-null category
dtypes: category(2), object(3)
memory usage: 6.5+ KB


In [20]:
df_categories.describe()

Unnamed: 0,item_category_name,group_name,subgroup_name,group_code,subgroup_code
count,84,84,84.0,84,84
unique,84,18,55.0,18,55
top,Книги - Аудиокниги 1С,КНИГИ,,12,1
freq,1,13,7.0,13,7


## Items

In [46]:
df_items = pd.read_csv(input_prefix + 'items' + trfile + '.csv', index_col='item_id')
print('shape:', df_items.shape)
df_items.head()

shape: (22170, 2)


Unnamed: 0_level_0,item_name,item_category_id
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,40
1,!ABBYY FineReader 12 Professional Edition Full...,76
2,***В ЛУЧАХ СЛАВЫ (UNV) D,40
3,***ГОЛУБАЯ ВОЛНА (Univ) D,40
4,***КОРОБКА (СТЕКЛО) D,40


In [47]:
#if not translate:
#    df_items['item_category_id'] = df_items['item_category_id'].astype('category')

In [48]:
df_items.describe()

Unnamed: 0,item_category_id
count,22170.0
mean,46.2908
std,15.9415
min,0.0
25%,37.0
50%,40.0
75%,58.0
max,83.0


In [49]:
import re

def extract_main_subject(str):
    s = str.upper()
    # remover caracteres do começo => !"*/
    s = re.sub("^[!*/\"]+ ?", "", s)
    # remover termo "THE" do nome
    s = re.sub("^THE ", "", s) # translated
    # obter primeira palavra em maiúsculo
    s = s.split()[0]
    # substituir caracteres => '`’
    s = re.sub("['`’]", "_", s)
    # remover caracteres do fim da palavra => :.®,!
    s = re.sub("[:.,!®]$", "", s)
    return s
    
df_items['subject_name'] = \
    df_items['item_name' + trcol].apply(
        lambda s: extract_main_subject(s))

df_items.head()

Unnamed: 0_level_0,item_name,item_category_id,subject_name
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,40,ВО
1,!ABBYY FineReader 12 Professional Edition Full...,76,ABBYY
2,***В ЛУЧАХ СЛАВЫ (UNV) D,40,В
3,***ГОЛУБАЯ ВОЛНА (Univ) D,40,ГОЛУБАЯ
4,***КОРОБКА (СТЕКЛО) D,40,КОРОБКА


In [50]:
df_items.tail()

Unnamed: 0_level_0,item_name,item_category_id,subject_name
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
22165,"Ядерный титбит 2 [PC, Цифровая версия]",31,ЯДЕРНЫЙ
22166,Язык запросов 1С:Предприятия [Цифровая версия],54,ЯЗЫК
22167,Язык запросов 1С:Предприятия 8 (+CD). Хрустале...,49,ЯЗЫК
22168,Яйцо для Little Inu,62,ЯЙЦО
22169,Яйцо дракона (Игра престолов),69,ЯЙЦО


In [51]:
df_items.groupby('subject_name').count().head(10)

Unnamed: 0_level_0,item_name,item_category_id
subject_name,Unnamed: 1_level_1,Unnamed: 2_level_1
007,5,5
1+1,2,2
10,6,6
100,20,20
1000,2,2
10000,1,1
101,8,8
11,2,2
11-11-11,1,1
12,8,8


In [52]:
df_items['subject_code'] = (
    df_items['subject_name'].astype('category').cat.codes + 1).astype('category')

df_items.head()

Unnamed: 0_level_0,item_name,item_category_id,subject_name,subject_code
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,40,ВО,1804
1,!ABBYY FineReader 12 Professional Edition Full...,76,ABBYY,112
2,***В ЛУЧАХ СЛАВЫ (UNV) D,40,В,1615
3,***ГОЛУБАЯ ВОЛНА (Univ) D,40,ГОЛУБАЯ,1959
4,***КОРОБКА (СТЕКЛО) D,40,КОРОБКА,2631


In [53]:
df_items.tail()

Unnamed: 0_level_0,item_name,item_category_id,subject_name,subject_code
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
22165,"Ядерный титбит 2 [PC, Цифровая версия]",31,ЯДЕРНЫЙ,4647
22166,Язык запросов 1С:Предприятия [Цифровая версия],54,ЯЗЫК,4649
22167,Язык запросов 1С:Предприятия 8 (+CD). Хрустале...,49,ЯЗЫК,4649
22168,Яйцо для Little Inu,62,ЯЙЦО,4650
22169,Яйцо дракона (Игра престолов),69,ЯЙЦО,4650


In [54]:
df_items.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22170 entries, 0 to 22169
Data columns (total 4 columns):
item_name           22170 non-null object
item_category_id    22170 non-null int64
subject_name        22170 non-null object
subject_code        22170 non-null category
dtypes: category(1), int64(1), object(2)
memory usage: 932.5+ KB


In [55]:
df_items.describe()

Unnamed: 0,item_category_id
count,22170.0
mean,46.2908
std,15.9415
min,0.0
25%,37.0
50%,40.0
75%,58.0
max,83.0


In [59]:
# join items + categories:item_category_id => group_code, subgroup_code
df_items2 = pd.merge(df_items, df_categories, how='left', on='item_category_id')
df_items2.head()

Unnamed: 0,item_name,item_category_id,subject_name,subject_code,item_category_name,group_name,subgroup_name,group_code,subgroup_code
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,40,ВО,1804,Кино - DVD,КИНО,DVD,11,8
1,!ABBYY FineReader 12 Professional Edition Full...,76,ABBYY,112,Программы - Для дома и офиса (Цифра),ПРОГРАММЫ,ДЛЯ ДОМА И ОФИСА,15,31
2,***В ЛУЧАХ СЛАВЫ (UNV) D,40,В,1615,Кино - DVD,КИНО,DVD,11,8
3,***ГОЛУБАЯ ВОЛНА (Univ) D,40,ГОЛУБАЯ,1959,Кино - DVD,КИНО,DVD,11,8
4,***КОРОБКА (СТЕКЛО) D,40,КОРОБКА,2631,Кино - DVD,КИНО,DVD,11,8


In [62]:
df_items2['item_category_id'] = df_items2['item_category_id'].astype('category')

In [63]:
df_items2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22170 entries, 0 to 22169
Data columns (total 9 columns):
item_name             22170 non-null object
item_category_id      22170 non-null category
subject_name          22170 non-null object
subject_code          22170 non-null category
item_category_name    22170 non-null object
group_name            22170 non-null object
subgroup_name         22170 non-null object
group_code            22170 non-null category
subgroup_code         22170 non-null category
dtypes: category(4), object(5)
memory usage: 1.3+ MB


In [64]:
df_items2.describe()

Unnamed: 0,item_name,item_category_id,subject_name,subject_code,item_category_name,group_name,subgroup_name,group_code,subgroup_code
count,22170,22170,22170,22170,22170,22170,22170,22170,22170
unique,22170,84,4656,4656,84,18,55,18,55
top,СУМЕРКИ. САГА. РАССВЕТ: ЧАСТЬ 2 (2DVD),40,ФИГУРКА,4349,Кино - DVD,КИНО,DVD,11,8
freq,1,5035,599,599,5035,7464,5035,7464,5035


## Sales (training)

In [157]:
dateparse = lambda x: pd.datetime.strptime(x, '%d.%m.%Y')
df_train = pd.read_csv(input_prefix + 'sales_train-1k.csv', parse_dates=['date'], date_parser=dateparse)
print('shape:', df_items.shape)
df_train.head()

shape: (22170, 4)


Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,2013-01-02,0,59,22154,999.0,1.0
1,2013-01-03,0,25,2552,899.0,1.0
2,2013-01-05,0,25,2552,899.0,-1.0
3,2013-01-06,0,25,2554,1709.05,1.0
4,2013-01-15,0,25,2555,1099.0,1.0


In [158]:
#for col in ('shop_id', 'item_id'):
#    df_train[col] = df_train[col].astype('category')

In [159]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
date              1000 non-null datetime64[ns]
date_block_num    1000 non-null int64
shop_id           1000 non-null int64
item_id           1000 non-null int64
item_price        1000 non-null float64
item_cnt_day      1000 non-null float64
dtypes: datetime64[ns](1), float64(2), int64(3)
memory usage: 47.0 KB


In [160]:
df_train.describe()

Unnamed: 0,date_block_num,shop_id,item_id,item_price,item_cnt_day
count,1000.0,1000.0,1000.0,1000.0,1000.0
mean,0.0,25.034,2408.107,908.4799,1.19
std,0.0,1.0752,1205.2495,822.763,0.7671
min,0.0,25.0,785.0,58.0,-1.0
25%,0.0,25.0,1524.0,299.0,1.0
50%,0.0,25.0,2308.0,599.0,1.0
75%,0.0,25.0,2973.0,1290.0,1.0
max,0.0,59.0,22154.0,5490.0,13.0


In [161]:
#TODO: converter 'date' para tipo data - OK

#TODO: mesclar com tabelas de lojas, itens e categorias

#TODO: verificar o que significam contagens negativas - OK
# It means the item is returned. You should predict that too.
# Or in the other words: it is not missing data or mistake.

In [162]:
# join items + categories:item_category_id => group_code, subgroup_code
df_train2 = pd.merge(df_train, df_shops, how='left', on='shop_id')
df_train2.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,shop_name,city_name,city_code
0,2013-01-02,0,59,22154,999.0,1.0,"Ярославль ТЦ ""Альтаир""",Ярославль,32
1,2013-01-03,0,25,2552,899.0,1.0,"Москва ТРК ""Атриум""",Москва,15
2,2013-01-05,0,25,2552,899.0,-1.0,"Москва ТРК ""Атриум""",Москва,15
3,2013-01-06,0,25,2554,1709.05,1.0,"Москва ТРК ""Атриум""",Москва,15
4,2013-01-15,0,25,2555,1099.0,1.0,"Москва ТРК ""Атриум""",Москва,15


In [163]:
df_train2['shop_id'] = df_train2['shop_id'].astype('category')

In [164]:
df_train2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 0 to 999
Data columns (total 9 columns):
date              1000 non-null datetime64[ns]
date_block_num    1000 non-null int64
shop_id           1000 non-null category
item_id           1000 non-null int64
item_price        1000 non-null float64
item_cnt_day      1000 non-null float64
shop_name         1000 non-null object
city_name         1000 non-null object
city_code         1000 non-null category
dtypes: category(2), datetime64[ns](1), float64(2), int64(2), object(2)
memory usage: 66.0+ KB


In [165]:
df_train2.describe()

Unnamed: 0,date_block_num,item_id,item_price,item_cnt_day
count,1000.0,1000.0,1000.0,1000.0
mean,0.0,2408.107,908.4799,1.19
std,0.0,1205.2495,822.763,0.7671
min,0.0,785.0,58.0,-1.0
25%,0.0,1524.0,299.0,1.0
50%,0.0,2308.0,599.0,1.0
75%,0.0,2973.0,1290.0,1.0
max,0.0,22154.0,5490.0,13.0


In [166]:
# join sales + items:item_id => subject_code, (group_code, subgroup_code)
#df_train3 = pd.merge(df_train2, df_items2, how='left', right_on='item_id', left_index=True)
#df_train3.head()

In [167]:
#FIXME: remover isso depois!
df_train3 = df_train2

In [168]:
# criar colunas contendo dia da semana a partir de 'date'
df_train3['year'] = df_train3['date'].dt.year
df_train3['month'] = df_train3['date'].dt.month
df_train3['day'] = df_train3['date'].dt.day
df_train3['dow'] = df_train3['date'].dt.dayofweek
df_train3['woy'] = df_train3['date'].dt.weekofyear

In [169]:
#TODO: baixar calendário de feriados na Rússia:
#      https://www.google.com/search?q=holidays+calendar+in+russia+format%3Acsv
#TODO: criar colunas indicando feriado
#TODO: criar colunas indicando véspera ou pós-feriado

In [170]:
df_train3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 0 to 999
Data columns (total 14 columns):
date              1000 non-null datetime64[ns]
date_block_num    1000 non-null int64
shop_id           1000 non-null category
item_id           1000 non-null int64
item_price        1000 non-null float64
item_cnt_day      1000 non-null float64
shop_name         1000 non-null object
city_name         1000 non-null object
city_code         1000 non-null category
year              1000 non-null int64
month             1000 non-null int64
day               1000 non-null int64
dow               1000 non-null int64
woy               1000 non-null int64
dtypes: category(2), datetime64[ns](1), float64(2), int64(7), object(2)
memory usage: 105.1+ KB


In [171]:
df_train3.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,shop_name,city_name,city_code,year,month,day,dow,woy
0,2013-01-02,0,59,22154,999.0,1.0,"Ярославль ТЦ ""Альтаир""",Ярославль,32,2013,1,2,2,1
1,2013-01-03,0,25,2552,899.0,1.0,"Москва ТРК ""Атриум""",Москва,15,2013,1,3,3,1
2,2013-01-05,0,25,2552,899.0,-1.0,"Москва ТРК ""Атриум""",Москва,15,2013,1,5,5,1
3,2013-01-06,0,25,2554,1709.05,1.0,"Москва ТРК ""Атриум""",Москва,15,2013,1,6,6,1
4,2013-01-15,0,25,2555,1099.0,1.0,"Москва ТРК ""Атриум""",Москва,15,2013,1,15,1,3


In [172]:
df_train3.drop(['date', 'shop_name', 'city_name'], axis=1, inplace=True)
df_train3.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_price,item_cnt_day,city_code,year,month,day,dow,woy
0,0,59,22154,999.0,1.0,32,2013,1,2,2,1
1,0,25,2552,899.0,1.0,15,2013,1,3,3,1
2,0,25,2552,899.0,-1.0,15,2013,1,5,5,1
3,0,25,2554,1709.05,1.0,15,2013,1,6,6,1
4,0,25,2555,1099.0,1.0,15,2013,1,15,1,3


In [173]:
df_train3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 0 to 999
Data columns (total 11 columns):
date_block_num    1000 non-null int64
shop_id           1000 non-null category
item_id           1000 non-null int64
item_price        1000 non-null float64
item_cnt_day      1000 non-null float64
city_code         1000 non-null category
year              1000 non-null int64
month             1000 non-null int64
day               1000 non-null int64
dow               1000 non-null int64
woy               1000 non-null int64
dtypes: category(2), float64(2), int64(7)
memory usage: 81.7 KB


In [None]:
#TODO: treino: todos os meses, exceto o último / teste: último mês
#TODO: avaliar diversos algoritmos de regressão e escolher o de melhor escore (métrica: MSE)

In [None]:
#TODO: usando o melhor algoritmo avaliado, treinar novamente usando todos os meses disponíveis
#TODO: montar dataframe contendo lojas e itens presentes na base de testes (test.csv)
#TODO: usar todas as datas do mês seguinte (01/11/2015 a 30/11/2015)
#TODO: incrementar dataframe fazendo JOINs com tabelas de lojas, itens e categorias
#TODO: incrementar dataframe produzindo campos extras de datas (year, month, day, dow, woy)

In [None]:
#TODO: submeter o dataframe de entrada ao modelo treinado a fim de obter previsões
#TODO: calcular somas das quantidades de produtos agrupadas por loja e produto
#TODO: fazer JOIN da base de testes com esse dataframe final, produzindo o arquivo de submissão final

## Sales (testing)

In [145]:
df_test = pd.read_csv(input_prefix + 'test.csv', index_col='ID')
print('shape:', df_items.shape)
df_test.head()

shape: (22170, 4)


Unnamed: 0_level_0,shop_id,item_id
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
0,5,5037
1,5,5320
2,5,5233
3,5,5232
4,5,5268


In [146]:
#TODO: verificar o que está sendo esperado (acumulado mensal por loja e produto?)
# item_cnt_day: number of products sold. You are predicting a monthly amount of this measure

In [147]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 214200 entries, 0 to 214199
Data columns (total 2 columns):
shop_id    214200 non-null int64
item_id    214200 non-null int64
dtypes: int64(2)
memory usage: 4.9 MB


In [148]:
df_test.describe()

Unnamed: 0,shop_id,item_id
count,214200.0,214200.0
mean,31.6429,11019.3986
std,17.5619,6252.6446
min,2.0,30.0
25%,16.0,5381.5
50%,34.5,11203.0
75%,47.0,16071.5
max,59.0,22167.0


## Sample submission

In [149]:
df_sample = pd.read_csv(input_prefix + 'sample_submission.csv', index_col='ID')
print('shape:', df_sample.shape)
df_sample.head()

shape: (214200, 1)


Unnamed: 0_level_0,item_cnt_month
ID,Unnamed: 1_level_1
0,0.5
1,0.5
2,0.5
3,0.5
4,0.5


In [150]:
!mkdir submissions

In [156]:
df_sample['item_cnt_month'] = 0.3
df_sample.to_csv('submissions/submission-0.3.csv')