# Predict Future Sales in Russia

- https://www.kaggle.com/c/competitive-data-science-predict-future-sales/

### Data files
- item_categories.csv
- items.csv
- sales_train.csv
- sample_submission.csv
- shops.csv
- test.csv

In [2]:
# importar pacotes necessários
import numpy as np
import pandas as pd

In [3]:
# definir parâmetros extras
pd.set_option('precision', 4)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

In [3]:
input_prefix = 'data/'
#input_prefix = 'https://github.com/hjort/ai-labs/raw/master/jupyter/future-sales/data/'

translate = False # original em russo
#translate = True # traduzir para inglês

trfile = '-translated' if translate else ''
trcol = '_translated' if translate else ''

bzfile = '.bz2' # ''

## Shops

In [4]:
df_shops = pd.read_csv(
    input_prefix + 'shops' + trfile + '.csv' + bzfile,
    index_col='shop_id')
print('shape:', df_shops.shape)
df_shops.head()

shape: (60, 1)


Unnamed: 0_level_0,shop_name
shop_id,Unnamed: 1_level_1
0,"!Якутск Орджоникидзе, 56 фран"
1,"!Якутск ТЦ ""Центральный"" фран"
2,"Адыгея ТЦ ""Мега"""
3,"Балашиха ТРК ""Октябрь-Киномир"""
4,"Волжский ТЦ ""Волга Молл"""


In [5]:
df_shops.dtypes

shop_name    object
dtype: object

In [6]:
df_shops.describe()

Unnamed: 0,shop_name
count,60
unique,60
top,"Волжский ТЦ ""Волга Молл"""
freq,1


In [7]:
df_shops.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 60 entries, 0 to 59
Data columns (total 1 columns):
shop_name    60 non-null object
dtypes: object(1)
memory usage: 960.0+ bytes


### Extract city name from the shop name

- 'Москва ТРК "Атриум"' => 'Москва'
- 'Н.Новгород ТРЦ "Фантастика" => 'Н.Новгород'

In [8]:
df_shops['city_name'] = df_shops['shop_name' + trcol].apply(lambda s: s.split()[0])

df_shops.head()

Unnamed: 0_level_0,shop_name,city_name
shop_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"!Якутск Орджоникидзе, 56 фран",!Якутск
1,"!Якутск ТЦ ""Центральный"" фран",!Якутск
2,"Адыгея ТЦ ""Мега""",Адыгея
3,"Балашиха ТРК ""Октябрь-Киномир""",Балашиха
4,"Волжский ТЦ ""Волга Молл""",Волжский


In [9]:
df_shops.groupby('city_name').count().\
    sort_values(by='shop_name', ascending=False).head()

Unnamed: 0_level_0,shop_name
city_name,Unnamed: 1_level_1
Москва,13
Тюмень,3
Воронеж,3
РостовНаДону,3
!Якутск,2


In [10]:
df_shops['city_code'] = (
    df_shops['city_name'].astype('category').cat.codes + 1
).astype('category')

df_shops.head()

Unnamed: 0_level_0,shop_name,city_name,city_code
shop_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,"!Якутск Орджоникидзе, 56 фран",!Якутск,1
1,"!Якутск ТЦ ""Центральный"" фран",!Якутск,1
2,"Адыгея ТЦ ""Мега""",Адыгея,2
3,"Балашиха ТРК ""Октябрь-Киномир""",Балашиха,3
4,"Волжский ТЦ ""Волга Молл""",Волжский,4


In [11]:
df_shops.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 60 entries, 0 to 59
Data columns (total 3 columns):
shop_name    60 non-null object
city_name    60 non-null object
city_code    60 non-null category
dtypes: category(1), object(2)
memory usage: 3.0+ KB


In [12]:
df_shops.describe()

Unnamed: 0,shop_name,city_name,city_code
count,60,60,60
unique,60,32,32
top,"Волжский ТЦ ""Волга Молл""",Москва,15
freq,1,13,13


In [13]:
df_shops.to_csv('shops_full.csv')

In [14]:
!head shops_full.csv

shop_id,shop_name,city_name,city_code
0,"!Якутск Орджоникидзе, 56 фран",!Якутск,1
1,"!Якутск ТЦ ""Центральный"" фран",!Якутск,1
2,"Адыгея ТЦ ""Мега""",Адыгея,2
3,"Балашиха ТРК ""Октябрь-Киномир""",Балашиха,3
4,"Волжский ТЦ ""Волга Молл""",Волжский,4
5,"Вологда ТРЦ ""Мармелад""",Вологда,5
6,"Воронеж (Плехановская, 13)",Воронеж,6
7,"Воронеж ТРЦ ""Максимир""",Воронеж,6
8,"Воронеж ТРЦ Сити-Парк ""Град""",Воронеж,6


In [15]:
!rm -f shops_full.csv.bz2 && bzip2 -9 shops_full.csv

## Item categories

In [16]:
df_categories = pd.read_csv(
    input_prefix + 'item_categories' + trfile + '.csv' + bzfile,
    index_col='item_category_id')
print('shape:', df_categories.shape)
df_categories.head()

shape: (84, 1)


Unnamed: 0_level_0,item_category_name
item_category_id,Unnamed: 1_level_1
0,PC - Гарнитуры/Наушники
1,Аксессуары - PS2
2,Аксессуары - PS3
3,Аксессуары - PS4
4,Аксессуары - PSP


In [17]:
df_categories.describe()

Unnamed: 0,item_category_name
count,84
unique,84
top,Книги - Аудиокниги 1С
freq,1


### Extract group and subgroup names from item category name

- 'Игровые консоли - PS4' => 'Игровые консоли'
- 'Карты оплаты - Windows (Цифра)' => 'Карты оплаты'
- 'Книги - Комиксы, манга' => 'Книги'

In [18]:
df_categories['group_name'] = \
    df_categories['item_category_name' + trcol].apply(
        lambda s: s.split(' - ')[0].split(' (')[0].upper())

df_categories.head()

Unnamed: 0_level_0,item_category_name,group_name
item_category_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,PC - Гарнитуры/Наушники,PC
1,Аксессуары - PS2,АКСЕССУАРЫ
2,Аксессуары - PS3,АКСЕССУАРЫ
3,Аксессуары - PS4,АКСЕССУАРЫ
4,Аксессуары - PSP,АКСЕССУАРЫ


In [19]:
df_categories.groupby('group_name').count().\
    sort_values(by='item_category_name', ascending=False).head()

Unnamed: 0_level_0,item_category_name
group_name,Unnamed: 1_level_1
КНИГИ,13
ПОДАРКИ,12
ИГРОВЫЕ КОНСОЛИ,8
ИГРЫ,8
АКСЕССУАРЫ,7


In [20]:
def extract_subgroup(s):
    gs = s.split(' - ')
    if len(gs) > 1:
        gs2 = gs[1].split(' (')
        return gs2[0].upper()
    else:
        return ''
    #return gs[1] if len(gs) > 1 else ''
    
df_categories['subgroup_name'] = \
    df_categories['item_category_name' + trcol].apply(
        lambda s: extract_subgroup(s))

df_categories.head()

Unnamed: 0_level_0,item_category_name,group_name,subgroup_name
item_category_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,PC - Гарнитуры/Наушники,PC,ГАРНИТУРЫ/НАУШНИКИ
1,Аксессуары - PS2,АКСЕССУАРЫ,PS2
2,Аксессуары - PS3,АКСЕССУАРЫ,PS3
3,Аксессуары - PS4,АКСЕССУАРЫ,PS4
4,Аксессуары - PSP,АКСЕССУАРЫ,PSP


In [21]:
df_categories.groupby('subgroup_name').count().head(10)

Unnamed: 0_level_0,item_category_name,group_name
subgroup_name,Unnamed: 1_level_1,Unnamed: 2_level_1
,7,7
1С:ПРЕДПРИЯТИЕ 8,1,1
BLU-RAY,1,1
BLU-RAY 3D,1,1
BLU-RAY 4K,1,1
CD ЛОКАЛЬНОГО ПРОИЗВОДСТВА,1,1
CD ФИРМЕННОГО ПРОИЗВОДСТВА,1,1
DVD,1,1
LIVE!,2,2
MAC,1,1


In [22]:
df_categories['group_code'] = (
    df_categories['group_name'].astype('category').cat.codes + 1).astype('category')

df_categories['subgroup_code'] = (
    df_categories['subgroup_name'].astype('category').cat.codes + 1).astype('category')

df_categories.head()

Unnamed: 0_level_0,item_category_name,group_name,subgroup_name,group_code,subgroup_code
item_category_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,PC - Гарнитуры/Наушники,PC,ГАРНИТУРЫ/НАУШНИКИ,1,30
1,Аксессуары - PS2,АКСЕССУАРЫ,PS2,2,12
2,Аксессуары - PS3,АКСЕССУАРЫ,PS3,2,13
3,Аксессуары - PS4,АКСЕССУАРЫ,PS4,2,14
4,Аксессуары - PSP,АКСЕССУАРЫ,PSP,2,16


In [23]:
df_categories.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 84 entries, 0 to 83
Data columns (total 5 columns):
item_category_name    84 non-null object
group_name            84 non-null object
subgroup_name         84 non-null object
group_code            84 non-null category
subgroup_code         84 non-null category
dtypes: category(2), object(3)
memory usage: 6.5+ KB


In [24]:
df_categories.describe()

Unnamed: 0,item_category_name,group_name,subgroup_name,group_code,subgroup_code
count,84,84,84.0,84,84
unique,84,18,55.0,18,55
top,Книги - Аудиокниги 1С,КНИГИ,,12,1
freq,1,13,7.0,13,7


## Items

In [25]:
df_items = pd.read_csv(
    input_prefix + 'items' + trfile + '.csv' + bzfile,
    index_col='item_id', dtype={'item_category_id': np.int8})
print('shape:', df_items.shape)
df_items.head()

shape: (22170, 2)


Unnamed: 0_level_0,item_name,item_category_id
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,40
1,!ABBYY FineReader 12 Professional Edition Full...,76
2,***В ЛУЧАХ СЛАВЫ (UNV) D,40
3,***ГОЛУБАЯ ВОЛНА (Univ) D,40
4,***КОРОБКА (СТЕКЛО) D,40


In [26]:
#if not translate:
#    df_items['item_category_id'] = df_items['item_category_id'].astype('category')

In [27]:
df_items.describe()

Unnamed: 0,item_category_id
count,22170.0
mean,46.2908
std,15.9415
min,0.0
25%,37.0
50%,40.0
75%,58.0
max,83.0


In [28]:
import re

def extract_main_subject(str):
    s = str.upper()
    # remover caracteres do começo => !"*/
    s = re.sub("^[!*/\"]+ ?", "", s)
    # remover termo "1C:" do começo do nome
    s = re.sub("^1C.", "", s)
    # remover termo "THE" do começo do nome
    s = re.sub("^THE ", "", s)
    # obter primeira palavra em maiúsculo
    s = s.split()[0]
    # substituir caracteres => '`’
    s = re.sub("['`’]", "_", s)
    # remover caracteres do fim da palavra => :.®,!
    s = re.sub("[:.,!®]$", "", s)
    return s
    
df_items['subject_name'] = \
    df_items['item_name' + trcol].apply(
        lambda s: extract_main_subject(s))

df_items.head()

Unnamed: 0_level_0,item_name,item_category_id,subject_name
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,40,ВО
1,!ABBYY FineReader 12 Professional Edition Full...,76,ABBYY
2,***В ЛУЧАХ СЛАВЫ (UNV) D,40,В
3,***ГОЛУБАЯ ВОЛНА (Univ) D,40,ГОЛУБАЯ
4,***КОРОБКА (СТЕКЛО) D,40,КОРОБКА


In [29]:
df_items.tail()

Unnamed: 0_level_0,item_name,item_category_id,subject_name
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
22165,"Ядерный титбит 2 [PC, Цифровая версия]",31,ЯДЕРНЫЙ
22166,Язык запросов 1С:Предприятия [Цифровая версия],54,ЯЗЫК
22167,Язык запросов 1С:Предприятия 8 (+CD). Хрустале...,49,ЯЗЫК
22168,Яйцо для Little Inu,62,ЯЙЦО
22169,Яйцо дракона (Игра престолов),69,ЯЙЦО


In [30]:
df_items.groupby('subject_name').count().head(10)

Unnamed: 0_level_0,item_name,item_category_id
subject_name,Unnamed: 1_level_1,Unnamed: 2_level_1
007,5,5
1+1,2,2
10,6,6
100,20,20
1000,2,2
10000,1,1
101,8,8
11,2,2
11-11-11,1,1
12,8,8


In [31]:
df_items['subject_code'] = (
    df_items['subject_name'].astype('category').cat.codes + 1).astype('category')

df_items.head()

Unnamed: 0_level_0,item_name,item_category_id,subject_name,subject_code
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,40,ВО,1803
1,!ABBYY FineReader 12 Professional Edition Full...,76,ABBYY,111
2,***В ЛУЧАХ СЛАВЫ (UNV) D,40,В,1614
3,***ГОЛУБАЯ ВОЛНА (Univ) D,40,ГОЛУБАЯ,1958
4,***КОРОБКА (СТЕКЛО) D,40,КОРОБКА,2630


In [32]:
df_items.tail()

Unnamed: 0_level_0,item_name,item_category_id,subject_name,subject_code
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
22165,"Ядерный титбит 2 [PC, Цифровая версия]",31,ЯДЕРНЫЙ,4646
22166,Язык запросов 1С:Предприятия [Цифровая версия],54,ЯЗЫК,4648
22167,Язык запросов 1С:Предприятия 8 (+CD). Хрустале...,49,ЯЗЫК,4648
22168,Яйцо для Little Inu,62,ЯЙЦО,4649
22169,Яйцо дракона (Игра престолов),69,ЯЙЦО,4649


In [33]:
df_items.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22170 entries, 0 to 22169
Data columns (total 4 columns):
item_name           22170 non-null object
item_category_id    22170 non-null int8
subject_name        22170 non-null object
subject_code        22170 non-null category
dtypes: category(1), int8(1), object(2)
memory usage: 780.9+ KB


In [34]:
df_items.describe()

Unnamed: 0,item_category_id
count,22170.0
mean,46.2908
std,15.9415
min,0.0
25%,37.0
50%,40.0
75%,58.0
max,83.0


In [35]:
# join items + categories:item_category_id => group_code, subgroup_code
df_items2 = pd.merge(df_items, #.reset_index(),
                     df_categories, how='left', on='item_category_id')
df_items2.index.names = ['item_id']
df_items2.head()

Unnamed: 0_level_0,item_name,item_category_id,subject_name,subject_code,item_category_name,group_name,subgroup_name,group_code,subgroup_code
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,40,ВО,1803,Кино - DVD,КИНО,DVD,11,8
1,!ABBYY FineReader 12 Professional Edition Full...,76,ABBYY,111,Программы - Для дома и офиса (Цифра),ПРОГРАММЫ,ДЛЯ ДОМА И ОФИСА,15,31
2,***В ЛУЧАХ СЛАВЫ (UNV) D,40,В,1614,Кино - DVD,КИНО,DVD,11,8
3,***ГОЛУБАЯ ВОЛНА (Univ) D,40,ГОЛУБАЯ,1958,Кино - DVD,КИНО,DVD,11,8
4,***КОРОБКА (СТЕКЛО) D,40,КОРОБКА,2630,Кино - DVD,КИНО,DVD,11,8


In [36]:
del(df_items)
del(df_categories)

In [37]:
#df_items2.set_index(['item_id'], inplace=True)
df_items2['item_category_id'] = df_items2['item_category_id'].astype('category')

In [38]:
df_items2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22170 entries, 0 to 22169
Data columns (total 9 columns):
item_name             22170 non-null object
item_category_id      22170 non-null category
subject_name          22170 non-null object
subject_code          22170 non-null category
item_category_name    22170 non-null object
group_name            22170 non-null object
subgroup_name         22170 non-null object
group_code            22170 non-null category
subgroup_code         22170 non-null category
dtypes: category(4), object(5)
memory usage: 1.3+ MB


In [39]:
df_items2.describe()

Unnamed: 0,item_name,item_category_id,subject_name,subject_code,item_category_name,group_name,subgroup_name,group_code,subgroup_code
count,22170,22170,22170,22170,22170,22170,22170,22170,22170
unique,22170,84,4655,4655,84,18,55,18,55
top,"Билет ""ИгроМир 2015"" - 2 октября 2015 (сайт) У...",40,ФИГУРКА,4348,Кино - DVD,КИНО,DVD,11,8
freq,1,5035,599,599,5035,7464,5035,7464,5035


In [40]:
df_items2.to_csv('items_full.csv')

In [41]:
!head items_full.csv

item_id,item_name,item_category_id,subject_name,subject_code,item_category_name,group_name,subgroup_name,group_code,subgroup_code
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.)         D,40,ВО,1803,Кино - DVD,КИНО,DVD,11,8
1,"!ABBYY FineReader 12 Professional Edition Full [PC, Цифровая версия]",76,ABBYY,111,Программы - Для дома и офиса (Цифра),ПРОГРАММЫ,ДЛЯ ДОМА И ОФИСА,15,31
2,***В ЛУЧАХ СЛАВЫ   (UNV)                    D,40,В,1614,Кино - DVD,КИНО,DVD,11,8
3,***ГОЛУБАЯ ВОЛНА  (Univ)                      D,40,ГОЛУБАЯ,1958,Кино - DVD,КИНО,DVD,11,8
4,***КОРОБКА (СТЕКЛО)                       D,40,КОРОБКА,2630,Кино - DVD,КИНО,DVD,11,8
5,***НОВЫЕ АМЕРИКАНСКИЕ ГРАФФИТИ  (UNI)             D,40,НОВЫЕ,3186,Кино - DVD,КИНО,DVD,11,8
6,***УДАР ПО ВОРОТАМ (UNI)               D,40,УДАР,4281,Кино - DVD,КИНО,DVD,11,8
7,***УДАР ПО ВОРОТАМ-2 (UNI)               D,40,УДАР,4281,Кино - DVD,КИНО,DVD,11,8
8,***ЧАЙ С МУССОЛИНИ                     D,40,ЧАЙ,4473,Кино - DVD,КИНО,DVD,11,8


In [42]:
!rm -f items_full.csv.bz2 && bzip2 -9 items_full.csv

## Date Blocks

In [34]:
# criar dataframe com possíveis meses
df_dates = pd.DataFrame({'last_date': pd.date_range(start='2013-01-01', periods=35, freq='M')})
df_dates.index.names = ['date_block_num']
df_dates['first_date'] = df_dates['last_date'].values.astype('datetime64[M]')
df_dates['days_count'] = ((df_dates['last_date'] - df_dates['first_date']).dt.days + 1).astype(np.int8)
df_dates['year'] = df_dates['last_date'].dt.year
df_dates['month'] = df_dates['last_date'].dt.month
df_dates.head()

Unnamed: 0_level_0,last_date,first_date,days_count,year,month
date_block_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,2013-01-31,2013-01-01,31,2013,1
1,2013-02-28,2013-02-01,28,2013,2
2,2013-03-31,2013-03-01,31,2013,3
3,2013-04-30,2013-04-01,30,2013,4
4,2013-05-31,2013-05-01,31,2013,5


In [35]:
df_dates.tail()

Unnamed: 0_level_0,last_date,first_date,days_count,year,month
date_block_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
30,2015-07-31,2015-07-01,31,2015,7
31,2015-08-31,2015-08-01,31,2015,8
32,2015-09-30,2015-09-01,30,2015,9
33,2015-10-31,2015-10-01,31,2015,10
34,2015-11-30,2015-11-01,30,2015,11


In [36]:
df_dates.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35 entries, 0 to 34
Data columns (total 5 columns):
last_date     35 non-null datetime64[ns]
first_date    35 non-null datetime64[ns]
days_count    35 non-null int8
year          35 non-null int64
month         35 non-null int64
dtypes: datetime64[ns](2), int64(2), int8(1)
memory usage: 1.2 KB


In [37]:
#TODO: criar colunas sun_cnt, mon_cnt, ..., sat_cnt

In [38]:
df_dates.to_csv('date_blocks.csv')

In [39]:
!head date_blocks.csv

date_block_num,last_date,first_date,days_count,year,month
0,2013-01-31,2013-01-01,31,2013,1
1,2013-02-28,2013-02-01,28,2013,2
2,2013-03-31,2013-03-01,31,2013,3
3,2013-04-30,2013-04-01,30,2013,4
4,2013-05-31,2013-05-01,31,2013,5
5,2013-06-30,2013-06-01,30,2013,6
6,2013-07-31,2013-07-01,31,2013,7
7,2013-08-31,2013-08-01,31,2013,8
8,2013-09-30,2013-09-01,30,2013,9
