# Predict Future Sales

- https://www.kaggle.com/c/competitive-data-science-predict-future-sales/

### Data files
- item_categories.csv
- items.csv
- sales_train.csv
- sample_submission.csv
- shops.csv
- test.csv

In [1]:
import pandas as pd

In [5]:
input_prefix = 'data/'

## Shops

In [14]:
df_shops = pd.read_csv(input_prefix + 'shops.csv', index_col='shop_id')
print(df_shops.shape)
print(df_shops.describe())
df_shops.head()

(60, 1)
                     shop_name
count                       60
unique                      60
top     СПб ТК "Невский Центр"
freq                         1


Unnamed: 0_level_0,shop_name
shop_id,Unnamed: 1_level_1
0,"!Якутск Орджоникидзе, 56 фран"
1,"!Якутск ТЦ ""Центральный"" фран"
2,"Адыгея ТЦ ""Мега"""
3,"Балашиха ТРК ""Октябрь-Киномир"""
4,"Волжский ТЦ ""Волга Молл"""


### Extract city name from the shop name

- 'Москва ТРК "Атриум"' => 'Москва'
- 'Н.Новгород ТРЦ "Фантастика" => 'Н.Новгород'

In [19]:
df_shops['city_name'] = df_shops['shop_name'].apply(lambda s: s.split()[0])
df_shops.head()

Unnamed: 0_level_0,shop_name,city_name
shop_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"!Якутск Орджоникидзе, 56 фран",!Якутск
1,"!Якутск ТЦ ""Центральный"" фран",!Якутск
2,"Адыгея ТЦ ""Мега""",Адыгея
3,"Балашиха ТРК ""Октябрь-Киномир""",Балашиха
4,"Волжский ТЦ ""Волга Молл""",Волжский


In [28]:
df_shops.groupby('city_name').count().head(10)

Unnamed: 0_level_0,shop_name
city_name,Unnamed: 1_level_1
!Якутск,2
Адыгея,1
Балашиха,1
Волжский,1
Вологда,1
Воронеж,3
Выездная,1
Жуковский,2
Интернет-магазин,1
Казань,2


In [31]:
df_shops.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 60 entries, 0 to 59
Data columns (total 2 columns):
shop_name    60 non-null object
city_name    60 non-null object
dtypes: object(2)
memory usage: 960.0+ bytes


## Item categories

In [42]:
df_categories = pd.read_csv(input_prefix + 'item_categories.csv', index_col='item_category_id')
print(df_categories.shape)
print(df_categories.describe())
df_categories.head()

(84, 1)
         item_category_name
count                    84
unique                   84
top     Аксессуары - PSVita
freq                      1


Unnamed: 0_level_0,item_category_name
item_category_id,Unnamed: 1_level_1
0,PC - Гарнитуры/Наушники
1,Аксессуары - PS2
2,Аксессуары - PS3
3,Аксессуары - PS4
4,Аксессуары - PSP


### Extract group and subgroup names from item category name

- 'Игровые консоли - PS4' => 'Игровые консоли'
- 'Карты оплаты - Windows (Цифра)' => 'Карты оплаты'
- 'Книги - Комиксы, манга' => 'Книги'

In [44]:
df_categories['group_name'] = df_categories['item_category_name'].apply(lambda s: s.split(' - ')[0])
df_categories.head()

Unnamed: 0_level_0,item_category_name,group_name
item_category_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,PC - Гарнитуры/Наушники,PC
1,Аксессуары - PS2,Аксессуары
2,Аксессуары - PS3,Аксессуары
3,Аксессуары - PS4,Аксессуары
4,Аксессуары - PSP,Аксессуары


In [45]:
df_categories.groupby('group_name').count().head(10)

Unnamed: 0_level_0,item_category_name
group_name,Unnamed: 1_level_1
PC,1
Аксессуары,7
Билеты (Цифра),1
Доставка товара,1
Игровые консоли,8
Игры,8
Игры Android,1
Игры MAC,1
Игры PC,4
Карты оплаты,4


In [46]:
def extract_subgroup(s):
    gs = s.split(' - ')
    return gs[1] if len(gs) > 1 else ''
    
df_categories['subgroup_name'] = df_categories['item_category_name'].apply(lambda s: extract_subgroup(s))
df_categories.head()

Unnamed: 0_level_0,item_category_name,group_name,subgroup_name
item_category_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,PC - Гарнитуры/Наушники,PC,Гарнитуры/Наушники
1,Аксессуары - PS2,Аксессуары,PS2
2,Аксессуары - PS3,Аксессуары,PS3
3,Аксессуары - PS4,Аксессуары,PS4
4,Аксессуары - PSP,Аксессуары,PSP


In [47]:
df_categories.groupby('subgroup_name').count().head(10)

Unnamed: 0_level_0,item_category_name,group_name
subgroup_name,Unnamed: 1_level_1,Unnamed: 2_level_1
,7,7
1С:Предприятие 8,1,1
Blu-Ray,1,1
Blu-Ray 3D,1,1
Blu-Ray 4K,1,1
CD локального производства,1,1
CD фирменного производства,1,1
DVD,1,1
Live!,1,1
Live! (Цифра),1,1


In [48]:
df_categories.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 84 entries, 0 to 83
Data columns (total 3 columns):
item_category_name    84 non-null object
group_name            84 non-null object
subgroup_name         84 non-null object
dtypes: object(3)
memory usage: 1.6+ KB


## Items

In [10]:
df_items = pd.read_csv(input_prefix + 'items.csv', index_col='item_id')
print(df_items.shape)
print(df_items.describe())
df_items.head()

(22170, 2)
       item_category_id
count      22170.000000
mean          46.290753
std           15.941486
min            0.000000
25%           37.000000
50%           40.000000
75%           58.000000
max           83.000000


Unnamed: 0_level_0,item_name,item_category_id
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,40
1,!ABBYY FineReader 12 Professional Edition Full...,76
2,***В ЛУЧАХ СЛАВЫ (UNV) D,40
3,***ГОЛУБАЯ ВОЛНА (Univ) D,40
4,***КОРОБКА (СТЕКЛО) D,40


In [52]:
df_items['item_category_id'] = df_items['item_category_id'].astype('category')
df_items.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22170 entries, 0 to 22169
Data columns (total 2 columns):
item_name           22170 non-null object
item_category_id    22170 non-null category
dtypes: category(1), object(1)
memory usage: 284.1+ KB


## Sales (training)

In [11]:
df_train = pd.read_csv(input_prefix + 'sales_train.csv')
print(df_train.shape)
print(df_train.describe())
df_train.head()

(2935849, 6)
       date_block_num       shop_id       item_id    item_price  item_cnt_day
count    2.935849e+06  2.935849e+06  2.935849e+06  2.935849e+06  2.935849e+06
mean     1.456991e+01  3.300173e+01  1.019723e+04  8.908532e+02  1.242641e+00
std      9.422988e+00  1.622697e+01  6.324297e+03  1.729800e+03  2.618834e+00
min      0.000000e+00  0.000000e+00  0.000000e+00 -1.000000e+00 -2.200000e+01
25%      7.000000e+00  2.200000e+01  4.476000e+03  2.490000e+02  1.000000e+00
50%      1.400000e+01  3.100000e+01  9.343000e+03  3.990000e+02  1.000000e+00
75%      2.300000e+01  4.700000e+01  1.568400e+04  9.990000e+02  1.000000e+00
max      3.300000e+01  5.900000e+01  2.216900e+04  3.079800e+05  2.169000e+03


Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


In [56]:
#TODO: converter 'date' para tipo data
#TODO: mesclar com tabelas de lojas, itens e categorias
#TODO: verificar o que significam contagens negativas

In [55]:
#TODO: criar colunas contendo dia da semana a partir de 'date'
#TODO: baixar calendário de feriados na Rússia:
#      https://www.google.com/search?q=holidays+calendar+in+russia+format%3Acsv
#TODO: criar colunas indicando feriado
#TODO: criar colunas indicando véspera ou pós-feriado

In [53]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2935849 entries, 0 to 2935848
Data columns (total 6 columns):
date              object
date_block_num    int64
shop_id           int64
item_id           int64
item_price        float64
item_cnt_day      float64
dtypes: float64(2), int64(3), object(1)
memory usage: 123.2+ MB


## Sales (testing)

In [12]:
df_test = pd.read_csv(input_prefix + 'test.csv')
print(df_test.shape)
print(df_test.describe())
df_test.head()

(214200, 3)
                  ID        shop_id        item_id
count  214200.000000  214200.000000  214200.000000
mean   107099.500000      31.642857   11019.398627
std     61834.358168      17.561933    6252.644590
min         0.000000       2.000000      30.000000
25%     53549.750000      16.000000    5381.500000
50%    107099.500000      34.500000   11203.000000
75%    160649.250000      47.000000   16071.500000
max    214199.000000      59.000000   22167.000000


Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268


In [None]:
#TODO: verificar o que está sendo esperado (acumulado mensal por loja e produto?)

In [54]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214200 entries, 0 to 214199
Data columns (total 3 columns):
ID         214200 non-null int64
shop_id    214200 non-null int64
item_id    214200 non-null int64
dtypes: int64(3)
memory usage: 4.9 MB
