# Module Importing

In [2]:
import os
os.chdir('..')

In [54]:
import datetime

In [3]:
import pandas as pd
import numpy as np

In [4]:
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
import warnings
warnings.filterwarnings('ignore')

In [6]:
%matplotlib inline

# Data Loading

## Data fields
+ **ID** - an Id that represents a (Shop, Item) tuple within the test set
+ **shop_id** - unique identifier of a shop
+ **item_id** - unique identifier of a product
+ **item_category_id** - unique identifier of item category
+ **item_cnt_day** - number of products sold. You are predicting a monthly amount of this measure
+ **item_price** - current price of an item
+ **date** - date in format dd/mm/yyyy
+ **date_block_num** - a consecutive month number, used for convenience. January 2013 is 0, February 2013 is 1,..., October 2015 is 33
+ **item_name** - name of item
+ **shop_name** - name of shop
+ **item_category_name** - name of item category

## Sales_train
**(the training set. Daily historical data from January 2013 to October 2015)**

In [7]:
sales_train_df = pd.read_csv('data/sales_train.csv')

In [8]:
sales_train_df.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


In [9]:
print('Shape of sales_train.csv is (%d, %d)' % sales_train_df.shape)

Shape of sales_train.csv is (2935849, 6)


## Items
**(supplemental information about the items/products)**

In [10]:
items_df = pd.read_csv('data/items.csv')

In [11]:
items_df.head()

Unnamed: 0,item_name,item_id,item_category_id
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,0,40
1,!ABBYY FineReader 12 Professional Edition Full...,1,76
2,***В ЛУЧАХ СЛАВЫ (UNV) D,2,40
3,***ГОЛУБАЯ ВОЛНА (Univ) D,3,40
4,***КОРОБКА (СТЕКЛО) D,4,40


In [12]:
print('Shape of items.csv is (%d, %d)' % items_df.shape)

Shape of items.csv is (22170, 3)


## Item_categories
**(supplemental information about the items categories)**

In [13]:
item_categories_df = pd.read_csv('data/item_categories.csv')

In [14]:
item_categories_df.head()

Unnamed: 0,item_category_name,item_category_id
0,PC - Гарнитуры/Наушники,0
1,Аксессуары - PS2,1
2,Аксессуары - PS3,2
3,Аксессуары - PS4,3
4,Аксессуары - PSP,4


In [15]:
print('Shape of item_categories.csv is (%d, %d)' % item_categories_df.shape)

Shape of item_categories.csv is (84, 2)


## Shops
**(supplemental information about the shops)**

In [16]:
shops_df = pd.read_csv('data/shops.csv')

In [17]:
shops_df.head()

Unnamed: 0,shop_name,shop_id
0,"!Якутск Орджоникидзе, 56 фран",0
1,"!Якутск ТЦ ""Центральный"" фран",1
2,"Адыгея ТЦ ""Мега""",2
3,"Балашиха ТРК ""Октябрь-Киномир""",3
4,"Волжский ТЦ ""Волга Молл""",4


In [18]:
print('Shape of shops.csv is (%d, %d)' % shops_df.shape)

Shape of shops.csv is (60, 2)


## test
**(the test set. You need to forecast the sales for these shops and products for November 2015)**

In [19]:
test_df = pd.read_csv('data/test.csv')

In [20]:
test_df.head()

Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268


In [21]:
print('Shape of test.csv is (%d, %d)' % test_df.shape)

Shape of test.csv is (214200, 3)


# Basic EDA

# Basic data preparation

## Sales train

### 1.  Adding revenue column

In [24]:
sales_train_df['revenue'] = sales_train_df['item_price'] * sales_train_df['item_cnt_day']

In [28]:
sales_train_df.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,revenue
0,02.01.2013,0,59,22154,999.0,1.0,999.0
1,03.01.2013,0,25,2552,899.0,1.0,899.0
2,05.01.2013,0,25,2552,899.0,-1.0,-899.0
3,06.01.2013,0,25,2554,1709.05,1.0,1709.05
4,15.01.2013,0,25,2555,1099.0,1.0,1099.0


### 2.  Date column to correct format

In [55]:
sales_train_df['date'] = sales_train_df['date'].apply(lambda x: datetime.datetime.strptime(x, '%d.%m.%Y'))

### 3.  Adding information about category

In [57]:
sales_train_df = sales_train_df.join(items_df, on='item_id', rsuffix='_').drop(['item_id_', 'item_name'], axis=1)

### 4.  Creating month sales table

In [63]:
month_sales_df = sales_train_df.sort_values('date') \
                               .groupby(['date_block_num', 'shop_id',
                                         'item_category_id', 'item_id'],
                                        as_index=False) \
                               .agg({'item_price':['median', 'mean'],
                                     'item_cnt_day':['sum', 'count']})

In [59]:
month_sales_df.head()

Unnamed: 0_level_0,date_block_num,shop_id,item_category_id,item_id,item_price,item_price,item_cnt_day,item_cnt_day
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,median,mean,sum,count
0,0,0,2,5572,1322.0,1322.0,10.0,6
1,0,0,2,5573,560.0,560.0,1.0,1
2,0,0,2,5575,806.0,806.0,4.0,3
3,0,0,2,5576,2231.0,2231.0,5.0,5
4,0,0,2,5609,2381.0,2381.0,1.0,1


In [64]:
month_sales_df.columns = ['date_block_num', 'shop_id',
                          'item_category_id','item_id',
                          'item_price_median', 'item_price_mean',
                          'item_cnt', 'transactions']

In [65]:
month_sales_df.head()

Unnamed: 0,date_block_num,shop_id,item_category_id,item_id,item_price_median,item_price_mean,item_cnt,transactions
0,0,0,2,5572,1322.0,1322.0,10.0,6
1,0,0,2,5573,560.0,560.0,1.0,1
2,0,0,2,5575,806.0,806.0,4.0,3
3,0,0,2,5576,2231.0,2231.0,5.0,5
4,0,0,2,5609,2381.0,2381.0,1.0,1
