In [1]:
# importar pacotes necessários
import numpy as np
import pandas as pd

In [2]:
# definir parâmetros extras
pd.set_option('precision', 4)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

In [3]:
input_prefix = 'data/'
#input_prefix = 'https://github.com/hjort/ai-labs/raw/master/jupyter/sales-russia/data/'

bzfile = '.bz2' # ''

## Sales (testing)

In [4]:
df_test = pd.read_csv(
    input_prefix + 'test.csv' + bzfile, #nrows=10000,
    dtype={'shop_id': np.int8, 'item_id': np.int16},
    index_col='ID')
print('shape:', df_test.shape)
df_test.head()

shape: (214200, 2)


Unnamed: 0_level_0,shop_id,item_id
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
0,5,5037
1,5,5320
2,5,5233
3,5,5232
4,5,5268


In [5]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 214200 entries, 0 to 214199
Data columns (total 2 columns):
shop_id    214200 non-null int8
item_id    214200 non-null int16
dtypes: int16(1), int8(1)
memory usage: 2.2 MB


In [6]:
df_test.describe()

Unnamed: 0,shop_id,item_id
count,214200.0,214200.0
mean,31.6429,11019.3986
std,17.5619,6252.6446
min,2.0,30.0
25%,16.0,5381.5
50%,34.5,11203.0
75%,47.0,16071.5
max,59.0,22167.0


In [7]:
# criar dataframe sem quantidade de itens
df_zeroed = df_test.copy()
df_zeroed['key'] = 0
df_zeroed = pd.merge(df_zeroed,
                     pd.DataFrame({'date_block_num': np.arange(0, 34), 'key': np.zeros(34, dtype=int)}),
                     how='left', on='key').\
                set_index(['shop_id', 'item_id', 'date_block_num']).\
                drop(['key'], axis=1)
df_zeroed.head()

shop_id,item_id,date_block_num
5,5037,0
5,5037,1
5,5037,2
5,5037,3
5,5037,4


In [8]:
df_zeroed.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 7282800 entries, (5, 5037, 0) to (45, 969, 33)
Empty DataFrame

In [9]:
df_zeroed.index.names

FrozenList(['shop_id', 'item_id', 'date_block_num'])

## Sales (monthly)

In [10]:
df_monthly = pd.read_csv(\
    input_prefix + '../sales_monthly.csv' + bzfile,
    #nrows=300, # somente primeiras linhas!
    dtype={'date_block_num': np.int8, 'shop_id': np.int8, 'item_id': np.int16,
           'item_price_mean': np.int32, 'item_cnt_sum': np.int32},
    index_col=['shop_id', 'item_id', 'date_block_num']
)
print('shape:', df_monthly.shape)
df_monthly.head()

  mask |= (ar1 == a)


shape: (1608226, 2)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,item_price_mean,item_cnt_sum
shop_id,item_id,date_block_num,Unnamed: 3_level_1,Unnamed: 4_level_1
0,30,1,265,31
0,31,1,434,11
0,32,0,221,6
0,32,1,221,10
0,33,0,347,3


In [11]:
df_monthly.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1608226 entries, (0, 30, 1) to (59, 22167, 17)
Data columns (total 2 columns):
item_price_mean    1608226 non-null int32
item_cnt_sum       1608226 non-null int32
dtypes: int32(2)
memory usage: 18.6 MB


In [12]:
df_monthly.describe()

Unnamed: 0,item_price_mean,item_cnt_sum
count,1608200.0,1608200.0
mean,790.2,2.2732
std,1549.3,8.6532
min,1.0,1.0
25%,199.0,1.0
50%,399.0,1.0
75%,895.0,2.0
max,307980.0,2253.0


In [13]:
df_monthly.index.names

FrozenList(['shop_id', 'item_id', 'date_block_num'])

## Zeroed joined Monthly Sales

In [14]:
# mesclar dataframes e zerar valores nulos
df_train2 = pd.merge(df_zeroed,
                     df_monthly,
                     how='left', left_index=True, right_index=True).\
                fillna(0)
df_train2.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,item_price_mean,item_cnt_sum
shop_id,item_id,date_block_num,Unnamed: 3_level_1,Unnamed: 4_level_1
5,5037,0,0.0,0.0
5,5037,1,0.0,0.0
5,5037,2,0.0,0.0
5,5037,3,0.0,0.0
5,5037,4,0.0,0.0


In [15]:
df_train2['item_price_mean'] = df_train2['item_price_mean'].astype(np.int32)
df_train2['item_cnt_sum'] = df_train2['item_cnt_sum'].astype(np.int32)

In [16]:
df_train2.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,item_price_mean,item_cnt_sum
shop_id,item_id,date_block_num,Unnamed: 3_level_1,Unnamed: 4_level_1
5,5037,0,0,0
5,5037,1,0,0
5,5037,2,0,0
5,5037,3,0,0
5,5037,4,0,0
5,5037,5,0,0
5,5037,6,0,0
5,5037,7,0,0
5,5037,8,0,0
5,5037,9,0,0


In [17]:
df_train2.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 7282800 entries, (5, 5037, 0) to (45, 969, 33)
Data columns (total 2 columns):
item_price_mean    int32
item_cnt_sum       int32
dtypes: int32(2)
memory usage: 339.4 MB


In [18]:
df_train2.describe()

Unnamed: 0,item_price_mean,item_cnt_sum
count,7282800.0,7282800.0
mean,75.048,0.22255
std,513.99,3.3251
min,0.0,0.0
25%,0.0,0.0
50%,0.0,0.0
75%,0.0,0.0
max,42990.0,2253.0


In [19]:
df_train2.index.names

FrozenList(['shop_id', 'item_id', 'date_block_num'])

## Shops

In [20]:
df_shops = pd.read_csv(
    input_prefix + '../shops_full' + '.csv' + bzfile,
    dtype={'city_code': 'category'},
    index_col='shop_id')
print('shape:', df_shops.shape)
df_shops.head()

shape: (60, 3)


Unnamed: 0_level_0,shop_name,city_name,city_code
shop_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,"!Якутск Орджоникидзе, 56 фран",!Якутск,1
1,"!Якутск ТЦ ""Центральный"" фран",!Якутск,1
2,"Адыгея ТЦ ""Мега""",Адыгея,2
3,"Балашиха ТРК ""Октябрь-Киномир""",Балашиха,3
4,"Волжский ТЦ ""Волга Молл""",Волжский,4


In [21]:
df_shops.drop(columns=['shop_name', 'city_name'], inplace=True)
df_shops.head()

Unnamed: 0_level_0,city_code
shop_id,Unnamed: 1_level_1
0,1
1,1
2,2
3,3
4,4


In [22]:
df_shops.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 60 entries, 0 to 59
Data columns (total 1 columns):
city_code    60 non-null category
dtypes: category(1)
memory usage: 1.4 KB


In [23]:
df_shops.describe()

Unnamed: 0,city_code
count,60
unique,32
top,15
freq,13


## Items and Categories

In [24]:
df_items = pd.read_csv(
    input_prefix + '../items_full' + '.csv' + bzfile,
    dtype={'item_category_id': 'category', 'subject_code': 'category',
           'group_code': 'category', 'subgroup_code': 'category'},
    index_col='item_id')
print('shape:', df_items.shape)
df_items.head()

shape: (22170, 9)


Unnamed: 0_level_0,item_name,item_category_id,subject_name,subject_code,item_category_name,group_name,subgroup_name,group_code,subgroup_code
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,40,ВО,1803,Кино - DVD,КИНО,DVD,11,8
1,!ABBYY FineReader 12 Professional Edition Full...,76,ABBYY,111,Программы - Для дома и офиса (Цифра),ПРОГРАММЫ,ДЛЯ ДОМА И ОФИСА,15,31
2,***В ЛУЧАХ СЛАВЫ (UNV) D,40,В,1614,Кино - DVD,КИНО,DVD,11,8
3,***ГОЛУБАЯ ВОЛНА (Univ) D,40,ГОЛУБАЯ,1958,Кино - DVD,КИНО,DVD,11,8
4,***КОРОБКА (СТЕКЛО) D,40,КОРОБКА,2630,Кино - DVD,КИНО,DVD,11,8


In [25]:
df_items.drop(columns=['item_name', 'subject_name', 'item_category_name', 'group_name', 'subgroup_name'], inplace=True)
df_items.head()

Unnamed: 0_level_0,item_category_id,subject_code,group_code,subgroup_code
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,40,1803,11,8
1,76,111,15,31
2,40,1614,11,8
3,40,1958,11,8
4,40,2630,11,8


In [26]:
df_items.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22170 entries, 0 to 22169
Data columns (total 4 columns):
item_category_id    22170 non-null category
subject_code        22170 non-null category
group_code          22170 non-null category
subgroup_code       22170 non-null category
dtypes: category(4)
memory usage: 399.6 KB


In [27]:
df_items.describe()

Unnamed: 0,item_category_id,subject_code,group_code,subgroup_code
count,22170,22170,22170,22170
unique,84,4655,18,55
top,40,4348,11,8
freq,5035,599,7464,5035


## Sales (full)

In [28]:
df_train2.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,item_price_mean,item_cnt_sum
shop_id,item_id,date_block_num,Unnamed: 3_level_1,Unnamed: 4_level_1
5,5037,0,0,0
5,5037,1,0,0
5,5037,2,0,0
5,5037,3,0,0
5,5037,4,0,0


In [29]:
df_train2.reset_index().head(10)

Unnamed: 0,shop_id,item_id,date_block_num,item_price_mean,item_cnt_sum
0,5,5037,0,0,0
1,5,5037,1,0,0
2,5,5037,2,0,0
3,5,5037,3,0,0
4,5,5037,4,0,0
5,5,5037,5,0,0
6,5,5037,6,0,0
7,5,5037,7,0,0
8,5,5037,8,0,0
9,5,5037,9,0,0


In [30]:
#shop_id	item_id	date_block_num

In [31]:
df_train3 = df_train2.join(df_shops).join(df_items)
df_train3.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,item_price_mean,item_cnt_sum,city_code,item_category_id,subject_code,group_code,subgroup_code
shop_id,item_id,date_block_num,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
5,5037,0,0,0,5,19,793,6,13
5,5037,1,0,0,5,19,793,6,13
5,5037,2,0,0,5,19,793,6,13
5,5037,3,0,0,5,19,793,6,13
5,5037,4,0,0,5,19,793,6,13


In [32]:
df_train3.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,item_price_mean,item_cnt_sum,city_code,item_category_id,subject_code,group_code,subgroup_code
shop_id,item_id,date_block_num,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
5,5037,0,0,0,5,19,793,6,13
5,5037,1,0,0,5,19,793,6,13
5,5037,2,0,0,5,19,793,6,13
5,5037,3,0,0,5,19,793,6,13
5,5037,4,0,0,5,19,793,6,13
5,5037,5,0,0,5,19,793,6,13
5,5037,6,0,0,5,19,793,6,13
5,5037,7,0,0,5,19,793,6,13
5,5037,8,0,0,5,19,793,6,13
5,5037,9,0,0,5,19,793,6,13


In [33]:
df_train3.to_csv('train_full.csv')