In [1]:
validation = False
reduce_size = False
num_first_level_models = 3
seed = 0

In [2]:
import time
start_time = time.time()

In [3]:
import pandas as pd
import numpy as np
import gc
from tqdm import tqdm

In [4]:
pd.set_option('display.max_rows', 99)
pd.set_option('display.max_columns', 50)
import warnings
warnings.filterwarnings('ignore')

In [5]:
import os
import subprocess
import sys

ROOT_DIR = subprocess.check_output(['git', 'rev-parse', '--show-toplevel']).rstrip().decode('utf-8')
sys.path.append(os.path.join(ROOT_DIR, 'code'))

import config

In [6]:
data_path = config.DATA_DIR
submission_path = ''

In [7]:
def downcast_dtypes(df):
    float_cols = [c for c in df if df[c].dtype == 'float64']
    int_cols   = [c for c in df if df[c].dtype in ['int64', 'int32']]
    
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int16)
    
    return df

Load Data

In [8]:
print('%0.2f min: Start loading data' % ((time.time() - start_time) / 60))
sale_train = pd.read_csv('%s/sales_train.csv.gz' % data_path)
test = pd.read_csv('%s/test.csv.gz' % data_path)
print('%0.2f min: Finish loading data' % ((time.time() - start_time) / 60))

0.01 min: Start loading data
0.04 min: Finish loading data


In [9]:
sale_train['date'] = pd.to_datetime(sale_train['date'], format='%d.%m.%Y')

In [10]:
# Correct sale_train values

Aggregate Data

In [11]:
from itertools import product

In [12]:
grid = []
for block_num in sale_train['date_block_num'].unique():
    cur_shops = sale_train[sale_train['date_block_num'] == block_num]['shop_id'].unique()
    cur_items = sale_train[sale_train['date_block_num'] == block_num]['item_id'].unique()
    grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])), dtype='int32'))

In [13]:
index_cols = ['shop_id', 'item_id', 'date_block_num']
grid = pd.DataFrame(np.vstack(grid), columns=index_cols, dtype=np.int32)

In [14]:
print('%0.2f min: Finish creating the grid' % ((time.time() - start_time) / 60))

0.25 min: Finish creating the grid


In [15]:
index_cols = ['shop_id', 'item_id', 'date_block_num']
sale_train['item_cnt_day'] = sale_train['item_cnt_day'].clip(0, 20)
gb_cnt = sale_train.groupby(index_cols)['item_cnt_day'].agg(['sum']).reset_index().rename(columns={'sum': 'item_cnt_month'})
gb_cnt['item_cnt_month'] = gb_cnt['item_cnt_month'].clip(0, 20).astype(np.int)

In [16]:
train = pd.merge(grid, gb_cnt, how='left', on=index_cols).fillna(0)
train['item_cnt_month'] = train['item_cnt_month'].astype(int)
train = downcast_dtypes(train)

In [17]:
train.sort_values(['date_block_num', 'shop_id', 'item_id'], inplace=True)
print('%0.2f min: Finish joining gb_cnt' % ((time.time() - start_time) / 60))

0.39 min: Finish joining gb_cnt


In [18]:
sale_train['item_cnt_day'].sum(), train['item_cnt_month'].sum(), gb_cnt['item_cnt_month'].sum()

(3582136.0, 3261311, 3261311)

In [20]:
item = pd.read_csv('%s/items.csv' % data_path)
train = train.merge(item[['item_id', 'item_category_id']], on=['item_id'], how='left')
test  = test.merge(item[['item_id', 'item_category_id']], on=['item_id'], how='left')
print('%0.2f min: Finish adding item_category_id' % ((time.time() - start_time) / 60))

3.95 min: Finish adding item_category_id


In [22]:
item_cat = pd.read_csv('%s/item_categories.csv' % data_path)
l_cat = list(item_cat.item_category_name)

In [24]:
from sklearn import preprocessing
lb = preprocessing.LabelEncoder()
item_cat['item_cat_id_fix'] = lb.fit_transform(l_cat)

In [25]:
train = train.merge(item_cat[['item_cat_id_fix', 'item_category_id']], on=['item_category_id'], how='left')
test  =  test.merge(item_cat[['item_cat_id_fix', 'item_category_id']], on=['item_category_id'], how='left')

In [26]:
del item, item_cat, grid, gb_cnt
gc.collect()
print('%0.2f min: Finish adding item_cat_id_fix' % ((time.time() - start_time) / 60))

48.06 min: Finish adding item_cat_id_fix


Add item/shop pair mean-encoding