In [1]:
import math
import os
import re
import itertools

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [2]:
def read(file_name):
    data_dir = '/Users/keiji/work/kaggle/sales1c/'
    pickle = data_dir + file_name + '.pickle'
    if os.path.exists(pickle):
        return pd.read_pickle(pickle)
    df = pd.read_csv(data_dir + file_name)
    df.to_pickle(pickle)
    return df

df_icats = read('item_categories.csv')
df_items = read('items.csv')
df_shops = read('shops.csv')
df_test = read('test.csv.gz')
df_sales = read('sales_train.csv.gz')

In [3]:
# Drop outliers.
# I'm not sure if this really matters.
df_sales = df_sales[df_sales.item_price < 60000]
df_sales = df_sales[df_sales.item_cnt_day < 700]

In [4]:
df_sales['item_sales'] = df_sales.item_price * df_sales.item_cnt_day

In [5]:
unique_shop_id = df_sales[['date_block_num', 'shop_id']].drop_duplicates().reset_index(drop=True)
unique_item_id = df_sales[['date_block_num', 'item_id']].drop_duplicates().reset_index(drop=True)
df_train = (
    pd.DataFrame({'date_block_num': np.arange(34)})
    .merge(unique_shop_id, how='left', on='date_block_num')
    .merge(unique_item_id, how='left', on='date_block_num')
    .sort_values(by=['date_block_num', 'shop_id', 'item_id'])
    .reset_index(drop=True)
)

df_agg = (
    df_sales.groupby(['date_block_num', 'shop_id', 'item_id'])
    .agg({
        'item_cnt_day': 'sum',
        'item_price': 'median',
        'item_sales': 'sum'})
    .reset_index()
    .rename(columns={
        'item_cnt_day': 'item_cnt_month',
        'item_price': 'median_price',
        'item_sales': 'item_sales_month'})
)
df_train = df_train.merge(df_agg, how='left', on=['date_block_num', 'shop_id', 'item_id'])
df_train.item_cnt_month.fillna(0.0, inplace=True)
df_train.item_sales_month.fillna(0.0, inplace=True)

df_agg = (
    df_sales.groupby(['date_block_num', 'item_id'])
    .agg({
        'item_cnt_day': 'sum',
        'item_price': 'median',
        'item_sales': 'sum'})
    .reset_index()
    .rename(columns={
        'item_cnt_day': 'item_cnt_month_allshops',
        'item_price': 'median_price_allshops',
        'item_sales': 'item_sales_month_allshops'})
)
df_train = df_train.merge(df_agg, how='left', on=['date_block_num', 'item_id'])
df_train.median_price.fillna(df_train.median_price_allshops, inplace=True)

df_train.head().T

Unnamed: 0,0,1,2,3,4
date_block_num,0.0,0.0,0.0,0.0,0.0
shop_id,0.0,0.0,0.0,0.0,0.0
item_id,19.0,27.0,28.0,29.0,32.0
median_price,28.0,2499.0,549.0,2499.0,221.0
item_sales_month,0.0,0.0,0.0,0.0,1326.0
item_cnt_month,0.0,0.0,0.0,0.0,6.0
median_price_allshops,28.0,2499.0,549.0,2499.0,349.0
item_sales_month_allshops,28.0,16275.0,4392.0,9387.0,101515.86
item_cnt_month_allshops,1.0,7.0,8.0,4.0,299.0


In [6]:
df_train['price_delta'] = df_train.median_price - df_train.median_price_allshops
df_train['price_ratio'] = df_train.median_price / df_train.median_price_allshops
df_train['item_cnt_ratio'] = df_train.item_cnt_month / df_train.item_cnt_month_allshops
df_train['item_cnt_ratio'].fillna(0.0, inplace=True)
df_train['item_sales_ratio'] = df_train.item_sales_month / df_train.item_sales_month_allshops
df_train['item_sales_ratio'].fillna(0.0, inplace=True)
df_train.head().T

Unnamed: 0,0,1,2,3,4
date_block_num,0.0,0.0,0.0,0.0,0.0
shop_id,0.0,0.0,0.0,0.0,0.0
item_id,19.0,27.0,28.0,29.0,32.0
median_price,28.0,2499.0,549.0,2499.0,221.0
item_sales_month,0.0,0.0,0.0,0.0,1326.0
item_cnt_month,0.0,0.0,0.0,0.0,6.0
median_price_allshops,28.0,2499.0,549.0,2499.0,349.0
item_sales_month_allshops,28.0,16275.0,4392.0,9387.0,101515.86
item_cnt_month_allshops,1.0,7.0,8.0,4.0,299.0
price_delta,0.0,0.0,0.0,0.0,-128.0


In [7]:
df_train.isnull().sum()

date_block_num               0
shop_id                      0
item_id                      0
median_price                 0
item_sales_month             0
item_cnt_month               0
median_price_allshops        0
item_sales_month_allshops    0
item_cnt_month_allshops      0
price_delta                  0
price_ratio                  0
item_cnt_ratio               0
item_sales_ratio             0
dtype: int64

In [20]:
from functools import partial

def rename_column(index, col_name):
    if col_name in ['shop_id', 'item_id']:
        return col_name
    return '{}_{}'.format(col_name, index)
    
def make_features(label_block):
    label_df = df_train.loc[df_train.date_block_num == label_block, ['shop_id', 'item_id', 'item_cnt_month']].reset_index(drop=True)
    return make_features2(label_block, label_df)

def make_features2(label_block, label_df):
    y = label_df.item_cnt_month
    X = label_df[['shop_id', 'item_id']]
    for i in range(1, 3):
        feature_block = label_block - i
        dff = (
            df_train.loc[df_train.date_block_num == feature_block]
            .drop(['date_block_num'], axis=1)
            .rename(partial(rename_column, i), axis='columns')
        )
        X = X.merge(dff, how='left', on=['shop_id', 'item_id'])
        
    return X, y

In [None]:
Xs = []
ys = []
for label_block in range(12, 33):
    X, y = make_features(label_block)
    Xs.append(X)
    ys.append(y)

X_train = pd.concat(Xs)
y_train = pd.concat(ys)
X_train.head()

In [None]:
reg = LGBMRegressor()
reg.fit(X_train, y_train)

X_val, y_val = make_features(33)
y_pred = reg.predict(X_val)

In [None]:
def rmse(x, y):
    return math.sqrt(mean_squared_error(x, y))

rmse(y_val, y_pred)

In [None]:
y_train_p = reg.predict(X_train)
rmse(y_train, y_train_p)