In [1]:
import math
import os
import re

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [2]:
def read(file_name):
    data_dir = '/Users/keiji/work/kaggle/sales1c/'
    pickle = data_dir + file_name + '.pickle'
    if os.path.exists(pickle):
        return pd.read_pickle(pickle)
    df = pd.read_csv(data_dir + file_name)
    df.to_pickle(pickle)
    return df

df_icats = read('item_categories.csv')
df_items = read('items.csv')
df_shops = read('shops.csv')
df_test = read('test.csv.gz')
df_train_all = read('sales_train.csv.gz')

In [3]:
# Split data into training and validation
TARGET_DATE_BLOCK = 33
df_day_train = df_train_all[df_train_all.date_block_num < TARGET_DATE_BLOCK]
df_day_val = df_train_all[df_train_all.date_block_num == TARGET_DATE_BLOCK]

In [4]:
df_day_train.tail()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
2882330,27.09.2015,32,25,21022,1799.0,1.0
2882331,19.09.2015,32,25,21105,1799.0,1.0
2882332,30.09.2015,32,25,21088,1299.0,1.0
2882333,08.09.2015,32,25,20990,1299.0,1.0
2882334,25.09.2015,32,25,20303,399.0,1.0


In [5]:
df_day_val.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
2882335,23.10.2015,33,45,13315,649.0,1.0
2882336,05.10.2015,33,45,13880,229.0,1.0
2882337,02.10.2015,33,45,13881,659.0,1.0
2882338,12.10.2015,33,45,13881,659.0,1.0
2882339,04.10.2015,33,45,13923,169.0,1.0


In [6]:
def make_monthly(df):
    dfm = (
        df.groupby(['shop_id', 'item_id'], as_index=False)
        .item_cnt_day
        .sum()
        .rename(columns={'item_cnt_day': 'item_cnt_month'})
    )
    
    # Add (shop,item) that show up in the test.
    shopitem = df_test[['shop_id', 'item_id']]
    dfm = dfm.merge(shopitem, on=['shop_id', 'item_id'], how='outer')
    
    dfm.item_cnt_month = (
        dfm.item_cnt_month
        .fillna(0.0)
        .map(lambda x: min(20.0, max(0.0, x)))
    )
    dfm.sort_values(by=['shop_id', 'item_id'], inplace=True)
    return dfm.reset_index(drop=True)

df_month_val = make_monthly(df_day_val)
df_month_val.head()

Unnamed: 0,shop_id,item_id,item_cnt_month
0,2,30,0.0
1,2,31,1.0
2,2,32,0.0
3,2,33,0.0
4,2,38,0.0


In [8]:
def aggregate_by_month(df, date_block, col_name):
    df = (
        df[df.date_block_num == date_block]
        .groupby(['shop_id', 'item_id'], as_index=False)
        .item_cnt_day
        .sum()
        .rename(columns={'item_cnt_day': col_name})
    )
    return df

In [22]:
def make_label(df, label_month):
    df = aggregate_by_month(df, label_month, 'target')
    # TODO: Consider not clipping at all.
    df.target = df.target.map(lambda x: min(50.0, max(-10.0, x)))
    df.sort_values(by=['shop_id', 'item_id'], inplace=True)
    df = df.reset_index(drop=True)
    return df

label = make_label(df_day_train, 31)
label.head()

Unnamed: 0,shop_id,item_id,target
0,2,32,1.0
1,2,70,1.0
2,2,482,1.0
3,2,792,2.0
4,2,806,1.0


In [71]:
def make_features(df, label_month, df_target):
    df_p1 = aggregate_by_month(df, label_month - 1, 'prev1')
    df_p2 = aggregate_by_month(df, label_month - 2, 'prev2')
    df_p3 = aggregate_by_month(df, label_month - 3, 'prev3')
    
    df_merged = (
        df_target[['shop_id', 'item_id']]
        .merge(df_p1, how='left', on=['shop_id', 'item_id'])
        .merge(df_p2, how='left', on=['shop_id', 'item_id'])
        .merge(df_p3, how='left', on=['shop_id', 'item_id'])
    )

    for col in ['prev1', 'prev2', 'prev3']:
        df_merged[col].fillna(0.0, inplace=True)

    df_merged.sort_values(by=['shop_id', 'item_id'], inplace=True)
    df_merged = df_merged.reset_index(drop=True)
    X = df_merged.drop(['shop_id', 'item_id'], axis=1)
    return X

X = make_features(df_day_train, 31, label)
X.info()
X.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33486 entries, 0 to 33485
Data columns (total 3 columns):
prev1    33486 non-null float64
prev2    33486 non-null float64
prev3    33486 non-null float64
dtypes: float64(3)
memory usage: 784.9 KB


Unnamed: 0,prev1,prev2,prev3
0,0.0,0.0,0.0
1,0.0,0.0,0.0
2,1.0,0.0,0.0
3,0.0,1.0,1.0
4,0.0,0.0,2.0


In [56]:
y = label.target
y.head()

0    1.0
1    1.0
2    1.0
3    2.0
4    1.0
Name: target, dtype: float64

In [67]:
reg = LGBMRegressor(
    objective='regression'
)
reg.fit(X, y)

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       learning_rate=0.1, max_depth=-1, min_child_samples=20,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
       n_jobs=-1, num_leaves=31, objective='regression', random_state=None,
       reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=1)

In [68]:
for k, v in zip(X.columns, reg.feature_importances_):
    print('{} => {}'.format(k, v))

prev1 => 937
prev2 => 1010
prev3 => 1053


In [69]:
def predict(df, target_month, df_target):
    X = make_features(df, target_month, df_target)
    y_pred = pd.Series(reg.predict(X), name='item_cnt_month')
    return y_pred.map(lambda x: min(20.0, max(0.0, x)))

val_label = make_label(df_train_all, TARGET_DATE_BLOCK)
y_val = val_label.target.map(lambda x: min(20.0, max(0.0, x)))
df_month_pred = predict(df_train_all, TARGET_DATE_BLOCK, val_label)
df_month_pred.head()

0    1.443326
1    1.431655
2    1.443326
3    1.443326
4    1.443326
Name: item_cnt_month, dtype: float64

In [70]:
def rmse(x, y):
    return math.sqrt(mean_squared_error(x, y))

rmse(y_val, df_month_pred)

2.286696254249043