In [1]:
import math
import os
import re

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [2]:
def read(file_name):
    data_dir = '/Users/keiji/work/kaggle/sales1c/'
    pickle = data_dir + file_name + '.pickle'
    if os.path.exists(pickle):
        return pd.read_pickle(pickle)
    df = pd.read_csv(data_dir + file_name)
    df.to_pickle(pickle)
    return df

df_icats = read('item_categories.csv')
df_items = read('items.csv')
df_shops = read('shops.csv')
df_test = read('test.csv.gz')
df_train_all = read('sales_train.csv.gz')

In [3]:
# Split data into training and validation
TARGET_DATE_BLOCK = 33
df_day_train = df_train_all[df_train_all.date_block_num < TARGET_DATE_BLOCK]
df_day_val = df_train_all[df_train_all.date_block_num == TARGET_DATE_BLOCK]

In [4]:
df_day_train.tail()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
2882330,27.09.2015,32,25,21022,1799.0,1.0
2882331,19.09.2015,32,25,21105,1799.0,1.0
2882332,30.09.2015,32,25,21088,1299.0,1.0
2882333,08.09.2015,32,25,20990,1299.0,1.0
2882334,25.09.2015,32,25,20303,399.0,1.0


In [5]:
df_day_val.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
2882335,23.10.2015,33,45,13315,649.0,1.0
2882336,05.10.2015,33,45,13880,229.0,1.0
2882337,02.10.2015,33,45,13881,659.0,1.0
2882338,12.10.2015,33,45,13881,659.0,1.0
2882339,04.10.2015,33,45,13923,169.0,1.0


In [6]:
def make_monthly(df):
    dfm = (
        df.groupby(['shop_id', 'item_id'], as_index=False)
        .item_cnt_day
        .sum()
        .rename(columns={'item_cnt_day': 'item_cnt_month'})
    )
    
    # Add (shop,item) that show up in the test.
    shopitem = df_test[['shop_id', 'item_id']]
    dfm = dfm.merge(shopitem, on=['shop_id', 'item_id'], how='outer')
    
    dfm.item_cnt_month = (
        dfm.item_cnt_month
        .fillna(0.0)
        .map(lambda x: min(20.0, max(0.0, x)))
    )
    dfm.sort_values(by=['shop_id', 'item_id'], inplace=True)
    return dfm.reset_index(drop=True)

df_month_val = make_monthly(df_day_val)
df_month_val.head()

Unnamed: 0,shop_id,item_id,item_cnt_month
0,2,30,0.0
1,2,31,1.0
2,2,32,0.0
3,2,33,0.0
4,2,38,0.0


In [7]:
def predict(df):
    df = df[df.date_block_num == (TARGET_DATE_BLOCK - 1)]
    dfm = (
        df.groupby(['shop_id', 'item_id'], as_index=False)
        .item_cnt_day
        .sum()
        .rename(columns={'item_cnt_day': 'item_cnt_month'})
    )
    
    shopitem = df_month_val[['shop_id', 'item_id']]
    dfm = dfm.merge(shopitem, on=['shop_id', 'item_id'], how='right')

    dfm.item_cnt_month = (
        dfm.item_cnt_month
        .fillna(0.0)
        .map(lambda x: min(20.0, max(0.0, x)))
    )
    dfm.sort_values(by=['shop_id', 'item_id'], inplace=True)
    return dfm.reset_index(drop=True)

df_month_pred = predict(df_day_train)
df_month_pred.head()

Unnamed: 0,shop_id,item_id,item_cnt_month
0,2,30,0.0
1,2,31,0.0
2,2,32,0.0
3,2,33,1.0
4,2,38,0.0


In [8]:
df_month_val.info()
df_month_pred.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 217051 entries, 0 to 217050
Data columns (total 3 columns):
shop_id           217051 non-null int64
item_id           217051 non-null int64
item_cnt_month    217051 non-null float64
dtypes: float64(1), int64(2)
memory usage: 5.0 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 217051 entries, 0 to 217050
Data columns (total 3 columns):
shop_id           217051 non-null int64
item_id           217051 non-null int64
item_cnt_month    217051 non-null float64
dtypes: float64(1), int64(2)
memory usage: 5.0 MB


In [9]:
def rmse(x, y):
    return math.sqrt(mean_squared_error(x, y))

rmse(df_month_val.item_cnt_month, df_month_pred.item_cnt_month)

1.1877253455528227