In [1]:
import math
import os
import re
import itertools

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [2]:
def read(file_name):
    data_dir = '/Users/keiji/work/kaggle/sales1c/'
    pickle = data_dir + file_name + '.pickle'
    if os.path.exists(pickle):
        return pd.read_pickle(pickle)
    df = pd.read_csv(data_dir + file_name)
    df.to_pickle(pickle)
    return df

df_icats = read('item_categories.csv')
df_items = read('items.csv')
df_shops = read('shops.csv')
df_test = read('test.csv.gz')
df_sales = read('sales_train.csv.gz')

In [3]:
# Drop outliers.
# I'm not sure if this really matters.
df_sales = df_sales[df_sales.item_price < 60000]
df_sales = df_sales[df_sales.item_cnt_day < 700]

In [4]:
df_sales['item_sales'] = df_sales.item_price * df_sales.item_cnt_day

In [None]:
unique_shop_id = df_sales[['date_block_num', 'shop_id']].drop_duplicates().reset_index(drop=True)
unique_item_id = df_sales[['date_block_num', 'item_id']].drop_duplicates().reset_index(drop=True)
df_train = (
    pd.DataFrame({'date_block_num': np.arange(34)})
    .merge(unique_shop_id, how='left', on='date_block_num')
    .merge(unique_item_id, how='left', on='date_block_num')
    .merge(df_items[['item_id', 'item_category_id']], how='left', on='item_id')
    .sort_values(by=['date_block_num', 'shop_id', 'item_id'])
    .reset_index(drop=True)
)

df_agg = (
    df_sales.groupby(['date_block_num', 'shop_id', 'item_id'])
    .agg({
        'item_cnt_day': 'sum',
        'item_price': 'median',
        'item_sales': 'sum',
    })
    .reset_index()
    .rename(columns={
        'item_cnt_day': 'item_cnt_month',
        'item_price': 'median_price',
        'item_sales': 'item_sales_month',
    })
)
df_train = df_train.merge(df_agg, how='left', on=['date_block_num', 'shop_id', 'item_id'])
df_train.item_cnt_month.fillna(0.0, inplace=True)
df_train.item_sales_month.fillna(0.0, inplace=True)

df_agg = (
    df_sales.groupby(['date_block_num', 'item_id'])
    .agg({
        'item_cnt_day': 'sum',
        'item_price': 'median',
        'item_sales': 'sum',
    })
    .reset_index()
    .rename(columns={
        'item_cnt_day': 'item_cnt_month_allshops',
        'item_price': 'median_price_allshops',
        'item_sales': 'item_sales_month_allshops',
    })
)
df_train = df_train.merge(df_agg, how='left', on=['date_block_num', 'item_id'])
df_train.median_price.fillna(df_train.median_price_allshops, inplace=True)

for col in ['shop_id', 'item_id', 'item_category_id']:
    df_agg = (
        df_train.groupby(['date_block_num', col])
        .item_cnt_month
        .mean()
        .reset_index()
        .rename(columns={'item_cnt_month': 'meanenc_' + col})
    )
    df_train = df_train.merge(df_agg, how='left', on=['date_block_num', col])

df_train.head().T

In [None]:
#df_train['price_delta'] = df_train.median_price - df_train.median_price_allshops
#df_train['price_ratio'] = df_train.median_price / df_train.median_price_allshops
df_train['item_cnt_ratio'] = df_train.item_cnt_month / df_train.item_cnt_month_allshops
df_train['item_cnt_ratio'].fillna(0.0, inplace=True)
#df_train['item_sales_ratio'] = df_train.item_sales_month / df_train.item_sales_month_allshops
#df_train['item_sales_ratio'].fillna(0.0, inplace=True)
df_train.head().T

In [None]:
df_train.isnull().sum()

In [None]:
from functools import partial

def rename_column(index, col_name):
    if col_name in ['shop_id', 'item_id']:
        return col_name
    return '{}_{}'.format(col_name, index)
    
def make_features(label_block):
    label_df = df_train.loc[df_train.date_block_num == label_block, ['shop_id', 'item_id', 'item_cnt_month']].reset_index(drop=True)
    y = label_df.item_cnt_month#.map(lambda x: max(0, min(40, x)))
    return make_features2(label_block, label_df), y

def make_features2(label_block, label_df):
    X = label_df[['shop_id', 'item_id']]
    # Only 1 month ago and 2 months ago
    for i in range(1, 3):
        feature_block = label_block - i
        dff = (
            df_train.loc[df_train.date_block_num == feature_block]
            .drop(['date_block_num', 'item_category_id'], axis=1)
            .rename(partial(rename_column, i), axis='columns')
        )
        X = X.merge(dff, how='left', on=['shop_id', 'item_id'])
    X['item_cnt_month_delta'] = X['item_cnt_month_1'] - X['item_cnt_month_2']
    X['item_cnt_month_allshops_delta'] = X['item_cnt_month_allshops_1'] - X['item_cnt_month_allshops_2']
    X.drop(['shop_id', 'item_id'], axis=1, inplace=True)
    X.fillna(0.0, inplace=True)
    return X

In [None]:
VALIDATION_BLOCK = 34
TRAINING_BLOCK_RANGE = range(0, VALIDATION_BLOCK)

Xs = []
ys = []
for label_block in TRAINING_BLOCK_RANGE:
    X, y = make_features(label_block)
    Xs.append(X)
    ys.append(y)

X_train = pd.concat(Xs)
y_train = pd.concat(ys)
X_train.head(10).T

In [None]:
X_train.isnull().sum()

In [None]:
X_train.info()

In [None]:
reg = LGBMRegressor()
reg.fit(X_train, y_train)

In [None]:
def clip(s):
    return s.map(lambda x: max(0.0, min(20.0, x)))

def rmse(x, y):
    return math.sqrt(mean_squared_error(clip(x), clip(y)))

if VALIDATION_BLOCK < 34:
    X_val, y_val = make_features(VALIDATION_BLOCK)
    y_pred = pd.Series(reg.predict(X_val))
    rmse(y_val, y_pred)

In [None]:
y_train_p = pd.Series(reg.predict(X_train))
rmse(y_train, y_train_p)

In [None]:
feature_importances = pd.DataFrame({'feature': X.columns, 'importance': reg.feature_importances_})
feature_importances.sort_values(by='importance', ascending=True, inplace=True)
feature_importances.plot(x='feature', y='importance', kind='barh', sort_columns=True, figsize=(10,10))

In [None]:
if VALIDATION_BLOCK == 34:
    X_test = make_features2(VALIDATION_BLOCK, df_test)
    y_test_pred = clip(pd.Series(reg.predict(X_test)))

In [None]:
pd.DataFrame(y_test_pred).info()
y_test_pred.head()

In [None]:
df_test.info()
df_test.head()

In [None]:
df_submit = pd.DataFrame(df_test.loc[:,'ID'])
df_submit['item_cnt_month'] = y_test_pred
df_submit.to_csv('lgbm3.csv', index=False)