In [1]:
import math
import os
import re
import itertools
import time

import numpy as np
import scipy
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import hyperopt
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import SpectralClustering

%matplotlib inline

In [2]:
DATA_DIR = '/Users/keiji/work/kaggle/sales1c/'

def read(file_name):
    pickle = DATA_DIR + file_name + '.pickle'
    if os.path.exists(pickle):
        return pd.read_pickle(pickle)
    df = pd.read_csv(DATA_DIR + file_name)
    df.to_pickle(pickle)
    return df

df_icats = read('item_categories.csv')
df_items = read('items.csv')
df_shops = read('shops.csv')
df_test = read('test.csv.gz')
df_sales = read('sales_train.csv.gz')

X_train = read('X_train')
y_train = read('y_train')
X_val = read('X_val')
y_val = read('y_val')

In [3]:
df_sales['item_sales'] = df_sales.item_price * df_sales.item_cnt_day

In [4]:
unique_shop_id = df_sales[['date_block_num', 'shop_id']].drop_duplicates().reset_index(drop=True)
unique_item_id = df_sales[['date_block_num', 'item_id']].drop_duplicates().reset_index(drop=True)
df_train = (
    pd.DataFrame({'date_block_num': np.arange(34)})
    .merge(unique_shop_id, how='left', on='date_block_num')
    .merge(unique_item_id, how='left', on='date_block_num')
    #.merge(df_items[['item_id', 'item_category_id']], how='left', on='item_id')
    .sort_values(by=['date_block_num', 'shop_id', 'item_id'])
    .reset_index(drop=True)
)
df_train.head().T

Unnamed: 0,0,1,2,3,4
date_block_num,0,0,0,0,0
shop_id,0,0,0,0,0
item_id,19,27,28,29,32


In [5]:
df_agg = (
    df_sales.groupby(['date_block_num', 'shop_id', 'item_id'])
    .agg({
        'item_cnt_day': 'sum',
        #'item_price': 'median',
        #'item_sales': 'sum',
    })
    .reset_index()
    .rename(columns={
        'item_cnt_day': 'item_cnt_month',
        #'item_price': 'median_price',
        #'item_sales': 'item_sales_month',
    })
)
df_train = df_train.merge(df_agg, how='left', on=['date_block_num', 'shop_id', 'item_id'])
df_train.item_cnt_month.fillna(0.0, inplace=True)
#df_train.item_sales_month.fillna(0.0, inplace=True)
df_train.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month
0,0,0,19,0.0
1,0,0,27,0.0
2,0,0,28,0.0
3,0,0,29,0.0
4,0,0,32,6.0


In [6]:
df_train[df_train.item_cnt_month > 0]

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month
4,0,0,32,6.0
5,0,0,33,3.0
7,0,0,35,1.0
11,0,0,43,1.0
19,0,0,51,2.0
28,0,0,61,1.0
32,0,0,75,1.0
36,0,0,88,1.0
40,0,0,95,1.0
41,0,0,96,1.0


In [7]:
df_train.item_id.max()

22169

In [8]:
shops = df_train[(df_train.shop_id == 0) & (df_train.date_block_num == 0) & (df_train.item_cnt_month > 0)][['item_id', 'item_cnt_month']]
shops = shops.groupby('item_id').item_cnt_month.sum()

In [9]:
from scipy import sparse

row = shops.index.values
row.shape

col = np.zeros((len(row),), dtype=np.int32)
col.shape

data = shops.values
data.shape

a = sparse.csc_matrix((data, (row, col))).toarray()
a[98]

array([ 25.])

In [10]:
NUM_SHOPS = 60
NUM_MONTHS = 34

rows = np.array([], dtype=np.int32)
cols = np.array([], dtype=np.int32)
data = np.array([], dtype=np.int32)
for j in range(NUM_MONTHS):
    sales = (df_train[(df_train.shop_id == 0)
                      & (df_train.date_block_num == j)
                      & (df_train.item_cnt_month > 0)][['item_id', 'item_cnt_month']])
    sales = sales.groupby('item_id').sum()
    n = len(sales)
    rows = np.append(rows, j * np.ones(n, dtype=np.int32))
    cols = np.append(cols, sales.index.values)
    data = np.append(data, sales.values)

a = sparse.csr_matrix((data, (rows, cols))).toarray()
a

array([[ 0.,  0.,  0., ...,  0.,  0.,  1.],
       [ 0.,  0.,  0., ...,  0.,  0.,  1.]])

In [11]:
a[0,98]

25.0

In [12]:
a.shape

(2, 22161)

In [15]:
import time
from scipy.sparse.linalg import norm

NUM_SHOPS = 60
NUM_MONTHS = 34
NUM_ITEMS = df_train.item_id.max()

t = time.time()
shops = []
for i in range(NUM_SHOPS):
    tnew = time.time()
    print('#{}: {}'.format(i, tnew - t))
    t = tnew

    rows = np.array([], dtype=np.int32)
    cols = np.array([], dtype=np.int32)
    data = np.array([], dtype=np.int32)
    for j in range(NUM_MONTHS):
        sales = (df_train[(df_train.shop_id == i)
                          & (df_train.date_block_num == j)
                          & (df_train.item_cnt_month > 0)]
                         [['item_id', 'item_cnt_month']])
        sales = sales.groupby('item_id').sum()
        n = len(sales)
        rows = np.append(rows, j * np.ones(n, dtype=np.int32))
        cols = np.append(cols, sales.index.values)
        data = np.append(data, sales.values)

    ary = sparse.csr_matrix((data, (rows, cols)))
    ary.resize(1, NUM_MONTHS * NUM_ITEMS)
    nary = norm(ary)
    if nary > 1e-9:
        ary = ary / nary
    shops.append(ary)

#0: 0.012861967086791992
#1: 2.426651954650879
#2: 2.413560152053833
#3: 2.4184060096740723
#4: 2.423499822616577
#5: 2.4365761280059814
#6: 2.4229798316955566
#7: 2.4363701343536377
#8: 2.3785059452056885
#9: 2.396512031555176
#10: 2.4021809101104736
#11: 2.4451911449432373
#12: 2.354702949523926
#13: 2.4692161083221436
#14: 2.4410808086395264
#15: 2.4848551750183105
#16: 2.4461829662323
#17: 2.4847099781036377
#18: 2.502794027328491
#19: 2.4767379760742188
#20: 2.4274349212646484
#21: 2.4170830249786377
#22: 2.4808340072631836
#23: 2.462739944458008
#24: 2.403114080429077
#25: 2.450747013092041
#26: 2.469064950942993
#27: 2.721938133239746
#28: 2.4437317848205566
#29: 2.469954013824463
#30: 2.6982951164245605
#31: 2.516530990600586
#32: 2.5217080116271973
#33: 2.408092975616455
#34: 2.4542160034179688
#35: 2.4473628997802734
#36: 2.4586710929870605
#37: 2.4031779766082764
#38: 2.449842929840088
#39: 2.505465030670166
#40: 2.446463108062744
#41: 2.4519829750061035
#42: 2.4372239112854

In [16]:
shops_array = scipy.sparse.vstack(shops)

In [17]:
shops_array.shape

(60, 753746)

In [19]:
sc = SpectralClustering(n_clusters=7)
pred = sc.fit_predict(shops_array)
pd.Series(pred).value_counts()

1    16
2    15
4     8
3     8
5     6
6     4
0     3
dtype: int64

In [27]:
import pickle

with open(DATA_DIR + 'shops_sparse.pickle', 'wb') as f:
    pickle.dump(shops_array, f)