In [1]:
import math
import os
import pickle
import re
import itertools
import time

import numpy as np
import scipy
from scipy import sparse
from scipy.sparse.linalg import norm
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import hyperopt
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import SpectralClustering

%matplotlib inline

In [2]:
DATA_DIR = '/Users/keiji/work/kaggle/sales1c/'

def read(file_name):
    pickle = DATA_DIR + file_name + '.pickle'
    if os.path.exists(pickle):
        return pd.read_pickle(pickle)
    df = pd.read_csv(DATA_DIR + file_name)
    df.to_pickle(pickle)
    return df

df_icats = read('item_categories.csv')
df_items = read('items.csv')
df_shops = read('shops.csv')
df_test = read('test.csv.gz')
df_sales = read('sales_train.csv.gz')

X_train = read('X_train')
y_train = read('y_train')
X_val = read('X_val')
y_val = read('y_val')

In [3]:
df_sales['item_sales'] = df_sales.item_price * df_sales.item_cnt_day

In [4]:
unique_shop_id = df_sales[['date_block_num', 'shop_id']].drop_duplicates().reset_index(drop=True)
unique_item_id = df_sales[['date_block_num', 'item_id']].drop_duplicates().reset_index(drop=True)
df_train = (
    pd.DataFrame({'date_block_num': np.arange(34)})
    .merge(unique_shop_id, how='left', on='date_block_num')
    .merge(unique_item_id, how='left', on='date_block_num')
    .merge(df_items[['item_id', 'item_category_id']], how='left', on='item_id')
    .sort_values(by=['date_block_num', 'shop_id', 'item_id'])
    .reset_index(drop=True)
)
df_train.head().T

Unnamed: 0,0,1,2,3,4
date_block_num,0,0,0,0,0
shop_id,0,0,0,0,0
item_id,19,27,28,29,32
item_category_id,40,19,30,23,40


In [5]:
df_agg = (
    df_sales.groupby(['date_block_num', 'shop_id', 'item_id'])
    .agg({
        'item_cnt_day': 'sum',
    })
    .reset_index()
    .rename(columns={
        'item_cnt_day': 'item_cnt_month',
    })
)
df_train = df_train.merge(df_agg, how='left', on=['date_block_num', 'shop_id', 'item_id'])
df_train.item_cnt_month.fillna(0.0, inplace=True)
df_train.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_category_id,item_cnt_month
0,0,0,19,40,0.0
1,0,0,27,19,0.0
2,0,0,28,30,0.0
3,0,0,29,23,0.0
4,0,0,32,40,6.0


In [6]:
df_train[df_train.item_cnt_month > 0]

Unnamed: 0,date_block_num,shop_id,item_id,item_category_id,item_cnt_month
4,0,0,32,40,6.0
5,0,0,33,37,3.0
7,0,0,35,40,1.0
11,0,0,43,40,1.0
19,0,0,51,57,2.0
28,0,0,61,43,1.0
32,0,0,75,40,1.0
36,0,0,88,40,1.0
40,0,0,95,40,1.0
41,0,0,96,40,1.0


In [7]:
df_train.item_id.max()

22169

In [8]:
NUM_SHOPS = 60
NUM_MONTHS = 31  # drop recent ones to prevent leaks
NUM_ITEMS = df_train.item_id.max()

t = time.time()
shops = []
for i in range(NUM_SHOPS):
    rows = np.array([], dtype=np.int32)
    cols = np.array([], dtype=np.int32)
    data = np.array([], dtype=np.int32)
    for j in range(NUM_MONTHS):
        sales = (df_train[(df_train.shop_id == i)
                          & (df_train.date_block_num == j)
                          & (df_train.item_cnt_month > 0)]
                         [['item_id', 'item_cnt_month']])
        sales = sales.groupby('item_id').sum()
        n = len(sales)
        rows = np.append(rows, j * np.ones(n, dtype=np.int32))
        cols = np.append(cols, sales.index.values)
        data = np.append(data, sales.values)

    if len(rows) == 0 or len(cols) == 0:
        ary = sparse.eye(NUM_MONTHS, NUM_ITEMS, dtype=np.int32) * 0
    else:
        ary = sparse.csr_matrix((data, (rows, cols)))
    ary.resize(1, NUM_MONTHS * NUM_ITEMS)
    nary = norm(ary)
    if nary > 1e-9:
        ary = ary / nary
    shops.append(ary)

    tnew = time.time()
    print('#{}: {}'.format(i, tnew - t))
    t = tnew

shops_array = scipy.sparse.vstack(shops)
shops_array.shape

#0: 2.27766489982605
#1: 2.2784321308135986
#2: 2.3728599548339844
#3: 2.3689398765563965
#4: 2.3703761100769043
#5: 2.1806280612945557
#6: 2.1686389446258545
#7: 2.1553468704223633
#8: 2.132798194885254
#9: 2.1257519721984863
#10: 2.2049078941345215
#11: 2.131197929382324
#12: 2.1855480670928955
#13: 2.135329008102417
#14: 2.146336078643799
#15: 2.1550419330596924
#16: 2.166378974914551
#17: 2.1675779819488525
#18: 2.152194023132324
#19: 2.1813011169433594
#20: 2.11493182182312
#21: 2.154306173324585
#22: 2.1544299125671387
#23: 2.1690449714660645
#24: 2.2033841609954834
#25: 2.15881085395813
#26: 2.1569950580596924
#27: 2.16158390045166
#28: 2.187429189682007
#29: 2.157904863357544
#30: 2.1175200939178467
#31: 2.2270898818969727
#32: 2.1201541423797607
#33: 2.1271018981933594
#34: 2.147243022918701
#35: 2.2156739234924316
#36: 2.128960132598877
#37: 2.153069019317627
#38: 2.15480899810791
#39: 2.1431989669799805
#40: 2.159646987915039
#41: 2.167747974395752
#42: 2.2043559551239014
#4

(60, 687239)

In [52]:
df = pd.DataFrame()
for i in range(10):
    sc = SpectralClustering(n_clusters=5, random_state=40 + i)
    pred = sc.fit_predict(shops_array)
    df[40 + i] = pd.Series(pred).value_counts().reset_index(drop=True)

df

Unnamed: 0,40,41,42,43,44,45,46,47,48,49
0,16,16,16,16,16,17,16,16,16,16
1,14,14,14,14,14,16,14,14,14,14
2,11,12,12,12,11,12,12,12,12,12
3,10,9,9,9,10,9,9,9,9,9
4,9,9,9,9,9,6,9,9,9,9


In [10]:
import pickle

with open(DATA_DIR + 'shops_sparse.pickle', 'wb') as f:
    pickle.dump(shops_array, f)

In [11]:
NUM_ICATS = 84
NUM_MONTHS = 31  # drop recent ones to prevent leaks
NUM_ITEMS = df_train.item_id.max()

t = time.time()
icats = []
for i in range(NUM_ICATS):
    rows = np.array([], dtype=np.int32)
    cols = np.array([], dtype=np.int32)
    data = np.array([], dtype=np.int32)
    for j in range(NUM_MONTHS):
        sales = (df_train[(df_train.item_category_id == i)
                          & (df_train.date_block_num == j)
                          & (df_train.item_cnt_month > 0)]
                         [['item_id', 'item_cnt_month']])
        sales = sales.groupby('item_id').sum()
        n = len(sales)
        rows = np.append(rows, j * np.ones(n, dtype=np.int32))
        cols = np.append(cols, sales.index.values)
        data = np.append(data, sales.values)

    if len(rows) == 0 or len(cols) == 0:
        ary = sparse.eye(NUM_MONTHS, NUM_ITEMS, dtype=np.int32) * 0
    else:
        ary = sparse.csr_matrix((data, (rows, cols)))
    ary.resize(1, NUM_MONTHS * NUM_ITEMS)
    nary = norm(ary)
    if nary > 1e-9:
        ary = ary / nary
    icats.append(ary)

    tnew = time.time()
    print('#{}: {}'.format(i, tnew - t))
    t = tnew

icats_array = scipy.sparse.vstack(icats)
icats_array.shape

#0: 2.075252056121826
#1: 2.105881929397583
#2: 2.1237289905548096
#3: 2.1643528938293457
#4: 2.1371350288391113
#5: 2.1205451488494873
#6: 2.141018867492676
#7: 2.0875160694122314
#8: 2.0864548683166504
#9: 2.126915216445923
#10: 2.088488817214966
#11: 2.1187210083007812
#12: 2.1385409832000732
#13: 2.3269309997558594
#14: 2.122680187225342
#15: 2.1428489685058594
#16: 2.0610790252685547
#17: 2.10392689704895
#18: 2.091912031173706
#19: 2.135356903076172
#20: 2.13275408744812
#21: 2.1458189487457275
#22: 2.137531042098999
#23: 2.1258840560913086
#24: 2.1072988510131836
#25: 2.1442220211029053
#26: 2.0803911685943604
#27: 2.087533950805664
#28: 2.1028618812561035
#29: 2.1580910682678223
#30: 2.1342170238494873
#31: 2.0994699001312256
#32: 2.0983691215515137
#33: 2.126487970352173
#34: 2.1279220581054688
#35: 2.113886833190918
#36: 2.1064000129699707
#37: 2.136101007461548
#38: 2.438115119934082
#39: 2.094364881515503
#40: 2.193444013595581
#41: 2.2055020332336426
#42: 2.077935934066772

(84, 687239)

In [12]:
sc = SpectralClustering(n_clusters=7, random_state=42)
pred = sc.fit_predict(icats_array)
pd.Series(pred).value_counts()

2    32
5    21
0    15
3     6
1     6
6     3
4     1
dtype: int64

In [13]:
with open(DATA_DIR + 'icats_sparse.pickle', 'wb') as f:
    pickle.dump(icats_array, f)