In [1]:
import numpy as np
import pandas as pd
import glob

In [2]:
tables = {
    'orders': '/kaggle/input/test-recsys/kaggle_tab_1345/tab_1_orders.csv',
    'products': list(sorted(glob.glob('/kaggle/input/test-recsys/sbermarket_tab_2*/*'))),
    'categories': '/kaggle/input/test-recsys/kaggle_tab_1345/tab_3_categories.csv',
    'users': '/kaggle/input/test-recsys/kaggle_tab_1345/tab_4_user_profiles.csv',
    'product_properties': '/kaggle/input/test-recsys/kaggle_tab_1345/tab_5_product_properties.csv',
    'cities': '/kaggle/input/test-recsys/tab_6_city.csv',
    'sample_submission': '/kaggle/input/test-recsys/sample_submission.csv',
}

products_dtypes = {
    'user_id': 'int32',
    'order_id': 'int32',
    'line_item_id': 'int32',
    'price': 'float32',
    'quantity': 'int32',
    'discount': 'float32',
    'product_id': 'int32',
    'master_category_id': 'float32',
    'parent_category_id': 'int32',
}

# tables['products']

In [3]:
subm = pd.read_csv(tables['sample_submission'])
subm_ids = subm.Id.values
subm_ids.shape, subm_ids[:5]

((107068,), array([  51,   65,  766, 1132, 1578]))

In [4]:
products = pd.concat(list(map(lambda x: pd.read_csv(x, usecols=['user_id', 'order_id', 'line_item_id', 'product_id'], dtype='int32'), tables['products'])))
products.head(), products.shape

(   user_id  order_id  line_item_id  product_id
 0      525   5354800      28381452       37548
 1      525   5354800      29242911        5636
 2      525   5354800      29242919       22107
 3      525   5354800      29243785        2530
 4      525   5354800      29244246     3818486,
 (59596980, 4))

### Filtered products only for test users

In [5]:
products_subm = products[products.user_id.isin(subm_ids)]
print(products_subm.shape)

(21677070, 4)


In [6]:
products = products[products.product_id.isin(products_subm.product_id.unique())]
# products = products[products.product_id != 0]
products.head(), products.shape

(   user_id  order_id  line_item_id  product_id
 0      525   5354800      28381452       37548
 1      525   5354800      29242911        5636
 2      525   5354800      29242919       22107
 3      525   5354800      29243785        2530
 4      525   5354800      29244246     3818486,
 (59586459, 4))

In [None]:
# products_subm = products

### Encode `line_item_id` inside each order

In [7]:
products_subm = products_subm.sort_values(['user_id', 'order_id', 'line_item_id'], ascending=[True, True, True])
products_subm.head()

line_item_idx = products_subm.groupby(['user_id', 'order_id']).cumcount().reset_index()
products_subm['line_item_pos'] = 0
products_subm['line_item_pos'] = products_subm['line_item_pos'].astype('int32')
products_subm['line_item_pos'] = line_item_idx.values[:, 1]
products_subm.head()

Unnamed: 0,user_id,order_id,line_item_id,product_id,line_item_pos
0,51,5324015,29478760,5501,0
1,51,5324015,29478763,19562,1
2,51,5324015,29478764,50190,2
3,51,5324015,29478780,45435,3
4,51,5324015,29478782,3817487,4


### Take last n orders

In [11]:
# n_last_orders = 5

# last_user_orders = products_subm.groupby(['user_id', 'order_id']).count().reset_index().sort_values(['user_id', 'order_id'], ascending=[True, False])
# last_user_orders = last_user_orders.groupby('user_id').head(n_last_orders).reset_index(drop=True)[['user_id', 'order_id']]
# last_user_orders.head()

# products_subm = products_subm.merge(last_user_orders, how='inner', on=['user_id', 'order_id'])
# products_subm.shape

### Mean line item position

In [10]:
user_product_line_pos_mean = products_subm.groupby(['user_id', 'product_id'])['line_item_pos'].mean().reset_index()
user_product_line_pos_mean = user_product_line_pos_mean.astype({'user_id': 'int32', 'product_id': 'int32', 'line_item_pos': 'float32'})
user_product_line_pos_mean.head()

Unnamed: 0,user_id,product_id,line_item_pos
0,51,0,40.75
1,51,159,2.0
2,51,397,50.0
3,51,407,27.5
4,51,456,24.666666


### Last order id

In [11]:
last_order_id = products_subm.groupby(['user_id', 'product_id'])['order_id'].max().reset_index()
last_order_id = last_order_id.rename(columns={'order_id': 'last_order_id'})
last_order_id = last_order_id.astype('int32')
last_order_id.head()

Unnamed: 0,user_id,product_id,last_order_id
0,51,0,7767199
1,51,159,7767199
2,51,397,6905330
3,51,407,9096837
4,51,456,13911065


### User-product frequencies

sorted by `cnt`, `last_order_id`, `mean_line_idx`

In [12]:
user_product_freqs = products_subm.groupby(['user_id', 'product_id'])['order_id'].count().sort_values(ascending=False).reset_index()
user_product_freqs = user_product_freqs.rename(columns={'order_id':'cnt'})
user_product_freqs = user_product_freqs.astype('int32')
user_product_freqs.head()

Unnamed: 0,user_id,product_id,cnt
0,56037,0,471
1,222648,3818317,364
2,56037,6375406,360
3,222648,709,332
4,56037,12565,330


In [13]:
user_product_freqs = user_product_freqs.merge(last_order_id, on=['user_id', 'product_id'])
del last_order_id
user_product_freqs = user_product_freqs.merge(user_product_line_pos_mean, on=['user_id', 'product_id'])
del user_product_line_pos_mean

user_product_freqs = user_product_freqs.sort_values(['user_id', 'cnt', 'last_order_id', 'line_item_pos'], ascending=[True, False, False, True])
user_product_freqs = user_product_freqs.astype({'user_id': 'int32', 'product_id': 'int32', 'cnt': 'int32', 'last_order_id': 'int32', 'line_item_pos': 'float32'})
user_product_freqs.head()

# user_product_freqs = user_product_freqs.sort_values(['user_id', 'last_order_id', 'cnt', 'line_item_pos'], ascending=[True, False, False, True])
# user_product_freqs = user_product_freqs.astype({'user_id': 'int32', 'product_id': 'int32', 'cnt': 'int32', 'last_order_id': 'int32', 'line_item_pos': 'float32'})
# user_product_freqs.head()

# user_product_freqs['score'] = user_product_freqs['cnt'] / (1 + np.log1p(user_product_freqs['line_item_pos']))
# user_product_freqs = user_product_freqs.sort_values(['user_id', 'score', 'last_order_id'], ascending=[True, False, False,])
# user_product_freqs.head()

Unnamed: 0,user_id,product_id,cnt,last_order_id,line_item_pos
10830,51,14863,16,13911065,20.5
25694,51,63057,13,13911065,8.538462
25356,51,19562,13,13911065,24.153847
32099,51,3562687,12,13911065,27.416666
35209,51,709,11,13911065,7.727273


In [14]:
user_num_orders = products_subm.groupby(['user_id', 'order_id']).count().reset_index().groupby('user_id')['order_id'].count().reset_index()
user_num_orders = user_num_orders.rename(columns={'order_id': 'num_orders_total'})

user_product_freqs = user_product_freqs.merge(user_num_orders, on='user_id')
user_product_freqs['ratio_orders'] = user_product_freqs['cnt'] / user_product_freqs['num_orders_total']
user_product_freqs.head()

Unnamed: 0,user_id,product_id,cnt,last_order_id,line_item_pos,num_orders_total,ratio_orders
0,51,14863,16,13911065,20.5,17,0.941176
1,51,63057,13,13911065,8.538462,17,0.764706
2,51,19562,13,13911065,24.153847,17,0.764706
3,51,3562687,12,13911065,27.416666,17,0.705882
4,51,709,11,13911065,7.727273,17,0.647059


In [15]:
user_product_freqs[user_product_freqs.user_id == 51]

Unnamed: 0,user_id,product_id,cnt,last_order_id,line_item_pos,num_orders_total,ratio_orders
0,51,14863,16,13911065,20.500000,17,0.941176
1,51,63057,13,13911065,8.538462,17,0.764706
2,51,19562,13,13911065,24.153847,17,0.764706
3,51,3562687,12,13911065,27.416666,17,0.705882
4,51,709,11,13911065,7.727273,17,0.647059
...,...,...,...,...,...,...,...
496,51,5567,1,5324015,60.000000,17,0.058824
497,51,38597,1,5324015,61.000000,17,0.058824
498,51,5481691,1,5324015,62.000000,17,0.058824
499,51,21874,1,5324015,64.000000,17,0.058824


### User top products

In [16]:
user_top_products = user_product_freqs.groupby('user_id')['product_id'].apply(list).reset_index()
# user_top_products = user_product_freqs[user_product_freqs.ratio_orders >= 0.1].groupby('user_id')['product_id'].apply(list).reset_index()
# user_top_products = user_product_freqs[user_product_freqs.product_id != 0].groupby('user_id')['product_id'].apply(list).reset_index()
user_top_products.head()

Unnamed: 0,user_id,product_id
0,51,"[14863, 63057, 19562, 3562687, 709, 9979, 3817..."
1,65,"[54728, 18450, 3817542, 709, 18439, 26062, 105..."
2,187,"[88084, 88086]"
3,400,"[94333, 3817507, 0, 813, 6565232, 61053, 42032..."
4,576,"[177, 2600, 46469, 5046703, 102, 7103983, 1400..."


In [45]:
# k_products = 50
# user_top_products.product_id = user_top_products.product_id.map(lambda x: ' '.join(map(str, x[:k_products])))
# user_top_products.head()

Unnamed: 0,user_id,product_id
0,51,14863 63057 19562 3562687 709 9979 3817489 456...
1,65,54728 18450 3817542 709 18439 26062 105764 771...
2,83,26589 74993 38831 4593463 63102 198543 38837 5...
3,142,5051530
4,187,88084 88086


In [47]:
# user_top_products.to_csv('/kaggle/working/user_top_products.csv', index=False)
# user_top_products = pd.read_csv('/kaggle/working/user_top_products.csv')
# user_top_products.product_id = a.product_id.apply(lambda x: x.split(' '))
# user_top_products.head()

## Models

In [17]:
from scipy.sparse import coo_matrix, csr_matrix
import os
import implicit

In [20]:
user_ids = sorted(products.user_id.unique())
product_ids = sorted(products.product_id.unique())
mappings = {
    'user2idx': {user_id: i for i, user_id in enumerate(user_ids)},
    'idx2user': {i: user_id for i, user_id in enumerate(user_ids)},
    'product2idx': {product_id: i for i, product_id in enumerate(product_ids)},
    'idx2product': {i: product_id for i, product_id in enumerate(product_ids)},
}

In [21]:
user_num_orders = products.groupby(['user_id', 'order_id']).count().reset_index().groupby('user_id')['order_id'].count().reset_index()
user_num_orders = user_num_orders.rename(columns={'order_id': 'num_orders_total'})

user_num_products = products.groupby(['user_id', 'product_id'])['order_id'].count().reset_index()
user_num_products = user_num_products.rename(columns={'order_id': 'num_orders'})

user_product_counts = user_num_products.merge(user_num_orders, on='user_id')
user_product_counts['ratio_orders'] = user_product_counts['num_orders'] / user_product_counts['num_orders_total']
user_product_counts.head()

Unnamed: 0,user_id,product_id,num_orders,num_orders_total,ratio_orders
0,51,0,4,17,0.235294
1,51,159,1,17,0.058824
2,51,397,2,17,0.117647
3,51,407,6,17,0.352941
4,51,456,9,17,0.529412


In [19]:
user_idxs = user_product_counts.user_id.map(mappings['user2idx']).values
product_idxs = user_product_counts.product_id.map(mappings['product2idx']).values
values = user_product_counts.ratio_orders.values

user_item = csr_matrix((values, (user_idxs, product_idxs)))
del user_idxs, product_idxs, values

item_user = user_item.T

data_conf = (item_user * 1).astype('double')
data_conf

<91994x657480 sparse matrix of type '<class 'numpy.float64'>'
	with 46184547 stored elements in Compressed Sparse Column format>

In [43]:
# user_idxs = products.user_id.map(mappings['user2idx'])
# product_idxs = products.product_id.map(mappings['product2idx'])
# user_item = coo_matrix((np.ones_like(user_idxs), (user_idxs, product_idxs))).tocsr()


<91994x657480 sparse matrix of type '<class 'numpy.float64'>'
	with 46184547 stored elements in Compressed Sparse Column format>

In [23]:
# model = implicit.als.AlternatingLeastSquares(
#     factors=64, regularization=0.1, iterations=10,
#     calculate_training_loss=True, random_state=0
# )

# model = implicit.nearest_neighbours.ItemItemRecommender(K=10)

model = implicit.nearest_neighbours.CosineRecommender(K=10)
model.fit(data_conf)

HBox(children=(FloatProgress(value=0.0, max=91994.0), HTML(value='')))




In [20]:
model.recommend(0, data_conf, N=10, filter_already_liked_items=False)

[(194, 1.6546121920487693),
 (33742, 1.0053961422450421),
 (3925, 0.9552985187757539),
 (32932, 0.859725834876824),
 (5298, 0.7740475975074037),
 (13203, 0.7647058823529421),
 (10589, 0.7646196089377536),
 (0, 0.7549764714008546),
 (180, 0.7545409117857674),
 (2625, 0.6439530968353565)]

### Predictions #1

`Top Products + Model recommendations to fill K=50`

In [25]:
from tqdm import tqdm

k = 50
ne_cnt = 0
predictions = [None] * len(subm_ids)
for i, user_id in enumerate(tqdm(subm_ids)):
    user_tp = user_top_products[user_top_products.user_id == user_id].product_id.values
    items = user_tp[0][:k] if len(user_tp) else []
    if len(items) < k:
        items_idx = list(map(lambda x: mappings['product2idx'][x], items))
        user_idx = mappings['user2idx'][user_id]
        items_a = model.recommend(user_idx, data_conf.T, k - len(items), 
                                  filter_items=items_idx, filter_already_liked_items=False)
        items_a = list(map(lambda x: mappings['idx2product'][x[0]], items_a))
        items += items_a
        ne_cnt += 1   
        
    predictions[i] = (user_id, items)

ne_cnt

100%|██████████| 107068/107068 [02:16<00:00, 783.12it/s]


35054

In [27]:
_, preds = zip(*predictions)
preds = list(map(lambda x: ' '.join(map(str, x)), preds))
test_submission = pd.DataFrame({'Id': subm_ids, 'Predicted': preds})
# test_submission
test_submission.to_csv('/kaggle/working/submission_top_sort3cdp_cosi2i_ratios.csv', index=False)

### Predictions #2: 

`Top Products in Top-100 Model recommendations + Model recommendationsto fill K=50`

In [99]:
from tqdm import tqdm

k = 50
k_rec = 100
ne_cnt = 0
predictions = [None] * len(subm_ids)
for i, user_id in enumerate(tqdm(subm_ids)):
    user_tp = user_top_products[user_top_products.user_id == user_id].product_id.values
    items = user_tp[0][:k_rec] if len(user_tp) else []
    
    if len(items) > 0:
        user_idx = mappings['user2idx'][user_id]
        items_a = model.recommend(user_idx, data_conf.T, k_rec, filter_items=[0], filter_already_liked_items=False)      
        items_a = list(map(lambda x: mappings['idx2product'][x[0]], items_a))
        items_a_set = set(items_a)
        items = [x for x in items if x in items_a_set][:k]
    
    if len(items) < k:
        items_set = set(items)
        items_a = [x for x in items_a if x not in items_set][:k - len(items)]
        items += items_a
        ne_cnt += 1   
        
    predictions[i] = (user_id, items)

ne_cnt

100%|██████████| 107068/107068 [03:49<00:00, 467.10it/s]


37454