In [1]:
import os
import gc
import heapq
import pickle
#import numba as nb
import numpy as np
import pandas as pd
from tqdm.auto import tqdm


In [2]:
np.__version__

'1.24.1'

In [3]:
pd.__version__

'1.3.5'

In [43]:
%%time
df = pd.read_csv("../../allData/validationData/train_meta_data.csv")
df_test = pd.read_csv("../../allData/validationData/test_meta_data.csv")
df = pd.concat([df, df_test]).reset_index(drop = True)
npz = np.load("../../allData/validationData/train_core_data.npz")
npz_test = np.load("../../allData/validationData/test_core_data.npz")
aids = np.concatenate([npz['aids'], npz_test['aids']])
ts = np.concatenate([npz['ts'], npz_test['ts']])
ops = np.concatenate([npz['ops'], npz_test['ops']])

df["start_idx"] = df["total_action"].cumsum(axis = 0) - df["total_action"].iloc[0]
#df["end_time"] = df["session_start_time"] + ts[df["start_idx"] + df["total_action"] - 1]

In [42]:
df

Unnamed: 0,session,total_action,session_start_time,session_end_time,start_idx,end_time
0,0,147,1659304800,1661103727,0,3320408527
1,1,27,1659304800,1660857067,27,3319015460
2,2,13,1659304800,1660577379,40,3319015443
3,3,226,1659304800,1661109666,266,3318655453
4,4,3,1659304800,1659304900,269,3319160501
...,...,...,...,...,...,...
12368249,12899774,1,1661723968,1661723968,171021986,3323447813
12368250,12899775,1,1661723970,1661723970,171021987,3323447815
12368251,12899776,1,1661723972,1661723972,171021988,3323447817
12368252,12899777,1,1661723976,1661723976,171021989,3323447822


In [31]:
df

Unnamed: 0,session,total_action,session_start_time,session_end_time,start_idx,end_time
0,0,147,1659304800,1661103727,147,3320408527
1,1,27,1659304800,1660857067,174,3320161867
2,2,13,1659304800,1660577379,187,3319882179
3,3,226,1659304800,1661109666,413,3320414466
4,4,3,1659304800,1659304900,416,3318609700
...,...,...,...,...,...,...
12368249,12899774,1,1661723968,1661723968,171022133,3323447936
12368250,12899775,1,1661723970,1661723970,171022134,3323447940
12368251,12899776,1,1661723972,1661723972,171022135,3323447944
12368252,12899777,1,1661723976,1661723976,171022136,3323447952


In [4]:
parallel = 1024

In [25]:
df

Unnamed: 0,session,total_action,session_start_time,session_end_time,start_idx
0,0,147,1659304800,1661103727,147
1,1,27,1659304800,1660857067,174
2,2,13,1659304800,1660577379,187
3,3,226,1659304800,1661109666,413
4,4,3,1659304800,1659304900,416
...,...,...,...,...,...
12368249,12899774,1,1661723968,1661723968,171022133
12368250,12899775,1,1661723970,1661723970,171022134
12368251,12899776,1,1661723972,1661723972,171022135
12368252,12899777,1,1661723976,1661723976,171022136


In [15]:
len(ops)

171022137

In [16]:
np.sum(ops)

21196462

In [17]:
len(aids)

171022137

In [5]:
@nb.jit(nopython=True)
def getSimScoresSingleRow(pairs_this_row, start_time, start_idx, length, aids, ts, ops, mode):
    max_idx = start_idx + length
    min_idx = max(max_idx - 30, start_idx)  
    for i in range(min_idx, max_idx):
        for j in range(i+1, max_idx):
            if ts[j] - ts[i] > 24 * 60 * 60: continue
            if aids[i] == aids[j]: continue
            if mode == "naive":
                pairs_this_row[(aids[i], aids[j])] = 1
                pairs_this_row[(aids[j], aids[i])] = 1
            elif mode == "iuf":
                pass

@nb.jit(nopython=True, parallel=True, cache=True)
def getSimScoreBatch(aids, ts, ops, rows, fullSimMatrix, mode="naive"):
    nrows = len(rows)
    pairs_this_batch = [{(0, 0): 0.0 for _ in range(0)} for _ in range(nrows)]
    ## get the sim scores of each batch in seperate sub dict in pairs_this_batch
    for row_i in nb.prange(nrows):  ## run each row of the batch in parallel
        _, start_idx, length, start_time = rows[row_i]
        getSimScoresSingleRow(pairs_this_batch[row_i], start_time, start_idx, length, aids, ts, ops, mode)
    ## merge pairs_this_batch into one big sim matrix for the batch.
    for row_i in range(nrows):
        for (aid1, aid2), score in pairs_this_batch[row_i].items():
            if aid1 not in fullSimMatrix: 
                fullSimMatrix[aid1] = {0: 0.0 for _ in range(0)}
            if aid2 not in fullSimMatrix[aid1]:
                fullSimMatrix[aid1][aid2] = 0.0
            fullSimMatrix[aid1][aid2] += score
    


In [6]:

@nb.jit(nopython = True, cache = True)
def heap_topk(cnt, overwrite, cap):
    q = [(0.0, 0, 0) for _ in range(0)]
    for i, (k, n) in enumerate(cnt.items()):
        if overwrite == 1:
            heapq.heappush(q, (n, i, k))
        else:
            heapq.heappush(q, (n, -i, k))
        if len(q) > cap:
            heapq.heappop(q)
    return [heapq.heappop(q)[2] for _ in range(len(q))][::-1]
   
# save top-k aid2 for each aid1's cnt
@nb.jit(nopython = True, cache = True)
def get_topk(cnts, topk, k):
    for aid1, cnt in cnts.items():
        topk[aid1] = np.array(heap_topk(cnt, 1, k))

In [8]:
%%time
topks = {}

## the nested dict to store full sim matrix, {itemX: {itemY: score, itemZ: score, ...}}
fullSimMatrix = nb.typed.Dict.empty(
        key_type = nb.types.int64,
        value_type = nb.typeof(nb.typed.Dict.empty(key_type = nb.types.int64, value_type = nb.types.float64)))
max_idx = len(df)
for idx in tqdm(range(0, max_idx, parallel)):
    rows_batch = df.iloc[idx: min(idx + parallel, max_idx)][["session", "idx", "length", "start_time"]].values
    getSimScoreBatch(aids, ts, ops, rows_batch, fullSimMatrix)
    
 # get topk from counter
topk = nb.typed.Dict.empty(
        key_type = nb.types.int64,
        value_type = nb.types.int64[:])
get_topk(fullSimMatrix, topk, 20)

100%|██████████| 14231/14231 [1:41:11<00:00,  2.34it/s]     


CPU times: user 43min 8s, sys: 30min 24s, total: 1h 13min 32s
Wall time: 1h 57min 11s
