In [1]:
import os
import gc
import heapq
import pickle
import numba as nb
import numpy as np
import pandas as pd
from tqdm.auto import tqdm


In [5]:
%%time
df = pd.read_csv("../../otto_data/train.csv")
df_test = pd.read_csv("../../otto_data/test.csv")
df = pd.concat([df, df_test]).reset_index(drop = True)
npz = np.load("../../otto_data/train.npz")
npz_test = np.load("../../otto_data/test.npz")
aids = np.concatenate([npz['aids'], npz_test['aids']])
ts = np.concatenate([npz['ts'], npz_test['ts']])
ops = np.concatenate([npz['ops'], npz_test['ops']])

df["idx"] = np.cumsum(df.length) - df.length
df["end_time"] = df.start_time + ts[df.idx + df.length - 1]

CPU times: user 16.3 s, sys: 4.49 s, total: 20.7 s
Wall time: 21.9 s


In [10]:
parallel = 1024

In [6]:
df

Unnamed: 0,session,start_time,length,idx,end_time
0,0,1659304800,276,0,1661684983
1,1,1659304800,32,276,1661714854
2,2,1659304800,33,308,1661714215
3,3,1659304800,226,341,1661109666
4,4,1659304800,19,567,1661586681
...,...,...,...,...,...
14571577,14571577,1662328774,1,223644214,1662328774
14571578,14571578,1662328775,1,223644215,1662328775
14571579,14571579,1662328775,1,223644216,1662328775
14571580,14571580,1662328781,1,223644217,1662328781


In [7]:
aids

array([1517085, 1563459, 1309446, ...,  739876,  202353, 1100210])

In [8]:
@nb.jit(nopython=True)
def getSimScoresSingleRow(pairs_this_row, start_time, start_idx, length, aids, ts, ops, mode):
    max_idx = start_idx + length
    min_idx = max(max_idx - 30, start_idx)  
    for i in range(min_idx, max_idx):
        for j in range(i+1, max_idx):
            if ts[j] - ts[i] > 24 * 60 * 60: continue
            if aids[i] == aids[j]: continue
            if mode == "naive":
                pairs_this_row[(aids[i], aids[j])] = 1
                pairs_this_row[(aids[j], aids[i])] = 1
            elif mode == "iuf":
                pass

@nb.jit(nopython=True, parallel=True, cache=True)
def getSimScoreBatch(aids, ts, ops, rows, fullSimMatrix, mode="naive"):
    nrows = len(rows)
    pairs_this_batch = [{(0, 0): 0.0 for _ in range(0)} for _ in range(nrows)]
    ## get the sim scores of each batch in seperate sub dict in pairs_this_batch
    for row_i in nb.prange(nrows):  ## run each row of the batch in parallel
        _, start_idx, length, start_time = rows[row_i]
        getSimScoresSingleRow(pairs_this_batch[row_i], start_time, start_idx, length, aids, ts, ops, mode)
    ## merge pairs_this_batch into one big sim matrix for the batch.
    for row_i in range(nrows):
        for (aid1, aid2), score in pairs_this_batch[row_i].items():
            if aid1 not in fullSimMatrix: 
                fullSimMatrix[aid1] = {0: 0.0 for _ in range(0)}
            if aid2 not in fullSimMatrix[aid1]:
                fullSimMatrix[aid1][aid2] = 0.0
            fullSimMatrix[aid1][aid2] += score
    

@nb.jit(nopython = True, cache = True)
def heap_topk(cnt, overwrite, cap):
    q = [(0.0, 0, 0) for _ in range(0)]
    for i, (k, n) in enumerate(cnt.items()):
        if overwrite == 1:
            heapq.heappush(q, (n, i, k))
        else:
            heapq.heappush(q, (n, -i, k))
        if len(q) > cap:
            heapq.heappop(q)
    return [heapq.heappop(q)[2] for _ in range(len(q))][::-1]
   
# save top-k aid2 for each aid1's cnt
@nb.jit(nopython = True, cache = True)
def get_topk(cnts, topk, k):
    for aid1, cnt in cnts.items():
        topk[aid1] = np.array(heap_topk(cnt, 1, k))

In [14]:
%%time
topks = {}

## the nested dict to store full sim matrix, {itemX: {itemY: score, itemZ: score, ...}}
fullSimMatrix = nb.typed.Dict.empty(
        key_type = nb.types.int64,
        value_type = nb.typeof(nb.typed.Dict.empty(key_type = nb.types.int64, value_type = nb.types.float64)))
max_idx = len(df)
for idx in tqdm(range(0, max_idx, parallel)):
    rows_batch = df.iloc[idx: min(idx + parallel, max_idx)][["session", "idx", "length", "start_time"]].values
    getSimScoreBatch(aids, ts, ops, rows_batch, fullSimMatrix)
    
 # get topk from counter
topk = nb.typed.Dict.empty(
        key_type = nb.types.int64,
        value_type = nb.types.int64[:])
        
get_topk(fullSimMatrix, topk, 20)

  0%|          | 0/1 [00:00<?, ?it/s]


UnsupportedError: Failed in nopython mode pipeline (step: inline calls to locally defined closures)
[1mUse of unsupported opcode (MAP_ADD) found
[1m
File "<ipython-input-8-3a43a958dd06>", line 27:[0m
[1mdef getSimScoreBatch(aids, ts, ops, rows, fullSimMatrix, mode="naive"):
    <source elided>
            if aid1 not in fullSimMatrix: 
[1m                fullSimMatrix[aid1] = {0: 0.0 for _ in range(0)}
[0m                [1m^[0m[0m
[0m