In [1]:
import os
import gc
import heapq
import pickle
import numba as nb
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import math
import jit

In [2]:
fullSimMatrix = nb.typed.Dict.empty(
        key_type = nb.types.int64,
        value_type = nb.typeof(nb.typed.Dict.empty(key_type = nb.types.int64, value_type = nb.types.float64)))

In [7]:
inner_dict_1 = nb.typed.Dict.empty(
    key_type = nb.types.int64,
    value_type = nb.types.float64
)

inner_dict_1[123454] = 85
inner_dict_1[432435] = 100
inner_dict_1[341451] = 25
inner_dict_1[165314] = 31

inner_dict_1

DictType[int64,float64]<iv=None>({123454: 85.0, 432435: 100.0, 341451: 25.0, 165314: 31.0})

In [8]:
fullSimMatrix[134134] = inner_dict_1

In [19]:
fullSimMatrix

DictType[int64,DictType[int64,float64]<iv=None>]<iv=None>({134134: {123454: 85.0, 432435: 100.0, 341451: 25.0, 165314: 31.0}})

In [27]:
@nb.jit(nopython = True)
def heap_topk(cnt, cap):
    """
    get the top cap(k) elements of the cnt dict based on value
    """
    dic = nb.typed.Dict.empty(key_type = nb.types.int64, value_type = nb.types.float64)
    q = [(np.float64(0), 0, np.int64(0)) for _ in range(0)]
    for i, (k, n) in enumerate(cnt.items()):
        heapq.heappush(q, (n, i, k))
        if len(q) > cap:
            heapq.heappop(q)
            
    res = [heapq.heappop(q) for _ in range(len(q))][::-1]
    for i in range(len(res)):
        dic[res[i][2]] = res[i][0]
    
    return dic
   
@nb.jit(nopython = True)
def get_topk(cnts, k = 2):
    """
    Input: {item1: {item2: 102, item3: 203},
            item2: {item1 100, item3: 5, item4: 10, item5: 1000},
            ....}
    Output: if k = 1:
           {item1: {item3: 203},
            item2: {item5: 1000}}
    """
    for item, cnt in cnts.items():
        cnts[item] = heap_topk(cnt, k)

In [32]:
[1,5,6,4,1,3,1][::-1]

[1, 3, 1, 4, 6, 5, 1]

In [25]:
import heapq
 
# initializing list
li = [5, 7, 9, 1, 3]
 
# using heapify to convert list into heap
heapq.heapify(li)

In [28]:
get_topk(fullSimMatrix)

In [29]:
fullSimMatrix

DictType[int64,DictType[int64,float64]<iv=None>]<iv=None>({134134: {432435: 100.0, 123454: 85.0}})

In [44]:
import heapq

q = [(np.float64(0), 0, np.int64(0)) for _ in range(0)]

heapq.heappush(q, (100, 3))
heapq.heappush(q, (101, 3))
heapq.heappush(q, (102, 2))

In [45]:
q

[(100, 3), (101, 3), (102, 2)]

In [47]:
heapq.heappop(q)

(100, 3)

In [48]:
q

[(101, 3), (102, 2)]

In [54]:
@nb.jit(nopython = True)
def heap_topk(cnt, cap):
    """
    get the top cap(k) elements of the cnt dict based on value, using a min-heap structure
    """
    dic = nb.typed.Dict.empty(key_type = nb.types.int64, value_type = nb.types.float64)
    q = [(np.float64(0), np.int64(0)) for _ in range(0)]  ## generate empty queue to implement a heap, 
    for item_ref, sim_score in cnt.items():   ## read in the dict in heap structure
        heapq.heappush(q, (sim_score, item_ref))   ## push the <sim_score, item_ref_id> pair into min-heap, using sim_score for order
        if len(q) > cap:
            heapq.heappop(q)
            
    res = [heapq.heappop(q) for _ in range(len(q))][::-1]
    for i in range(len(res)):
        dic[res[i][1]] = res[i][0]
    
    return dic

In [53]:
for i, j in inner_dict_1.items():
    print(i)
    print(j)
    break

123454
85.0


In [50]:
inner_dict_1

DictType[int64,float64]<iv=None>({123454: 85.0, 432435: 100.0, 341451: 25.0, 165314: 31.0})

In [56]:
heap_topk(inner_dict_1, 2)

DictType[int64,float64]<iv=None>({432435: 100.0, 123454: 85.0})

In [39]:
for i, (item_ref, sim_score) in enumerate(inner_dict_1.items()):
    print(i)
    break

0
123454
85.0


In [57]:
fullSimMatrix

DictType[int64,DictType[int64,float64]<iv=None>]<iv=None>({134134: {432435: 100.0, 123454: 85.0}})

In [None]:
## steps to make inferences
## 1. read in a test set row, look at its aids and ops
## 2. give time weight and seq weight for each aid it has interaction with
## 3. based on the aids, search the similar items of aids in the fullSimMatrix.
## 4. get all the sim scores and combine with the seq weights and time weight found in step #2
## 5. Find the top 20 items of each user in step 4, use heap_topk to reduce memory overflow.