## Refactor the baseline itemCF_numba, and add feature store function to it

In [1]:
import os
import gc
import heapq
import pickle
import numba as nb
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import math


In [2]:
%%time
df = pd.read_csv("../../allData/validationData/train_meta_data.csv")
df_test = pd.read_csv("../../allData/validationData/test_meta_data.csv")
df = pd.concat([df, df_test]).reset_index(drop = True)
npz = np.load("../../allData/validationData/train_core_data.npz")
npz_test = np.load("../../allData/validationData/test_core_data.npz")
aids = np.concatenate([npz['aids'], npz_test['aids']])
ts = np.concatenate([npz['ts'], npz_test['ts']])
ops = np.concatenate([npz['ops'], npz_test['ops']])

df["start_idx"] = df['total_action'].cumsum().shift(1).fillna(0).astype(int)
df["end_time"] = ts[df["start_idx"] + df["total_action"] - 1]

CPU times: user 6.74 s, sys: 3.06 s, total: 9.8 s
Wall time: 10.3 s


In [3]:
df.head()

Unnamed: 0,session,total_action,session_start_time,session_end_time,start_idx,end_time
0,0,147,1659304800,1661103727,0,1661103727
1,1,27,1659304800,1660857067,147,1660857067
2,2,13,1659304800,1660577379,174,1660577379
3,3,226,1659304800,1661109666,187,1661109666
4,4,3,1659304800,1659304900,413,1659304900


## 1. Training -- Derive ItemCF similarity Matrix

#### CONSTANTS

In [3]:
## Define constants
PARALLEL = 1024
LOOKBACK_WINDOW = 200   ## only fit the latest LOOKBACK_WINDOW to train the sim matrix
#TOPN = 20
ACTION_WEIGHTS = np.array([1.0, 6.0, 3.0])

#### Section A: Utils Functions 
1. Count Item Total likes: The similary score will be normalized by "Item Total Like Scores". In theory, popular items should have less weight in simiarity score.
2. Trimming function: Helpful managing memoery usage. 
3. Method for normalization: Mostly item total like normalization, and max norm(make all sim score between 0 and 1) of the score. 

In [4]:
# ==================================
# Methods for counting Item Total Likes
# ==================================
@nb.jit(nopython=True)
def getItemTotalLikesNaive(aids, ops, item_total_likes, action_weights):
    """
    Stores the total like score of itemXXX in item_total_likes, based on action_weights parameter. np.array([X, Y, Z])
    """
    for idx, item in enumerate(aids):
        if item not in item_total_likes: 
            item_total_likes[item] = 0
        item_total_likes[item] += action_weights[ops[idx]]   ## TODO: For time decay, consider replace with 1, for iuf keep this. 

# ==================================
# Methods for rank and trim the sim score dict
# ==================================
@nb.jit(nopython = True)
def heap_topk(item_cnt_dict, cap):
    """
    get the top cap(k) elements of the cnt dict based on value, using a min-heap structure
    """
    dic = nb.typed.Dict.empty(key_type = nb.types.int64, value_type = nb.types.float64)
    q = [(np.float64(0), np.int64(0)) for _ in range(0)]  ## generate empty queue to implement a heap, 
    for item_ref, sim_score in item_cnt_dict.items():   ## read in the dict in heap structure
        heapq.heappush(q, (sim_score, item_ref))   ## push the <sim_score, item_ref_id> pair into min-heap, using sim_score for order
        if len(q) > cap:
            heapq.heappop(q)
            
    res = [heapq.heappop(q) for _ in range(len(q))][::-1]
    for i in range(len(res)):
        dic[res[i][1]] = res[i][0]
    
    return dic
   
@nb.jit(nopython = True)
def trim_simMatrix_topk(fullSimMatrix, k = 50):
    """
    trim top k items of each "itemX: {itemY: score1, ...}" pair in fullSimMatrix based on sim scores. 
    """
    for item, item_cnt_dict in fullSimMatrix.items():
        fullSimMatrix[item] = heap_topk(item_cnt_dict, k)

# ==================================
# Methods for score normalization
# ==================================

# @nb.jit(nopython=True)
# def itemTotalLikeNorm(fullSimMatrix, item_total_likes):
#     for aid_1, relations in fullSimMatrix.items():
#         for aid_2, sim_score in relations.items():
#             fullSimMatrix[aid_1][aid_2] = sim_score / (item_total_likes[aid_1] * item_total_likes[aid_2]) ** 0.1  ## TODO: consider 0.1 or other small number
            
@nb.jit(nopython=True)
def maxNormSimMatrix(fullSimMatrix):
    for aid_1, relations in fullSimMatrix.items():
        max_num = -np.inf
        for _, sim_score in relations.items():
            if sim_score > max_num:
                max_num = sim_score
        ## DEGUG use, delete later
        if max_num == 0:
            print(aid_1)
            print(fullSimMatrix[aid_1])
        for aid_2, sim_score in relations.items():
#             if max_num == 0:
#                 max_num += 0.001
            fullSimMatrix[aid_1][aid_2] = sim_score / max_num

#### Section B: Sim Score Computation functions

In [7]:

@nb.jit(nopython=True)
def getSimScoresSingleRow(pairs_this_row, start_time, start_idx, length, aids, ts, ops, item_total_likes, action_weights, mode):
    """
    Get the sim scores of items within single session, can be ran in parallel within each batch. 
    """
    max_idx = start_idx + length
    min_idx = max(max_idx - LOOKBACK_WINDOW, start_idx)  
    for i in range(min_idx, max_idx):
        for j in range(i+1, max_idx):
            if ts[j] - ts[i] > 2 * 60 * 60: continue  #TODO: try 2h only
            if aids[i] == aids[j]: continue
            
            if mode == "cosine":
                w_ij = action_weights[ops[j]] 
                w_ji = action_weights[ops[i]] 
            elif mode == "iuf":  ## penalize users that had lots of actions TODO: consider location weight
                
                loc_weight = 0.5**(abs(i-j))   #math.exp(-0.02 * abs(i-j)) 
                time_gap_weight = 0.5 ** (abs(ts[i]-ts[j]) / (1.5*60*60))  
                w_ij = action_weights[ops[j]] * time_gap_weight * loc_weight / math.log1p(length)
                w_ji = action_weights[ops[i]] * time_gap_weight * loc_weight / math.log1p(length)
            elif mode == "time_decay":
                ## calculate some time weights of each item, more weights are given when ts is later. #TODO: try adding (i-j) location weight, exponential weight, 0.5 ** (abs(i-j + 1)), 
                loc_weight = 0.5**(abs(i-j))   #math.exp(-0.02 * abs(i-j)) 
                #time_i = 1 + 0.1 ** ((1662328791-ts[i])/(1662328791-1659304800)) #1 + 3 * (ts[i] + start_time - 1659304800) / (1662328791 - 1659304800) #  #(1 - 0.8 *(TEST_END_TS - ts[i]) / TIME_SPAN) ** 0.5 # 0.2~1 #   ## time decay weight for item i 
                #time_j = 1 + 0.1 ** ((1662328791-ts[j])/(1662328791-1659304800))  # 1 + 3 * (ts[j] + start_time - 1659304800) / (1662328791 - 1659304800) # #  #(1 - 0.8 *(TEST_END_TS - ts[j]) / TIME_SPAN) ** 0.5   # 
                time_i = 1 + 1/(1 + math.exp(10*( ((1662328791-ts[i])/(1662328791-1659304800)) - 0.6  )))
                time_j = 1 + 1/(1 + math.exp(10*( ((1662328791-ts[j])/(1662328791-1659304800)) - 0.6  )))
                
                time_gap_weight = 0.5 ** (abs(ts[i]-ts[j]) / (1.5*60*60))  
                
                w_ij = action_weights[ops[j]] * loc_weight * time_gap_weight * time_i / math.log1p(length)
                w_ji = action_weights[ops[i]] * loc_weight * time_gap_weight * time_j / math.log1p(length)
            elif mode == "buy2buy":
                if (ops[i] == 0) or (ops[j] == 0):
                    continue
                loc_weight = 0.5**(abs(i-j))   #math.exp(-0.02 * abs(i-j)) 
                time_gap_weight = 0.5 ** (abs(ts[i]-ts[j]) / (1.5*60*60))  
                w_ij = action_weights[ops[j]] * time_gap_weight * loc_weight / math.log1p(length)
                w_ji = action_weights[ops[i]] * time_gap_weight * loc_weight / math.log1p(length)
                
            pairs_this_row[(aids[i], aids[j])] = w_ij / (item_total_likes[aids[i]] * item_total_likes[aids[j]]) ** 0.1
            pairs_this_row[(aids[j], aids[i])] = w_ji / (item_total_likes[aids[i]] * item_total_likes[aids[j]]) ** 0.1

@nb.jit(nopython=True, parallel=True, cache=True)
def getSimScoreBatch(aids, ts, ops, rows, fullSimMatrix, action_weights, item_total_likes, mode="cosine"):
    nrows = len(rows)
    pairs_this_batch = [{(0, 0): 0.0 for _ in range(0)} for _ in range(nrows)]
    ## get the sim scores of each batch in seperate sub dict in pairs_this_batch
    for row_i in nb.prange(nrows):  ## run each row of the batch in parallel
        _, start_idx, length, start_time = rows[row_i]
        getSimScoresSingleRow(pairs_this_batch[row_i], start_time, start_idx, length, aids, ts, ops, item_total_likes, action_weights, mode)
    ## merge pairs_this_batch into the fullSimMatrix
    for row_i in range(nrows):
        for (aid1, aid2), score in pairs_this_batch[row_i].items():
            if aid1 not in fullSimMatrix: 
                fullSimMatrix[aid1] = {0: 0.0 for _ in range(0)}
            if aid2 not in fullSimMatrix[aid1]:
                fullSimMatrix[aid1][aid2] = 0.0
            fullSimMatrix[aid1][aid2] += score


#### Section C: Train the similarity matrices
1. Derive the total like score first
2. Train 2 similarity matrices, one using iuf(Inverse User Frequence), the other using time_decay method. 

In [6]:
%%time
## get the Total Like matrix
item_total_likes = nb.typed.Dict.empty(
    key_type = nb.types.int64,
    value_type = nb.types.float64)

getItemTotalLikesNaive(aids, ops, item_total_likes, ACTION_WEIGHTS)

CPU times: user 18.9 s, sys: 766 ms, total: 19.7 s
Wall time: 19.9 s


In [8]:
%%time
simMatrices = {}   ## store a few different similarity matrices using different scoring system, for different prediction type
TRIM_CYCLES = 1000   ## trim full sim matrix every XX batches. 
MODES_TO_TRAIN = ["time_decay"] # "iuf", "buy2buy"] #, 

for mode in MODES_TO_TRAIN:
    ## the nested dict to store full sim matrix, {itemX: {itemY: score, itemZ: score, ...}}
    fullSimMatrix = nb.typed.Dict.empty(
            key_type = nb.types.int64,
            value_type = nb.typeof(nb.typed.Dict.empty(key_type = nb.types.int64, value_type = nb.types.float64)))
    max_idx = len(df)
    batch_idx = 1  ## compute sim matrix for PARALLEL # of rows per batch, have a total of max_idx/PARALLEL batches.
    for idx in tqdm(range(0, max_idx, PARALLEL)):
        rows = df.iloc[idx: min(idx + PARALLEL, max_idx)][['session', 'start_idx', 'total_action', 'session_start_time']].values
        getSimScoreBatch(aids, ts, ops, rows, fullSimMatrix, ACTION_WEIGHTS, item_total_likes, mode=mode)
        batch_idx += 1
        if batch_idx % TRIM_CYCLES == 0:
            print("batch_idx: ", batch_idx)
            trim_simMatrix_topk(fullSimMatrix, 150)
            gc.collect()
            # break

    
    ## trim top 50 when the training is complete
    trim_simMatrix_topk(fullSimMatrix, 150)   ## TODO: make this num small enough to reduce time for normalization, consider keeping 100, give more option for selection
    ## max norm of each score
    maxNormSimMatrix(fullSimMatrix)
    
    simMatrices[mode] = fullSimMatrix
    
    del fullSimMatrix
    gc.collect()

  8%|▊         | 998/12079 [02:29<37:24,  4.94it/s]

batch_idx:  1000


 17%|█▋        | 1998/12079 [05:29<28:44,  5.85it/s]   

batch_idx:  2000


 25%|██▍       | 2998/12079 [08:13<17:17,  8.75it/s]   

batch_idx:  3000


 33%|███▎      | 3999/12079 [11:53<36:03:08, 16.06s/it]

batch_idx:  4000


 41%|████▏     | 4999/12079 [14:41<33:35:59, 17.08s/it]

batch_idx:  5000


 50%|████▉     | 6000/12079 [17:11<10:27:17,  6.19s/it]

batch_idx:  6000


 58%|█████▊    | 6999/12079 [19:25<12:24:40,  8.80s/it]

batch_idx:  7000


 66%|██████▌   | 7999/12079 [21:33<19:30:16, 17.21s/it]

batch_idx:  8000


 75%|███████▍  | 9000/12079 [23:42<10:43:56, 12.55s/it]

batch_idx:  9000


 83%|████████▎ | 9998/12079 [25:43<02:20, 14.86it/s]   

batch_idx:  10000


 91%|█████████ | 10999/12079 [27:25<1:20:41,  4.48s/it]

batch_idx:  11000


 99%|█████████▉| 11999/12079 [28:54<23:24, 17.56s/it]  

batch_idx:  12000


100%|██████████| 12079/12079 [28:56<00:00,  6.96it/s]


CPU times: user 51min 14s, sys: 25min 54s, total: 1h 17min 9s
Wall time: 30min 7s


In [9]:
## A sanity check
simMatrices["buy2buy"][1517085]

DictType[int64,float64]<iv=None>({331941: 1.0, 371417: 0.5888439497177552, 32249: 0.42943762435324906, 303302: 0.2151606169331991, 461689: 0.21463847720057422, 1371202: 0.18500509000648885, 1775482: 0.09952497694341997, 1765072: 0.07861434634553799, 1853268: 0.008780961279884873, 1231891: 0.003965427852916738, 989590: 0.002005855472564233, 320601: 0.0011189360665819362, 1190046: 0.0001025840881962515, 1236142: 1.6958427526431777e-06})

In [10]:
print("len of iuf sim matrix" ,len(simMatrices["iuf"]))
print("len of iuf sim matrix" ,len(simMatrices["buy2buy"]))

len of iuf sim matrix 1818001
len of iuf sim matrix 999742


In [11]:
gc.collect()

4097

## 2. Inference -- Make prediction using the matrices derived from above. 

#### Section D: Utils for inference:
1. Select top items to recommend in re-ranking
2. Compute Real time importance of each action (Not in use currently).

In [14]:
@nb.jit(nopython = True)
def heap_topk_return_list(item_cnt_dict, cap):
    """
    get the top cap(k) elements of the cnt dict based on value, using a min-heap structure, return a list with top "cap" elements with highest score
    """
    q = [(np.float64(0), np.int64(0)) for _ in range(0)]  ## generate empty queue to implement a heap, 
    for item_ref, sim_score in item_cnt_dict.items():   ## read in the dict in heap structure
        heapq.heappush(q, (sim_score, item_ref))   ## push the <sim_score, item_ref_id> pair into min-heap, using sim_score for order
        if len(q) > cap:
            heapq.heappop(q)
            
    res = [heapq.heappop(q)[1] for _ in range(len(q))][::-1]
    
    return res

#### Section E: Main Logic in Making Inferences (DO NOT RUN in this notebook)
1. clicks_inferences: time_decay sim matrix + regular action weights <1, 6, 3>.
2. carts_inferencs: iuf sim matrix + weights <4, 2, 5> (as clicks actions tend to lead to cart action next).
3. orders_inferences: iuf sim matrix + regular action weights <1, 6, 3>.

In [9]:
@nb.jit(nopython=True)
def inference_single_session(session, starting_idx, length, start_time, aids, ops, ts, result, full_sim_matrix, test_ops_weights):
    PREV_INTERACT_BONUS = 10
    NEARBY_ACTION_BONUS = 1.5
    
    ending_idx = starting_idx + length
    end_time = ts[ending_idx]
    
    candidates = aids[starting_idx: ending_idx][::-1]
    candidates_ops = ops[starting_idx: ending_idx][::-1]
    
    ## record all potential aid that might be relevant
    potential_to_recommend = nb.typed.Dict.empty(key_type=nb.types.int64, value_type=nb.types.float64)
    
    ## get unique aid of each session 
    unique_aids = nb.typed.Dict.empty(key_type = nb.types.int64, value_type = nb.types.float64)
    for a in candidates:
        unique_aids[a] = 0
    
    ## Sequence weight to all the candidates, from near to far 
    sequence_weight = np.power(2, np.linspace(0.3, 1, len(candidates)))[::-1] - 1
    
    ## Time weight of all candidates, from near to far
    time_weights = []
    for idx in range(starting_idx, ending_idx):
        if end_time - ts[idx] < 2 * 60 * 60:   ## apply nearby action bonus
            time_weight = (1 + 0.5 ** ((end_time - ts[idx])/(end_time - start_time))) * NEARBY_ACTION_BONUS
        else:
            time_weight = 1 + 0.5 ** ((end_time - ts[idx])/(end_time - start_time))
        time_weights.append(time_weight)
    time_weights = time_weights[::-1]
    
    
    ## making inference
    if len(unique_aids) >= 20:  
        for aid, op, seq_w, time_w in zip(candidates, candidates_ops, sequence_weight, time_weights):
            if aid not in potential_to_recommend:
                potential_to_recommend[aid] = 0
            potential_to_recommend[aid] += seq_w * time_w * test_ops_weights[op] #* PREV_INTERACT_BONUS
    else:   ## otherwise, fill the rest with similar items.
        for aid, op, seq_w, time_w in zip(candidates, candidates_ops, sequence_weight, time_weights):
            if aid not in potential_to_recommend:
                potential_to_recommend[aid] = 0
            potential_to_recommend[aid] += np.inf #seq_w * time_w * test_ops_weights[op] * PREV_INTERACT_BONUS
            ## adding the similar items, if full_sim_matrix don't have such record, skip. 
            if aid not in full_sim_matrix:
                continue
            for similar_item in full_sim_matrix[aid]:
                ## if sim_item is in candidates, would be included above anyways, skip 
                if similar_item in candidates:
                    continue
                if similar_item not in potential_to_recommend:
                    potential_to_recommend[similar_item] = 0
                potential_to_recommend[similar_item] += seq_w * time_w * test_ops_weights[op] * full_sim_matrix[aid][similar_item]  ## no PREV_INTERACT_BONUS as expected, replaced with sim_matrix scores
    result[session] = np.array(heap_topk_return_list(potential_to_recommend, 20))
    
@nb.jit(nopython=True)
def run_inference_parallel(rows, aids, ops, ts, result, full_sim_matrix, test_ops_weights):
    for row_idx in nb.prange(len(rows)):
        session, starting_idx, length, start_time = rows[row_idx]
        inference_single_session(session, starting_idx, length, start_time, aids, ops, ts, result, full_sim_matrix, test_ops_weights)

In [12]:
# %%time
# result_iuf = nb.typed.Dict.empty(
#     key_type = nb.types.int64,
#     value_type = nb.types.int64[:])

# result_iuf_2 = nb.typed.Dict.empty(
#     key_type = nb.types.int64,
#     value_type = nb.types.int64[:])

# result_time_decay = nb.typed.Dict.empty(
#     key_type = nb.types.int64,
#     value_type = nb.types.int64[:])

# for row_idx in tqdm(range(len(df) - len(df_test), len(df), PARALLEL)):
#     start_row = row_idx
#     end_row = min(row_idx + PARALLEL, len(df))
#     rows = df.iloc[start_row: end_row][['session', 'start_idx', 'total_action', 'session_start_time']].values
#     run_inference_parallel(rows, aids, ops, ts, result_iuf, simMatrices["iuf"], np.array([2.0, 6.0, 6.0]))
#     run_inference_parallel(rows, aids, ops, ts, result_iuf_2, simMatrices["iuf"], np.array([4.0, 2.0, 5.0]))   ## considebly add the weights for click action in the real time.
#     run_inference_parallel(rows, aids, ops, ts, result_time_decay, simMatrices["time_decay"], np.array([3.0, 6.0, 3.0]))

### Submissions - Convert results to csv, get validation set result - DO NOT RUN in this notebook

In [13]:
# %%time
# subs = []
# op_names = ["clicks", "carts", "orders"]

# for result, op in zip([result_time_decay, result_iuf_2, result_iuf], op_names):
#     sub = pd.DataFrame({"session_type": result.keys(), "labels": result.values()})
#     sub.session_type = sub.session_type.astype(str) + f"_{op}"
#     sub.labels = sub.labels.apply(lambda x: " ".join(x.astype(str)))
#     subs.append(sub)
    
# submission = pd.concat(subs).reset_index(drop=True)
# #sub.sort_values(by=["session_type"])  ## optional
# #submission.to_csv('submission.csv', index = False)
# submission

In [14]:
# %%time
# submission.to_csv('../../allData/validationData/p_v_579_80_items.csv', index = False)

### Section F: Saving Feature, Run E or F, depends on mode, this will not only generate result, but the featues associated.
**Feature engineering are made here**. 

In [33]:
from datetime import datetime
from datetime import timezone
import pytz

In [65]:
@nb.jit(nopython=True)
def getLocalTsInfo(utc_ts, timezone):
    local_times_info = datetime.fromtimestamp(utc_ts, pytz.timezone(timezone))
    if (local_times_info.hour >= 18) or (local_times_info.hour <= 2):
        day_noon_night = 2
    elif (local_times_info.hour >= 3) and (local_times_info.hour <= 11):
        day_noon_night = 1
    else: ## (local_times_info.hour >= 12) and (local_times_info.hour <= 15)
        day_noon_night = 0
    return local_times_info.day, local_times_info.hour, day_noon_night

In [58]:
print("LA", getLocalTsInfo(1659367439, 'America/Los_Angeles'))
print("Berlin:", getLocalTsInfo(1659367439, 'Europe/Berlin'))

LA (1, 8, 1)
Berlin: (1, 17, 0)


##### Utils for features save

In [15]:
@nb.jit(nopython=True)
def update_feature_vec(aid, features_tuple_arr, features_idx_map, new_feat_tuple):
    ## append features
    if aid not in features_idx_map:
        features_tuple_arr.append(new_feat_tuple)
        new_pos = len(features_tuple_arr)-1
        ## save the position in the tuple arr
        features_idx_map[aid] = new_pos
    else: # <is_prev_int, seq_w, time_w, total_ops_w, session_len, # uniuqe aids, CF_score, aid's itemTotalLike >
        # ================== 8 ==
        if features_tuple_arr[features_idx_map[aid]][0]: 
            slot8_ref_time = features_tuple_arr[features_idx_map[aid]][8]
        else:
            slot8_ref_time = features_tuple_arr[features_idx_map[aid]][8] + new_feat_tuple[8]
        # ================== 9 ==
        if features_tuple_arr[features_idx_map[aid]][0]: 
            slot9_max_sim = 1
        else:
            slot9_max_sim = max(new_feat_tuple[9], features_tuple_arr[features_idx_map[aid]][9])
        # ================== 10 ==
        if features_tuple_arr[features_idx_map[aid]][0]: 
            slot10_mean_sim = 1
        else:
            slot10_mean_sim = ((features_tuple_arr[features_idx_map[aid]][10] * (features_tuple_arr[features_idx_map[aid]][8]-1) ) + new_feat_tuple[10] ) / features_tuple_arr[features_idx_map[aid]][8]
        # ================== 14 seq_w ==
        slot14_max_seq_w = max(new_feat_tuple[14], features_tuple_arr[features_idx_map[aid]][14])
        # ================== 15 ==
        if features_tuple_arr[features_idx_map[aid]][0]: 
            slot15_mean_seq_w = (features_tuple_arr[features_idx_map[aid]][1] + new_feat_tuple[1]) / (features_tuple_arr[features_idx_map[aid]][11] + new_feat_tuple[11])
        else:
            slot15_mean_seq_w = (features_tuple_arr[features_idx_map[aid]][1] + new_feat_tuple[1]) / (features_tuple_arr[features_idx_map[aid]][8] + new_feat_tuple[8])
        # ================== 16 ==
        slot16_min_seq_w = min(new_feat_tuple[16], features_tuple_arr[features_idx_map[aid]][16])
        # ================== 17 time_w == 
        slot17_max_time_w = max(new_feat_tuple[17], features_tuple_arr[features_idx_map[aid]][17])
        # ================== 18 == 
        if features_tuple_arr[features_idx_map[aid]][0]: 
            slot18_mean_time_w = (features_tuple_arr[features_idx_map[aid]][2] + new_feat_tuple[2]) / (features_tuple_arr[features_idx_map[aid]][11] + new_feat_tuple[11])
        else:
            slot18_mean_time_w = (features_tuple_arr[features_idx_map[aid]][2] + new_feat_tuple[2]) / (features_tuple_arr[features_idx_map[aid]][8] + new_feat_tuple[8])
        # ================= 19 ==
        slot19_min_time_w = min(new_feat_tuple[19], features_tuple_arr[features_idx_map[aid]][19])
        # ================= 20 ops_w ==
        slot20_max_ops_w = max(new_feat_tuple[20], features_tuple_arr[features_idx_map[aid]][20])
        # ================= 21 ==
        if features_tuple_arr[features_idx_map[aid]][0]: 
            slot21_mean_ops_w = (features_tuple_arr[features_idx_map[aid]][3] + new_feat_tuple[3]) / (features_tuple_arr[features_idx_map[aid]][11] + new_feat_tuple[11])
        else:
            slot21_mean_ops_w = (features_tuple_arr[features_idx_map[aid]][3] + new_feat_tuple[3]) / (features_tuple_arr[features_idx_map[aid]][8] + new_feat_tuple[8])
        # ================= 22 ==
        slot22_min_ops_w = min(new_feat_tuple[22], features_tuple_arr[features_idx_map[aid]][22]) #new_feat_tuple[22] if new_feat_tuple[22] < features_tuple_arr[features_idx_map[aid]][22] else features_tuple_arr[features_idx_map[aid]][22]
        # ================= 28 cf_incre ==
        slot28_max_cf_incre = max(new_feat_tuple[28], features_tuple_arr[features_idx_map[aid]][28])
        # ================= 29 ==
        if features_tuple_arr[features_idx_map[aid]][0]: 
            slot29_mean_cf_incre = new_feat_tuple[6] / (features_tuple_arr[features_idx_map[aid]][11] + new_feat_tuple[11])
        else:
            slot29_mean_cf_incre = new_feat_tuple[6] / (features_tuple_arr[features_idx_map[aid]][8] + new_feat_tuple[8])
        # ================= 30 ==
        slot30_min_cf_incre = min(new_feat_tuple[30], features_tuple_arr[features_idx_map[aid]][30])

        
        features_tuple_arr[features_idx_map[aid]] = (new_feat_tuple[0], 
                                                     features_tuple_arr[features_idx_map[aid]][1] + new_feat_tuple[1], 
                                                     features_tuple_arr[features_idx_map[aid]][2] + new_feat_tuple[2], 
                                                     features_tuple_arr[features_idx_map[aid]][3] + new_feat_tuple[3],
                                                     new_feat_tuple[4],
                                                     new_feat_tuple[5],
                                                     new_feat_tuple[6],
                                                     new_feat_tuple[7],
                                                     slot8_ref_time,
                                                     slot9_max_sim,
                                                     slot10_mean_sim,
                                                     features_tuple_arr[features_idx_map[aid]][11] + new_feat_tuple[11],
                                                     new_feat_tuple[12],
                                                     features_tuple_arr[features_idx_map[aid]][13],   ## should stay as the 1st iter
                                                     slot14_max_seq_w,
                                                     slot15_mean_seq_w,
                                                     slot16_min_seq_w,
                                                     slot17_max_time_w,
                                                     slot18_mean_time_w,
                                                     slot19_min_time_w,
                                                     slot20_max_ops_w,
                                                     slot21_mean_ops_w,
                                                     slot22_min_ops_w,
                                                     features_tuple_arr[features_idx_map[aid]][23] + new_feat_tuple[23], 
                                                     features_tuple_arr[features_idx_map[aid]][24] + new_feat_tuple[24],
                                                     features_tuple_arr[features_idx_map[aid]][25] + new_feat_tuple[25],
                                                     features_tuple_arr[features_idx_map[aid]][26], ## should not change once set at the 1st iter
                                                     features_tuple_arr[features_idx_map[aid]][27], ## should stay as the 1st iter
                                                     slot28_max_cf_incre,
                                                     slot29_mean_cf_incre,
                                                     slot30_min_cf_incre,
                                                     slot14_max_seq_w - slot16_min_seq_w,
                                                     slot17_max_time_w - slot19_min_time_w,
                                                     slot20_max_ops_w - slot22_min_ops_w,
                                                     slot28_max_cf_incre - slot30_min_cf_incre
                                                     )

# <
# slot_0: is_prev_interacted, 
# slot_1: seq_w_total, 
# slot_2: time_w_total, 
# slot_3: ops_w_total: for visited item, ops weight total in this session; for unvisited item, ops weight total of the item referencing this item. 
# slot_4: session_len, 
# slot_5: num uniuqe aids, 
# slot_6: CF_score, 
# slot_7: aid's itemTotalLike: total like score use for normalization.  
# slot_8: reference time by similar matrix(if aid visited, default 100; if aid not visited, 1-19(when all aid only interact once, could grow to very large if a lot of actions on one aid), depending how many aid reference this item)
# slot_9: max_sim_score:  (1 if it's a visited item)
# slot_10: mean_sim_score: (1 if it's a visited item)
# slot_11: num_interact, (0 for unvisited item; count of interaction for visited item)
# slot_12: time_span of the session 
# slot_13: action_recency: time to last action(end time), for unvisited items -> the time to of reference_aid to the last action
# slot_14: seq_w_max: 
# slot_15: seq_w_mean: for visited item -> seq_w_total / num_interact; for unvisited item -> seq_w_total / reference_time
## ========================= round 2 ================
# slot_16: seq_w_min: 
# slot_17: time_w_max:
# slot_18: time_w_mean: similar to slot_15
# slot_19: time_w_min
# slot_20: ops_w_max:
# slot_21: ops_w_mean:
# slot_22: ops_w_min: 
# slot_23: num_clicks: visited item, direct num; unvisited item, take the reference item's num
# slot_24: num_carts:
# slot_25: num_orders:
# slot_26: last_action_type: 0 -> clicks, 1-> carts, 2 -> orders
# slot_27: time_to_now: latest interaction time to now 
# slot_28: cf_increment_max: 
# slot_29: cf_increment_mean:
# slot_30: cf_increment_min:
##  ======================== Derivable ============================
# slot_31: seq_w max_min_gap: slot_14 - slot_16 
# slot_32: time_w_max_min_gap: slot_17 - slot_19
# slot_33: ops_w_max_min_gap: slot_20 - slot_22
# slot_34: cf_incre_max_min_gap: slot_28 - slot_30
## ====================== Round 3 features ========================
# slot_35: last3Inter_cf_incre_max: maximum cf_incre in the last 3 action, if <= 3 actions, same as slot_28
# slot_36: last3Inter_cf_incre_mean: 
# slot_37: last3Inter_cf_incre_min:
# slot_38: last3Inter_time_w_max:  maximum time_w in the last 3 action, if <= 3 actions, same as slot_17
# slot_39: last3Inter_time_w_mean:
# slot_40: last3Inter_time_w_min: 
# slot_41: last3Inter_seq_w_max:  maximum time_w in the last 3 action, if <= 3 actions, same as slot_14
# slot_42: last3Inter_seq_w_mean:
# slot_43: last3Inter_seq_w_min:
# slot_44: raw_seq_order: last action's seq_order, no depreciation
# slot_45: raw_seq_order_max: 
# slot_46: raw_seq_order_mean:
# slot_47: raw_seq_order_min

# append ts lastly to unblock feature below
#  

## ================= numba restriction, add in notebook =====================
# slot_35: last_interact_local_day_of_week, Mon -> 0; Tues -> 1; ..... ; Sun -> 6
# slot_36: last_interact_local_hour: 
# slot_37: last_interact_day_night, local ts, if 18:00:00 ~ 2:59:59 -> night/2; 3:00:00 ~ 11:59:59 -> morning/1; 12:00:00 ~ 17:59:59 -> afternoon/0
# >
FEATURE_TUPLE_TEMPLATE = (bool(0), np.float64(0.0), np.float64(0.0), np.int64(0), np.int64(0), np.int64(0), \
    np.float64(0.0), np.float64(0.0), np.int32(0), np.float32(0.0), np.float64(0.0), np.int32(0), np.float64(0.0), np.float64(0.0), np.float32(0.0), np.float32(0.0),\
        np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0),\
            np.int32(0), np.int32(0), np.int32(0), np.int32(0), np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0), \
                np.float32(0.0), np.float32(0.0), np.float32(0.0), np.float32(0.0))

FEATURE_NAMES = ["prev_int", "seq_w_total", "time_w_total", "ops_w_total", "session_len", "num_uniuqe_aids", "CF_score", "itemTotalLike", "ref_time", "max_sim_score",\
    "mean_sim_score", "num_interact", "time_span", "action_recency", "seq_w_max", "seq_w_mean", "seq_w_min", "time_w_max", "time_w_mean", "time_w_min", \
        "ops_w_max", "ops_w_mean", "ops_w_min", "num_clicks", "num_carts", "num_orders", "last_action_type", "time_to_now", "cf_incre_max", "cf_incre_mean", \
            "cf_incre_min", "seqW_max_min_gap", "timeW_max_min_gap", "opsW_max_min_gap", "cf_incre_max_min_gap"]

##### Main feature save logics

In [16]:
@nb.jit(nopython=True)
def save_feature_single_session(session, starting_idx, length, start_time, aids, ops, ts, result, full_sim_matrix, item_total_likes, test_ops_weights):
    NOW_TIME = ts[-1] ## ts of latest avaiable action
    PREV_INTERACT_BONUS = 20
    NEARBY_ACTION_BONUS = 1.5
    
    ending_idx = starting_idx + length 
    end_time = ts[ending_idx - 1]
    time_span = end_time - start_time
    
    candidates = aids[starting_idx: ending_idx][::-1]
    candidates_ops = ops[starting_idx: ending_idx][::-1]
    
    ## record all potential aid that might be relevant
    potential_to_recommend = nb.typed.Dict.empty(key_type=nb.types.int64, value_type=nb.types.float64)
    
    ## get unique aid of each session 
    unique_aids = nb.typed.Dict.empty(key_type = nb.types.int64, value_type = nb.types.float64)
    for a in candidates:
        unique_aids[a] = 0
    
    ## Sequence weight to all the candidates, from near to far 
    sequence_weight = np.power(2, np.linspace(0.3, 1, len(candidates)))[::-1] - 1
    
    ## Time weight of all candidates, from near to far
    time_weights = []
    time_lapse = end_time - start_time + 1  ## +1 to avoid zero
    for idx in range(starting_idx, ending_idx):
        if end_time - ts[idx] < 2 * 60 * 60:   ## apply nearby action bonus
            time_weight = (1 + 0.5 ** ((end_time - ts[idx])/time_lapse)) * NEARBY_ACTION_BONUS
        else:
            time_weight = 1 + 0.5 ** ((end_time - ts[idx])/time_lapse)
        time_weights.append(time_weight)
    time_weights = time_weights[::-1]
    
    ## feature vector template: [aid: <is_prev_int, seq_w, time_w, associated_action, session_len,.. >]
    features_tuple_arr = nb.typed.List()
    features_tuple_arr.append(FEATURE_TUPLE_TEMPLATE)
    features_idx_map = nb.typed.Dict.empty(key_type=nb.types.int64, value_type=nb.types.int64)

    helper_idx = ending_idx - 1
    ## making inference
    if len(unique_aids) >= 20:  
        for aid, op, seq_w, time_w in zip(candidates, candidates_ops, sequence_weight, time_weights):
            if aid not in potential_to_recommend:
                potential_to_recommend[aid] = 0
            ## caculate scores
            cf_incre = seq_w * time_w * test_ops_weights[op]
            potential_to_recommend[aid] += cf_incre #* PREV_INTERACT_BONUS
            ## append features
            update_feature_vec(aid, features_tuple_arr, features_idx_map, \
                (1, seq_w, time_w, test_ops_weights[op], length, len(unique_aids), potential_to_recommend[aid], \
                    item_total_likes[aid], 100, 1, 1, 1, time_span, end_time-ts[helper_idx], seq_w, seq_w, seq_w, \
                        time_w, time_w, time_w, test_ops_weights[op], test_ops_weights[op], test_ops_weights[op], op==0, op==1, op==2, op, NOW_TIME-ts[helper_idx],\
                            cf_incre, cf_incre, cf_incre, 0, 0, 0, 0))
            helper_idx -= 1
    else:   ## otherwise, fill the rest with similar items.
        for aid, op, seq_w, time_w in zip(candidates, candidates_ops, sequence_weight, time_weights):
            if aid not in potential_to_recommend:
                potential_to_recommend[aid] = 0
            ## get the scores
            cf_incre = seq_w * time_w * test_ops_weights[op] * PREV_INTERACT_BONUS
            potential_to_recommend[aid] += cf_incre
            ## append features
            update_feature_vec(aid, features_tuple_arr, features_idx_map, \
                (1, seq_w, time_w, test_ops_weights[op], length, len(unique_aids), potential_to_recommend[aid], \
                    item_total_likes[aid], 100, 1, 1, 1, time_span, end_time-ts[helper_idx], seq_w, seq_w, seq_w, \
                        time_w, time_w, time_w, test_ops_weights[op], test_ops_weights[op], test_ops_weights[op], op==0, op==1, op==2, op, NOW_TIME - ts[helper_idx],\
                            cf_incre, cf_incre, cf_incre, 0, 0, 0, 0))
            ## adding the similar items, if full_sim_matrix don't have such record, skip. 
            if aid not in full_sim_matrix:
                continue
            for similar_item in full_sim_matrix[aid]:
                ## if sim_item is in candidates, would be included above anyways, skip 
                if similar_item in candidates:
                    continue
                if similar_item not in potential_to_recommend:
                    potential_to_recommend[similar_item] = 0
                
                cf_incre = seq_w * time_w * test_ops_weights[op] * full_sim_matrix[aid][similar_item]
                potential_to_recommend[similar_item] += cf_incre  ## no PREV_INTERACT_BONUS as expected, replaced with sim_matrix scores
                ## append features
                update_feature_vec(similar_item, features_tuple_arr, features_idx_map, \
                    (0, seq_w, time_w, test_ops_weights[op], length, len(unique_aids), potential_to_recommend[similar_item], \
                        item_total_likes[similar_item], 1, full_sim_matrix[aid][similar_item], full_sim_matrix[aid][similar_item], 0, \
                            time_span, end_time-ts[helper_idx], seq_w, seq_w, seq_w, time_w, time_w, time_w, test_ops_weights[op], test_ops_weights[op], test_ops_weights[op], op==0, op==1, op==2, op,\
                                NOW_TIME-ts[helper_idx], cf_incre, cf_incre, cf_incre, 0, 0, 0, 0))
            helper_idx -= 1

    result[session] = np.array(heap_topk_return_list(potential_to_recommend, 60))  ## Take top 100 for validation runs. 
    
    feature_tuples_this_session = []
    for aid in result[session]:
#         features_save[(session, aid)] = features_tuple_arr[features_idx_map[aid]]
        feature_tuples_this_session.append(features_tuple_arr[features_idx_map[aid]])
    
    return feature_tuples_this_session

#### Utils for batch processing the features save

In [22]:
def load_gt_tables(type):
    """ type -> carts / orders """
    gt_labels = pd.read_json("../../allData/validationData/out_7day_test/test_labels.jsonl", lines=True)
    gt_labels['aids'] = gt_labels["labels"].apply(lambda x: x.get(type))
    gt_labels = gt_labels[gt_labels.aids.notnull()]
    gt_labels = gt_labels.drop("labels", axis = 1)
    ## ========= special df to identify the unique session id to look at ================
    valid_gt_sessions = gt_labels.drop("aids", axis = 1) 
    ## ========================================================================
    ## keep go on for gt labels processing
    gt_labels = gt_labels.set_index(['session']).apply(pd.Series.explode).reset_index()
    gt_labels["gt"] = 1
    return valid_gt_sessions, gt_labels

def process_batch_pipeline(rawDf, valid_gt_sessions, gt_labels):
    """ rawDf -> Df with session, aids(100), feature_tuple """
    ## join valid_gt_session with rawDf, now only gt_features in valid sessions(have at least 1 aid to predict) are kept
    gt_features_valid_session = pd.merge(rawDf, valid_gt_sessions, on="session")

    ## Now explode the whole valid_gt_session aids, these session - aid are served as the train/val/test data for the reranker model, 
    ## for orders, 
    ## for carts, a total of 569697 correct guesses(not 100% included in the recall)
    gt_features_valid_session = gt_features_valid_session.set_index(['session']).apply(pd.Series.explode).reset_index()

    ## finally, attach the gt_lables 1/null to the df to return
    final_df = pd.merge(gt_features_valid_session, gt_labels, on=["session", "aids"], how='left')

    # ## open up the feature tuple 
    # for slot_id, f_name in enumerate(FEATURE_NAMES):
    #     final_df[f_name] = final_df["feature_tuple"].apply(lambda x: x[slot_id])

    # new crazy fast method
    features = np.vstack(final_df["feature_tuple"].values)
    temp_df = pd.DataFrame(features)
    del features
    temp_df.columns = [f'{feat_name}' for feat_name in FEATURE_NAMES]
    final_df[temp_df.columns] = temp_df
    del temp_df
    

    final_df = final_df.drop("feature_tuple", axis = 1)

    return final_df


#### Save features as batches

In [24]:
%%time
result_time_decay_clicks = nb.typed.Dict.empty(
    key_type = nb.types.int64,
    value_type = nb.types.int64[:])

features_all_sessions = [] # session, aid, feature tuple
                          ## session, aid, feature tuple
gc.collect()


## Given there are 1671803 sessions in total, we separate them into K batches
K = 100
session_per_batch = len(df_test) // K 
row_idx_cutoffs = [(len(df) - len(df_test)) + (PARALLEL * (session_per_batch//PARALLEL) ) * i for i in range(1, K+3)]   ## batch process every 1024 * 136 rows

feature_batch_id = 0
## load important tables
valid_gt_sessions, gt_labels = load_gt_tables("clicks")
print("finish loading the gt datas")

for row_idx in tqdm(range(len(df) - len(df_test), len(df), PARALLEL)):
    start_row = row_idx
    end_row = min(row_idx + PARALLEL, len(df))
    rows = df.iloc[start_row: end_row][['session', 'start_idx', 'total_action', 'session_start_time']].values
#     save_features_parallel(rows, aids, ops, ts, result_iuf_orders, simMatrices["iuf"], np.array([2.0, 8.0, 6.0]), orders_features_save)  
    ## run things in parallel
    for row_idx in nb.prange(len(rows)):
        session, starting_idx, length, start_time = rows[row_idx]
        features_tuples_this_session = save_feature_single_session(session, starting_idx, length, start_time, aids, ops, ts, result_time_decay_clicks, simMatrices["time_decay"], item_total_likes, np.array([3.0, 6.0, 3.0]))
        features_all_sessions.append(features_tuples_this_session)
    # break
    
    if (start_row in row_idx_cutoffs) or (end_row == len(df)):
        ## save batch result
        rawDf = pd.DataFrame({"session": result_time_decay_clicks.keys(), "aids": result_time_decay_clicks.values(), "feature_tuple": features_all_sessions})
        batch_result = process_batch_pipeline(rawDf, valid_gt_sessions, gt_labels)
        batch_result.to_parquet(f"../../allData/features/click_features_V5/batch_result_{feature_batch_id}.parquet")
        ## clean the memory for next batch
        del batch_result, rawDf, features_all_sessions, result_time_decay_clicks
        gc.collect()
        ## progress update
        print(f"feature_batch_{feature_batch_id} completes saving.")
        feature_batch_id += 1
        ## initiate the struct for new batch again
        result_time_decay_clicks = nb.typed.Dict.empty(
            key_type = nb.types.int64,
            value_type = nb.types.int64[:])
        features_all_sessions = []
        
    #break
#     gc.collect()

  0%|          | 0/1742 [00:00<?, ?it/s]

finish loading the gt datas


  1%|          | 18/1742 [00:24<1:55:08,  4.01s/it]

feature_batch_0 completes saving.


  2%|▏         | 35/1742 [00:46<1:39:53,  3.51s/it]

feature_batch_1 completes saving.


  3%|▎         | 52/1742 [01:10<1:47:59,  3.83s/it]

feature_batch_2 completes saving.


  4%|▍         | 69/1742 [01:33<1:43:57,  3.73s/it]

feature_batch_3 completes saving.


  5%|▍         | 86/1742 [01:57<1:49:50,  3.98s/it]

feature_batch_4 completes saving.


  6%|▌         | 103/1742 [02:18<1:36:27,  3.53s/it]

feature_batch_5 completes saving.


  7%|▋         | 120/1742 [02:41<1:43:11,  3.82s/it]

feature_batch_6 completes saving.


  8%|▊         | 137/1742 [03:03<1:35:05,  3.55s/it]

feature_batch_7 completes saving.


  9%|▉         | 154/1742 [03:25<1:37:05,  3.67s/it]

feature_batch_8 completes saving.


 10%|▉         | 171/1742 [03:47<1:41:42,  3.88s/it]

feature_batch_9 completes saving.


 11%|█         | 188/1742 [04:09<1:30:26,  3.49s/it]

feature_batch_10 completes saving.


 12%|█▏        | 205/1742 [04:30<1:28:02,  3.44s/it]

feature_batch_11 completes saving.


 13%|█▎        | 222/1742 [04:52<1:32:09,  3.64s/it]

feature_batch_12 completes saving.


 14%|█▎        | 239/1742 [05:13<1:27:31,  3.49s/it]

feature_batch_13 completes saving.


 15%|█▍        | 256/1742 [05:34<1:26:32,  3.49s/it]

feature_batch_14 completes saving.


 16%|█▌        | 273/1742 [05:56<1:32:43,  3.79s/it]

feature_batch_15 completes saving.


 17%|█▋        | 290/1742 [06:18<1:25:56,  3.55s/it]

feature_batch_16 completes saving.


 18%|█▊        | 307/1742 [06:39<1:24:39,  3.54s/it]

feature_batch_17 completes saving.


 19%|█▊        | 324/1742 [07:01<1:18:46,  3.33s/it]

feature_batch_18 completes saving.


 20%|█▉        | 341/1742 [07:20<1:13:38,  3.15s/it]

feature_batch_19 completes saving.


 21%|██        | 358/1742 [07:40<1:12:31,  3.14s/it]

feature_batch_20 completes saving.


 22%|██▏       | 375/1742 [08:00<1:13:54,  3.24s/it]

feature_batch_21 completes saving.


 23%|██▎       | 392/1742 [08:20<1:12:39,  3.23s/it]

feature_batch_22 completes saving.


 23%|██▎       | 409/1742 [08:40<1:14:55,  3.37s/it]

feature_batch_23 completes saving.


 24%|██▍       | 426/1742 [09:01<1:15:11,  3.43s/it]

feature_batch_24 completes saving.


 25%|██▌       | 443/1742 [09:21<1:10:47,  3.27s/it]

feature_batch_25 completes saving.


 26%|██▋       | 460/1742 [09:41<1:10:30,  3.30s/it]

feature_batch_26 completes saving.


 27%|██▋       | 477/1742 [10:02<1:09:34,  3.30s/it]

feature_batch_27 completes saving.


 28%|██▊       | 494/1742 [10:21<1:07:29,  3.25s/it]

feature_batch_28 completes saving.


 29%|██▉       | 511/1742 [10:41<1:06:39,  3.25s/it]

feature_batch_29 completes saving.


 30%|███       | 528/1742 [11:01<1:06:52,  3.30s/it]

feature_batch_30 completes saving.


 31%|███▏      | 545/1742 [11:20<1:04:48,  3.25s/it]

feature_batch_31 completes saving.


 32%|███▏      | 562/1742 [11:41<1:05:22,  3.32s/it]

feature_batch_32 completes saving.


 33%|███▎      | 579/1742 [12:01<1:03:57,  3.30s/it]

feature_batch_33 completes saving.


 34%|███▍      | 596/1742 [12:21<1:02:52,  3.29s/it]

feature_batch_34 completes saving.


 35%|███▌      | 613/1742 [12:41<1:01:57,  3.29s/it]

feature_batch_35 completes saving.


 36%|███▌      | 630/1742 [13:02<1:02:31,  3.37s/it]

feature_batch_36 completes saving.


 37%|███▋      | 647/1742 [13:22<1:02:14,  3.41s/it]

feature_batch_37 completes saving.


 38%|███▊      | 664/1742 [13:43<1:02:09,  3.46s/it]

feature_batch_38 completes saving.


 39%|███▉      | 681/1742 [14:04<1:00:13,  3.41s/it]

feature_batch_39 completes saving.


 40%|████      | 698/1742 [14:24<57:47,  3.32s/it]  

feature_batch_40 completes saving.


 41%|████      | 715/1742 [14:44<57:07,  3.34s/it]

feature_batch_41 completes saving.


 42%|████▏     | 732/1742 [15:04<55:30,  3.30s/it]

feature_batch_42 completes saving.


 43%|████▎     | 749/1742 [15:25<59:27,  3.59s/it]

feature_batch_43 completes saving.


 44%|████▍     | 766/1742 [15:47<59:32,  3.66s/it]

feature_batch_44 completes saving.


 45%|████▍     | 783/1742 [16:07<54:36,  3.42s/it]

feature_batch_45 completes saving.


 46%|████▌     | 800/1742 [16:28<56:47,  3.62s/it]

feature_batch_46 completes saving.


 47%|████▋     | 817/1742 [16:50<57:07,  3.71s/it]

feature_batch_47 completes saving.


 48%|████▊     | 834/1742 [17:11<51:14,  3.39s/it]

feature_batch_48 completes saving.


 49%|████▉     | 851/1742 [17:31<48:47,  3.29s/it]

feature_batch_49 completes saving.


 50%|████▉     | 868/1742 [17:51<48:29,  3.33s/it]

feature_batch_50 completes saving.


 51%|█████     | 885/1742 [18:13<51:39,  3.62s/it]

feature_batch_51 completes saving.


 52%|█████▏    | 902/1742 [18:33<47:15,  3.38s/it]

feature_batch_52 completes saving.


 53%|█████▎    | 919/1742 [18:54<46:53,  3.42s/it]

feature_batch_53 completes saving.


 54%|█████▎    | 936/1742 [19:14<45:08,  3.36s/it]

feature_batch_54 completes saving.


 55%|█████▍    | 953/1742 [19:35<45:37,  3.47s/it]

feature_batch_55 completes saving.


 56%|█████▌    | 970/1742 [19:55<42:57,  3.34s/it]

feature_batch_56 completes saving.


 57%|█████▋    | 987/1742 [20:15<42:38,  3.39s/it]

feature_batch_57 completes saving.


 58%|█████▊    | 1004/1742 [20:35<40:35,  3.30s/it]

feature_batch_58 completes saving.


 59%|█████▊    | 1021/1742 [20:55<40:27,  3.37s/it]

feature_batch_59 completes saving.


 60%|█████▉    | 1038/1742 [21:15<40:09,  3.42s/it]

feature_batch_60 completes saving.


 61%|██████    | 1055/1742 [21:36<38:41,  3.38s/it]

feature_batch_61 completes saving.


 62%|██████▏   | 1072/1742 [21:56<37:05,  3.32s/it]

feature_batch_62 completes saving.


 63%|██████▎   | 1089/1742 [22:16<36:02,  3.31s/it]

feature_batch_63 completes saving.


 63%|██████▎   | 1106/1742 [22:35<34:25,  3.25s/it]

feature_batch_64 completes saving.


 64%|██████▍   | 1123/1742 [22:55<34:13,  3.32s/it]

feature_batch_65 completes saving.


 65%|██████▌   | 1140/1742 [23:15<33:30,  3.34s/it]

feature_batch_66 completes saving.


 66%|██████▋   | 1157/1742 [23:35<32:10,  3.30s/it]

feature_batch_67 completes saving.


 67%|██████▋   | 1174/1742 [23:55<31:01,  3.28s/it]

feature_batch_68 completes saving.


 68%|██████▊   | 1191/1742 [24:15<31:13,  3.40s/it]

feature_batch_69 completes saving.


 69%|██████▉   | 1208/1742 [24:35<29:23,  3.30s/it]

feature_batch_70 completes saving.


 70%|███████   | 1225/1742 [24:55<29:30,  3.42s/it]

feature_batch_71 completes saving.


 71%|███████▏  | 1242/1742 [25:14<27:05,  3.25s/it]

feature_batch_72 completes saving.


 72%|███████▏  | 1259/1742 [25:34<25:36,  3.18s/it]

feature_batch_73 completes saving.


 73%|███████▎  | 1276/1742 [25:53<24:57,  3.21s/it]

feature_batch_74 completes saving.


 74%|███████▍  | 1293/1742 [26:12<24:12,  3.23s/it]

feature_batch_75 completes saving.


 75%|███████▌  | 1310/1742 [26:32<22:56,  3.19s/it]

feature_batch_76 completes saving.


 76%|███████▌  | 1327/1742 [26:51<22:49,  3.30s/it]

feature_batch_77 completes saving.


 77%|███████▋  | 1344/1742 [27:11<21:21,  3.22s/it]

feature_batch_78 completes saving.


 78%|███████▊  | 1361/1742 [27:30<20:07,  3.17s/it]

feature_batch_79 completes saving.


 79%|███████▉  | 1378/1742 [27:49<18:57,  3.12s/it]

feature_batch_80 completes saving.


 80%|████████  | 1395/1742 [28:07<18:02,  3.12s/it]

feature_batch_81 completes saving.


 81%|████████  | 1412/1742 [28:27<17:36,  3.20s/it]

feature_batch_82 completes saving.


 82%|████████▏ | 1429/1742 [28:47<17:45,  3.41s/it]

feature_batch_83 completes saving.


 83%|████████▎ | 1446/1742 [29:07<16:23,  3.32s/it]

feature_batch_84 completes saving.


 84%|████████▍ | 1463/1742 [29:27<15:54,  3.42s/it]

feature_batch_85 completes saving.


 85%|████████▍ | 1480/1742 [29:48<15:33,  3.56s/it]

feature_batch_86 completes saving.


 86%|████████▌ | 1497/1742 [30:09<14:13,  3.49s/it]

feature_batch_87 completes saving.


 87%|████████▋ | 1514/1742 [30:30<13:04,  3.44s/it]

feature_batch_88 completes saving.


 88%|████████▊ | 1531/1742 [30:51<12:16,  3.49s/it]

feature_batch_89 completes saving.


 89%|████████▉ | 1548/1742 [31:12<10:51,  3.36s/it]

feature_batch_90 completes saving.


 90%|████████▉ | 1565/1742 [31:32<09:40,  3.28s/it]

feature_batch_91 completes saving.


 91%|█████████ | 1582/1742 [31:53<09:18,  3.49s/it]

feature_batch_92 completes saving.


 92%|█████████▏| 1599/1742 [32:13<07:57,  3.34s/it]

feature_batch_93 completes saving.


 93%|█████████▎| 1616/1742 [32:32<07:00,  3.34s/it]

feature_batch_94 completes saving.


 94%|█████████▎| 1633/1742 [32:52<06:10,  3.40s/it]

feature_batch_95 completes saving.


 95%|█████████▍| 1650/1742 [33:12<05:09,  3.36s/it]

feature_batch_96 completes saving.


 96%|█████████▌| 1667/1742 [33:32<04:05,  3.27s/it]

feature_batch_97 completes saving.


 97%|█████████▋| 1684/1742 [33:50<03:06,  3.21s/it]

feature_batch_98 completes saving.


 98%|█████████▊| 1701/1742 [34:11<02:23,  3.51s/it]

feature_batch_99 completes saving.


 99%|█████████▊| 1718/1742 [34:31<01:22,  3.43s/it]

feature_batch_100 completes saving.


100%|█████████▉| 1735/1742 [34:49<00:22,  3.25s/it]

feature_batch_101 completes saving.


100%|██████████| 1742/1742 [34:57<00:00,  1.20s/it]

feature_batch_102 completes saving.
CPU times: user 31min 59s, sys: 2min 42s, total: 34min 42s
Wall time: 35min 5s





#### Run the feature storage pipeline for carts

In [13]:
%%time
result_iuf_carts = nb.typed.Dict.empty(
    key_type = nb.types.int64,
    value_type = nb.types.int64[:])

features_all_sessions = [] # session, aid, feature tuple
                          ## session, aid, feature tuple
gc.collect()

## Given there are 1783737 sessions in total, we separate them into K batches
K = 12
batch_size = 1783737 // K  ##  -> 148644 dealing with around 150k sessions per batch
row_idx_cutoffs = [(len(df) - len(df_test)) + (1024 * 145) * i for i in range(1, K+3)]   ## batch process every 1024 * 145 rows
# 145 -> 148644 // 1024  -> batch_size // PARALLEL

feature_batch_id = 0
## load important tables
valid_gt_sessions, gt_labels = load_gt_tables("carts")
print("finish loading the gt datas")

for row_idx in tqdm(range(len(df) - len(df_test), len(df), PARALLEL)):
    start_row = row_idx
    end_row = min(row_idx + PARALLEL, len(df))
    rows = df.iloc[start_row: end_row][['session', 'start_idx', 'total_action', 'session_start_time']].values
#     save_features_parallel(rows, aids, ops, ts, result_iuf_orders, simMatrices["iuf"], np.array([2.0, 8.0, 6.0]), orders_features_save)  
    ## run things in parallel
    for row_idx in nb.prange(len(rows)):
        session, starting_idx, length, start_time = rows[row_idx]
        features_tuples_this_session = save_feature_single_session(session, starting_idx, length, start_time, aids, ops, ts, result_iuf_carts, simMatrices["iuf"], item_total_likes, np.array([4.0, 2.0, 5.0]))
        features_all_sessions.append(features_tuples_this_session)
    
    if (start_row in row_idx_cutoffs) or (end_row == len(df)):
        ## save batch result
        rawDf = pd.DataFrame({"session": result_iuf_carts.keys(), "aids": result_iuf_carts.values(), "feature_tuple": features_all_sessions})
        batch_result = process_batch_pipeline(rawDf, valid_gt_sessions, gt_labels)
        batch_result.to_parquet(f"../../allData/features/cart_features_V5/batch_result_{feature_batch_id}.parquet")
        ## clean the memory for next batch
        del batch_result, rawDf, features_all_sessions, result_iuf_carts
        gc.collect()
        ## progress update
        print(f"feature_batch_{feature_batch_id} completes saving.")
        feature_batch_id += 1
        ## initiate the struct for new batch again
        result_iuf_carts = nb.typed.Dict.empty(
            key_type = nb.types.int64,
            value_type = nb.types.int64[:])
        features_all_sessions = []
        
    #break
#     gc.collect()

  0%|          | 0/1742 [00:00<?, ?it/s]

finish loading the gt datas


  update_feature_vec(aid, features_tuple_arr, features_idx_map, \
  update_feature_vec(aid, features_tuple_arr, features_idx_map, \
  update_feature_vec(similar_item, features_tuple_arr, features_idx_map, \
  8%|▊         | 146/1742 [05:33<32:44:28, 73.85s/it]

feature_batch_0 completes saving.


 17%|█▋        | 291/1742 [09:45<18:32:45, 46.01s/it]

feature_batch_1 completes saving.


 25%|██▌       | 436/1742 [15:32<27:41:57, 76.35s/it]

feature_batch_2 completes saving.


 33%|███▎      | 581/1742 [19:05<11:45:59, 36.49s/it]

feature_batch_3 completes saving.


 42%|████▏     | 726/1742 [23:39<15:54:44, 56.38s/it]

feature_batch_4 completes saving.


 50%|█████     | 871/1742 [26:53<7:38:28, 31.58s/it] 

feature_batch_5 completes saving.


 58%|█████▊    | 1016/1742 [29:30<4:13:49, 20.98s/it]

feature_batch_6 completes saving.


 67%|██████▋   | 1161/1742 [32:45<5:15:20, 32.57s/it]

feature_batch_7 completes saving.


 75%|███████▍  | 1306/1742 [35:58<3:42:00, 30.55s/it]

feature_batch_8 completes saving.


 83%|████████▎ | 1451/1742 [39:12<2:16:28, 28.14s/it]

feature_batch_9 completes saving.


 92%|█████████▏| 1596/1742 [43:10<1:42:22, 42.07s/it]

feature_batch_10 completes saving.


100%|█████████▉| 1741/1742 [46:00<00:23, 23.21s/it]  

feature_batch_11 completes saving.


100%|██████████| 1742/1742 [46:01<00:00,  1.59s/it]

feature_batch_12 completes saving.
CPU times: user 19min 49s, sys: 16min 34s, total: 36min 24s
Wall time: 46min 12s





### Do same thing for carts, Draft from earlier, DEPRECATED

In [12]:
%%time
# result_iuf_orders = nb.typed.Dict.empty(
#     key_type = nb.types.int64,
#     value_type = nb.types.int64[:])

result_iuf_carts = nb.typed.Dict.empty(
    key_type = nb.types.int64,
    value_type = nb.types.int64[:])

# result_time_decay_clicks = nb.typed.Dict.empty(
#     key_type = nb.types.int64,
#     value_type = nb.types.int64[:])

features_all_sessions = []
gc.collect()

for row_idx in tqdm(range(len(df) - len(df_test), len(df), PARALLEL)):
    start_row = row_idx
    end_row = min(row_idx + PARALLEL, len(df))
    rows = df.iloc[start_row: end_row][['session', 'start_idx', 'total_action', 'session_start_time']].values
#     save_features_parallel(rows, aids, ops, ts, result_iuf_orders, simMatrices["iuf"], np.array([2.0, 8.0, 6.0]), orders_features_save)  
    ## run things in parallel
    for row_idx in nb.prange(len(rows)):
        session, starting_idx, length, start_time = rows[row_idx]
        features_tuples_this_session = save_feature_single_session(session, starting_idx, length, start_time, aids, ops, ts, result_iuf_carts, simMatrices["iuf"], np.array([4.0, 2.0, 5.0]))
        features_all_sessions.append(features_tuples_this_session)
#     gc.collect()

  update_feature_vec(aid, features_tuple_arr, features_idx_map, (1, seq_w, time_w, test_ops_weights[op], length, len(unique_aids), potential_to_recommend[aid]))
100%|██████████| 1742/1742 [09:11<00:00,  3.16it/s]  

CPU times: user 4min 18s, sys: 1min 35s, total: 5min 53s
Wall time: 9min 11s





In [14]:
%%time
## save the result as df 
features_save = pd.DataFrame({"session": result_iuf_carts.keys(), "aids": result_iuf_carts.values(), "feature_tuple": features_all_sessions})
# features_save = features_save.set_index(['session']).apply(pd.Series.explode).reset_index()
del features_all_sessions
gc.collect()

CPU times: user 9.64 s, sys: 26.8 s, total: 36.5 s
Wall time: 2min 21s


10031

In [15]:
%%time
## A total of 569697 carts action that's predictable 
## get all valid sessions with carts actions
## there are 150179 / 1783737 sessions have orders actions, 301057 / 1783737 have carts, 1737968 / 1783737 have clicks
carts_labels = pd.read_json("../../allData/validationData/out_7day_test/test_labels.jsonl", lines=True)
carts_labels['aids'] = carts_labels["labels"].apply(lambda x: x.get("carts"))
carts_labels = carts_labels[carts_labels.aids.notnull()]
carts_labels = carts_labels.drop("labels", axis = 1)
## ========= special df to identify the unique session id to look at ================
valid_cart_sessions = carts_labels.drop("aids", axis = 1) 
## ========================================================================
## keep go on for cart labels processing
carts_labels = carts_labels.set_index(['session']).apply(pd.Series.explode).reset_index()
carts_labels["gt"] = 1

CPU times: user 17.9 s, sys: 1min 9s, total: 1min 27s
Wall time: 5min 50s


In [20]:
## join with features_save, now only cart_features in valid sessions are kept, and as expected 301057 sessions are the valid sessions to expand
cart_features_valid_session = pd.merge(features_save, valid_cart_sessions, on="session")

In [24]:
## Now explode the whole valid session aids, a total of 27168199 session - aid are served as the traini/val/test data for cart, 
## a total of 569697 correct guesses(not 100% included in the recall)
cart_features_valid_session = cart_features_valid_session.set_index(['session']).apply(pd.Series.explode).reset_index()

In [28]:
## finally 
carts_full_df = pd.merge(cart_features_valid_session, carts_labels, on=["session", "aids"], how='left')

In [33]:
## open up the feature tuple 
carts_full_df["prev_int"] = carts_full_df["feature_tuple"].apply(lambda x: x[0])
carts_full_df["seq_w"] = carts_full_df["feature_tuple"].apply(lambda x: x[1])
carts_full_df["time_w"] = carts_full_df["feature_tuple"].apply(lambda x: x[2])
carts_full_df["ops_total"] = carts_full_df["feature_tuple"].apply(lambda x: x[3])
print("Done with ops_total")
carts_full_df["session_len"] = carts_full_df["feature_tuple"].apply(lambda x: x[4])
carts_full_df["session_unique_aid"] = carts_full_df["feature_tuple"].apply(lambda x: x[5])
carts_full_df["rank_score"] = carts_full_df["feature_tuple"].apply(lambda x: x[6])
carts_full_df = carts_full_df.drop("feature_tuple", axis = 1)

Done with ops_total


In [35]:
carts_full_df.head()

Unnamed: 0,session,aids,gt,prev_int,seq_w,time_w,ops_total,session_len,session_unique_aid,rank_score
0,11098528,11830,,True,0.231144,3.0,4,1,1,27.73733
1,11098528,1732105,,False,0.231144,3.0,4,1,1,2.773733
2,11098528,588923,,False,0.231144,3.0,4,1,1,1.466177
3,11098528,571762,,False,0.231144,3.0,4,1,1,0.75204
4,11098528,884502,,False,0.231144,3.0,4,1,1,0.709451


In [36]:
carts_full_df.to_parquet("../../allData/features/carts_features_100_per_session_V3_opwFix.parquet")

In [None]:
    # final_df["prev_int"] = final_df["feature_tuple"].apply(lambda x: x[0])
    # final_df["seq_w_total"] = final_df["feature_tuple"].apply(lambda x: x[1])
    # final_df["time_w_total"] = final_df["feature_tuple"].apply(lambda x: x[2])
    # final_df["ops_total"] = final_df["feature_tuple"].apply(lambda x: x[3])
    # final_df["session_len"] = final_df["feature_tuple"].apply(lambda x: x[4])
    # final_df["session_unique_aid"] = final_df["feature_tuple"].apply(lambda x: x[5])
    # final_df["cf_score"] = final_df["feature_tuple"].apply(lambda x: x[6])
    # final_df["item_total_like"] = final_df["feature_tuple"].apply(lambda x: x[7])
    # final_df["num_reference_time"] = final_df["feature_tuple"].apply(lambda x: x[8])
    # final_df["max_sim_score"] = final_df["feature_tuple"].apply(lambda x: x[9])
    # final_df["mean_sim_score"] = final_df["feature_tuple"].apply(lambda x: x[10])
    # final_df["num_interact"] = final_df["feature_tuple"].apply(lambda x: x[11])
    # final_df["time_span"] = final_df["feature_tuple"].apply(lambda x: x[12])
    # final_df["action_recency"] = final_df["feature_tuple"].apply(lambda x: x[13])
    # final_df["seq_w_max"] = final_df["feature_tuple"].apply(lambda x: x[14])
    # final_df["seq_w_mean"] = final_df["feature_tuple"].apply(lambda x: x[15])

In [None]:
## Given there are 1783737 sessions in total, we separate them into K batches
K = 6
batch_size = 1783737 // K  ##  -> 297289 ~ 1024 * 290 dealing with around 30k sessions per batch
row_idx_cutoffs = [(len(df) - len(df_test)) + (1024 * 290)* i for i in range(1, 8)]

## Significant change to feature storage, 
using Session Cache instead of dynamic save
### Intended structure
For each session, a structure of following will be created   
session_ts_cache: {  
                   aid1: [ts_1, ts_2, ...],   
                   aid2: [ts, ......],  
                   ..........  
                   }

session_ops_cache: {
                  aid1: [ops_1, ops_2, ...] 
}

session_simScore_cache: {
                 aid1: []
}

session_CFIncre_cache: {
                 aid1: []
}

session_seqW_cache: {
    aid1
}

session_timeW_cache: {

}

session_rawSeqOrder_cache: {
    
}


In [162]:
## test interface
df_test

Unnamed: 0,session,total_action,session_start_time,session_end_time
0,11098528,1,1661119200,1661119200
1,11098529,1,1661119200,1661119200
2,11098530,6,1661119200,1661120532
3,11098531,24,1661119200,1661119746
4,11098532,2,1661119201,1661119996
...,...,...,...,...
1783732,12899774,1,1661723968,1661723968
1783733,12899775,1,1661723970,1661723970
1783734,12899776,1,1661723972,1661723972
1783735,12899777,1,1661723976,1661723976


In [165]:
df[df.session == 11098531]

Unnamed: 0,session,total_action,session_start_time,session_end_time,start_idx,end_time
10584520,11098531,24,1661119200,1661119746,163441177,1661119746


In [230]:
@nb.jit(nopython=True)
def save_feature_single_session_by_caching(session, starting_idx, length, start_time, aids, ops, ts, result, full_sim_matrix, item_total_likes, test_ops_weights):

    NOW_TIME = ts[-1] ## ts of latest avaiable action
    PREV_INTERACT_BONUS = 20
    NEARBY_ACTION_BONUS = 1.5
    
    ending_idx = starting_idx + length 
    end_time = ts[ending_idx - 1]
    time_span = end_time - start_time
    
    candidates = aids[starting_idx: ending_idx][::-1]
    candidates_ops = ops[starting_idx: ending_idx][::-1]
    
    ## record all potential aid that might be relevant
    potential_to_recommend = nb.typed.Dict.empty(key_type=nb.types.int64, value_type=nb.types.float64)
    
    ## get unique aid of each session 
    unique_aids = nb.typed.Dict.empty(key_type = nb.types.int64, value_type = nb.types.float64)
    for a in candidates:
        unique_aids[a] = 0
    
    ## Sequence weight to all the candidates, from near to far 
    sequence_weight = np.power(2, np.linspace(0.3, 1, len(candidates)))[::-1] - 1

    raw_sequence = np.arange(1, len(candidates) + 1)
    
    ## Time weight of all candidates, from near to far
    time_weights = []
    time_lapse = end_time - start_time + 1  ## +1 to avoid zero
    for idx in range(starting_idx, ending_idx):
        if end_time - ts[idx] < 2 * 60 * 60:   ## apply nearby action bonus
            time_weight = (1 + 0.5 ** ((end_time - ts[idx])/time_lapse)) * NEARBY_ACTION_BONUS
        else:
            time_weight = 1 + 0.5 ** ((end_time - ts[idx])/time_lapse)
        time_weights.append(time_weight)
    time_weights = time_weights[::-1]
    
    ## feature vector template: [aid: <is_prev_int, seq_w, time_w, associated_action, session_len,.. >]
    features_tuple_arr = nb.typed.List()
    features_tuple_arr.append(FEATURE_TUPLE_TEMPLATE)
    features_idx_map = nb.typed.Dict.empty(key_type=nb.types.int64, value_type=nb.types.int64)

    ## initiate the caches for the features
    ts_cache = nb.typed.Dict.empty(key_type = nb.types.int64, value_type=np.array([np.float64(0.0) for _ in range(0)]))
    ops_cache = nb.typed.Dict.empty(key_type = nb.types.int64, value_type=np.array([np.float64(0.0) for _ in range(0)]))    #value_type = nb.types.float64[:])
    simScore_cache = nb.typed.Dict.empty(key_type = nb.types.int64, value_type=np.array([np.float64(0.0) for _ in range(0)]))
    cfIncre_cache = nb.typed.Dict.empty(key_type = nb.types.int64, value_type=np.array([np.float64(0.0) for _ in range(0)]))
    seqW_cache = nb.typed.Dict.empty(key_type = nb.types.int64, value_type=np.array([np.float64(0.0) for _ in range(0)]))
    timeW_cache = nb.typed.Dict.empty(key_type = nb.types.int64, value_type=np.array([np.float64(0.0) for _ in range(0)]))
    actionW_cache = nb.typed.Dict.empty(key_type = nb.types.int64, value_type=np.array([np.float64(0.0) for _ in range(0)]))
    raw_seqOrder_cache = nb.typed.Dict.empty(key_type = nb.types.int64, value_type=np.array([np.float64(0.0) for _ in range(0)]))

    helper_idx = starting_idx
    ## making inference
    if len(unique_aids) >= 20:  
        for aid, op, seq_w, raw_seq_order, time_w in zip(candidates, candidates_ops, sequence_weight, raw_sequence, time_weights):
            if aid not in potential_to_recommend:
                potential_to_recommend[aid] = 0
                ## init all cache obj
                ts_cache[aid] = np.array([np.float64(0) for _ in range(0)])
                ops_cache[aid] = np.array([np.float64(0) for _ in range(0)])
                simScore_cache[aid] = np.array([np.float64(0) for _ in range(0)])
                cfIncre_cache[aid] = np.array([np.float64(0) for _ in range(0)])
                seqW_cache[aid] = np.array([np.float64(0) for _ in range(0)])
                timeW_cache[aid] = np.array([np.float64(0) for _ in range(0)])
                actionW_cache[aid] = np.array([np.float64(0) for _ in range(0)])
                raw_seqOrder_cache[aid] = np.array([np.float64(0) for _ in range(0)])
            ## caculate scores
            cf_incre = seq_w * time_w * test_ops_weights[op]
            potential_to_recommend[aid] += cf_incre #* PREV_INTERACT_BONUS
            ## append features
            ts_cache[aid] = np.append(ts_cache[aid], ts[helper_idx])
            ops_cache[aid] = np.append(ops_cache[aid], op)
            simScore_cache[aid] = np.append(simScore_cache[aid], 1)
            cfIncre_cache[aid] = np.append(cfIncre_cache[aid], cf_incre)
            seqW_cache[aid] = np.append(seqW_cache[aid], seq_w)
            timeW_cache[aid] = np.append(timeW_cache[aid], time_w)
            actionW_cache[aid] = np.append(actionW_cache[aid], test_ops_weights[op])
            raw_seqOrder_cache[aid] = np.append(raw_seqOrder_cache[aid], raw_seq_order)
            
            
            helper_idx += 1
    else:   ## otherwise, fill the rest with similar items.
        for aid, op, seq_w, raw_seq_order, time_w in zip(candidates, candidates_ops, sequence_weight, raw_sequence, time_weights):
            if aid not in potential_to_recommend:
                potential_to_recommend[aid] = 0
                ## init all cache obj
                ts_cache[aid] = np.array([np.float64(0) for _ in range(0)])
                ops_cache[aid] = np.array([np.float64(0) for _ in range(0)])
                simScore_cache[aid] = np.array([np.float64(0) for _ in range(0)])
                cfIncre_cache[aid] = np.array([np.float64(0) for _ in range(0)])
                seqW_cache[aid] = np.array([np.float64(0) for _ in range(0)])
                timeW_cache[aid] = np.array([np.float64(0) for _ in range(0)])
                actionW_cache[aid] = np.array([np.float64(0) for _ in range(0)])
                raw_seqOrder_cache[aid] = np.array([np.float64(0) for _ in range(0)])
            ## get the scores
            cf_incre = seq_w * time_w * test_ops_weights[op] * PREV_INTERACT_BONUS
            potential_to_recommend[aid] += cf_incre
            ## append features
            ts_cache[aid] = np.append(ts_cache[aid], ts[helper_idx])
            ops_cache[aid] = np.append(ops_cache[aid], op)
            simScore_cache[aid] = np.append(simScore_cache[aid], 1)
            cfIncre_cache[aid] = np.append(cfIncre_cache[aid], cf_incre)
            seqW_cache[aid] = np.append(seqW_cache[aid], seq_w)
            timeW_cache[aid] = np.append(timeW_cache[aid], time_w)
            actionW_cache[aid] = np.append(actionW_cache[aid], test_ops_weights[op])
            raw_seqOrder_cache[aid] = np.append(raw_seqOrder_cache[aid], raw_seq_order)
            ## adding the similar items, if full_sim_matrix don't have such record, skip. 
            if aid not in full_sim_matrix:
                continue
            for similar_item in full_sim_matrix[aid]:
                ## if sim_item is in candidates, would be included above anyways, skip 
                if similar_item in candidates:
                    continue
                if similar_item not in potential_to_recommend:
                    potential_to_recommend[similar_item] = 0
                    ## init all cache obj
                    ts_cache[similar_item] = np.array([np.float64(0) for _ in range(0)])
                    ops_cache[similar_item] = np.array([np.float64(0) for _ in range(0)])
                    simScore_cache[similar_item] = np.array([np.float64(0) for _ in range(0)])
                    cfIncre_cache[similar_item] = np.array([np.float64(0) for _ in range(0)])
                    seqW_cache[similar_item] = np.array([np.float64(0) for _ in range(0)])
                    timeW_cache[similar_item] = np.array([np.float64(0) for _ in range(0)])
                    actionW_cache[similar_item] = np.array([np.float64(0) for _ in range(0)])
                    raw_seqOrder_cache[similar_item] = np.array([np.float64(0) for _ in range(0)])
                
                cf_incre = seq_w * time_w * test_ops_weights[op] * full_sim_matrix[aid][similar_item]
                potential_to_recommend[similar_item] += cf_incre  ## no PREV_INTERACT_BONUS as expected, replaced with sim_matrix scores
                ## append features
                ops_cache[similar_item] = np.append(ops_cache[similar_item], op)
                ts_cache[similar_item] = np.append(ts_cache[similar_item], ts[helper_idx])
                ops_cache[similar_item] = np.append(ops_cache[similar_item], op)
                simScore_cache[similar_item] = np.append(simScore_cache[similar_item], 1)
                cfIncre_cache[similar_item] = np.append(cfIncre_cache[similar_item], cf_incre)
                seqW_cache[similar_item] = np.append(seqW_cache[similar_item], seq_w)
                timeW_cache[similar_item] = np.append(timeW_cache[similar_item], time_w)
                actionW_cache[similar_item] = np.append(actionW_cache[similar_item], test_ops_weights[op])
                raw_seqOrder_cache[similar_item] = np.append(raw_seqOrder_cache[similar_item], raw_seq_order)
                
            helper_idx += 1

    result[session] = np.array(heap_topk_return_list(potential_to_recommend, 100))  ## Take top 100 for validation runs. 
    
#     feature_tuples_this_session = []
#     for aid in result[session]:
# #         features_save[(session, aid)] = features_tuple_arr[features_idx_map[aid]]
#         feature_tuples_this_session.append(features_tuple_arr[features_idx_map[aid]])
    
    return ops_cache, raw_seqOrder_cache, seqW_cache

In [231]:
pseudo_result = nb.typed.Dict.empty(
    key_type = nb.types.int64,
    value_type = nb.types.int64[:])
test_cache, test1_cache, test2_cache = save_feature_single_session_by_caching(11098531, 163441177, 24, 1661119200, aids, ops, ts, pseudo_result , simMatrices["iuf"], item_total_likes, np.array([1, 2, 3]))

In [221]:
test_cache[653835]

array([2., 2., 2., 2., 2., 2., 2., 2., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0.])

In [213]:
test_cache[653835]

array([2., 2., 2., 2., 2., 2., 2., 2., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0.])

In [214]:
test1_cache[653835]

array([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 11., 12., 13., 14.,
       15., 16., 17., 18., 20., 21., 22., 24.])

In [215]:
test2_cache[653835]

array([1.        , 0.95825035, 0.91737222, 0.87734741, 0.83815811,
       0.79978689, 0.76221665, 0.72543069, 0.68941263, 0.61961642,
       0.58580721, 0.55270376, 0.52029135, 0.48855553, 0.4574822 ,
       0.42705751, 0.39726794, 0.33954136, 0.31157867, 0.2841997 ,
       0.23114441])

In [194]:
for aid in test_cache:
    print(test_cache[aid])
    break

[2. 0. 0.]


In [196]:
np.arange(1, 10)[::-1]

array([9, 8, 7, 6, 5, 4, 3, 2, 1])

In [204]:
np.power(2, np.linspace(0.3, 1, 9))[::-1] - 1

array([1.        , 0.88230446, 0.77153504, 0.66728415, 0.5691682 ,
       0.47682615, 0.38991822, 0.30812463, 0.23114441])

In [111]:
test = nb.typed.Dict.empty(
    key_type = nb.types.int64,
    value_type = nb.types.int64[:])

test[123] = np.array([234, 4343])
test

DictType[int64,array(int64, 1d, A)]<iv=None>({123: [ 234 4343]})

In [185]:
ops_cache = nb.typed.Dict.empty(
    key_type = nb.types.int32, 
    value_type = nb.types.float32[:])
ops_cache[123] = np.array([np.float32(0.0) for _ in range(0)], dtype=np.float32) # np.array(dtype=np.float32)
ops_cache[123]

array([], dtype=float32)

In [186]:
ops_cache[123] = np.array(np.append(ops_cache[123], 2.0), dtype=np.float32)
ops_cache[123] = np.array(np.append(ops_cache[123], 8.0), dtype=np.float32)

In [187]:
ops_cache[123]

array([2., 8.], dtype=float32)

In [161]:
ops_cache[123][-1:]

array([8.], dtype=float32)

In [158]:
np.max(ops_cache[123])

8.0

In [191]:
len(ops_cache[123])

2