In [1]:
import os
import gc
import heapq
import pickle
import numba as nb
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import math

In [2]:
%%time
df = pd.read_csv("../../allData/submission_phase_data/replicate_otto_fast_pipeline_source_data/train_meta_data.csv")
df_test = pd.read_csv("../../allData/submission_phase_data/replicate_otto_fast_pipeline_source_data/test_meta_data.csv")
df = pd.concat([df, df_test]).reset_index(drop = True)
npz = np.load("../../allData/submission_phase_data/replicate_otto_fast_pipeline_source_data/train_core_data.npz")
npz_test = np.load("../../allData/submission_phase_data/replicate_otto_fast_pipeline_source_data/test_core_data.npz")
aids = np.concatenate([npz['aids'], npz_test['aids']])
ts = np.concatenate([npz['ts'], npz_test['ts']])
ops = np.concatenate([npz['ops'], npz_test['ops']])

df["start_idx"] = df['total_action'].cumsum().shift(1).fillna(0).astype(int)
df["end_time"] = ts[df["start_idx"] + df["total_action"] - 1]

CPU times: user 8.47 s, sys: 3.97 s, total: 12.4 s
Wall time: 13 s


In [3]:
## Define constants
PARALLEL = 1024
LOOKBACK_WINDOW = 200   ## only fit the latest LOOKBACK_WINDOW to train the sim matrix
#TOPN = 20
ACTION_WEIGHTS = np.array([1.0, 6.0, 3.0])

## Phase I: sim matrix

In [4]:
# ==================================
# Methods for counting Item Total Likes
# ==================================
@nb.jit(nopython=True)
def getItemTotalLikesNaive(aids, ops, item_total_likes, action_weights):
    """
    Stores the total like score of itemXXX in item_total_likes, based on action_weights parameter. np.array([X, Y, Z])
    """
    for idx, item in enumerate(aids):
        if item not in item_total_likes: 
            item_total_likes[item] = 0
        item_total_likes[item] += action_weights[ops[idx]]   ## TODO: For time decay, consider replace with 1, for iuf keep this. 

# ==================================
# Methods for rank and trim the sim score dict
# ==================================
@nb.jit(nopython = True)
def heap_topk(item_cnt_dict, cap):
    """
    get the top cap(k) elements of the cnt dict based on value, using a min-heap structure
    """
    dic = nb.typed.Dict.empty(key_type = nb.types.int64, value_type = nb.types.float64)
    q = [(np.float64(0), np.int64(0)) for _ in range(0)]  ## generate empty queue to implement a heap, 
    for item_ref, sim_score in item_cnt_dict.items():   ## read in the dict in heap structure
        heapq.heappush(q, (sim_score, item_ref))   ## push the <sim_score, item_ref_id> pair into min-heap, using sim_score for order
        if len(q) > cap:
            heapq.heappop(q)
            
    res = [heapq.heappop(q) for _ in range(len(q))][::-1]
    for i in range(len(res)):
        dic[res[i][1]] = res[i][0]
    
    return dic
   
@nb.jit(nopython = True)
def trim_simMatrix_topk(fullSimMatrix, k = 50):
    """
    trim top k items of each "itemX: {itemY: score1, ...}" pair in fullSimMatrix based on sim scores. 
    """
    for item, item_cnt_dict in fullSimMatrix.items():
        fullSimMatrix[item] = heap_topk(item_cnt_dict, k)

# ==================================
# Methods for score normalization
# ==================================

# @nb.jit(nopython=True)
# def itemTotalLikeNorm(fullSimMatrix, item_total_likes):
#     for aid_1, relations in fullSimMatrix.items():
#         for aid_2, sim_score in relations.items():
#             fullSimMatrix[aid_1][aid_2] = sim_score / (item_total_likes[aid_1] * item_total_likes[aid_2]) ** 0.1  ## TODO: consider 0.1 or other small number
            
@nb.jit(nopython=True)
def maxNormSimMatrix(fullSimMatrix):
    for aid_1, relations in fullSimMatrix.items():
        max_num = -np.inf
        for _, sim_score in relations.items():
            if sim_score > max_num:
                max_num = sim_score
        ## DEGUG use, delete later
        if max_num == 0:
            print(aid_1)
            print(fullSimMatrix[aid_1])
        for aid_2, sim_score in relations.items():
#             if max_num == 0:
#                 max_num += 0.001
            fullSimMatrix[aid_1][aid_2] = sim_score / max_num

In [5]:
@nb.jit(nopython=True)
def getSimScoresSingleRow(pairs_this_row, start_time, start_idx, length, aids, ts, ops, item_total_likes, action_weights, mode):
    """
    Get the sim scores of items within single session, can be ran in parallel within each batch. 
    """
    max_idx = start_idx + length
    min_idx = max(max_idx - LOOKBACK_WINDOW, start_idx)  
    for i in range(min_idx, max_idx):
        for j in range(i+1, max_idx):
            if ts[j] - ts[i] > 2 * 60 * 60: continue  #TODO: try 2h only
            if aids[i] == aids[j]: continue
            
            if mode == "cosine":
                w_ij = action_weights[ops[j]] 
                w_ji = action_weights[ops[i]] 
            elif mode == "iuf":  ## penalize users that had lots of actions TODO: consider location weight
                
                loc_weight = 0.5**(abs(i-j))   #math.exp(-0.02 * abs(i-j)) 
                time_gap_weight = 0.5 ** (abs(ts[i]-ts[j]) / (1.5*60*60))  
                w_ij = action_weights[ops[j]] * time_gap_weight * loc_weight / math.log1p(length)
                w_ji = action_weights[ops[i]] * time_gap_weight * loc_weight / math.log1p(length)
            elif mode == "time_decay":
                ## calculate some time weights of each item, more weights are given when ts is later. #TODO: try adding (i-j) location weight, exponential weight, 0.5 ** (abs(i-j + 1)), 
                loc_weight = 0.5**(abs(i-j))   #math.exp(-0.02 * abs(i-j)) 
                #time_i = 1 + 0.1 ** ((1662328791-ts[i])/(1662528791-1659304800)) #1 + 3 * (ts[i] + start_time - 1659304800) / (1662328791 - 1659304800) #  #(1 - 0.8 *(TEST_END_TS - ts[i]) / TIME_SPAN) ** 0.5 # 0.2~1 #   ## time decay weight for item i 
                #time_j = 1 + 0.1 ** ((1662328791-ts[j])/(1662328791-1659304800))  # 1 + 3 * (ts[j] + start_time - 1659304800) / (1662328791 - 1659304800) # #  #(1 - 0.8 *(TEST_END_TS - ts[j]) / TIME_SPAN) ** 0.5   # 
                time_i = 1 + 1/(1 + math.exp(10*( ((1662328791-ts[i])/(1662328791-1659304800)) - 0.6  )))
                time_j = 1 + 1/(1 + math.exp(10*( ((1662328791-ts[j])/(1662328791-1659304800)) - 0.6  )))
                
                time_gap_weight = 0.5 ** (abs(ts[i]-ts[j]) / (1.5*60*60))  
                
                w_ij = action_weights[ops[j]] * loc_weight * time_gap_weight * time_i / math.log1p(length)
                w_ji = action_weights[ops[i]] * loc_weight * time_gap_weight * time_j / math.log1p(length)
            elif mode == "buy2buy":
                if (ops[i] == 0) or (ops[j] == 0):
                    continue
                loc_weight = 0.5**(abs(i-j))   #math.exp(-0.02 * abs(i-j)) 
                time_gap_weight = 0.5 ** (abs(ts[i]-ts[j]) / (1.5*60*60))  
                w_ij = action_weights[ops[j]] * time_gap_weight * loc_weight / math.log1p(length)
                w_ji = action_weights[ops[i]] * time_gap_weight * loc_weight / math.log1p(length)
                
            pairs_this_row[(aids[i], aids[j])] = w_ij / (item_total_likes[aids[i]] * item_total_likes[aids[j]]) ** 0.1
            pairs_this_row[(aids[j], aids[i])] = w_ji / (item_total_likes[aids[i]] * item_total_likes[aids[j]]) ** 0.1

@nb.jit(nopython=True, parallel=True, cache=True)
def getSimScoreBatch(aids, ts, ops, rows, fullSimMatrix, action_weights, item_total_likes, mode="cosine"):
    nrows = len(rows)
    pairs_this_batch = [{(0, 0): 0.0 for _ in range(0)} for _ in range(nrows)]
    ## get the sim scores of each batch in seperate sub dict in pairs_this_batch
    for row_i in nb.prange(nrows):  ## run each row of the batch in parallel
        _, start_idx, length, start_time = rows[row_i]
        getSimScoresSingleRow(pairs_this_batch[row_i], start_time, start_idx, length, aids, ts, ops, item_total_likes, action_weights, mode)
    ## merge pairs_this_batch into the fullSimMatrix
    for row_i in range(nrows):
        for (aid1, aid2), score in pairs_this_batch[row_i].items():
            if aid1 not in fullSimMatrix: 
                fullSimMatrix[aid1] = {0: 0.0 for _ in range(0)}
            if aid2 not in fullSimMatrix[aid1]:
                fullSimMatrix[aid1][aid2] = 0.0
            fullSimMatrix[aid1][aid2] += score


In [6]:
%%time
## get the Total Like matrix
item_total_likes = nb.typed.Dict.empty(
    key_type = nb.types.int64,
    value_type = nb.types.float64)

getItemTotalLikesNaive(aids, ops, item_total_likes, ACTION_WEIGHTS)

CPU times: user 22.9 s, sys: 890 ms, total: 23.8 s
Wall time: 24 s


In [7]:
%%time
simMatrices = {}   ## store a few different similarity matrices using different scoring system, for different prediction type
TRIM_CYCLES = 1000   ## trim full sim matrix every XX batches. 
MODES_TO_TRAIN = ["iuf"] #, "time_decay"]

for mode in MODES_TO_TRAIN:
    ## the nested dict to store full sim matrix, {itemX: {itemY: score, itemZ: score, ...}}
    fullSimMatrix = nb.typed.Dict.empty(
            key_type = nb.types.int64,
            value_type = nb.typeof(nb.typed.Dict.empty(key_type = nb.types.int64, value_type = nb.types.float64)))
    max_idx = len(df)
    batch_idx = 1  ## compute sim matrix for PARALLEL # of rows per batch, have a total of max_idx/PARALLEL batches.
    for idx in tqdm(range(0, max_idx, PARALLEL)):
        rows = df.iloc[idx: min(idx + PARALLEL, max_idx)][['session', 'start_idx', 'total_action', 'session_start_time']].values
        getSimScoreBatch(aids, ts, ops, rows, fullSimMatrix, ACTION_WEIGHTS, item_total_likes, mode=mode)
        batch_idx += 1
        if batch_idx % TRIM_CYCLES == 0:
            print("batch_idx: ", batch_idx)
            trim_simMatrix_topk(fullSimMatrix, 150)  ## ALERT
            gc.collect()
#             break

    
    ## trim top 50 when the training is complete
    trim_simMatrix_topk(fullSimMatrix, 150)   ## ALERT ## TODO: make this num small enough to reduce time for normalization, consider keeping 100, give more option for selection
    ## max norm of each score
    maxNormSimMatrix(fullSimMatrix)
    
    simMatrices[mode] = fullSimMatrix
    
    del fullSimMatrix
    gc.collect()

  7%|▋         | 998/14231 [02:46<1:02:59,  3.50it/s]

batch_idx:  1000


 14%|█▍        | 1999/14231 [06:56<51:57:09, 15.29s/it]

batch_idx:  2000


 21%|██        | 2999/14231 [10:23<54:41:33, 17.53s/it]

batch_idx:  3000


 28%|██▊       | 3999/14231 [13:25<54:10:57, 19.06s/it]

batch_idx:  4000


 35%|███▌      | 4999/14231 [16:43<50:31:33, 19.70s/it]

batch_idx:  5000


 42%|████▏     | 5999/14231 [19:44<23:01:48, 10.07s/it]

batch_idx:  6000


 49%|████▉     | 6999/14231 [22:23<40:16:01, 20.04s/it]

batch_idx:  7000


 56%|█████▌    | 8000/14231 [25:04<26:27:37, 15.29s/it]

batch_idx:  8000


 63%|██████▎   | 8999/14231 [27:14<13:35:21,  9.35s/it]

batch_idx:  9000


 70%|███████   | 9999/14231 [29:21<22:15:39, 18.94s/it]

batch_idx:  10000


 77%|███████▋  | 10999/14231 [31:34<9:02:04, 10.06s/it] 

batch_idx:  11000


 84%|████████▍ | 11999/14231 [33:39<6:14:16, 10.06s/it]

batch_idx:  12000


 91%|█████████▏| 12999/14231 [35:45<1:43:47,  5.06s/it]

batch_idx:  13000


 98%|█████████▊| 14001/14231 [37:10<09:36,  2.51s/it]  

batch_idx:  14000


100%|██████████| 14231/14231 [37:17<00:00,  6.36it/s]


CPU times: user 1h 7min 34s, sys: 33min 22s, total: 1h 40min 56s
Wall time: 38min 33s


## Phase II: feature save

In [13]:
@nb.jit(nopython = True)
def heap_topk_return_list(item_cnt_dict, cap):
    """
    get the top cap(k) elements of the cnt dict based on value, using a min-heap structure, return a list with top "cap" elements with highest score
    """
    q = [(np.float64(0), np.int64(0)) for _ in range(0)]  ## generate empty queue to implement a heap, 
    for item_ref, sim_score in item_cnt_dict.items():   ## read in the dict in heap structure
        heapq.heappush(q, (sim_score, item_ref))   ## push the <sim_score, item_ref_id> pair into min-heap, using sim_score for order
        if len(q) > cap:
            heapq.heappop(q)
            
    res = [heapq.heappop(q)[1] for _ in range(len(q))][::-1]
    
    return res

In [14]:
# 3s version
FEATURE_NAMES = ["prev_int", "seq_w_total", "time_w_total", "action_w_total", "session_len", "num_uniuqe_aids", "CF_score", "itemTotalLike", "ref_time", "max_sim_score",\
    "mean_sim_score", "num_interact", "time_span", "action_recency", "seq_w_max", "seq_w_mean", "seq_w_min", "time_w_max", "time_w_mean", "time_w_min", \
        "ops_w_max", "ops_w_mean", "ops_w_min", "num_clicks", "num_carts", "num_orders", "last_action_type", "time_to_now", "cf_incre_max", "cf_incre_mean", \
            "cf_incre_min", "seqW_std", "timeW_std", "actionW_std", "cf_incre_std", "last3_cfIncre_max", "last3_cfIncre_mean", "last3_cfIncre_min", \
                "last3_timeW_max", "last3_timeW_mean", "last3_timeW_min", "last3_seqW_max", "last3_seqW_mean", "last3_seqW_min", \
                    "last1_seq_order_raw", "raw_seq_order_max", "raw_seq_order_mean", "raw_seq_order_min", "last_op_ts"]

## 5s version
# FEATURE_NAMES = ["prev_int", "seq_w_total", "time_w_total", "action_w_total", "session_len", "num_uniuqe_aids", "CF_score", "itemTotalLike", "ref_time", "max_sim_score",\
#     "mean_sim_score", "num_interact", "time_span", "action_recency", "seq_w_max", "seq_w_mean", "seq_w_min", "time_w_max", "time_w_mean", "time_w_min", \
#         "ops_w_max", "ops_w_mean", "ops_w_min", "num_clicks", "num_carts", "num_orders", "last_action_type", "time_to_now", "cf_incre_max", "cf_incre_mean", \
#             "cf_incre_min", "seqW_std", "timeW_std", "actionW_std", "cf_incre_std", "last5_cfIncre_max", "last5_cfIncre_mean", "last5_cfIncre_min", \
#                 "last5_timeW_max", "last5_timeW_mean", "last5_timeW_min", "last5_seqW_max", "last5_seqW_mean", "last5_seqW_min", \
#                     "last1_seq_order_raw", "raw_seq_order_max", "raw_seq_order_mean", "raw_seq_order_min", "last_op_ts"]
                    
                    #"day_of_week", "hour_of_day", "day_noon_night"]

In [15]:
@nb.jit(nopython=True)
def save_feature_single_session_by_caching(session, starting_idx, length, start_time, aids, ops, ts, result, full_sim_matrix, item_total_likes, test_ops_weights):

    NOW_TIME = ts[-1] ## ts of latest avaiable action
    PREV_INTERACT_BONUS = 20
    NEARBY_ACTION_BONUS = 1.5
    
    ending_idx = starting_idx + length 
    end_time = ts[ending_idx - 1]
    time_span = end_time - start_time
    
    candidates = aids[starting_idx: ending_idx][::-1]
    candidates_ops = ops[starting_idx: ending_idx][::-1]
    
    ## record all potential aid that might be relevant
    potential_to_recommend = nb.typed.Dict.empty(key_type=nb.types.int64, value_type=nb.types.float64)
    
    ## get unique aid of each session 
    unique_aids = nb.typed.Dict.empty(key_type = nb.types.int64, value_type = nb.types.float64)
    for a in candidates:
        unique_aids[a] = 0
    
    ## Sequence weight to all the candidates, from near to far 
    sequence_weight = np.power(2, np.linspace(0.3, 1, len(candidates)))[::-1] - 1

    raw_sequence = np.arange(1, len(candidates) + 1)
    
    ## Time weight of all candidates, from near to far
    time_weights = []
    time_lapse = end_time - start_time + 1  ## +1 to avoid zero
    for idx in range(starting_idx, ending_idx):
        if end_time - ts[idx] < 2 * 60 * 60:   ## apply nearby action bonus
            time_weight = (1 + 0.5 ** ((end_time - ts[idx])/time_lapse)) * NEARBY_ACTION_BONUS
        else:
            time_weight = 1 + 0.5 ** ((end_time - ts[idx])/time_lapse)
        time_weights.append(time_weight)
    time_weights = time_weights[::-1]


    ## initiate the caches for the features
    visit_flag = nb.typed.Dict.empty(key_type=nb.types.int64, value_type=nb.types.boolean) ## indicate if an aid be visited
    ts_cache = nb.typed.Dict.empty(key_type = nb.types.int64, value_type=np.array([np.float64(0.0) for _ in range(0)]))
    ops_cache = nb.typed.Dict.empty(key_type = nb.types.int64, value_type=np.array([np.float64(0.0) for _ in range(0)]))    #value_type = nb.types.float64[:])
    simScore_cache = nb.typed.Dict.empty(key_type = nb.types.int64, value_type=np.array([np.float64(0.0) for _ in range(0)]))
    cfIncre_cache = nb.typed.Dict.empty(key_type = nb.types.int64, value_type=np.array([np.float64(0.0) for _ in range(0)]))
    seqW_cache = nb.typed.Dict.empty(key_type = nb.types.int64, value_type=np.array([np.float64(0.0) for _ in range(0)]))
    timeW_cache = nb.typed.Dict.empty(key_type = nb.types.int64, value_type=np.array([np.float64(0.0) for _ in range(0)]))
    actionW_cache = nb.typed.Dict.empty(key_type = nb.types.int64, value_type=np.array([np.float64(0.0) for _ in range(0)]))
    raw_seqOrder_cache = nb.typed.Dict.empty(key_type = nb.types.int64, value_type=np.array([np.float64(0.0) for _ in range(0)]))

    helper_idx = ending_idx - 1
    ## making inference
    if len(unique_aids) >= 20:  
        for aid, op, seq_w, raw_seq_order, time_w in zip(candidates, candidates_ops, sequence_weight, raw_sequence, time_weights):
            if aid not in potential_to_recommend:
                potential_to_recommend[aid] = 0
                ## init all cache obj
                visit_flag[aid] = 1
                ts_cache[aid] = np.array([np.float64(0) for _ in range(0)])
                ops_cache[aid] = np.array([np.float64(0) for _ in range(0)])
                simScore_cache[aid] = np.array([np.float64(0) for _ in range(0)])
                cfIncre_cache[aid] = np.array([np.float64(0) for _ in range(0)])
                seqW_cache[aid] = np.array([np.float64(0) for _ in range(0)])
                timeW_cache[aid] = np.array([np.float64(0) for _ in range(0)])
                actionW_cache[aid] = np.array([np.float64(0) for _ in range(0)])
                raw_seqOrder_cache[aid] = np.array([np.float64(0) for _ in range(0)])
            ## caculate scores
            cf_incre = seq_w * time_w * test_ops_weights[op]
            potential_to_recommend[aid] += cf_incre #* PREV_INTERACT_BONUS
            ## append features
            ts_cache[aid] = np.append(ts_cache[aid], ts[helper_idx])
            ops_cache[aid] = np.append(ops_cache[aid], op)
            simScore_cache[aid] = np.append(simScore_cache[aid], 1)
            cfIncre_cache[aid] = np.append(cfIncre_cache[aid], cf_incre)
            seqW_cache[aid] = np.append(seqW_cache[aid], seq_w)
            timeW_cache[aid] = np.append(timeW_cache[aid], time_w)
            actionW_cache[aid] = np.append(actionW_cache[aid], test_ops_weights[op])
            raw_seqOrder_cache[aid] = np.append(raw_seqOrder_cache[aid], raw_seq_order)
            
            
            helper_idx -= 1
    else:   ## otherwise, fill the rest with similar items.
        for aid, op, seq_w, raw_seq_order, time_w in zip(candidates, candidates_ops, sequence_weight, raw_sequence, time_weights):
            if aid not in potential_to_recommend:
                potential_to_recommend[aid] = 0
                ## init all cache obj
                visit_flag[aid] = 1
                ts_cache[aid] = np.array([np.float64(0) for _ in range(0)])
                ops_cache[aid] = np.array([np.float64(0) for _ in range(0)])
                simScore_cache[aid] = np.array([np.float64(0) for _ in range(0)])
                cfIncre_cache[aid] = np.array([np.float64(0) for _ in range(0)])
                seqW_cache[aid] = np.array([np.float64(0) for _ in range(0)])
                timeW_cache[aid] = np.array([np.float64(0) for _ in range(0)])
                actionW_cache[aid] = np.array([np.float64(0) for _ in range(0)])
                raw_seqOrder_cache[aid] = np.array([np.float64(0) for _ in range(0)])
            ## get the scores
            cf_incre = seq_w * time_w * test_ops_weights[op] * PREV_INTERACT_BONUS
            potential_to_recommend[aid] += cf_incre
            ## append features
            ts_cache[aid] = np.append(ts_cache[aid], ts[helper_idx])
            ops_cache[aid] = np.append(ops_cache[aid], op)
            simScore_cache[aid] = np.append(simScore_cache[aid], 1)
            cfIncre_cache[aid] = np.append(cfIncre_cache[aid], cf_incre)
            seqW_cache[aid] = np.append(seqW_cache[aid], seq_w)
            timeW_cache[aid] = np.append(timeW_cache[aid], time_w)
            actionW_cache[aid] = np.append(actionW_cache[aid], test_ops_weights[op])
            raw_seqOrder_cache[aid] = np.append(raw_seqOrder_cache[aid], raw_seq_order)
            ## adding the similar items, if full_sim_matrix don't have such record, skip. 
            if aid not in full_sim_matrix:
                continue
            for similar_item in full_sim_matrix[aid]:
                ## if sim_item is in candidates, would be included above anyways, skip 
                if similar_item in candidates:
                    continue
                if similar_item not in potential_to_recommend:
                    potential_to_recommend[similar_item] = 0
                    ## init all cache obj
                    visit_flag[similar_item] = 0
                    ts_cache[similar_item] = np.array([np.float64(0) for _ in range(0)])
                    ops_cache[similar_item] = np.array([np.float64(0) for _ in range(0)])
                    simScore_cache[similar_item] = np.array([np.float64(0) for _ in range(0)])
                    cfIncre_cache[similar_item] = np.array([np.float64(0) for _ in range(0)])
                    seqW_cache[similar_item] = np.array([np.float64(0) for _ in range(0)])
                    timeW_cache[similar_item] = np.array([np.float64(0) for _ in range(0)])
                    actionW_cache[similar_item] = np.array([np.float64(0) for _ in range(0)])
                    raw_seqOrder_cache[similar_item] = np.array([np.float64(0) for _ in range(0)])
                
                cf_incre = seq_w * time_w * test_ops_weights[op] * full_sim_matrix[aid][similar_item]
                potential_to_recommend[similar_item] += cf_incre  ## no PREV_INTERACT_BONUS as expected, replaced with sim_matrix scores
                ## append features
                ts_cache[similar_item] = np.append(ts_cache[similar_item], ts[helper_idx])
                ops_cache[similar_item] = np.append(ops_cache[similar_item], op)
                simScore_cache[similar_item] = np.append(simScore_cache[similar_item], full_sim_matrix[aid][similar_item])
                cfIncre_cache[similar_item] = np.append(cfIncre_cache[similar_item], cf_incre)
                seqW_cache[similar_item] = np.append(seqW_cache[similar_item], seq_w)
                timeW_cache[similar_item] = np.append(timeW_cache[similar_item], time_w)
                actionW_cache[similar_item] = np.append(actionW_cache[similar_item], test_ops_weights[op])
                raw_seqOrder_cache[similar_item] = np.append(raw_seqOrder_cache[similar_item], raw_seq_order)
                
            helper_idx -= 1

    result[session] = np.array(heap_topk_return_list(potential_to_recommend, 150)) ## ALERT ## Take top 100 for validation runs. 
    
    feature_tuples_this_session = []
    for aid in result[session]:
        # action_types_temp, counts = np.unique(ops_cache[aid], return_counts=True)
        num_clicks, num_carts, num_orders = 0, 0, 0
        for op in ops_cache[aid]:
            if op == 0:
                num_clicks += 1
            elif op == 1:
                num_carts += 1
            elif op == 2:
                num_orders += 1

        if visit_flag[aid]:   ## write 6 features per row
            feature_tuple_this_aid = (
                visit_flag[aid], np.sum(seqW_cache[aid]), np.sum(timeW_cache[aid]), np.sum(actionW_cache[aid]), length, len(unique_aids),
                potential_to_recommend[aid], item_total_likes[aid], 100, 1, 1, len(raw_seqOrder_cache[aid]),
                time_span, end_time-ts_cache[aid][0], np.max(seqW_cache[aid]), np.mean(seqW_cache[aid]), np.min(seqW_cache[aid]), np.max(timeW_cache[aid]),
                np.mean(timeW_cache[aid]), np.min(timeW_cache[aid]), np.max(actionW_cache[aid]), np.mean(actionW_cache[aid]), np.min(actionW_cache[aid]), num_clicks,
                num_carts, num_orders, ops_cache[aid][0], NOW_TIME-ts_cache[aid][0], np.max(cfIncre_cache[aid]), np.mean(cfIncre_cache[aid]),
                np.min(cfIncre_cache[aid]), np.std(seqW_cache[aid]), np.std(timeW_cache[aid]), np.std(actionW_cache[aid]), np.std(cfIncre_cache[aid]), \
                    np.max(cfIncre_cache[aid][: min(3, len(cfIncre_cache[aid]))]), np.mean(cfIncre_cache[aid][: min(3, len(cfIncre_cache[aid]))]), np.min(cfIncre_cache[aid][: min(3, len(cfIncre_cache[aid]))]), 
                    np.max(timeW_cache[aid][: min(3, len(timeW_cache[aid]))]), np.mean(timeW_cache[aid][: min(3, len(timeW_cache[aid]))]), np.min(timeW_cache[aid][: min(3, len(timeW_cache[aid]))]),
                    np.max(seqW_cache[aid][: min(3, len(seqW_cache[aid]))]), np.mean(seqW_cache[aid][: min(3, len(seqW_cache[aid]))]), np.min(seqW_cache[aid][: min(3, len(seqW_cache[aid]))]),
                raw_seqOrder_cache[aid][0], np.max(raw_seqOrder_cache[aid]), np.mean(raw_seqOrder_cache[aid]), np.min(raw_seqOrder_cache[aid]),
                ts_cache[aid][0]
            )
        else:
            feature_tuple_this_aid = (
                visit_flag[aid], np.sum(seqW_cache[aid]), np.sum(timeW_cache[aid]), np.sum(actionW_cache[aid]), length, len(unique_aids),
                potential_to_recommend[aid], item_total_likes[aid], len(raw_seqOrder_cache[aid]), np.max(simScore_cache[aid]), np.mean(simScore_cache[aid]), 0,
                time_span, end_time-ts_cache[aid][0], np.max(seqW_cache[aid]), np.mean(seqW_cache[aid]), np.min(seqW_cache[aid]), np.max(timeW_cache[aid]),
                np.mean(timeW_cache[aid]), np.min(timeW_cache[aid]), np.max(actionW_cache[aid]), np.mean(actionW_cache[aid]), np.min(actionW_cache[aid]), num_clicks,
                num_carts, num_orders, ops_cache[aid][0], NOW_TIME-ts_cache[aid][0], np.max(cfIncre_cache[aid]), np.mean(cfIncre_cache[aid]),
                np.min(cfIncre_cache[aid]), np.std(seqW_cache[aid]), np.std(timeW_cache[aid]), np.std(actionW_cache[aid]), np.std(cfIncre_cache[aid]), \
                    np.max(cfIncre_cache[aid][: min(3, len(cfIncre_cache[aid]))]), np.mean(cfIncre_cache[aid][: min(3, len(cfIncre_cache[aid]))]), np.min(cfIncre_cache[aid][: min(3, len(cfIncre_cache[aid]))]), 
                    np.max(timeW_cache[aid][: min(3, len(timeW_cache[aid]))]), np.mean(timeW_cache[aid][: min(3, len(timeW_cache[aid]))]), np.min(timeW_cache[aid][: min(3, len(timeW_cache[aid]))]),
                    np.max(seqW_cache[aid][: min(3, len(seqW_cache[aid]))]), np.mean(seqW_cache[aid][: min(3, len(seqW_cache[aid]))]), np.min(seqW_cache[aid][: min(3, len(seqW_cache[aid]))]),
                raw_seqOrder_cache[aid][0], np.max(raw_seqOrder_cache[aid]), np.mean(raw_seqOrder_cache[aid]), np.min(raw_seqOrder_cache[aid]),
                ts_cache[aid][0]
            )
            
        feature_tuples_this_session.append(feature_tuple_this_aid)
    
    return feature_tuples_this_session

In [16]:
# def load_gt_tables(type):
#     """ type -> carts / orders """
#     gt_labels = pd.read_json("/kaggle/input/local-validation7days-test-labels/test_labels.jsonl", lines=True)
#     gt_labels['aids'] = gt_labels["labels"].apply(lambda x: x.get(type))
#     gt_labels = gt_labels[gt_labels.aids.notnull()]
#     gt_labels = gt_labels.drop("labels", axis = 1)
#     ## ========= special df to identify the unique session id to look at ================
#     valid_gt_sessions = gt_labels.drop("aids", axis = 1) 
#     ## ========================================================================
#     ## keep go on for gt labels processing
#     gt_labels = gt_labels.set_index(['session']).apply(pd.Series.explode).reset_index()
#     gt_labels["gt"] = 1
#     return valid_gt_sessions, gt_labels

"""
This is different from validation phase, as we don't have any gt data, therefore, all data has to be exploded and save
"""
def process_batch_pipeline_subVersion(rawDf):
    """ rawDf -> Df with session, aids(100), feature_tuple """
    ## Directly explode aids and feature_tuple
    final_df = rawDf.set_index(['session']).apply(pd.Series.explode).reset_index()

    ## new method
    features = np.vstack(final_df["feature_tuple"].values)
    temp_df = pd.DataFrame(features)
    del features
    temp_df.columns = [f'{feat_name}' for feat_name in FEATURE_NAMES]
    final_df[temp_df.columns] = temp_df
    del temp_df
    
    final_df = final_df.drop("feature_tuple", axis = 1)

    return final_df

## CARTs, comment out if run ORDERs

In [None]:
# %%time
# result_iuf_carts = nb.typed.Dict.empty(
#     key_type = nb.types.int64,
#     value_type = nb.types.int64[:])

# features_all_sessions = [] # session, aid, feature tuple
#                           ## session, aid, feature tuple
# gc.collect()

# ## Given there are 1671803 sessions in total, we separate them into K batches
# K = 64
# # batch_size = 1671803 // K  ##  -> 139316 dealing with around 140k sessions per batch
# session_per_batch = len(df_test) // K 

# row_idx_cutoffs = [(len(df) - len(df_test)) + (PARALLEL * (session_per_batch//PARALLEL) ) * i for i in range(1, K+3)]   ## batch process every 1024 * 136 rows
# # 145 -> 148644 // 1024  -> batch_size // PARALLEL

# feature_batch_id = 0

# print("feature store starts:")


# for row_idx in tqdm(range(len(df) - len(df_test), len(df), PARALLEL)):
#     start_row = row_idx
#     end_row = min(row_idx + PARALLEL, len(df))
#     rows = df.iloc[start_row: end_row][['session', 'start_idx', 'total_action', 'session_start_time']].values
# #     save_features_parallel(rows, aids, ops, ts, result_iuf_orders, simMatrices["iuf"], np.array([2.0, 8.0, 6.0]), orders_features_save)  
#     ## run things in parallel
#     for row_idx in nb.prange(len(rows)):
#         session, starting_idx, length, start_time = rows[row_idx]
#         features_tuples_this_session = save_feature_single_session_by_caching(session, starting_idx, length, start_time, aids, ops, ts, result_iuf_carts, simMatrices["iuf"], item_total_likes, np.array([4.0, 2.0, 5.0]))
#         features_all_sessions.append(features_tuples_this_session)
    
#     if (start_row in row_idx_cutoffs) or (end_row == len(df)):
#         ## save batch result
#         rawDf = pd.DataFrame({"session": result_iuf_carts.keys(), "aids": result_iuf_carts.values(), "feature_tuple": features_all_sessions})
#         batch_result = process_batch_pipeline_subVersion(rawDf)
#         batch_result.to_parquet(f"/kaggle/working/batch_result_{feature_batch_id}.parquet")
#         ## clean the memory for next batch
#         del batch_result, rawDf, features_all_sessions, result_iuf_carts
#         gc.collect()
#         ## progress update
#         print(f"feature_batch_{feature_batch_id} completes saving.")
#         feature_batch_id += 1
#         ## initiate the struct for new batch again
#         result_iuf_carts = nb.typed.Dict.empty(
#             key_type = nb.types.int64,
#             value_type = nb.types.int64[:])
#         features_all_sessions = []

## ORDERs, comment out if run CARTS

In [17]:
%%time
result_iuf_orders = nb.typed.Dict.empty(
    key_type = nb.types.int64,
    value_type = nb.types.int64[:])

features_all_sessions = [] # session, aid, feature tuple
                          ## session, aid, feature tuple
gc.collect()

## Given there are 1671803 sessions in total, we separate them into K batches
K = 48
# batch_size = 1671803 // K  ##  -> 139316 dealing with around 140k sessions per batch
session_per_batch = len(df_test) // K 

row_idx_cutoffs = [(len(df) - len(df_test)) + (PARALLEL * (session_per_batch//PARALLEL) ) * i for i in range(1, K+3)]   ## batch process every 1024 * 136 rows
# 145 -> 148644 // 1024  -> batch_size // PARALLEL

feature_batch_id = 0

print("feature store starts:")


for row_idx in tqdm(range(len(df) - len(df_test), len(df), PARALLEL)):
    start_row = row_idx
    end_row = min(row_idx + PARALLEL, len(df))
    rows = df.iloc[start_row: end_row][['session', 'start_idx', 'total_action', 'session_start_time']].values
#     save_features_parallel(rows, aids, ops, ts, result_iuf_orders, simMatrices["iuf"], np.array([2.0, 8.0, 6.0]), orders_features_save)  
    ## run things in parallel
    for row_idx in nb.prange(len(rows)):
        session, starting_idx, length, start_time = rows[row_idx]
        features_tuples_this_session = save_feature_single_session_by_caching(session, starting_idx, length, start_time, aids, ops, ts, result_iuf_orders, simMatrices["iuf"], item_total_likes, np.array([2.0, 6.0, 6.0]))
        features_all_sessions.append(features_tuples_this_session)
    
    if (start_row in row_idx_cutoffs) or (end_row == len(df)):
        ## save batch result
        rawDf = pd.DataFrame({"session": result_iuf_orders.keys(), "aids": result_iuf_orders.values(), "feature_tuple": features_all_sessions})
        batch_result = process_batch_pipeline_subVersion(rawDf)
        batch_result.to_parquet(f"../../allData/submission_phase_data/features_kaggle_eval_set/V2/order_features_last7day/batch_result_{feature_batch_id}.parquet")
        ## clean the memory for next batch
        del batch_result, rawDf, features_all_sessions, result_iuf_orders
        gc.collect()
        ## progress update
        print(f"feature_batch_{feature_batch_id} completes saving.")
        feature_batch_id += 1
        ## initiate the struct for new batch again
        result_iuf_orders = nb.typed.Dict.empty(
            key_type = nb.types.int64,
            value_type = nb.types.int64[:])
        features_all_sessions = []

  0%|          | 0/1633 [00:00<?, ?it/s]

feature store starts:


  visit_flag[aid] = 1
  2%|▏         | 35/1633 [03:59<16:10:38, 36.44s/it]

feature_batch_0 completes saving.


  4%|▍         | 69/1633 [07:44<16:05:04, 37.02s/it]

feature_batch_1 completes saving.


  6%|▋         | 103/1633 [11:22<15:10:20, 35.70s/it]

feature_batch_2 completes saving.


  8%|▊         | 137/1633 [15:05<15:26:39, 37.17s/it]

feature_batch_3 completes saving.


 10%|█         | 171/1633 [18:42<14:51:09, 36.57s/it]

feature_batch_4 completes saving.


 13%|█▎        | 205/1633 [22:19<14:45:04, 37.19s/it]

feature_batch_5 completes saving.


 15%|█▍        | 239/1633 [25:57<14:15:49, 36.84s/it]

feature_batch_6 completes saving.


 17%|█▋        | 273/1633 [29:30<13:44:31, 36.38s/it]

feature_batch_7 completes saving.


 19%|█▉        | 307/1633 [33:13<13:46:25, 37.39s/it]

feature_batch_8 completes saving.


 21%|██        | 341/1633 [36:54<13:28:38, 37.55s/it]

feature_batch_9 completes saving.


 23%|██▎       | 375/1633 [40:30<12:50:03, 36.73s/it]

feature_batch_10 completes saving.


 25%|██▌       | 409/1633 [44:16<13:13:21, 38.89s/it]

feature_batch_11 completes saving.


 27%|██▋       | 443/1633 [47:53<11:54:52, 36.04s/it]

feature_batch_12 completes saving.


 29%|██▉       | 477/1633 [51:29<11:50:00, 36.85s/it]

feature_batch_13 completes saving.


 31%|███▏      | 511/1633 [55:02<11:18:35, 36.29s/it]

feature_batch_14 completes saving.


 33%|███▎      | 545/1633 [58:44<11:59:30, 39.68s/it]

feature_batch_15 completes saving.


 35%|███▌      | 579/1633 [1:02:37<11:30:15, 39.29s/it]

feature_batch_16 completes saving.


 38%|███▊      | 613/1633 [1:06:33<11:23:56, 40.23s/it]

feature_batch_17 completes saving.


 40%|███▉      | 647/1633 [1:10:26<10:54:45, 39.84s/it]

feature_batch_18 completes saving.


 42%|████▏     | 681/1633 [1:14:07<9:50:21, 37.21s/it] 

feature_batch_19 completes saving.


 44%|████▍     | 715/1633 [1:17:51<9:38:54, 37.84s/it]

feature_batch_20 completes saving.


 46%|████▌     | 749/1633 [1:21:37<9:31:56, 38.82s/it]

feature_batch_21 completes saving.


 48%|████▊     | 783/1633 [1:25:20<9:03:47, 38.39s/it]

feature_batch_22 completes saving.


 50%|█████     | 817/1633 [1:29:10<8:53:26, 39.22s/it]

feature_batch_23 completes saving.


 52%|█████▏    | 851/1633 [1:33:10<8:45:10, 40.29s/it]

feature_batch_24 completes saving.


 54%|█████▍    | 885/1633 [1:36:59<8:01:38, 38.63s/it]

feature_batch_25 completes saving.


 56%|█████▋    | 919/1633 [1:40:39<7:21:59, 37.14s/it]

feature_batch_26 completes saving.


 58%|█████▊    | 953/1633 [1:44:23<7:10:15, 37.96s/it]

feature_batch_27 completes saving.


 60%|██████    | 987/1633 [1:48:04<6:50:30, 38.13s/it]

feature_batch_28 completes saving.


 63%|██████▎   | 1021/1633 [1:51:50<6:35:28, 38.77s/it]

feature_batch_29 completes saving.


 65%|██████▍   | 1055/1633 [1:55:44<6:21:39, 39.62s/it]

feature_batch_30 completes saving.


 67%|██████▋   | 1089/1633 [1:59:30<5:51:17, 38.75s/it]

feature_batch_31 completes saving.


 69%|██████▉   | 1123/1633 [2:03:13<5:22:17, 37.92s/it]

feature_batch_32 completes saving.


 71%|███████   | 1157/1633 [2:07:01<5:12:22, 39.37s/it]

feature_batch_33 completes saving.


 73%|███████▎  | 1191/1633 [2:10:46<4:45:25, 38.75s/it]

feature_batch_34 completes saving.


 75%|███████▌  | 1225/1633 [2:14:33<4:21:44, 38.49s/it]

feature_batch_35 completes saving.


 77%|███████▋  | 1259/1633 [2:18:17<3:56:35, 37.96s/it]

feature_batch_36 completes saving.


 79%|███████▉  | 1293/1633 [2:21:56<3:33:11, 37.62s/it]

feature_batch_37 completes saving.


 81%|████████▏ | 1327/1633 [2:25:35<3:13:39, 37.97s/it]

feature_batch_38 completes saving.


 83%|████████▎ | 1361/1633 [2:29:19<2:55:21, 38.68s/it]

feature_batch_39 completes saving.


 85%|████████▌ | 1395/1633 [2:32:57<2:27:53, 37.28s/it]

feature_batch_40 completes saving.


 88%|████████▊ | 1429/1633 [2:36:49<2:16:04, 40.02s/it]

feature_batch_41 completes saving.


 90%|████████▉ | 1463/1633 [2:40:43<1:53:25, 40.03s/it]

feature_batch_42 completes saving.


 92%|█████████▏| 1497/1633 [2:44:34<1:30:53, 40.10s/it]

feature_batch_43 completes saving.


 94%|█████████▍| 1531/1633 [2:48:20<1:07:28, 39.69s/it]

feature_batch_44 completes saving.


 96%|█████████▌| 1565/1633 [2:52:04<44:20, 39.12s/it]  

feature_batch_45 completes saving.


 98%|█████████▊| 1599/1633 [2:55:52<22:48, 40.24s/it]

feature_batch_46 completes saving.


100%|██████████| 1633/1633 [2:59:31<00:00,  6.60s/it]

feature_batch_47 completes saving.
CPU times: user 1h 46min 53s, sys: 48min 44s, total: 2h 35min 38s
Wall time: 2h 59min 31s



