## Item Collaborative Filtering Pipeline

In [3]:
import os
import gc
import heapq
import pickle
import numba as nb
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import math


In [4]:
%%time
df = pd.read_csv("../../allData/validationData/train_meta_data.csv")
df_test = pd.read_csv("../../allData/validationData/test_meta_data.csv")
df = pd.concat([df, df_test]).reset_index(drop = True)
npz = np.load("../../allData/validationData/train_core_data.npz")
npz_test = np.load("../../allData/validationData/test_core_data.npz")
aids = np.concatenate([npz['aids'], npz_test['aids']])
ts = np.concatenate([npz['ts'], npz_test['ts']])
ops = np.concatenate([npz['ops'], npz_test['ops']])

df["start_idx"] = df['total_action'].cumsum().shift(1).fillna(0).astype(int)
df["end_time"] = ts[df["start_idx"] + df["total_action"] - 1]

CPU times: user 6.7 s, sys: 2.52 s, total: 9.21 s
Wall time: 9.44 s


In [4]:
df

Unnamed: 0,session,total_action,session_start_time,session_end_time,start_idx,end_time
0,0,147,1659304800,1661103727,0,1661103727
1,1,27,1659304800,1660857067,147,1660857067
2,2,13,1659304800,1660577379,174,1660577379
3,3,226,1659304800,1661109666,187,1661109666
4,4,3,1659304800,1659304900,413,1659304900
...,...,...,...,...,...,...
12368249,12899774,1,1661723968,1661723968,171022132,1661723968
12368250,12899775,1,1661723970,1661723970,171022133,1661723970
12368251,12899776,1,1661723972,1661723972,171022134,1661723972
12368252,12899777,1,1661723976,1661723976,171022135,1661723976


## 1. Training -- Derive ItemCF similarity Matrix

#### CONSTANTS

In [5]:
## Define constants
PARALLEL = 1024
LOOKBACK_WINDOW = 100   ## only fit the latest LOOKBACK_WINDOW to train the sim matrix
#TOPN = 20
ACTION_WEIGHTS = np.array([1.0, 6.0, 3.0])

#### Section A: Utils Functions 
1. Count Item Total likes: The similary score will be normalized by "Item Total Like Scores". In theory, popular items should have less weight in simiarity score.
2. Trimming function: Helpful managing memoery usage. 
3. Method for normalization: Mostly item total like normalization, and max norm(make all sim score between 0 and 1) of the score. 

In [6]:
# ==================================
# Methods for counting Item Total Likes
# ==================================
@nb.jit(nopython=True)
def getItemTotalLikesNaive(aids, ops, item_total_likes, action_weights):
    """
    Stores the total like score of itemXXX in item_total_likes, based on action_weights parameter. np.array([X, Y, Z])
    """
    for idx, item in enumerate(aids):
        if item not in item_total_likes: 
            item_total_likes[item] = 0
        item_total_likes[item] += action_weights[ops[idx]]   ## TODO: For time decay, consider replace with 1, for iuf keep this. 

# ==================================
# Methods for rank and trim the sim score dict
# ==================================
@nb.jit(nopython = True)
def heap_topk(item_cnt_dict, cap):
    """
    get the top cap(k) elements of the cnt dict based on value, using a min-heap structure
    """
    dic = nb.typed.Dict.empty(key_type = nb.types.int64, value_type = nb.types.float64)
    q = [(np.float64(0), np.int64(0)) for _ in range(0)]  ## generate empty queue to implement a heap, 
    for item_ref, sim_score in item_cnt_dict.items():   ## read in the dict in heap structure
        heapq.heappush(q, (sim_score, item_ref))   ## push the <sim_score, item_ref_id> pair into min-heap, using sim_score for order
        if len(q) > cap:
            heapq.heappop(q)
            
    res = [heapq.heappop(q) for _ in range(len(q))][::-1]
    for i in range(len(res)):
        dic[res[i][1]] = res[i][0]
    
    return dic
   
@nb.jit(nopython = True)
def trim_simMatrix_topk(fullSimMatrix, k = 50):
    """
    trim top k items of each "itemX: {itemY: score1, ...}" pair in fullSimMatrix based on sim scores. 
    """
    for item, item_cnt_dict in fullSimMatrix.items():
        fullSimMatrix[item] = heap_topk(item_cnt_dict, k)

# ==================================
# Methods for score normalization
# ==================================

# @nb.jit(nopython=True)
# def itemTotalLikeNorm(fullSimMatrix, item_total_likes):
#     for aid_1, relations in fullSimMatrix.items():
#         for aid_2, sim_score in relations.items():
#             fullSimMatrix[aid_1][aid_2] = sim_score / (item_total_likes[aid_1] * item_total_likes[aid_2]) ** 0.1  ## TODO: consider 0.1 or other small number
            
@nb.jit(nopython=True)
def maxNormSimMatrix(fullSimMatrix):
    for aid_1, relations in fullSimMatrix.items():
        max_num = -np.inf
        for _, sim_score in relations.items():
            if sim_score > max_num:
                max_num = sim_score
        ## DEGUG use, delete later
        if max_num == 0:
            print(aid_1)
            print(fullSimMatrix[aid_1])
        for aid_2, sim_score in relations.items():
#             if max_num == 0:
#                 max_num += 0.001
            fullSimMatrix[aid_1][aid_2] = sim_score / max_num

#### Section B: Sim Score Computation functions

In [7]:

@nb.jit(nopython=True)
def getSimScoresSingleRow(pairs_this_row, start_time, start_idx, length, aids, ts, ops, item_total_likes, action_weights, mode):
    """
    Get the sim scores of items within single session, can be ran in parallel within each batch. 
    """
    max_idx = start_idx + length
    min_idx = max(max_idx - LOOKBACK_WINDOW, start_idx)  
    for i in range(min_idx, max_idx):
        for j in range(i+1, max_idx):
            if ts[j] - ts[i] > 2 * 60 * 60: continue  #TODO: try 2h only
            if aids[i] == aids[j]: continue
            
            if mode == "cosine":
                w_ij = action_weights[ops[j]] 
                w_ji = action_weights[ops[i]] 
            elif mode == "iuf":  ## penalize users that had lots of actions TODO: consider location weight
                if ts[max_idx] - start_time > 24 * 60 * 60:
                    special_factor = 0.8
                else:
                    special_factor = 1.0
                
                loc_weight = 0.5**(abs(i-j))   #math.exp(-0.02 * abs(i-j)) 
                time_gap_weight = 0.5 ** (abs(ts[i]-ts[j]) / (1.5*60*60))  
                w_ij = special_factor * action_weights[ops[j]] * time_gap_weight * loc_weight / math.log1p(length)
                w_ji = special_factor * action_weights[ops[i]] * time_gap_weight * loc_weight / math.log1p(length)
            elif mode == "time_decay":
                ## calculate some time weights of each item, more weights are given when ts is later. #TODO: try adding (i-j) location weight, exponential weight, 0.5 ** (abs(i-j + 1)), 
                loc_weight = 0.5**(abs(i-j))   #math.exp(-0.02 * abs(i-j)) 
                #time_i = 1 + 0.1 ** ((1662328791-ts[i])/(1662328791-1659304800)) #1 + 3 * (ts[i] + start_time - 1659304800) / (1662328791 - 1659304800) #  #(1 - 0.8 *(TEST_END_TS - ts[i]) / TIME_SPAN) ** 0.5 # 0.2~1 #   ## time decay weight for item i 
                #time_j = 1 + 0.1 ** ((1662328791-ts[j])/(1662328791-1659304800))  # 1 + 3 * (ts[j] + start_time - 1659304800) / (1662328791 - 1659304800) # #  #(1 - 0.8 *(TEST_END_TS - ts[j]) / TIME_SPAN) ** 0.5   # 
                time_i = 1 + 1/(1 + math.exp(10*( ((1662328791-ts[i])/(1662328791-1659304800)) - 0.6  )))
                time_j = 1 + 1/(1 + math.exp(10*( ((1662328791-ts[j])/(1662328791-1659304800)) - 0.6  )))
                
                time_gap_weight = 0.5 ** (abs(ts[i]-ts[j]) / (1.5*60*60))  
                
                w_ij = action_weights[ops[j]] * loc_weight * time_gap_weight * time_i / math.log1p(length)
                w_ji = action_weights[ops[i]] * loc_weight * time_gap_weight * time_j / math.log1p(length)
                
            pairs_this_row[(aids[i], aids[j])] = w_ij / (item_total_likes[aids[i]] * item_total_likes[aids[j]]) ** 0.1
            pairs_this_row[(aids[j], aids[i])] = w_ji / (item_total_likes[aids[i]] * item_total_likes[aids[j]]) ** 0.1

@nb.jit(nopython=True, parallel=True, cache=True)
def getSimScoreBatch(aids, ts, ops, rows, fullSimMatrix, action_weights, item_total_likes, mode="cosine"):
    nrows = len(rows)
    pairs_this_batch = [{(0, 0): 0.0 for _ in range(0)} for _ in range(nrows)]
    ## get the sim scores of each batch in seperate sub dict in pairs_this_batch
    for row_i in nb.prange(nrows):  ## run each row of the batch in parallel
        _, start_idx, length, start_time = rows[row_i]
        getSimScoresSingleRow(pairs_this_batch[row_i], start_time, start_idx, length, aids, ts, ops, item_total_likes, action_weights, mode)
    ## merge pairs_this_batch into the fullSimMatrix
    for row_i in range(nrows):
        for (aid1, aid2), score in pairs_this_batch[row_i].items():
            if aid1 not in fullSimMatrix: 
                fullSimMatrix[aid1] = {0: 0.0 for _ in range(0)}
            if aid2 not in fullSimMatrix[aid1]:
                fullSimMatrix[aid1][aid2] = 0.0
            fullSimMatrix[aid1][aid2] += score


#### Section C: Train the similarity matrices
1. Derive the total like score first
2. Train 2 similarity matrices, one using iuf(Inverse User Frequence), the other using time_decay method. 

In [8]:
%%time
## get the Total Like matrix
item_total_likes = nb.typed.Dict.empty(
    key_type = nb.types.int64,
    value_type = nb.types.float64)

getItemTotalLikesNaive(aids, ops, item_total_likes, ACTION_WEIGHTS)

CPU times: user 21.6 s, sys: 746 ms, total: 22.4 s
Wall time: 22.9 s


In [9]:
%%time
simMatrices = {}   ## store a few different similarity matrices using different scoring system, for different prediction type
TRIM_CYCLES = 1000   ## trim full sim matrix every XX batches. 
MODES_TO_TRAIN = ["iuf", "time_decay"]

for mode in MODES_TO_TRAIN:
    ## the nested dict to store full sim matrix, {itemX: {itemY: score, itemZ: score, ...}}
    fullSimMatrix = nb.typed.Dict.empty(
            key_type = nb.types.int64,
            value_type = nb.typeof(nb.typed.Dict.empty(key_type = nb.types.int64, value_type = nb.types.float64)))
    max_idx = len(df)
    batch_idx = 1  ## compute sim matrix for PARALLEL # of rows per batch, have a total of max_idx/PARALLEL batches.
    for idx in tqdm(range(0, max_idx, PARALLEL)):
        rows = df.iloc[idx: min(idx + PARALLEL, max_idx)][['session', 'start_idx', 'total_action', 'session_start_time']].values
        getSimScoreBatch(aids, ts, ops, rows, fullSimMatrix, ACTION_WEIGHTS, item_total_likes, mode=mode)
        batch_idx += 1
        if batch_idx % TRIM_CYCLES == 0:
            print("batch_idx: ", batch_idx)
            trim_simMatrix_topk(fullSimMatrix, 100)
            gc.collect()
            #break

    
    ## trim top 50 when the training is complete
    trim_simMatrix_topk(fullSimMatrix, 80)   ## TODO: make this num small enough to reduce time for normalization
    ## max norm of each score
    maxNormSimMatrix(fullSimMatrix)
    
    simMatrices[mode] = fullSimMatrix
    
    del fullSimMatrix
    gc.collect()

  8%|▊         | 998/12079 [01:20<16:47, 11.00it/s] 

batch_idx:  1000


 17%|█▋        | 1997/12079 [03:28<16:12, 10.36it/s]   

batch_idx:  2000


 25%|██▍       | 3000/12079 [05:16<17:17:54,  6.86s/it]

batch_idx:  3000


 33%|███▎      | 3999/12079 [07:02<22:53:23, 10.20s/it]

batch_idx:  4000


 41%|████▏     | 4999/12079 [08:45<10:42:08,  5.44s/it]

batch_idx:  5000


 50%|████▉     | 6000/12079 [10:25<9:45:51,  5.78s/it] 

batch_idx:  6000


 58%|█████▊    | 7000/12079 [12:00<11:33:27,  8.19s/it]

batch_idx:  7000


 66%|██████▌   | 7999/12079 [13:33<7:00:14,  6.18s/it] 

batch_idx:  8000


 75%|███████▍  | 8999/12079 [15:03<5:22:14,  6.28s/it]

batch_idx:  9000


 83%|████████▎ | 9999/12079 [16:33<7:15:37, 12.57s/it]

batch_idx:  10000


 91%|█████████ | 10994/12079 [17:46<00:24, 43.66it/s]  

batch_idx:  11000


 99%|█████████▉| 11999/12079 [18:50<15:55, 11.94s/it]

batch_idx:  12000


100%|██████████| 12079/12079 [18:52<00:00, 10.66it/s]
  8%|▊         | 999/12079 [02:22<22:39:07,  7.36s/it]

batch_idx:  1000


 17%|█▋        | 2000/12079 [04:50<19:46:35,  7.06s/it]

batch_idx:  2000


 25%|██▍       | 2998/12079 [07:10<15:04, 10.04it/s]   

batch_idx:  3000


 33%|███▎      | 3999/12079 [09:17<26:40:18, 11.88s/it]

batch_idx:  4000


 41%|████▏     | 4999/12079 [11:29<12:23:45,  6.30s/it]

batch_idx:  5000


 50%|████▉     | 5999/12079 [13:23<10:35:38,  6.27s/it]

batch_idx:  6000


 58%|█████▊    | 6999/12079 [15:06<8:58:54,  6.37s/it] 

batch_idx:  7000


 66%|██████▌   | 8000/12079 [16:50<5:39:29,  4.99s/it]

batch_idx:  8000


 75%|███████▍  | 9000/12079 [18:26<3:44:46,  4.38s/it]

batch_idx:  9000


 83%|████████▎ | 9999/12079 [19:58<7:33:58, 13.10s/it]

batch_idx:  10000


 91%|█████████ | 10995/12079 [20:33<00:49, 22.10it/s]  

batch_idx:  11000


 99%|█████████▉| 11999/12079 [22:22<08:05,  6.07s/it]  

batch_idx:  12000


100%|██████████| 12079/12079 [22:24<00:00,  8.99it/s]


CPU times: user 1h 19min 22s, sys: 35min 24s, total: 1h 54min 47s
Wall time: 42min 52s


In [21]:
## A sanity check
simMatrices["iuf"][1517085]

DictType[int64,float64]<iv=None>({331941: 1.0, 243711: 0.7190634944882741, 1801351: 0.5040513231086448, 371417: 0.4167012125283079, 32249: 0.41373445935315617, 899438: 0.3626232356223332, 303302: 0.3081623040015135, 461689: 0.24140817919093385, 1799312: 0.20439432086051285, 576535: 0.20363825079722686, 516937: 0.17150816792050863, 1500897: 0.1474968554333243, 807502: 0.12350333543572184, 528847: 0.12185781543011626, 734463: 0.12045400414403375, 460942: 0.1139099425003342, 12725: 0.10691090671837272, 215107: 0.09943509171737062, 893213: 0.09210204073235082, 1364441: 0.0843909572280193, 1620477: 0.07891101754442596, 355088: 0.0760826198068775, 1630327: 0.07572153181228192, 804782: 0.07362283905259837, 760663: 0.07337831080008321, 1390627: 0.07327155250282527, 1775482: 0.07042982880108292, 293222: 0.06833266345318563, 307484: 0.06657964815873083, 1432934: 0.06421905471330858, 1190046: 0.06345478375755678, 1371202: 0.0631481939274279, 1350724: 0.06124414306732429, 800979: 0.058474785489027

In [20]:

len(simMatrices["iuf"])

1814440

In [22]:
gc.collect()

11016

## 2. Inference -- Make prediction using the matrices derived from above. 

#### Section D: Utils for inference:
1. Select top items to recommend in re-ranking
2. Compute Real time importance of each action (Not in use currently).

In [23]:
@nb.jit(nopython = True)
def heap_topk_return_list(item_cnt_dict, cap):
    """
    get the top cap(k) elements of the cnt dict based on value, using a min-heap structure, return a list with top "cap" elements with highest score
    """
    q = [(np.float64(0), np.int64(0)) for _ in range(0)]  ## generate empty queue to implement a heap, 
    for item_ref, sim_score in item_cnt_dict.items():   ## read in the dict in heap structure
        heapq.heappush(q, (sim_score, item_ref))   ## push the <sim_score, item_ref_id> pair into min-heap, using sim_score for order
        if len(q) > cap:
            heapq.heappop(q)
            
    res = [heapq.heappop(q)[1] for _ in range(len(q))][::-1]
    
    return res

# ===================================

# ===================================


@nb.jit(nopython=True)
def getRealTimeActionWeight(ts_action, ts_start, ts_end, op, seq, length, ACTION_WEIGHTS, TRAIN_START_TS, TEST_END_TS):
    """
    This function returns the real time importance weight of test set session actions
    input: 
        ts_action: ts of the action takes place
        ts_start: start_ts of this session
        ts_end: end_ts of this session
        op: type of the action
        seq: seq order this action
        length: total # of actions of this session. 
    """
#     overall_time_span = TEST_END_TS - TRAIN_START_TS
#     #session_time_span = ts_end - ts_start
#     time_to_end_of_session = ts_end - ts_action
#     if time_to_end_of_session > 24 * 60 * 60:
#         time_weight = 0.3
#     else:
#         time_weight = max(2**(1 - time_to_end_of_session/3600) - 1, 0.4)
    ## TODO: add time decay 
    action_weight = ACTION_WEIGHTS[op]
    sequence_weight = max(2 ** (seq/length) - 1, 0.5) ## floor at 0.1  #np.power(2, np.linspace(seq_weight_const, 1, length))[::-1] - 1
    res = action_weight * sequence_weight # * time_weight
    
    return res

#### Section E: Main Logic in Making Inferences
1. clicks_inferences: time_decay sim matrix + regular action weights <1, 6, 3>.
2. carts_inferencs: iuf sim matrix + weights <4, 2, 5> (as clicks actions tend to lead to cart action next).
3. orders_inferences: iuf sim matrix + regular action weights <1, 6, 3>.

In [64]:
@nb.jit(nopython=True)
def inference_single_session(session, starting_idx, length, start_time, aids, ops, ts, result, full_sim_matrix, test_ops_weights):
    ending_idx = starting_idx + length
    
    candidates = aids[starting_idx: ending_idx][::-1]
    candidates_ops = ops[starting_idx: ending_idx][::-1]
    
    ## record all potential aid that might be relevant
    potential_to_recommend = nb.typed.Dict.empty(key_type=nb.types.int64, value_type=nb.types.float64)

    ## get unique aid of each session 
    unique_aids = nb.typed.Dict.empty(key_type = nb.types.int64, value_type = nb.types.float64)
    for a in candidates:
        unique_aids[a] = 0
    
    if len(unique_aids) >= 20:   ## if the user has many actions, recommend based on the itemes it had been interacted with only.
        PREV_INTERACT_BONUS = 10
        sequence_weight = np.power(2, np.linspace(0.3, 1, len(candidates)))[::-1] - 1
        for aid, op, w in zip(candidates, candidates_ops, sequence_weight):
            if aid not in potential_to_recommend:
                potential_to_recommend[aid] = 0
            potential_to_recommend[aid] += w * test_ops_weights[op] * PREV_INTERACT_BONUS 
        #result[session] = np.array(heap_topk_return_list(potential_to_recommend, 20)) 
    else:  
        for idx in range(starting_idx, ending_idx):
            candidate = aids[idx]
            if candidate not in potential_to_recommend:
                potential_to_recommend[candidate] = np.inf ## ensure large weights on items had interacted with. 
    
    ## In case some items are duplicates, when potential_to_recommend not yet reach 20, impute with items from sim matrix
    if len(potential_to_recommend) < 80: ## CAUTIOUS: validation purpose only 
        sequence_weight = np.power(2, np.linspace(0.1, 1, len(candidates))) - 1   ## CHANGE_MADE: 0.3 -> 0.1
        for idx in range(starting_idx, ending_idx):
            candidate = aids[idx] 
            time_weight = 1 + 0.1 ** ((1662328791-ts[idx])/(1662328791-1659304800))    ## TODO: consider 
            candidate_realtime_weight = test_ops_weights[ops[idx]] * sequence_weight[idx-starting_idx] * time_weight  
            ## load the potential items to recommend,
            if candidate not in full_sim_matrix: 
                continue
            for similar_item in full_sim_matrix[candidate]:
#                 if similar_item in candidates:    ## skip the item if the it's already been interacted
#                     continue
                if similar_item not in potential_to_recommend:
                    potential_to_recommend[similar_item] = 0
                potential_to_recommend[similar_item] += full_sim_matrix[candidate][similar_item] * candidate_realtime_weight 
    
    result[session] = np.array(heap_topk_return_list(potential_to_recommend, 80))   ## CAUTIOUS: validation purpose only
    
@nb.jit(nopython=True)
def run_inference_parallel(rows, aids, ops, ts, result, full_sim_matrix, test_ops_weights):
    for row_idx in nb.prange(len(rows)):
        session, starting_idx, length, start_time = rows[row_idx]
        inference_single_session(session, starting_idx, length, start_time, aids, ops, ts, result, full_sim_matrix, test_ops_weights)

In [65]:
%%time
result_iuf = nb.typed.Dict.empty(
    key_type = nb.types.int64,
    value_type = nb.types.int64[:])

result_iuf_2 = nb.typed.Dict.empty(
    key_type = nb.types.int64,
    value_type = nb.types.int64[:])

result_time_decay = nb.typed.Dict.empty(
    key_type = nb.types.int64,
    value_type = nb.types.int64[:])

for row_idx in tqdm(range(len(df) - len(df_test), len(df), PARALLEL)):
    start_row = row_idx
    end_row = min(row_idx + PARALLEL, len(df))
    rows = df.iloc[start_row: end_row][['session', 'start_idx', 'total_action', 'session_start_time']].values
    run_inference_parallel(rows, aids, ops, ts, result_iuf, simMatrices["iuf"], ACTION_WEIGHTS)
    run_inference_parallel(rows, aids, ops, ts, result_iuf_2, simMatrices["iuf"], np.array([4.0, 2.0, 5.0]))   ## considebly add the weights for click action in the real time.
    run_inference_parallel(rows, aids, ops, ts, result_time_decay, simMatrices["time_decay"], ACTION_WEIGHTS)

100%|██████████| 1742/1742 [07:28<00:00,  3.89it/s]

CPU times: user 6min 28s, sys: 37.6 s, total: 7min 5s
Wall time: 7min 29s





In [66]:
result_time_decay[11098528]

array([  11830, 1732105,  588923,  571762,  884502,  876129, 1157882,
       1182614,  307904, 1790438,  231487, 1517680, 1633746,   77440,
       1586171,  855613,  735729,  205357,  756588,  523174,  258814,
        636101,  322370, 1689044,  532616, 1317291, 1718231, 1609734,
        409620, 1125638,  603583,  215561, 1052212, 1394029,  822934,
       1853703, 1383529, 1425172,  135833,  500334, 1519088,   19468,
       1349230, 1390152,  542780, 1667019,  490677,  448755, 1169267,
       1428162, 1695994,   42241,  804966, 1677053, 1311701,  672942,
        487136, 1197172,   45494,  697336, 1572401,  432989, 1446918,
        577040,  696526, 1241036, 1123537, 1307712, 1041771,  600258,
       1307461,  293000,  460148, 1032776,   87442,  198496,  972466,
       1421968, 1097576,  823991])

In [67]:
gc.collect()

49529

## 3. Submissions

In [68]:
%%time
subs = []
op_names = ["clicks", "carts", "orders"]

for result, op in zip([result_time_decay, result_iuf_2, result_iuf], op_names):
    sub = pd.DataFrame({"session_type": result.keys(), "labels": result.values()})
    sub.session_type = sub.session_type.astype(str) + f"_{op}"
    sub.labels = sub.labels.apply(lambda x: " ".join(x.astype(str)))
    subs.append(sub)
    
submission = pd.concat(subs).reset_index(drop=True)
#sub.sort_values(by=["session_type"])  ## optional
#submission.to_csv('submission.csv', index = False)
submission

CPU times: user 5min 32s, sys: 14.7 s, total: 5min 47s
Wall time: 5min 56s


Unnamed: 0,session_type,labels
0,11098528_clicks,11830 1732105 588923 571762 884502 876129 1157...
1,11098529_clicks,1105029 1632356 1049489 612829 295362 333991 1...
2,11098530_clicks,409236 264500 1603001 583026 963957 254154 752...
3,11098531_clicks,1728212 1557766 1553691 1449555 1365569 130963...
4,11098532_clicks,876469 7651 1202618 1159379 77906 476681 17040...
...,...,...
5351206,12899774_orders,33035 1539309 819288 771913 31490 95488 218795...
5351207,12899775_orders,1743151 1760714 1163166 1255910 1498443 783827...
5351208,12899776_orders,548599 695829 1737908 773354 1762353 1440959 9...
5351209,12899777_orders,384045 1308634 1688215 395762 703474 1486067 2...


In [70]:
%%time
submission.to_csv('../../allData/validationData/phaseII_80_items_preranking.csv', index = False)

CPU times: user 46.3 s, sys: 2.62 s, total: 48.9 s
Wall time: 50 s


In [69]:
len(submission.iloc[0]["labels"].split(" "))

80

In [40]:
len(df_test)

1783737

In [71]:
gc.collect()

60