In [46]:
from DeepRec.modules.data import SessionDataset, SessionDataLoader
import os
import numpy as np

In [2]:
def loadData(dataDir, trainName='train.csv', testName='test.csv'):
    # load data from npz format to numpy 
    train_dataset = SessionDataset(path=os.path.join(dataDir,trainName))
    test_dataset = SessionDataset(path=os.path.join(dataDir,testName), itemmap=train_dataset.itemmap)
    
    return train_dataset, test_dataset 

In [3]:
trainDs, testDs = loadData('/run/user/1024/UBDTR/data')

In [4]:
trainDs.itemmap.to_csv('~/storage/UBDTR/data/itemmap.csv', index=False)

In [5]:
import pandas as pd

In [8]:
trainDs.itemmap.dtypes

item_ID     int64
item_idx    int64
dtype: object

In [10]:
len(trainDs.items)

924756

In [11]:
len(trainDs.itemmap)

924756

In [28]:
len(trainDs.session_idx_arr)
click_offsets = trainDs.click_offsets
session_idx_arr = trainDs.session_idx_arr

In [29]:
iters=np.arange(32)

In [34]:
start = click_offsets[session_idx_arr[iters]]
end = click_offsets[session_idx_arr[iters] + 1]

In [36]:
end

array([  2,   4,   6,   8,  15,  17,  22,  25,  28,  30,  32,  35,  37,
        42,  46, 117, 136, 173, 189, 198, 202, 212, 217, 219, 223, 227,
       230, 232, 234, 236, 271, 283], dtype=int32)

In [37]:
minlen = (end - start).min()

In [43]:
idx_target = trainDs.df.item_idx.values[start]
idx_target

array([  0,   2,   4,   6,   8,  14,  16,  19,  22,  25,  27,  29,  32,
        34,  39,  43,  87, 105, 104, 104,  85, 143, 152, 157, 143, 159,
       162, 165, 167,  48,  69, 172])

In [44]:
# click indices where a particular session meets second-to-last element
start = start + (minlen - 1)
start

array([  1,   3,   5,   7,   9,  16,  18,  23,  26,  29,  31,  33,  36,
        38,  43,  47, 118, 137, 174, 190, 199, 203, 213, 218, 220, 224,
       228, 231, 233, 235, 237, 272], dtype=int32)

In [45]:
# see if how many sessions should terminate
mask = np.arange(len(iters))[(end - start) <= 1]
mask

array([ 0,  1,  2,  3,  5,  9, 10, 12, 23, 27, 28, 29])

In [47]:
loader = SessionDataLoader(trainDs, batch_size=32)

In [56]:
lst = []
for x in loader:
    lst.append(x)
    if len(lst) == 10:
        break

In [57]:
lst

[(tensor([  0,   2,   4,   6,   8,  14,  16,  19,  22,  25,  27,  29,  32,  34,
           39,  43,  87, 105, 104, 104,  85, 143, 152, 157, 143, 159, 162, 165,
          167,  48,  69, 172]),
  tensor([  1,   3,   5,   7,   9,  15,  17,  20,  23,  26,  28,  30,  33,  35,
           40,  44,  53, 106, 136, 144,  86, 144, 153, 158, 150, 160, 163, 166,
          168,  49, 169, 173]),
  []),
 (tensor([197, 201, 205, 211,   9, 224,  17,  20,  23, 229, 237,  30, 239,  35,
           40,  44,  53, 106, 136, 144,  86, 144, 153, 241, 150, 160, 163, 240,
          255, 259, 169, 173]),
  tensor([198, 202, 206, 212,   9, 209,  16,  21,  24, 230, 238,  31, 240,  36,
           41,  45,  88, 107, 137, 146, 148, 143, 154, 242,  48, 158, 164, 250,
          256, 260,  53, 190]),
  array([ 0,  1,  2,  3,  5,  9, 10, 12, 23, 27, 28, 29])),
 (tensor([198, 202, 206, 212,   9, 209,  16, 265, 267, 230, 268, 272, 276,  36,
           41,  45,  88, 107, 137, 146, 148, 143, 154, 242,  48, 158, 272, 250,
     