In [1]:
import cudf
import os
import cupy as cp
from vs_knn.vs_knn import CupyVsKnnModel
import gc
import time
from tqdm import tqdm

## Dataset: RSC15 "*Yoochoose*" data
We are going to illustrate the algorithm on the public *RSC15* e-commerce dataset, as it is one of the largest publicly available

In [2]:
dataset_filepath = 'archive/yoochoose-clicks.dat'

In [3]:
yoochoose_data = cudf.read_csv(dataset_filepath, 
                               usecols=[0, 1, 2], 
                               dtype={
                                 'session_id': cp.dtype('int32'),
                                 'item_id': cp.dtype('int32'),
                                 'timestamp': cp.dtype('O')
                                 },
                               names=['session_id', 'timestamp', 'item_id'])


In [4]:
n_rows = yoochoose_data.shape[0]
n_sessions = len(yoochoose_data['session_id'].unique())
n_items = len(yoochoose_data['item_id'].unique())
filesize = os.path.getsize(dataset_filepath)

print(f"the dataset contains {round(n_rows / 10 ** 6)}M rows, ", 
      f"with {round(n_sessions / 10 ** 6)}M ", 
      f"sessions and {round(n_items / 10 ** 3)}K items",
      f"\nOriginal file size: {round(filesize / 10 ** 6)}Mb")

the dataset contains 33M rows,  with 9M  sessions and 53K items 
Original file size: 1487Mb


## Data Format
Internally, the model keeps various arrays of values per `item_id` and `session_id`. It uses integer values between 1 and *N_items* to represent `item_id` (between 1 and *N_sessions* to represent`session_id`).
The conversion from original `item_id` and `session_id` in the dataset to integers is handled automatically.
 Note that it offers superior performances when converting integers rather than strings.
It is therefore recommended to use integers for `item_id` and `session_id` in your dataset.


## Train test split
The dataset covers 183 days, we will use the first 180 days as train set, and the remaining 3 days as test set

In [5]:
yoochoose_data['day'] = yoochoose_data['timestamp'].str.slice(start=0, stop=10)

In [6]:
all_days = yoochoose_data['day'].unique()
train_days = all_days[0:180]
print(all_days)

0      2014-04-01
1      2014-04-02
2      2014-04-03
3      2014-04-04
4      2014-04-05
          ...    
178    2014-09-26
179    2014-09-27
180    2014-09-28
181    2014-09-29
182    2014-09-30
Name: day, Length: 183, dtype: object


In [7]:
train_df = yoochoose_data[yoochoose_data['day'].isin(train_days)][['session_id', 'timestamp', 'item_id']]
test_df = yoochoose_data[~yoochoose_data['day'].isin(train_days)][['session_id', 'timestamp', 'item_id']]

In [8]:
del yoochoose_data
gc.collect()

22

## Model train
training the model

In [9]:
model = CupyVsKnnModel(top_k=100, max_sessions_per_items=5000, max_item_per_session=10)

In [10]:
start = time.time()

model.train(train_df)

end = time.time()
print(f"trained the model in {end - start} seconds")

Device memory footprint for index objects: 71.31 Mb (item_id index)
Device memory footprint for index objects: 334.09 Mb (session_id index)
trained the model in 8.112018585205078 seconds


## Testing the model 
- set last element of each test session to the target
- Run the model for each test session
- Calculate HR@20 per session

In [12]:
# removing test items that are unseen at train
# items_train = set(train_data['item_id'].unique())
# test_df = test_df[test_df['item_id'].isin(items_train)]

In [11]:
def get_test_examples(test_set):
    test_array = test_set \
        .drop('timestamp', axis=1) \
        .groupby('session_id') \
        .agg({'item_id': 'collect'})['item_id']\
        .to_pandas()\
        .values
    return test_array

In [12]:
test_sessions_array = get_test_examples(test_df)

In [13]:
def session_to_xy(items_in_session):
    return (items_in_session[0:-1], items_in_session[-1]) if len(items_in_session) > 1 else (None, None)

In [14]:
def test_a_model(model, test_data):
    
    total_hits = 0
    n_treated = 0
    hr20 = 0
    
    pbar = tqdm(test_data)

    for test_session in pbar:
            x, y = session_to_xy(test_session)
            if x is not None:
                items_pred, item_scores = model.predict(x)
                n_treated += 1
                if len(items_pred) > 0:
                    selection = cp.flip(cp.argsort(item_scores)[-20:])
                    items_rec = items_pred[selection]

                    if y in items_rec:
                        total_hits += 1
                        hr20 = total_hits / n_treated
                        pbar.set_postfix({'HR@20': hr20})

    time_per_iter = pbar.format_dict['elapsed'] / pbar.format_dict['n']

    return time_per_iter, hr20

In [15]:
itertime_rd, hr_rd = test_a_model(model, test_sessions_array)

100%|██████████| 44694/44694 [04:32<00:00, 164.05it/s, HR@20=0.677]
