In [22]:
!python gru4rec.py --train-path /home/changhyun/workspace/dataset/ml-20m/preprocessing_data/train.csv \
                   --dev-path /home/changhyun/workspace/dataset/ml-20m/preprocessing_data/dev.csv \
                   --test-path /home/changhyun/workspace/dataset/ml-20m/preprocessing_data/test.csv \
                   --epoch 3 \
                   --save-weights SAVE_WEIGHTS

2022-08-06 16:28:52.514759: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-06 16:28:52.541587: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-06 16:28:52.541748: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-06 16:28:52.542032: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags



	 - Recall@20 epoch 3: 0.198182
	 - MRR@20    epoch 3: 0.056315



In [26]:
import argparse
import numpy as np
import pandas as pd
from tqdm import tqdm

import keras
import keras.backend as K
import tensorflow as tf
from keras.models import Model
from tensorflow.keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint
from keras.losses import categorical_crossentropy
from keras.layers import Input, Dense, Dropout, GRU

In [27]:
class SessionDataset:
    """Credit to yhs-968/pyGRU4REC."""
    def __init__(self, data, sep='\t', session_key='SessionId', item_key='ItemId', time_key='Time', n_samples=-1, itemmap=None, time_sort=False):
        """
        Args:
            path: path of the csv file
            sep: separator for the csv
            session_key, item_key, time_key: name of the fields corresponding to the sessions, items, time
            n_samples: the number of samples to use. If -1, use the whole dataset.
            itemmap: mapping between item IDs and item indices
            time_sort: whether to sort the sessions by time or not
        """
        self.df = data
        self.session_key = session_key
        self.item_key = item_key
        self.time_key = time_key
        self.time_sort = time_sort
        self.add_item_indices(itemmap=itemmap)
        self.df.sort_values([session_key, time_key], inplace=True)

        # Sort the df by time, and then by session ID. That is, df is sorted by session ID and
        # clicks within a session are next to each other, where the clicks within a session are time-ordered.

        self.click_offsets = self.get_click_offsets()
        self.session_idx_arr = self.order_session_idx()

    def get_click_offsets(self):
        """
        Return the offsets of the beginning clicks of each session IDs,
        where the offset is calculated against the first click of the first session ID.
        """
        offsets = np.zeros(self.df[self.session_key].nunique() + 1, dtype=np.int32)
        # group & sort the df by session_key and get the offset values
        offsets[1:] = self.df.groupby(self.session_key).size().cumsum()

        return offsets

    def order_session_idx(self):
        """ Order the session indices """
        if self.time_sort:
            # starting time for each sessions, sorted by session IDs
            sessions_start_time = self.df.groupby(self.session_key)[self.time_key].min().values
            # order the session indices by session starting times
            session_idx_arr = np.argsort(sessions_start_time)
        else:
            session_idx_arr = np.arange(self.df[self.session_key].nunique())

        return session_idx_arr

    def add_item_indices(self, itemmap=None):
        """
        Add item index column named "item_idx" to the df
        Args:
            itemmap (pd.DataFrame): mapping between the item Ids and indices
        """
        if itemmap is None:
            item_ids = self.df[self.item_key].unique()  # unique item ids
            item2idx = pd.Series(data=np.arange(len(item_ids)),
                                 index=item_ids)
            itemmap = pd.DataFrame({self.item_key:item_ids,
                                   'item_idx':item2idx[item_ids].values})

        self.itemmap = itemmap
        self.df = pd.merge(self.df, self.itemmap, on=self.item_key, how='inner')

    @property
    def items(self):
        return self.itemmap.ItemId.unique()

In [28]:
class SessionDataLoader:
    """Credit to yhs-968/pyGRU4REC."""
    def __init__(self, dataset, batch_size=50):
        """
        A class for creating session-parallel mini-batches.
        Args:
            dataset (SessionDataset): the session dataset to generate the batches from
            batch_size (int): size of the batch
        """
        self.dataset = dataset
        self.batch_size = batch_size
        self.done_sessions_counter = 0

    def __iter__(self):
        """ Returns the iterator for producing session-parallel training mini-batches.
        Yields:
            input (B,):  Item indices that will be encoded as one-hot vectors later.
            target (B,): a Variable that stores the target item indices
            masks: Numpy array indicating the positions of the sessions to be terminated
        """

        df = self.dataset.df
        session_key='SessionId'
        item_key='ItemId'
        time_key='TimeStamp'
        self.n_items = df[item_key].nunique()+1
        click_offsets = self.dataset.click_offsets
        session_idx_arr = self.dataset.session_idx_arr

        iters = np.arange(self.batch_size)
        maxiter = iters.max()
        start = click_offsets[session_idx_arr[iters]]
        end = click_offsets[session_idx_arr[iters] + 1]
        mask = [] # indicator for the sessions to be terminated
        finished = False

        while not finished:
            minlen = (end - start).min()
            # Item indices (for embedding) for clicks where the first sessions start
            idx_target = df.item_idx.values[start]
            for i in range(minlen - 1):
                # Build inputs & targets
                idx_input = idx_target
                idx_target = df.item_idx.values[start + i + 1]
                inp = idx_input
                target = idx_target
                yield inp, target, mask

            # click indices where a particular session meets second-to-last element
            start = start + (minlen - 1)
            # see if how many sessions should terminate
            mask = np.arange(len(iters))[(end - start) <= 1]
            self.done_sessions_counter = len(mask)
            for idx in mask:
                maxiter += 1
                if maxiter >= len(click_offsets) - 1:
                    finished = True
                    break
                # update the next starting/ending point
                iters[idx] = maxiter
                start[idx] = click_offsets[session_idx_arr[maxiter]]
                end[idx] = click_offsets[session_idx_arr[maxiter] + 1]


In [29]:
def get_metrics(model, args, train_generator_map, recall_k=20, mrr_k=20):

    test_dataset = SessionDataset(args.test_data, itemmap=train_generator_map)
    test_generator = SessionDataLoader(test_dataset, batch_size=args.batch_size)

    n = 0
    rec_sum = 0
    mrr_sum = 0

    print("Evaluating model...")
    for feat, label, mask in test_generator:

        gru_layer = model.get_layer(name="GRU")
        hidden_states = gru_layer.states[0].numpy()
        for elt in mask:
            hidden_states[elt, :] = 0
        gru_layer.reset_states(states=hidden_states)

        target_oh = to_categorical(label, num_classes=args.train_n_items)
        input_oh  = to_categorical(feat,  num_classes=args.train_n_items)
        input_oh = np.expand_dims(input_oh, axis=1)

        pred = model.predict(input_oh, batch_size=args.batch_size)

        for row_idx in range(feat.shape[0]):
            pred_row = pred[row_idx]
            label_row = target_oh[row_idx]

            rec_idx =  pred_row.argsort()[-recall_k:][::-1]
            mrr_idx =  pred_row.argsort()[-mrr_k:][::-1]
            tru_idx = label_row.argsort()[-1:][::-1]

            n += 1

            if tru_idx[0] in rec_idx:
                rec_sum += 1

            if tru_idx[0] in mrr_idx:
                mrr_sum += 1/int((np.where(mrr_idx == tru_idx[0])[0]+1))

    recall = rec_sum/n
    mrr = mrr_sum/n
    return (recall, recall_k), (mrr, mrr_k)



In [24]:
from keras.models import load_model

model = load_model('/home/changhyun/workspace/Study/05.Recommandation/GRU4REC_2.h5')

2022-08-06 16:40:59.152610: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-06 16:40:59.180843: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-06 16:40:59.181000: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-06 16:40:59.181495: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

In [30]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(512, 1, 11618)]         0         
                                                                 
 GRU (GRU)                   [(512, 100),              3516000   
                              (512, 100)]                        
                                                                 
 dropout (Dropout)           (512, 100)                0         
                                                                 
 dense (Dense)               (512, 11618)              1173418   
                                                                 
Total params: 4,689,418
Trainable params: 4,689,418
Non-trainable params: 0
_________________________________________________________________


In [106]:
test_data = pd.read_csv('/home/changhyun/workspace/dataset/ml-20m/preprocessing_data/test.csv', sep='\t')
test_data

Unnamed: 0,SessionId,ItemId,Time
0,42,31,1411667366
1,42,111,1411668051
2,42,260,1411667772
3,42,296,1411667904
4,42,355,1411667339
...,...,...,...
215841,138468,109487,1419255337
215842,138468,111759,1419255347
215843,138468,112556,1419255339
215844,138468,118696,1419255383


In [114]:
test_n_items = len(test_data['ItemId'].unique()) + 1
test_n_items

10371

In [115]:
test_dataset = SessionDataset(test_data, itemmap=test_dataset.itemmap)

In [119]:
test_generator = SessionDataLoader(test_dataset, batch_size=512)

In [120]:
test_generator

<__main__.SessionDataLoader at 0x7faa7f7a7d60>

In [121]:
for feat, label, mask in test_generator:

    gru_layer = model.get_layer(name="GRU")
    hidden_states = gru_layer.states[0].numpy()
    for elt in mask:
        hidden_states[elt, :] = 0
    gru_layer.reset_states(states=hidden_states)

    target_oh = to_categorical(label, num_classes=test_n_items)
    input_oh  = to_categorical(feat,  num_classes=test_n_items)
    input_oh = np.expand_dims(input_oh, axis=1)

    pred = model.predict(input_oh, batch_size=512)







In [112]:
pred.shape

(512, 11618)

In [113]:
pred

array([[1.04488441e-04, 8.76652630e-05, 8.75898841e-05, ...,
        8.55603139e-05, 8.52544690e-05, 8.53279635e-05],
       [9.29118396e-05, 1.17597810e-04, 1.00011741e-04, ...,
        7.44683493e-05, 7.60503026e-05, 7.41452241e-05],
       [9.88077882e-05, 1.12487665e-04, 1.05223131e-04, ...,
        7.27355000e-05, 7.31248438e-05, 7.16478899e-05],
       ...,
       [1.12491238e-04, 1.19848148e-04, 1.06123822e-04, ...,
        7.17230432e-05, 7.23142075e-05, 7.10860986e-05],
       [9.51118709e-05, 9.56382282e-05, 8.92086464e-05, ...,
        7.22236437e-05, 7.43962592e-05, 7.28156301e-05],
       [1.11367350e-04, 1.17734882e-04, 1.04032952e-04, ...,
        7.54325229e-05, 7.68670870e-05, 7.42350603e-05]], dtype=float32)