# User Variables

In [1]:
# Data
OTT_SESSIONS_SAMPLE = 10000
MIN_SESSIONS_SIZE = 20

# Predictive Models
TRAIN_RATIO = 0.8
HIT_RATE_EVAL = 20
MIN_TEST_SESSIONS_SIZE = 10

# STAMP
STAMP_EPOCHS = 20

# NARM
NARM_EPOCHS = 20

# Others
RANDOM_SEED = 0

# Interface NN/LFIT
MIN_ATTENTION = 0.5
HIT_RATE_LEARNING = 5
NB_BROADCASTERS = 6

# LFIT
ALGORITHM = "pride"
HEURISTICS = [] #["max_coverage_dynamic"] #["max_coverage_static"] # ["try_all_atoms"]
MAX_OUTPUT_RULE_PER_TARGET = 10
THREADS = 8



# Constants

In [2]:
# Dataset
OTT_SESSIONS_DATA_PATH = "dataset/tokyo2021_user_tv_sessions_data.zip"
PROGRAM_PROPERTIES_PATH = "dataset/tokyo2021_tv_programs_properties.csv"
OLYMPICS_PROPERTIES_PATH = "dataset/tokyo2021_olympics_properties.csv"

# Output files with rules
PAKDD2023_OUTPUT_FILE_PATH_STAMP = "output_rules_stamp.txt"
PAKDD2023_OUTPUT_FILE_PATH_NARM = "output_rules_narm.txt"

DATASET_CHUNK = OTT_SESSIONS_SAMPLE*10

# Installs

In [3]:
#!pip install tensorflow-estimator==1.15.1
#!pip install tensorboard==2.0.0
#!pip install tensorflow==1.15.0
#!pip install tensorflow-gpu==1.14

In [4]:
#!pip install pylfit

# Imports

In [5]:
import pandas as pd
import numpy as np
import random
import os
import sys
import tensorflow as tf
import time
import warnings

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

# STAMP
from algorithms.STAMP.model.STAMP import Seq2SeqAttNN
from algorithms.STAMP.util.batcher.equal_len.batcher_p import batcher

# NARM
from algorithms.NARM.narm import NARM


import pylfit



# Settings

In [6]:
random.seed(RANDOM_SEED)
tf.random.set_random_seed(RANDOM_SEED)
#tf.random.set_seed(RANDOM_SEED)


os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
warnings.filterwarnings('ignore')

# Dataset formating

In [7]:
pipeline_start_time = time.time()

## Raw sequence and Attention

In [8]:
ts = time.time()
# Initialise df
df_sessions = pd.read_csv(OTT_SESSIONS_DATA_PATH, compression="zip", usecols=["userid","program name","Program broadcast date (start)"],
                            dtype={"userid":"category", "program name":"category"},
                            parse_dates=["Program broadcast date (start)"],
                            nrows=DATASET_CHUNK)

current_row = 0

# Extract until enough sample sessions
while df_sessions["userid"].nunique() < OTT_SESSIONS_SAMPLE:

  df_tmp = pd.read_csv(OTT_SESSIONS_DATA_PATH, compression="zip", usecols=["userid","program name","Program broadcast date (start)"],
                            dtype={"userid":"category", "program name":"category"},
                            parse_dates=["Program broadcast date (start)"],
                            nrows=DATASET_CHUNK,
                            skiprows=range(1,current_row))

  if len(df_tmp) == 0:
    break

  current_row += len(df_tmp)
  df_sessions = df_sessions.append(df_tmp, ignore_index=True)

  # Clean too short sessions
  df_count = df_sessions.groupby("userid").count().reset_index()
  df_sessions = df_sessions[~df_sessions["userid"].isin(list(df_count[df_count["program name"] <= MIN_SESSIONS_SIZE]["userid"].unique()))]

  print("Extracted",df_sessions["userid"].nunique(),"valid sessions")

# Remove additional sessions
df_sessions = df_sessions[df_sessions["userid"].isin(df_sessions["userid"].unique()[:OTT_SESSIONS_SAMPLE])]

print("Extracted",df_sessions["userid"].nunique(),"sessions of size >",MIN_SESSIONS_SIZE)

# Rename columns to STAMP input style
df_sessions.rename(columns={"userid": "SessionId", "Program broadcast date (start)": "Time", "program name": "ItemId"}, inplace=True)
df_sessions = df_sessions[["SessionId","ItemId","Time"]]
df_sessions

Extracted 2814 valid sessions
Extracted 4640 valid sessions
Extracted 6442 valid sessions
Extracted 8279 valid sessions
Extracted 10095 valid sessions
Extracted 10000 sessions of size > 20


Unnamed: 0,SessionId,ItemId,Time
0,000002,P130_C0,2021-08-01 07:30:00
1,000002,P37_C3,2021-08-01 19:00:00
2,000002,P297_C3,2021-08-01 19:30:00
3,000002,P310_C1,2021-08-02 18:30:00
4,000002,P309_C3,2021-08-02 21:15:00
...,...,...,...
507932,00564c,P375_C3,2021-08-07 16:15:00
507933,00564c,P378_C3,2021-08-07 18:05:00
507934,00564c,P380_C4,2021-08-07 18:30:00
507935,00564c,P378_C3,2021-08-07 19:30:00


In [9]:
sessions_ids = set(df_sessions["SessionId"].unique())
train_sessions = random.sample(sessions_ids, int(TRAIN_RATIO*len(sessions_ids)))

df_train = df_sessions[df_sessions["SessionId"].isin(train_sessions)]
df_test = df_sessions[~df_sessions["SessionId"].isin(train_sessions)]

print("Total:", len(df_sessions["SessionId"].unique()))
print("Train:", len(df_train["SessionId"].unique()))
print("Test:", len(df_test["SessionId"].unique()))

display(df_train)
display(df_test)

dataset_processing_time = (time.time() - ts)

Total: 10000
Train: 8000
Test: 2000


Unnamed: 0,SessionId,ItemId,Time
15,000003,P129_C4,2021-08-01 05:50:00
16,000003,P130_C0,2021-08-01 07:30:00
17,000003,P37_C3,2021-08-01 19:00:00
18,000003,P297_C3,2021-08-01 19:30:00
19,000003,P300_C4,2021-08-01 21:00:00
...,...,...,...
507878,00564a,P381_C0,2021-08-07 05:59:00
507879,00564a,P285_C3,2021-08-07 08:15:00
507880,00564a,P375_C3,2021-08-07 16:15:00
507881,00564a,P378_C3,2021-08-07 18:05:00


Unnamed: 0,SessionId,ItemId,Time
0,000002,P130_C0,2021-08-01 07:30:00
1,000002,P37_C3,2021-08-01 19:00:00
2,000002,P297_C3,2021-08-01 19:30:00
3,000002,P310_C1,2021-08-02 18:30:00
4,000002,P309_C3,2021-08-02 21:15:00
...,...,...,...
507932,00564c,P375_C3,2021-08-07 16:15:00
507933,00564c,P378_C3,2021-08-07 18:05:00
507934,00564c,P380_C4,2021-08-07 18:30:00
507935,00564c,P378_C3,2021-08-07 19:30:00


# Attention model 1: STAMP

In [10]:
ts = time.time()

model = Seq2SeqAttNN(n_epochs=STAMP_EPOCHS, model_save_path="", model_path="", is_save=True)
model.fit(df_train,df_test)

stamp_run_time = (time.time() - ts)
print('Total time: ', stamp_run_time)

stamp
GPU: False
reload the datasets.
rsc15_64
read finish
sort finish
list finish
I am reading
id: 7999
session_id: 00564a
items: [1, 53, 55, 174, 118, 106, 93, 95, 107, 46, 8, 22, 93, 23, 95, 19, 93, 23, 22, 169, 23, 25, 28, 71, 155, 139, 93, 23, 134, 67, 77, 34, 36, 69, 37, 38, 92, 42, 56]
click_items: ['P129_C4', 'P132_C1', 'P134_C1', 'P292_C1', 'P294_C1', 'P299_C1', 'P5_C2', 'P70_C2', 'P302_C2', 'P304_C2', 'P309_C3', 'P41_C4', 'P5_C2', 'P11_C2', 'P70_C2', 'P321_C3', 'P5_C2', 'P11_C2', 'P41_C4', 'P337_C4', 'P11_C2', 'P345_C0', 'P346_C3', 'P44_C1', 'P50_C1', 'P2_C2', 'P5_C2', 'P11_C2', 'P351_C4', 'P33_C2', 'P358_C5', 'P362_C4', 'P367_C0', 'P377_C0', 'P381_C0', 'P285_C3', 'P375_C3', 'P378_C3', 'P121_C0']
out: [53, 55, 174, 118, 106, 93, 95, 107, 46, 8, 22, 93, 23, 95, 19, 93, 23, 22, 169, 23, 25, 28, 71, 155, 139, 93, 23, 134, 67, 77, 34, 36, 69, 37, 38, 92, 42, 56]
in: [1, 53, 55, 174, 118, 106, 93, 95, 107, 46, 8, 22, 93, 23, 95, 19, 93, 23, 22, 169, 23, 25, 28, 71, 155, 139, 93, 2

## Test

In [11]:
def stamp_predict_next(model, session_id, input_item_id):
  '''
  Gives prediction scores for a selected item in a selected session.
  The self.s variable allow to shift the items in the selected session.
  Parameters
  --------
  session_id : int
      Contains the session ID.
  input_item_id : int
      Contains the item ID of the events of the session.
  Returns
  --------
  out : (input, attention, predictions)
      input: list of int
        The input session of the model
      attention: list of int
        The attention layer value of the model
      predictions: pandas.Serie
        Prediction scores given the input_item_id and session_id for the next item.
        Columns: 1 column containing the scores; rows: items. Rows are indexed by the item IDs.'''

  sample = [x for x in model.test_data.samples if x.session_id == session_id]
  #if model.old_session_id != session_id:
  #    model.s = 0
  
  # DBG
  #print(input_item_id)
  model.s = input_item_id

  c_loss = []
  bt = batcher(
      samples=sample,
      class_num=model.n_items,
      random=False
  )

  while bt.has_next():  # batch round.
      batch_data = bt.next_batch()

      tmp_in_data = batch_data['in_idxes']
      tmp_out_data = batch_data['out_idxes']
      tmp_batch_ids = batch_data['batch_ids']
      # for s in range(len(tmp_in_data[0])):
      batch_in = []
      batch_out = []
      batch_last = []
      batch_seq_l = []
      for tmp_in, tmp_out in zip(tmp_in_data, tmp_out_data):
          _in = tmp_in[model.s]
          _out = tmp_out[model.s] - 1
          batch_last.append(_in)
          batch_in.append(tmp_in[:model.s + 1])
          batch_out.append(_out)
          batch_seq_l.append(model.s + 1)
      feed_dict = {
          model.inputs: batch_in,
          model.last_inputs: batch_last,
          model.lab_input: batch_out,
          model.sequence_length: batch_seq_l

      }
      

      preds, loss, alpha = model.sess.run([model.softmax_input, model.loss, model.alph],feed_dict=feed_dict)
      model.test_data.pack_ext_matrix('alpha', alpha, tmp_batch_ids)
      c_loss += list(loss)
      rev_map = {v: k for k, v in model.mappingitem2idx.items()}
      return [rev_map[i] for i in feed_dict[model.inputs][0]], alpha[0][0], pd.DataFrame(data=np.asanyarray(preds.reshape(len(preds[0]), 1)), index=list(model.mappingitem2idx.keys()))[0]

stamp_predict_next(model, session_id=df_test.iloc[0]["SessionId"], input_item_id=0)

(['P130_C0'],
 array([-0.11206327], dtype=float32),
 P129_C4    1.994417
 P130_C0    9.084502
 P37_C3     3.378508
 P297_C3    4.574994
 P300_C4    3.752305
              ...   
 P123_C5   -5.175893
 P51_C3    -0.170526
 P85_C2    -4.190967
 P281_C2   -4.053764
 P366_C1   -3.824155
 Name: 0, Length: 190, dtype: float32)

In [12]:
predictions = []
for index, row in df_test.groupby('SessionId', observed=True)['ItemId'].apply(list).reset_index().iterrows(): #df_test.groupby("SessionId", observed=True).count().reset_index().iterrows():
  session_id = row["SessionId"]
  session_length = len(row["ItemId"])
  # Subsessions
  for i in range(MIN_TEST_SESSIONS_SIZE, session_length-1):
    predictions.append([row["ItemId"][i+1]] + list(stamp_predict_next(model, session_id=session_id, input_item_id=i-1)))

predictions[:2]

[['P53_C4',
  ['P130_C0',
   'P130_C0',
   'P37_C3',
   'P37_C3',
   'P297_C3',
   'P297_C3',
   'P310_C1',
   'P310_C1',
   'P309_C3',
   'P309_C3'],
  array([0.00071505, 0.00071505, 0.00598137, 0.00598137, 0.00819911,
         0.00819911, 0.32977232, 0.32977232, 0.02174826, 0.02174826],
        dtype=float32),
  P129_C4    -9.320343
  P130_C0    -9.042594
  P37_C3      5.511071
  P297_C3    -4.179463
  P300_C4    -3.532456
               ...    
  P123_C5   -11.492845
  P51_C3     -4.498158
  P85_C2    -13.700951
  P281_C2   -15.509074
  P366_C1   -11.568974
  Name: 0, Length: 190, dtype: float32],
 ['P322_C0',
  ['P130_C0',
   'P130_C0',
   'P37_C3',
   'P37_C3',
   'P297_C3',
   'P297_C3',
   'P310_C1',
   'P310_C1',
   'P309_C3',
   'P309_C3',
   'P53_C4'],
  array([-0.00052725, -0.00052725,  0.00307902,  0.00307902,  0.01311039,
          0.01311039,  0.5233297 ,  0.5233297 ,  0.04933532,  0.04933532,
          0.00087341], dtype=float32),
  P129_C4    -9.646071
  P130_C0   -12.3

In [13]:
def predictions_to_dataframe(predictions):
    df_preds = pd.DataFrame(predictions, columns=['Expected', 'Model_input', 'Model_attention', 'Model_prediction'])
    df_preds["Expected"] = [a for a,b,c,d in predictions]
    df_preds["Model_input"] = [list(b) for a,b,c,d in predictions]
    df_preds["Model_attention"] = [list(c) for a,b,c,d in predictions]
    df_preds["Model_prediction"] = [list(d.sort_values(ascending=False).index) for a,b,c,d in predictions]
    df_preds["HIT_"+str(HIT_RATE_LEARNING)] = [float(i in df_preds["Model_prediction"].iloc[idx][:HIT_RATE_LEARNING]) for idx, i in enumerate(df_preds["Expected"].values)]
    df_preds["HIT_"+str(HIT_RATE_EVAL)] = [float(i in df_preds["Model_prediction"].iloc[idx][:HIT_RATE_EVAL]) for idx, i in enumerate(df_preds["Expected"].values)]
    return df_preds

df_preds = predictions_to_dataframe(predictions)
display(df_preds)

Unnamed: 0,Expected,Model_input,Model_attention,Model_prediction,HIT_5,HIT_20
0,P53_C4,"[P130_C0, P130_C0, P37_C3, P37_C3, P297_C3, P2...","[0.00071504666, 0.00071504666, 0.0059813675, 0...","[P309_C3, P311_C2, P312_C3, P3_C3, P41_C4, P42...",0.0,1.0
1,P322_C0,"[P130_C0, P130_C0, P37_C3, P37_C3, P297_C3, P2...","[-0.00052725174, -0.00052725174, 0.0030790204,...","[P53_C4, P320_C2, P37_C3, P313_C1, P317_C2, P1...",0.0,1.0
2,P322_C0,"[P130_C0, P130_C0, P37_C3, P37_C3, P297_C3, P2...","[-0.0016549662, -0.0016549662, 0.0006530368, 0...","[P53_C4, P320_C2, P37_C3, P313_C1, P317_C2, P1...",0.0,1.0
3,P41_C4,"[P130_C0, P130_C0, P37_C3, P37_C3, P297_C3, P2...","[0.021361394, 0.021361394, 0.011495762, 0.0114...","[P322_C0, P321_C3, P41_C4, P70_C2, P312_C3, P4...",1.0,1.0
4,P41_C4,"[P130_C0, P130_C0, P37_C3, P37_C3, P297_C3, P2...","[0.019003412, 0.019003412, -0.0050029727, -0.0...","[P312_C3, P41_C4, P322_C0, P70_C2, P4_C4, P332...",1.0,1.0
...,...,...,...,...,...,...
75511,P375_C3,"[P132_C1, P287_C0, P134_C1, P121_C0, P2_C2, P5...","[0.0014097501, 0.013776485, 0.0019696476, 0.00...","[P92_C2, P367_C0, P368_C3, P91_C4, P100_C1, P9...",0.0,0.0
75512,P378_C3,"[P132_C1, P287_C0, P134_C1, P121_C0, P2_C2, P5...","[0.0023575264, -0.001487946, 0.003095106, 0.02...","[P368_C3, P285_C3, P100_C1, P370_C3, P374_C3, ...",0.0,1.0
75513,P380_C4,"[P132_C1, P287_C0, P134_C1, P121_C0, P2_C2, P5...","[0.0008022963, -0.010590725, -0.0056620254, 0....","[P112_C2, P378_C3, P380_C4, P37_C3, P376_C2, P...",1.0,1.0
75514,P378_C3,"[P132_C1, P287_C0, P134_C1, P121_C0, P2_C2, P5...","[0.0011820245, 0.011323961, -0.0019836011, 0.0...","[P378_C3, P380_C4, P37_C3, P312_C3, P121_C0, P...",1.0,1.0


In [14]:
def hit_rates(df_preds):
    hit_rate_1 = df_preds["HIT_"+str(HIT_RATE_LEARNING)].mean()
    hit_rate_1 = "Hit rate (P@"+str(HIT_RATE_LEARNING)+"): "+str(hit_rate_1)

    hit_rate_2 = df_preds["HIT_"+str(HIT_RATE_EVAL)].mean()
    hit_rate_2 = "Hit rate (P@"+str(HIT_RATE_EVAL)+"): "+str(hit_rate_2)

    return hit_rate_1, hit_rate_2

stamp_hit_rate_1, stamp_hit_rate_2 = hit_rates(df_preds)
print(stamp_hit_rate_1)
print(stamp_hit_rate_2)

Hit rate (P@5): 0.5015360983102919
Hit rate (P@20): 0.8681339053975317


## Interface STAMP/LFIT

In [15]:
def normalize_and_apply_attention(df):
  # Rename columns to LFIT input style
  df = df[["Model_input","Model_attention","Model_prediction"]]
  df.rename(columns={"Model_input": "Features", "Model_attention": "Attention", "Model_prediction": "Targets"}, inplace=True)
  df
  # Normalize attention
  normalized_attention = []
  for index, row in df.iterrows():
    min_val = min(row["Attention"])
    max_val = max(row["Attention"])
    normalized_attention.append([round((i - min_val) / (max_val - min_val + 0.0001),2) for i in row["Attention"]])
  df["Attention"] = normalized_attention

  # Apply attention mask
  masked_features = []
  for index, row in df.iterrows():
    masked_features.append([i if row["Attention"][idx] >= MIN_ATTENTION else "?" for idx, i in enumerate(row["Features"][:-1])]+[row["Features"][-1]])
  df["Features"] = masked_features
  return df

df_stamp = normalize_and_apply_attention(df_preds)
df_stamp

Unnamed: 0,Features,Attention,Targets
0,"[?, ?, ?, ?, ?, ?, P310_C1, P310_C1, ?, P309_C3]","[0.0, 0.0, 0.02, 0.02, 0.02, 0.02, 1.0, 1.0, 0...","[P309_C3, P311_C2, P312_C3, P3_C3, P41_C4, P42..."
1,"[?, ?, ?, ?, ?, ?, P310_C1, P310_C1, ?, ?, P53...","[0.0, 0.0, 0.01, 0.01, 0.03, 0.03, 1.0, 1.0, 0...","[P53_C4, P320_C2, P37_C3, P313_C1, P317_C2, P1..."
2,"[?, ?, ?, ?, ?, ?, P310_C1, P310_C1, ?, ?, ?, ...","[0.0, 0.0, 0.0, 0.0, 0.02, 0.02, 1.0, 1.0, 0.0...","[P53_C4, P320_C2, P37_C3, P313_C1, P317_C2, P1..."
3,"[?, ?, ?, ?, ?, ?, P310_C1, P310_C1, ?, ?, ?, ...","[0.11, 0.11, 0.09, 0.09, 0.14, 0.14, 1.0, 1.0,...","[P322_C0, P321_C3, P41_C4, P70_C2, P312_C3, P4..."
4,"[?, ?, ?, ?, ?, ?, P310_C1, P310_C1, ?, ?, ?, ...","[0.16, 0.16, 0.11, 0.11, 0.17, 0.17, 1.0, 1.0,...","[P312_C3, P41_C4, P322_C0, P70_C2, P4_C4, P332..."
...,...,...,...
75511,"[?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ...","[0.12, 0.2, 0.13, 0.16, 0.14, 0.15, 0.1, 0.14,...","[P92_C2, P367_C0, P368_C3, P91_C4, P100_C1, P9..."
75512,"[?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ...","[0.11, 0.09, 0.11, 0.23, 0.0, 0.11, 0.0, 0.15,...","[P368_C3, P285_C3, P100_C1, P370_C3, P374_C3, ..."
75513,"[?, ?, ?, ?, ?, ?, ?, ?, ?, P310_C1, ?, ?, ?, ...","[0.06, 0.0, 0.03, 0.16, 0.0, 0.04, 0.0, 0.1, 0...","[P112_C2, P378_C3, P380_C4, P37_C3, P376_C2, P..."
75514,"[?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ...","[0.12, 0.16, 0.11, 0.17, 0.0, 0.12, 0.07, 0.14...","[P378_C3, P380_C4, P37_C3, P312_C3, P121_C0, P..."


# Attention model 2: NARM

In [16]:
ts = time.time()

model = NARM(epochs=NARM_EPOCHS,session_key='SessionId', item_key='ItemId')
model.fit(df_train,df_test)

narm_run_time = (time.time() - ts)
print('Total time: ', narm_run_time)

model options {'self': <algorithms.NARM.narm.NARM object at 0x00000214A7F59390>, 'dim_proj': 100, 'hidden_units': 100, 'patience': 5, 'max_epochs': 20, 'dispFreq': 10000, 'lrate': 0.001, 'n_items': 191, 'encoder': 'gru', 'saveto': 'gru_model.npz', 'is_valid': True, 'is_save': False, 'batch_size': 512, 'valid_batch_size': 512, 'use_dropout': True, 'reload_model': None, 'test_size': -1}
Loading data
Building model
Optimization
343451 train examples
38161 valid examples
Best perfomance updated!
Valid Recall@20: 0.6004035533660019    Valid Mrr@20: 0.22054068931903537
Seen 343451 samples


This epoch took 288.3s


Best perfomance updated!
Valid Recall@20: 0.6238830219333875    Valid Mrr@20: 0.23833512437027596
Seen 343451 samples


This epoch took 290.4s


Best perfomance updated!
Valid Recall@20: 0.6343911323078536    Valid Mrr@20: 0.24732774206822988
Seen 343451 samples


This epoch took 288.8s


Best perfomance updated!
Valid Recall@20: 0.6413091900107439    Valid Mrr@20: 0.2524384464377827
Seen 343451 samples


This epoch took 296.6s


Best perfomance updated!
Valid Recall@20: 0.6442441235816672    Valid Mrr@20: 0.2539681761225335
Seen 343451 samples


This epoch took 295.6s


Best perfomance updated!
Valid Recall@20: 0.6490395953984435    Valid Mrr@20: 0.2583560509042313
Seen 343451 samples


This epoch took 298.0s


Best perfomance updated!
Valid Recall@20: 0.6515814575089751    Valid Mrr@20: 0.26184884411081105
Seen 343451 samples


This epoch took 298.2s


Best perfomance updated!
Valid Recall@20: 0.6558004245171772    Valid Mrr@20: 0.26418049711068947
Seen 343451 samples


This epoch took 295.2s


Best perfomance updated!
Valid Recall@20: 0.656272110269647    Valid Mrr@20: 0.26504957180221494
Seen 343451 samples


This epoch took 292.4s


Best perfomance updated!
Valid Recall@20: 0.6587091533240743    Valid Mrr@20: 0.26753147721130405
Seen 343451 samples


This epoch took 292.4s


Valid Recall@20: 0.6593904771887529    Valid Mrr@20: 0.26716419765502963
Seen 343451 samples


This epoch took 292.9s


Best perfomance updated!
Valid Recall@20: 0.6594952962448574    Valid Mrr@20: 0.2680190744045872
Seen 343451 samples


This epoch took 293.5s


Best perfomance updated!
Valid Recall@20: 0.6614606535468148    Valid Mrr@20: 0.26959872986916394
Seen 343451 samples


This epoch took 292.7s


Best perfomance updated!
Valid Recall@20: 0.6628757108042242    Valid Mrr@20: 0.2715802887121086
Seen 343451 samples


This epoch took 293.7s


Epoch  14 Update  10000 Loss  3.7373764505089446
Best perfomance updated!
Valid Recall@20: 0.6622205917035717    Valid Mrr@20: 0.27239153259649423
Seen 343451 samples


This epoch took 292.8s


Best perfomance updated!
Valid Recall@20: 0.6638190823091638    Valid Mrr@20: 0.27354443862756
Seen 343451 samples


This epoch took 293.2s


Valid Recall@20: 0.6605434868059014    Valid Mrr@20: 0.2731259047092249
current validation mrr: 0.2731259047092249      history max mrr:0.27354443862756
Seen 343451 samples


This epoch took 294.1s


Best perfomance updated!
Valid Recall@20: 0.6625874583999372    Valid Mrr@20: 0.27533267719103
Seen 343451 samples


This epoch took 295.2s


Best perfomance updated!
Valid Recall@20: 0.6656534157909908    Valid Mrr@20: 0.2770852056199444
Seen 343451 samples


This epoch took 295.5s


Best perfomance updated!
Valid Recall@20: 0.6648410681061817    Valid Mrr@20: 0.2796862491152207
Seen 343451 samples
Valid Recall@20: nan    Valid Mrr@20: nan
Total time:  5895.339156389236


This epoch took 295.5s


In [17]:
def narm_predict_next(model, session):
    '''
    Gives predicton scores for a selected set of items on how likely they be the next item in the session.
            
    Parameters
    --------
    session_id : int or string
        The session IDs of the event.
    input_item_id : int or string
        The item ID of the event.
    predict_for_item_ids : 1D array
        IDs of items for which the network should give prediction scores. Every ID must be in the set of item IDs of the training set.
        
    Returns
    --------
    out : pandas.Series
        Prediction scores for selected items on how likely to be the next item of this session. Indexed by the item IDs.
    
    '''
    
    #model.session = session_id
    model.session_items = list(session)
    
    x = [model.itemmap[model.session_items].values]
    y = x
    
    x, mask, y = model.prepare_data(x,y)
    preds = model.pred_function(x, mask)
    attention = model.attention(x, mask)[0]

    return session, attention, pd.Series(data=preds[0][1:], index=model.itemmap.index)

narm_predict_next(model, [0,0,0,0])

([0, 0, 0, 0],
 array([0.28536559, 0.12624121, 0.444803  , 0.14359021]),
 P129_C4    2.267247e-02
 P130_C0    1.454065e-01
 P37_C3     6.113324e-03
 P297_C3    5.710168e-03
 P300_C4    5.621128e-03
                ...     
 P123_C5    3.839364e-06
 P51_C3     3.521303e-05
 P85_C2     5.701542e-07
 P281_C2    6.693610e-06
 P366_C1    6.157276e-06
 Length: 190, dtype: float64)

In [18]:
predictions = []
item_ids = set(list(df_train["ItemId"].unique())+list(df_test["ItemId"].unique()))
for index, row in df_test.groupby('SessionId', observed=True)['ItemId'].apply(list).reset_index().iterrows(): #df_test.groupby("SessionId", observed=True).count().reset_index().iterrows():
  session_id = row["SessionId"]
  session_length = len(row["ItemId"])
  # Full sessions
  #predictions.append(predict_next(model, session_id=row["SessionId"], input_item_id=session_length-2))
  # Subsessions
  for i in range(MIN_TEST_SESSIONS_SIZE, session_length-1):
    predictions.append([row["ItemId"][i+1]] + list(narm_predict_next(model,row["ItemId"][:i])))

predictions[:2]

[['P53_C4',
  ['P130_C0',
   'P130_C0',
   'P37_C3',
   'P37_C3',
   'P297_C3',
   'P297_C3',
   'P310_C1',
   'P310_C1',
   'P309_C3',
   'P309_C3'],
  array([6.55554366e-04, 8.07442603e-05, 7.65997020e-04, 1.59771969e-04,
         8.89250180e-04, 4.96197568e-04, 1.22814265e-01, 2.13323245e-01,
         1.91284182e-01, 4.69530793e-01]),
  P129_C4    7.431257e-08
  P130_C0    2.304535e-07
  P37_C3     6.800238e-02
  P297_C3    7.032774e-05
  P300_C4    7.963466e-06
                 ...     
  P123_C5    7.566138e-06
  P51_C3     1.385904e-04
  P85_C2     4.599589e-08
  P281_C2    9.849973e-10
  P366_C1    2.398802e-06
  Length: 190, dtype: float64],
 ['P322_C0',
  ['P130_C0',
   'P130_C0',
   'P37_C3',
   'P37_C3',
   'P297_C3',
   'P297_C3',
   'P310_C1',
   'P310_C1',
   'P309_C3',
   'P309_C3',
   'P53_C4'],
  array([2.13925176e-04, 1.77958319e-05, 2.50623190e-04, 5.14255376e-05,
         4.41959162e-04, 2.25204717e-04, 1.19774671e-02, 1.62177300e-02,
         1.48508998e-02, 2.2589

In [19]:
df_preds = predictions_to_dataframe(predictions)
display(df_preds)

narm_hit_rate_1, narm_hit_rate_2 = hit_rates(df_preds)
print(narm_hit_rate_1)
print(narm_hit_rate_2)

Unnamed: 0,Expected,Model_input,Model_attention,Model_prediction,HIT_5,HIT_20
0,P53_C4,"[P130_C0, P130_C0, P37_C3, P37_C3, P297_C3, P2...","[0.0006555543662302049, 8.074426027787377e-05,...","[P312_C3, P309_C3, P322_C0, P311_C2, P37_C3, P...",0.0,0.0
1,P322_C0,"[P130_C0, P130_C0, P37_C3, P37_C3, P297_C3, P2...","[0.00021392517572983746, 1.7795831857630927e-0...","[P322_C0, P321_C3, P314_C3, P53_C4, P16_C4, P1...",1.0,1.0
2,P322_C0,"[P130_C0, P130_C0, P37_C3, P37_C3, P297_C3, P2...","[3.670417961491957e-05, 6.013868965158371e-06,...","[P321_C3, P322_C0, P16_C4, P53_C4, P314_C3, P7...",1.0,1.0
3,P41_C4,"[P130_C0, P130_C0, P37_C3, P37_C3, P297_C3, P2...","[2.1812011077987513e-05, 2.9913854714236048e-0...","[P322_C0, P312_C3, P41_C4, P70_C2, P321_C3, P3...",1.0,1.0
4,P41_C4,"[P130_C0, P130_C0, P37_C3, P37_C3, P297_C3, P2...","[6.762624585033666e-06, 1.0158062244673557e-06...","[P312_C3, P41_C4, P322_C0, P332_C5, P70_C2, P3...",1.0,1.0
...,...,...,...,...,...,...
75511,P375_C3,"[P132_C1, P287_C0, P134_C1, P121_C0, P2_C2, P5...","[7.905976243099199e-05, 6.32156831755736e-05, ...","[P378_C3, P380_C4, P374_C3, P370_C3, P375_C3, ...",1.0,1.0
75512,P378_C3,"[P132_C1, P287_C0, P134_C1, P121_C0, P2_C2, P5...","[0.0001226453719627761, 5.821319861341715e-05,...","[P378_C3, P380_C4, P374_C3, P37_C3, P370_C3, P...",1.0,1.0
75513,P380_C4,"[P132_C1, P287_C0, P134_C1, P121_C0, P2_C2, P5...","[4.2156511809465923e-05, 3.396810551344202e-05...","[P378_C3, P380_C4, P374_C3, P370_C3, P37_C3, P...",1.0,1.0
75514,P378_C3,"[P132_C1, P287_C0, P134_C1, P121_C0, P2_C2, P5...","[6.192674923548728e-05, 2.7561346665082338e-05...","[P378_C3, P37_C3, P312_C3, P380_C4, P370_C3, P...",1.0,1.0


Hit rate (P@5): 0.3538323004396419
Hit rate (P@20): 0.7065654960538164


## Interface NARM/LFIT

In [20]:
df_narm = normalize_and_apply_attention(df_preds)
df_narm

Unnamed: 0,Features,Attention,Targets
0,"[?, ?, ?, ?, ?, ?, ?, ?, ?, P309_C3]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.26, 0.45, 0.4...","[P312_C3, P309_C3, P322_C0, P311_C2, P37_C3, P..."
1,"[?, ?, ?, ?, ?, ?, ?, ?, ?, ?, P53_C4]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.01, 0.02, 0.0...","[P322_C0, P321_C3, P314_C3, P53_C4, P16_C4, P1..."
2,"[?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, P53_C4]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.01, 0.01...","[P321_C3, P322_C0, P16_C4, P53_C4, P314_C3, P7..."
3,"[?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, P322_C0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[P322_C0, P312_C3, P41_C4, P70_C2, P321_C3, P3..."
4,"[?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, P322_C0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[P312_C3, P41_C4, P322_C0, P332_C5, P70_C2, P3..."
...,...,...,...
75511,"[?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[P378_C3, P380_C4, P374_C3, P370_C3, P375_C3, ..."
75512,"[?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[P378_C3, P380_C4, P374_C3, P37_C3, P370_C3, P..."
75513,"[?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[P378_C3, P380_C4, P374_C3, P370_C3, P37_C3, P..."
75514,"[?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[P378_C3, P37_C3, P312_C3, P380_C4, P370_C3, P..."


## Property extraction

In [21]:
# Program properties extraction
df_tv = pd.read_csv(PROGRAM_PROPERTIES_PATH)
df_tv = df_tv[[c for c in df_tv.columns if c not in ["C"+str(i) for i in range(0,8)]]] # Remove broadcaster channel (we get it from item id)
df_tv

Unnamed: 0,program,olympic-related,game live,news,wide,digest,break,special,report,documentary,...,movie,weekday,weekend,6to9,9to12,12to18,18to21,21to24,24to6,live broadcast
0,P0,1,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
1,P1,1,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
2,P2,1,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
3,P3,1,0,1,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,0
4,P4,1,0,1,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
221,P389,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
222,P391,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
223,P393,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
224,P396,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
# Olympics properties extraction
df_olympics = pd.read_csv(OLYMPICS_PROPERTIES_PATH)
df_olympics.fillna("",inplace=True)
df_olympics

Unnamed: 0,program_id,Game,Opn.,qualifier,semifinal,final,J_medal,Cls.,weekday,weekend,24to6,6to9,9to12,12to18,18to21,21to24
0,P12,softball,0,1,0,0,0,0,1,0,0,0,1,0,0,0
1,P38,soccer,0,1,0,0,0,0,1,0,0,0,0,0,1,0
2,P65,soccer,0,1,0,0,0,0,1,0,0,0,0,0,1,0
3,P57,softball,0,1,0,0,0,0,1,0,0,0,0,1,0,0
4,P74,soccer,0,1,0,0,0,0,1,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
154,P385,bicycle,0,0,1,1,1,0,0,1,0,0,1,1,0,0
155,P388,rhythmic sports gymnastics,0,0,0,1,0,0,0,1,0,0,1,1,0,0
156,P392,rhythmic sports gymnastics,0,0,0,1,0,0,0,1,0,0,1,1,0,0
157,P387,basketball,0,0,0,1,1,0,0,1,0,0,1,1,0,0


In [23]:
tv_properties = dict()
tv_property_list = df_tv.columns[1:]
for idx, row in df_tv.iterrows():
  program = row["program"]
  if program != "?":
    tv_properties[program] = dict()
    for key in tv_property_list:
      tv_properties[program][key] = row[key] > 0.0

#print(program_properties)

print(tv_properties["P0"])

olympics_properties = dict()
olympics_property_list = df_olympics.columns[1:]
for idx, row in df_olympics.iterrows():
  program = row["program_id"]
  if program != "?":
    olympics_properties[program] = dict()
    for key in olympics_property_list:
      if key == "Game":
          if row["Opn."] > 0.0:
            olympics_properties[program][key] = "opening"
          elif row["Cls."] > 0.0:
            olympics_properties[program][key] = "close"
          else:
            olympics_properties[program][key] = row[key]
      else:
        olympics_properties[program][key] = row[key] > 0.0

#print(program_properties)

print(olympics_properties["P12"])
print(olympics_properties["P74"])
print(olympics_properties["P271"])

#events = list(set(itertools.chain.from_iterable(df["Features"].tolist())))
events = list(tv_properties.keys()) + list(olympics_properties.keys())
print("Extracted events:", events[:5], "...")

def event_watched(input, events):
  return ["watched_"+str(i) for i in events], [i in input for i in events]

print(event_watched(["P0_C0","P0_C0","P1_C0","P0_C2","P3_C0","P3_C1","?"], ["P0_C0","P0_C2","P1_C0","P3_C2","P3_C1"]))

def tv_property_watched(input, properties):
  property_variables = []
  property_values = []
  property_list = properties[list(properties.keys())[0]]
  for p in property_list:
    property_variables.append("watched_tv_"+p)
    p_count = len([i for i in input if i[:i.find("_")] in properties and properties[i[:i.find("_")]][p]])
    property_values.append(p_count > 0)
  return property_variables, property_values

#print(tv_property_watched(["P0_C0", "P1_C1", "P2_C2", "?"], tv_properties))

def olympics_property_watched(input, properties):
  property_variables = []
  property_values = []
  property_list = [p for p in properties[list(properties.keys())[0]] if p != "Game"]
  for p in property_list:
    property_variables.append("watched_olympics_"+p.replace(" ","-"))
    p_count = len([i for i in input if i[:i.find("_")] in properties and properties[i[:i.find("_")]][p]])
    property_values.append(p_count > 0)
  return property_variables, property_values

#print(olympics_property_watched(["P12_C0", "P74_C1", "P271_C4", "?"], olympics_properties))

def channel_watched(input, nb_channels):
  property_variables = []
  property_values = []
  channels = ["C"+str(i) for i in range(0,nb_channels)]
  for c in channels:
    property_variables.append("watched_channel_"+c)
    p_count = len([i for i in input if i[i.find("_")+1:] == c])
    property_values.append(p_count > 0)
  return property_variables, property_values

#print(channel_watched(["P12_C0", "P74_C1", "P271_C4", "?"], NB_BROADCASTERS))

def olympics_property_game(input, properties, df_olympics):
  property_variables = []
  property_values = []
  games = list(df_olympics["Game"].unique())
  for g in games:
    if g != "":
      property_variables.append("watched_olympics_"+g.replace(" ","-"))
      p_count = len([i for i in input if i[:i.find("_")] in properties and properties[i[:i.find("_")]]["Game"] == g])
      property_values.append(p_count > 0)
  return property_variables, property_values

#print(olympics_property_game(["P12_C0", "P74_C1", "P271_C4", "?"], olympics_properties, df_olympics))


def last_program(input):
  program_name = input[-1]
  
  return ["last_program"], [program_name]

print(last_program(["?", "P4_C0", "P1_C2", "P10_C0"]))

{'olympic-related': True, 'game live': False, 'news': True, 'wide': False, 'digest': False, 'break': False, 'special': False, 'report': False, 'documentary': False, 'sports': False, 'drama': False, 'variety': False, 'music': False, 'anime': False, 'shopping': False, 'movie': False, 'weekday': True, 'weekend': False, '6to9': False, '9to12': False, '12to18': False, '18to21': False, '21to24': False, '24to6': True, 'live broadcast': False}
{'Game': 'softball', 'Opn.': False, 'qualifier': True, 'semifinal': False, 'final': False, 'J_medal': False, 'Cls.': False, 'weekday': True, 'weekend': False, '24to6': False, '6to9': False, '9to12': True, '12to18': False, '18to21': False, '21to24': False}
{'Game': 'soccer', 'Opn.': False, 'qualifier': True, 'semifinal': False, 'final': False, 'J_medal': False, 'Cls.': False, 'weekday': True, 'weekend': False, '24to6': False, '6to9': False, '9to12': False, '12to18': True, '18to21': False, '21to24': False}
{'Game': 'volleyball', 'Opn.': False, 'qualifier':

In [24]:
def encode_properties(df_model):
  data = []
  features = []
  for index, row in df_model.iterrows():
    s = row["Features"]
    encoding = last_program(s)[1] + \
    olympics_property_game(s, olympics_properties, df_olympics)[1] + \
    olympics_property_watched(s, olympics_properties)[1] + \
    channel_watched(s, NB_BROADCASTERS)[1] + \
    tv_property_watched(s, tv_properties)[1] + \
    event_watched(s, events)[1]
    for t in row["Targets"][:HIT_RATE_LEARNING]:
      data.append((encoding, [t]))

    if index == 0:
      features = last_program(s)[0] + \
        olympics_property_game(s, olympics_properties, df_olympics)[0] + \
        olympics_property_watched(s, olympics_properties)[0] + \
        channel_watched(s, NB_BROADCASTERS)[0] + \
        tv_property_watched(s, tv_properties)[0] + \
        event_watched(s, events)[0]

  targets = ["next"]
  
  print(len(features),"features:",features)
  print(len(targets),"targets:",targets)
  print(len(events), "target values")
  print(data[:1])
  return data, features, targets

#encode_properties(df_stamp[:1])

# LFIT

In [25]:
def learn_dmvlp(df):
    data, features, targets = encode_properties(df)
    dataset = pylfit.preprocessing.discrete_state_transitions_dataset_from_array(data=data, feature_names=features, target_names=targets)
    #dataset.summary()
    print()

    # Initialize a WDMVLP with the dataset variables and set GULA as learning algorithm
    dmvlp = pylfit.models.DMVLP(features=dataset.features, targets=dataset.targets)
    dmvlp.compile(algorithm=ALGORITHM)

    # Fit the DMVLP on the dataset
    dmvlp.fit(dataset=dataset, heuristics=HEURISTICS, verbose=0, threads=THREADS)
    #model.summary()
    return dataset, dmvlp

In [26]:
ts = time.time()

dataset_stamp, dmvlp_stamp = learn_dmvlp(df_stamp)

lfit_stamp_run_time = (time.time() - ts)
print('Total time: ', lfit_stamp_run_time)

463 features: ['last_program', 'watched_olympics_softball', 'watched_olympics_soccer', 'watched_olympics_archery', 'watched_olympics_badminton', 'watched_olympics_table-tennis', 'watched_olympics_weightlifting', 'watched_olympics_judo', 'watched_olympics_volleyball', 'watched_olympics_swimming', 'watched_olympics_skateboard', 'watched_olympics_3on3-basket', 'watched_olympics_7man-rugby', 'watched_olympics_gymnastics', 'watched_olympics_surfing', 'watched_olympics_baseball', 'watched_olympics_basketball', 'watched_olympics_athletics', 'watched_olympics_trampoline', 'watched_olympics_fencing', 'watched_olympics_triathlon', 'watched_olympics_golf', 'watched_olympics_tennis', 'watched_olympics_bicycle', 'watched_olympics_boxing', 'watched_olympics_dive', 'watched_olympics_wrestling', 'watched_olympics_volleyballwrestling', 'watched_olympics_sailing', 'watched_olympics_karate', 'watched_olympics_rhythmic-sports-gymnastics', 'watched_olympics_sports-climbing', 'watched_olympics_artistic-swim

In [27]:
ts = time.time()

dataset_narm, dmvlp_narm = learn_dmvlp(df_narm)

lfit_narm_run_time = (time.time() - ts)
print('Total time: ', lfit_narm_run_time)

463 features: ['last_program', 'watched_olympics_softball', 'watched_olympics_soccer', 'watched_olympics_archery', 'watched_olympics_badminton', 'watched_olympics_table-tennis', 'watched_olympics_weightlifting', 'watched_olympics_judo', 'watched_olympics_volleyball', 'watched_olympics_swimming', 'watched_olympics_skateboard', 'watched_olympics_3on3-basket', 'watched_olympics_7man-rugby', 'watched_olympics_gymnastics', 'watched_olympics_surfing', 'watched_olympics_baseball', 'watched_olympics_basketball', 'watched_olympics_athletics', 'watched_olympics_trampoline', 'watched_olympics_fencing', 'watched_olympics_triathlon', 'watched_olympics_golf', 'watched_olympics_tennis', 'watched_olympics_bicycle', 'watched_olympics_boxing', 'watched_olympics_dive', 'watched_olympics_wrestling', 'watched_olympics_volleyballwrestling', 'watched_olympics_sailing', 'watched_olympics_karate', 'watched_olympics_rhythmic-sports-gymnastics', 'watched_olympics_sports-climbing', 'watched_olympics_artistic-swim

## Postprocessing

In [28]:
def postprocessing(model, dataset):
  print("Computing rules weight")
  weighted_rules = {}
  encoded_data = [(pylfit.algorithms.Algorithm.encode_state(s1, dataset.features), pylfit.algorithms.Algorithm.encode_state(s2, dataset.targets)) for s1,s2 in dataset.data]

  # Compute Weight
  for var in range(len(model.targets)):
      for val in range(len(model.targets[var][1])):
          weighted_rules[(var,val)] = []

  total = len(model.rules)
  for id, r in enumerate(model.rules):
      print("\r"+str(id+1)+"/"+str(total), end="")
      weight = 0
      total_head = 0
      for s1,s2 in encoded_data:
          if s2[r.head_variable] == r.head_value:
            total_head += 1
            if r.matches(s1):
              weight += 1
      #if weight > 0:
      weighted_rules[(var,val)].append((weight,total_head,r))

  rules = [(w,t,r) for key,values in weighted_rules.items() for w,t,r in values]
  print()

  # Rule Cleaning
  selected_rules = {}
  for var_id, (var_label, vals) in enumerate(model.targets):
    selected_rules[var_label] = {}
    for val_id, val_label in enumerate(vals):
      selected_rules[var_label][val_label] = sorted([(w,t,r) for w,t,r in rules if w > 0 and r.head_variable == var_id and r.head_value == val_id],
                                            key=lambda x: x[0], reverse=True)[:MAX_OUTPUT_RULE_PER_TARGET]

  output = ""
  for var in selected_rules:
    for val in selected_rules[var]:
      output +="Best rules of "+val+"\n"
      for w,t,r in selected_rules[var][val]:
        percent = round(w/t*100,2)
        ratio = str(w)+"/"+str(t)
        output += "> "+str(percent)+"% ("+ratio+") "+r.logic_form(model.features,model.targets)+"\n"
      output += "\n"

  return output

# Selected Rules

In [29]:
ts = time.time()

output = postprocessing(dmvlp_stamp, dataset_stamp)

with open(PAKDD2023_OUTPUT_FILE_PATH_STAMP, 'w') as f:
    f.write(output)

print(output)

postprocesing_lfit_stamp_run_time = (time.time() - ts)
print('Total time: ', postprocesing_lfit_stamp_run_time)

Computing rules weight
45848/45848
Best rules of P0_C0
> 23.87% (290/1215) next(P0_C0) :- last_program(P0_C0), watched_olympics_golf(False).
> 23.13% (281/1215) next(P0_C0) :- last_program(P0_C0), watched_olympics_athletics(False).
> 7.9% (96/1215) next(P0_C0) :- last_program(P0_C0), watched_olympics_soccer(True).
> 5.51% (67/1215) next(P0_C0) :- last_program(P0_C0), watched_olympics_table-tennis(True).
> 3.13% (38/1215) next(P0_C0) :- last_program(P1_C1), watched_olympics_soccer(False), watched_olympics_badminton(False), watched_olympics_table-tennis(False), watched_olympics_skateboard(False), watched_channel_C2(True).
> 2.96% (36/1215) next(P0_C0) :- last_program(P1_C1), watched_olympics_badminton(False), watched_olympics_table-tennis(False), watched_olympics_baseball(False), watched_olympics_bicycle(True), watched_channel_C5(False).
> 2.96% (36/1215) next(P0_C0) :- last_program(P1_C1), watched_olympics_soccer(False), watched_olympics_table-tennis(False), watched_olympics_baseball(Fa

In [30]:
ts = time.time()
output = postprocessing(dmvlp_narm, dataset_narm)

with open(PAKDD2023_OUTPUT_FILE_PATH_NARM, 'w') as f:
    f.write(output)

print(output)

postprocesing_lfit_narm_run_time = (time.time() - ts)
print('Total time: ', postprocesing_lfit_narm_run_time)

Computing rules weight
52910/52910
Best rules of P0_C0
> 23.87% (132/553) next(P0_C0) :- watched_olympics_soccer(False), watched_olympics_skateboard(False), watched_olympics_wrestling(False), watched_channel_C1(False), watched_channel_C2(False), watched_channel_C3(False), watched_channel_C4(False), watched_channel_C5(False), watched_tv_news(True), watched_tv_9to12(False), watched_tv_12to18(False), watched_tv_18to21(False), watched_tv_21to24(False).
> 23.33% (129/553) next(P0_C0) :- last_program(P0_C0), watched_olympics_soccer(False), watched_olympics_skateboard(False), watched_olympics_wrestling(False), watched_channel_C1(False), watched_channel_C2(False), watched_channel_C3(False), watched_channel_C4(False), watched_channel_C5(False), watched_tv_digest(False), watched_tv_variety(False), watched_tv_9to12(False).
> 9.95% (55/553) next(P0_C0) :- last_program(P42_C0), watched_olympics_soccer(False), watched_olympics_wrestling(False), watched_channel_C1(False), watched_channel_C2(False), w

# Summary

In [31]:
pipeline_total_time = (time.time() - pipeline_start_time)

print("Number of sesions:", OTT_SESSIONS_SAMPLE)
print("STAMP epochs:", STAMP_EPOCHS)
print("NARM epochs:", NARM_EPOCHS)
print("LFIT threads:", THREADS)

print('Total time: ', round(pipeline_total_time,2))
print()
print('> Dataset preprocess time: ',  round(dataset_processing_time,2))
print()
print('> STAMP/LFIT run time: ',  round(stamp_run_time+lfit_stamp_run_time,2))
print('>> STAMP run time: ',  round(stamp_run_time,2))
print('>> LFIT PRIDE run time: ',  round(lfit_stamp_run_time,2))
print('>> LFIT postprocessing run time: ',  round(postprocesing_lfit_stamp_run_time,2))
print()
print('> NARM/LFIT run time: ',  round(narm_run_time+lfit_narm_run_time,2))
print('>> NARM run time: ',  round(narm_run_time,2))
print('>> LFIT PRIDE run time: ',  round(lfit_narm_run_time,2))
print('>> LFIT postprocessing run time: ',  round(postprocesing_lfit_narm_run_time,2))
print()
print("STAMP hit rates:")
print(stamp_hit_rate_1)
print(stamp_hit_rate_2)
print("NARM hit rates:")
print(narm_hit_rate_1)
print(narm_hit_rate_2)

Number of sesions: 10000
STAMP epochs: 20
NARM epochs: 20
LFIT threads: 8
Total time:  23042.61

> Dataset preprocess time:  1.59

> STAMP/LFIT run time:  2877.48
>> STAMP run time:  1319.05
>> LFIT PRIDE run time:  1558.43
>> LFIT postprocessing run time:  5538.54

> NARM/LFIT run time:  7280.23
>> NARM run time:  5895.34
>> LFIT PRIDE run time:  1384.89
>> LFIT postprocessing run time:  6601.98

STAMP hit rates:
Hit rate (P@5): 0.5015360983102919
Hit rate (P@20): 0.8681339053975317
NARM hit rates:
Hit rate (P@5): 0.3538323004396419
Hit rate (P@20): 0.7065654960538164
