# Practical Work

In [114]:
import pandas as pd

## Create the dataset files for RecBole

### Dataset structure
Recbole knowledge aware datasets require three files: .inter, .g, .link files. 

| File | Description |
|------|-------------|
|.inter|User-Item interaction|
|.kg| head, relation, tail|
|.link|item_id to entity_id|

## Create atomic files

 Add type info to kg generated by David

In [115]:
dataset = '75percent_subset'
path = 'data/lfm'
kg_path = '%s/intermediate_kg/%s' % (path, dataset)
prefixed_path = '%s/%s' % (path,dataset)

In [116]:
kg = pd.read_csv('%s_kg.txt' % kg_path, sep='\t', names=['head_id:token', 'relation_id:token', 'tail_id:token'])

In [117]:
kg.head()

Unnamed: 0,head_id:token,relation_id:token,tail_id:token
0,t31469731,has_micro_genre,indie rock
1,t31469731,has_micro_genre,indie pop
2,t31469731,has_micro_genre,rock
3,t31469731,has_micro_genre,alternative rock
4,t31469731,has_micro_genre,synthpop


In [118]:
kg['relation_id:token'].unique()

array(['has_micro_genre', 'has_gender', 'listened_to', 'lives_in',
       'created_by', 'in_album', 'has_genre'], dtype=object)

In [119]:
kg_no_listen_events = kg[kg['relation_id:token'] != 'listened_to']
kg_no_listen_events.to_csv('data/rb_lfm/rb_lfm.kg', sep='\t', index=False)

In [120]:
users = pd.read_csv('%s_users.tsv' % prefixed_path, sep='\t', skiprows=[0], names=['user_id:token', 'country:token', 'age:token', 'gender:token', 'creation_time:token'])
users.to_csv('data/rb_lfm/rb_lfm.user', sep='\t', index=False)

In [121]:
items = pd.read_csv('%s_tracks.tsv' % prefixed_path, sep='\t', skiprows=[0], names=['item_id:token', 'artist:token', 'track:token'])
items.to_csv('data/rb_lfm/rb_lfm.item', sep='\t', index=False)
                    
track_ids = pd.DataFrame(items['item_id:token'])
track_ids['entity_id:token'] = 't' + track_ids['item_id:token'].astype(str)
track_ids.to_csv('data/rb_lfm/rb_lfm.link', sep='\t', index=False)

In [122]:
listening_events = pd.read_csv('%s_listening_events.tsv' % prefixed_path, sep='\t', skiprows=[0], names=['user_id:token', 'item_id:token', 'album_id:token', 'timestamp:token'])
listening_events.to_csv('data/rb_lfm/rb_lfm.inter', sep='\t', index=False)

In [123]:
# free memory
del kg
del kg_no_listen_events
del items
del track_ids
del users
del listening_events

## Demo run model

In [124]:
#from recbole.quick_start import run_recbole
#run_recbole(model='KGAT', dataset='rb_lfm', config_file_list=['lfm.yaml'])

# Custom pipeline

### Load config and create Dataset

In [1]:
from logging import getLogger
from recbole.config import Config
from recbole.data import create_dataset, data_preparation
from recbole.model.knowledge_aware_recommender import KGAT
from recbole.trainer import KGATTrainer
from recbole.utils import init_seed, init_logger

In [2]:

# configurations initialization
config = Config(model='KGAT', dataset='rb_lfm', config_file_list=['lfm.yaml'])

# init random seed
init_seed(config['seed'], config['reproducibility'])

# logger initialization
init_logger(config)
logger = getLogger()

# write config info into log
logger.info(config)

# dataset creating and filtering
dataset = create_dataset(config)
logger.info(dataset)

# dataset splitting
train_data, valid_data, test_data = data_preparation(config, dataset)


17 Feb 00:26    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 2020
state = INFO
reproducibility = True
data_path = ./data/rb_lfm
checkpoint_dir = saved
show_progress = True
save_dataset = False
dataset_save_path = None
save_dataloaders = False
dataloaders_save_path = None
log_wandb = False

Training Hyper Parameters:
epochs = 100
train_batch_size = 256
learner = adam
learning_rate = 0.001
train_neg_sample_args = {'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}
eval_step = 1
stopping_step = 10
clip_grad_norm = None
weight_decay = 0.0
loss_decimal_place = 4

Evaluation Hyper Parameters:
eval_args = {'split': {'RS': [8, 1, 1]}, 'group_by': 'user', 'order': 'RO', 'mode': 'full'}
repeatable = False
metrics = ['NDCG', 'Hit', 'Precision']
topk = [10]
valid_metric = NDCG@10
valid_metric_bigger = True
eval_batch_size = 256
metric_decimal_place = 4

Dataset Hyper Parameters:
field_separator = 	
seq_separator =  
USER_ID_FIELD = 

In [None]:
len(train_data)

### Train model and evaluate

In [None]:
# model loading and initialization
model = KGAT(config, train_data._dataset).to(config['device'])
logger.info(model)

# trainer loading and initialization
trainer = KGATTrainer(config, model)

print('Starting to fit model')
# model training
best_valid_score, best_valid_result = trainer.fit(train_data, valid_data, saved=True, show_progress=True)

print('Evaluating model')
# model evaluation
test_result = trainer.evaluate(test_data, load_best_model=True)
print(test_result)

  d_inv = np.power(rowsum, -1).flatten()
  indices = torch.LongTensor([final_adj_matrix.row, final_adj_matrix.col])
17 Feb 00:29    INFO  KGAT(
  (user_embedding): Embedding(14993, 48)
  (entity_embedding): Embedding(1184321, 48)
  (relation_embedding): Embedding(8, 48)
  (trans_w): Embedding(8, 2304)
  (aggregator_layers): ModuleList(
    (0): Aggregator(
      (message_dropout): Dropout(p=0.1, inplace=False)
      (W1): Linear(in_features=48, out_features=64, bias=True)
      (W2): Linear(in_features=48, out_features=64, bias=True)
      (activation): LeakyReLU(negative_slope=0.01)
    )
    (1): Aggregator(
      (message_dropout): Dropout(p=0.1, inplace=False)
      (W1): Linear(in_features=64, out_features=32, bias=True)
      (W2): Linear(in_features=64, out_features=32, bias=True)
      (activation): LeakyReLU(negative_slope=0.01)
    )
  )
  (tanh): Tanh()
  (mf_loss): BPRLoss()
  (reg_loss): EmbLoss()
)
Trainable parameters: 57596320


Starting to fit model


[1;35mTrain     0[0m:   0%|          | 0/37128 [00:00<?, ?it/s]

In [None]:
test_result = trainer.evaluate(test_data, load_best_model=True)
print(test_result)

# Load best model

In [None]:
import os
import torch

In [None]:
latest_model = 'saved/%s' % os.listdir('saved')[-1]

In [None]:
latest_model

In [None]:
state_dict = torch.load(latest_model, map_location='cuda')
model = KGAT(config, train_data._dataset).to(config['device'])

model.load_state_dict(state_dict['state_dict'])

## Some model investigation

The model contains entity and user embeddings

In [None]:
model.entity_embedding.weight.shape

Ego Embeddings return current user and entity embeddings and concatenates them

In [None]:
ego_embeddings = model._get_ego_embeddings()
ego_embeddings.shape

In [None]:
from recbole.data.interaction import Interaction
input_interactions = Interaction({
    'user_id': torch.tensor([1]),
    'item_id_list': torch.tensor([]),
    'item_length': torch.tensor([])
})
predictions = model.full_sort_predict(input_interactions)

In [None]:
predictions.shape

In [None]:
dataset

## The attention matrix

The attention matrix is a nxn matrix where $n = n_u + n_e$ with $n_u$ being the number of users and $n_e$ the number of entities

The rows of the matrix indicate the head, the columns the tail and the values are the $\pi(h,r,t)$ values stated in the paper as

$$
\pi(h,r,t) = 
$$

So the attention matrix is more or less a graph with all relations and the corresponding attention value for a connection between head and tail.

In [None]:
attention_matrix = model.A_in

In [None]:
attention_matrix

In [None]:
attention_matrix.coalesce()

In [None]:
134531 + 14068

In [None]:
type(dataset.ckg_graph(form="dgl", value_field="relation_id"))

In [None]:
user, items = model()
user

In [None]:
user.shape

In [None]:
model._get_ego_embeddings().shape

In [None]:
user[0][:16]

In [None]:
user[0][16:]

In [None]:
items.shape

In [None]:
attention_matrix = model.A_in.cpu().coalesce()

In [None]:
size = attention_matrix.size()[0]

# Try to convert sparse attention matrix to scipy sparse coo matrix

The attention matrix should reflect a directional weighted graph with attention scores for each connection in the graph. Following those connections and picking the shortest path between two nodes should result in an explainable path

In [None]:
from scipy.sparse import coo_matrix
from scipy.sparse.csgraph import shortest_path
import numpy as np

In [None]:
matrix = coo_matrix((1/attention_matrix.values().numpy(), (attention_matrix.indices()[0].numpy(), attention_matrix.indices()[1].numpy())), shape=(size,size))

In [None]:
dataset

In [None]:
dataset.token2id(dataset.uid_field, ['2'])

In [None]:
dataset.id2token(dataset.uid_field, [1])

In [None]:
dataset.fields

In [None]:
i = 0
model.eval()
for batch_idx, batched_data in enumerate(valid_data):
    interaction, history_index, positive_u, positive_i = batched_data
    print(interaction['user_id'])
    predictions = model.full_sort_predict(interaction).detach().cpu().numpy()
    print(predictions)
    print(len(predictions))
    break
    i+=1

In [None]:
input_interactions = Interaction({
    'user_id': torch.tensor([3243])
})
predictions = model.full_sort_predict(input_interactions).detach().cpu().numpy()


In [None]:
ind = np.argpartition(predictions, -10)[-10:]
ind = ind[np.argsort(predictions[ind])]
ind

In [None]:
ind = ind + dataset.user_num
ind

In [None]:
last_ind = np.argpartition(predictions, 10)[:10]
last_ind = last_ind[np.argsort(predictions[last_ind])]
last_ind

In [None]:
last_ind = last_ind + dataset.user_num
last_ind

In [None]:
dataset.id2token(dataset.uid_field, [1])

In [None]:
shortest_distances, predecessors = shortest_path(matrix, directed=True,  return_predecessors = True, indices=[3243])

In [None]:
shortest_distances

In [None]:
predecessors

Let's check the distances for the top 10 recommendations

In [None]:
for idx in ind:
    display(shortest_path_user[0][0][idx])

intuitively, the last 10 recommendations should be not reachable

In [None]:
for idx in last_ind:
    display(shortest_path_user[0][0][idx])

In [None]:
def get_path(Pr, j):
    path = [j]
    k = j
    while Pr[k] != -9999:
        path.append(Pr[k])
        k = Pr[k]
    return path[::-1]

In [None]:
get_path(predecessors[0], ind[9])

# Put it all together and evaluate on test set

In [None]:
import os
import torch
from scipy.sparse import coo_matrix
from scipy.sparse.csgraph import shortest_path
import numpy as np
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

## Load model

In [None]:
latest_model = 'saved/%s' % os.listdir('saved')[-1]
state_dict = torch.load(latest_model, map_location='cuda')
model = KGAT(config, train_data._dataset).to(config['device'])

model.load_state_dict(state_dict['state_dict'])

## Create graph

In [None]:
from enum import Enum
class DistanceMode(Enum):
    Inverse=1
    RBF=2
    Linear=3

In [None]:
attention_matrix = model.A_in.cpu().coalesce()
size = attention_matrix.size()[0]

In [None]:
def create_scipy_matrix(distance_mode: str):
    attention_values = attention_matrix.values().numpy()
    if distance_mode == DistanceMode.Inverse:
        attention_values = 1/attention_values
    if distance_mode == DistanceMode.RBF:
        attention_values = np.exp(-attention_values**2)
    if distance_mode == DistanceMode.Linear:
        attention_values = 1-attention_values
    return coo_matrix((attention_values, (attention_matrix.indices()[0].numpy(), attention_matrix.indices()[1].numpy())), shape=(size,size))

In [None]:
graph_matrix = create_scipy_matrix(DistanceMode.RBF)

## Run on test set

In [None]:
k = 10

In [None]:
user_paths = {}

In [None]:
model.eval()
for batch_idx, batched_data in enumerate(tqdm(test_data)):
    interaction, history_index, positive_u, positive_i = batched_data
    predictions = model.full_sort_predict(interaction).detach().cpu().numpy()
    internal_user_id = interaction['user_id'].cpu().numpy()[0]
    
    # calculate top 10 recommended items
    recommended_items = np.argpartition(predictions, -k)[-k:]
    recommended_items = recommended_items[np.argsort(predictions[recommended_items])]
    # item ids in the entity list start after the users, 
    # so we have to add the number of users to the recommended indices
    recommended_items = recommended_items + dataset.user_num
    
    # now calculate shortest distances for current user
    shortest_distances, predecessors = shortest_path(graph_matrix, directed=True,  return_predecessors = True, indices=[internal_user_id])
    
    # calculate the paths for the top k recommended items
    paths = []
    for item_idx in recommended_items:
        path = get_path(predecessors[0], item_idx)
        paths.append(np.asarray(path, dtype=int))
    user_paths[internal_user_id] = paths

## Evaulate... Profit?

In [None]:
dataset.user_num

In [None]:
user_paths

In [None]:
flat_list = [item for sublist in list(user_paths.values()) for item in sublist]

In [None]:
lengths = [len(item) for item in flat_list]

In [None]:
plt.hist(lengths)

In [None]:
np.unique(lengths, return_counts=True)

In [None]:
dataset.item_num+dataset.user_num

In [None]:
for idx, arr in enumerate(flat_list):
    if np.any(arr > 64087):
        print(idx)