In [1]:
from __future__ import absolute_import, division, print_function

import os
import pickle
import gzip

import sys
sys.path.insert(0,'..')

from utils import *
from dataset import RW_based_dataset, KG_based_dataset
from knowledge_graph import RW_based_KG, KG_based_KG
import pandas as pd

In [2]:
class args_class:
    def __init__(self):
        self.dataset = BEAUTY_CORE
        self.att_th_lower = 0
        self.att_th_upper = 3000
        self.user_core_th = 6
        self.user_top_k = 6000
        
args = args_class()
        
if not os.path.isdir(DATA_DIR[args.dataset]):
    os.makedirs(DATA_DIR[args.dataset])

load review to dataset class

In [None]:
print('Load', args.dataset, 'dataset from file...')
dataset = RW_based_dataset(args, DATA_DIR[args.dataset] + '/review_data/')

generate train and test label by filtered users

In [None]:
def labels_filter(core_user_list, dataset, mode='train'):

    review_file = '{}/{}/review_{}.txt.gz'.format(DATA_DIR[dataset], 'review_data', mode)
    user_products = {}  # {uid: [pid,...], ...}

    print('len(core_user_list) = ', len(core_user_list))

    count = 0
    with gzip.open(review_file, 'r') as f:
        for line in f:

            line = line.decode('utf-8').strip()
            arr = line.split('\t')
            user_idx = int(arr[0])
            product_idx = int(arr[1])

            if user_idx in core_user_list:
                if user_idx not in user_products:
                    user_products[user_idx] = []
                user_products[user_idx].append(product_idx)
                count += 1

    print(mode + ', avg user product = ', count/len(user_products))

    return user_products

In [None]:
print('generate filter label', args.dataset, 'knowledge graph from dataset...')
core_user_list = dataset.core_user_list
trn_label = labels_filter(core_user_list, args.dataset, 'train')
tst_label = labels_filter(core_user_list, args.dataset, 'test')

build KG from review dataset class

In [None]:
print('build', args.dataset, 'knowledge graph from dataset...')
kg = RW_based_KG(args, dataset)

save information

In [None]:
print(args.dataset, ' save dataset, trn tst label, kg')
save_dataset(args.dataset, dataset)
save_labels(args.dataset, trn_label, mode='train')
save_labels(args.dataset, tst_label, mode='test')
save_kg(args.dataset, kg)

part2 answer covering rate check

In [14]:
embeds = load_embed(args.dataset)
kg = load_kg(args.dataset)
trn_label = load_labels(args.dataset, 'train')
tst_label = load_labels(args.dataset, 'test')

Load embedding: ../data/Amazon_Beauty_Core/transe_embed.pkl
label_file =  ../data/Amazon_Beauty_Core/train_label.pkl
label_file =  ../data/Amazon_Beauty_Core/test_label.pkl


In [15]:
from math import log

def evaluate(topk_matches, test_user_products):
    """Compute metrics for predicted recommendations.
    Args:
        topk_matches: a list or dict of product ids in ascending order.
    """
    cum_k = 0
    invalid_users = []
    # Compute metrics
    precisions, recalls, ndcgs, hits = [], [], [], []
    test_user_idxs = list(test_user_products.keys())
    for uid in test_user_idxs:

        if uid not in topk_matches:
            print('uid not in topk_matches = ',uid)
            invalid_users.append(uid)
            continue
        pred_list, rel_set = topk_matches[uid][::-1], test_user_products[uid]

        if len(pred_list) == 0:
            cum_k += 1
            ndcgs.append(0)
            recalls.append(0)
            precisions.append(0)
            hits.append(0)
            continue

        dcg = 0.0
        hit_num = 0.0
        for i in range(len(pred_list)):
            if pred_list[i] in rel_set:
                dcg += 1. / (log(i + 2) / log(2))
                hit_num += 1
        # idcg
        idcg = 0.0
        for i in range(min(len(rel_set), len(pred_list))):
            idcg += 1. / (log(i + 2) / log(2))
        ndcg = dcg / idcg

        recall = hit_num / len(rel_set)

        precision = hit_num / len(pred_list)

        hit = 1.0 if hit_num > 0.0 else 0.0

        ndcgs.append(ndcg)
        recalls.append(recall)
        precisions.append(precision)
        hits.append(hit)

    avg_precision = np.mean(precisions) * 100
    avg_recall = np.mean(recalls) * 100
    avg_ndcg = np.mean(ndcgs) * 100
    avg_hit = np.mean(hits) * 100
    print('NDCG={:.3f} |  Recall={:.3f} | HR={:.3f} | Precision={:.3f} | Invalid users={}'.format(
            avg_ndcg, avg_recall, avg_hit, avg_precision, len(invalid_users)))
    print('cum_k == 0 ',  cum_k)
    return avg_precision, avg_recall, avg_ndcg, avg_hit, invalid_users, cum_k

In [16]:
max_acts = 51

def get_actions(path, user):
    """Compute actions for current node."""
    curr_node_type, curr_node_id = path
    actions = [(SELF_LOOP, curr_node_id)]  # self-loop must be included.

    relations_nodes = kg(curr_node_type, curr_node_id)
    candidate_acts = []  # list of tuples of (relation, node_type, node_id)

    for r in relations_nodes:
        next_node_type = KG_RELATION[curr_node_type][r]
        next_node_ids = relations_nodes[r]
        next_node_ids = [n for n in next_node_ids]  # filter
        candidate_acts.extend(zip([r] * len(next_node_ids), next_node_ids))

    # (3) If candidate action set is empty, only return self-loop action.
    if len(candidate_acts) == 0:
        actions = [(SELF_LOOP, curr_node_id)]
        return actions

    # (4) If number of available actions is smaller than max_acts, return action sets.
    if len(candidate_acts) <= max_acts:
        candidate_acts = sorted(candidate_acts, key=lambda x: (x[0], x[1]))
        actions.extend(candidate_acts)
        return actions

    # (5) If there are too many actions, do some deterministic trimming here!
    user_embed = embeds[USER][user]
    scores = []
    for r, next_node_id in candidate_acts:
        next_node_type = KG_RELATION[curr_node_type][r]
        if next_node_type == USER:
            src_embed = user_embed
        elif next_node_type == PRODUCT:
            src_embed = user_embed + embeds[PURCHASE][0]
        elif next_node_type == WORD:
            src_embed = user_embed + embeds[MENTION][0]
        else:
            src_embed = user_embed + embeds[PURCHASE][0] + embeds[r][0]
        score = np.matmul(src_embed, embeds[next_node_type][next_node_id])
        scores.append(score)
        
    candidate_idxs = np.argsort(scores)[-max_acts:]  # choose actions with larger scores
    candidate_acts = sorted([candidate_acts[i] for i in candidate_idxs], key=lambda x: (x[0], x[1]))
    actions.extend(candidate_acts)
    return actions

In [17]:
def ans_cover_rate(step):
    anser_cover = {}
    for user, trn_item_list in trn_label.items():
        total_item = [[USER, user]]
        cache = {}
        cache[USER] = {}
        cache[USER][user] = 1
        for _ in range(step):
            total_item_tmp = []
            for action in total_item:
                next_action = get_actions(action, user)

                for n_action in next_action:
                    curr_node_type, _ = action
                    relation, next_node_id = n_action
                    if relation == SELF_LOOP: next_node_type = curr_node_type
                    else: next_node_type = KG_RELATION[curr_node_type][relation]
                    if next_node_type not in cache: cache[next_node_type] = {}
                    if next_node_id not in cache[next_node_type]:
                        cache[next_node_type][next_node_id] = 1
                        total_item_tmp.append([next_node_type, next_node_id])
            total_item = total_item_tmp
            
        if 'product' in cache:
            anser_cover[user] = [it for it in cache['product'] if it not in trn_item_list]

    evaluate(anser_cover, tst_label)

In [18]:
ans_cover_rate(1)

NDCG=0.000 |  Recall=0.000 | HR=0.000 | Precision=0.000 | Invalid users=0
cum_k == 0  8300


In [19]:
ans_cover_rate(2)

NDCG=6.008 |  Recall=16.626 | HR=39.120 | Precision=1.542 | Invalid users=0
cum_k == 0  33


In [20]:
ans_cover_rate(3)

NDCG=10.703 |  Recall=67.463 | HR=92.012 | Precision=0.236 | Invalid users=0
cum_k == 0  0
