In [1]:
from __future__ import absolute_import, division, print_function

import os
import pickle
import gzip
import pandas as pd

import sys
sys.path.insert(0,'..')

from utils import *
from dataset import RW_based_dataset, KG_based_dataset
from knowledge_graph import RW_based_KG, KG_based_KG
import pandas as pd

In [2]:
class args_class:
    def __init__(self):
        self.dataset = MOVIE_CORE
        self.att_th_lower = 0
        self.att_th_upper = 3000
        self.user_core_th = 6
        self.user_top_k = 6000

args = args_class()
        
if not os.path.isdir(DATA_DIR[args.dataset]):
    os.makedirs(DATA_DIR[args.dataset])

load review to dataset class

In [3]:
print('Load', args.dataset, 'dataset from file...')
dataset = KG_based_dataset(args, DATA_DIR[args.dataset])

Load MovieLens-1M_core dataset from file...
n_user =  6036
n_item =  2445
n_attribute =  182011
self.entity_list =  {'user': {'vocab_size': 6036}, 'product': {'vocab_size': 8481}, 'attribute': {'vocab_size': 188047}}


generate train and test label by filtered users

In [4]:
def kg_labels_filter(core_user_list, dataset, mode='train'):

    rating_file = DATA_DIR[dataset] + '/ratings_final'
    rating_np = np.load(rating_file + '.npy')
    n_user = max(set(rating_np[:, 0])) + 1
    data = pd.read_csv(f'{DATA_DIR[dataset]}/{mode}_pd.csv',index_col=None)
    data = data.drop(data.columns[0], axis=1)
    data = data[['user','item','like']].values

    seen = {'user':{}, 'items':{}}
    user_products = {}  # {uid: [pid,...], ...}
    for row in data:
        user_idx, product_idx, like = row[0], row[1]  + n_user, row[2]
        if like == 0: continue
        if user_idx in core_user_list:
            if user_idx not in user_products:
                user_products[user_idx] = []
            user_products[user_idx].append(product_idx)
            if user_idx not in seen['user']:  seen['user'][user_idx] = len(seen['user'])
            if product_idx not in seen['items']: seen['items'][product_idx] = len(seen['items'])

    print('seen.user, seen.items = ', len(seen['user']), len(seen['items']))
    return user_products

In [5]:
print('generate filter label', args.dataset, 'knowledge graph from dataset...')
core_user_list = dataset.core_user_list
trn_label = kg_labels_filter(core_user_list, args.dataset, 'train')
tst_label = kg_labels_filter(core_user_list, args.dataset, 'test')

generate filter label MovieLens-1M_core knowledge graph from dataset...
seen.user, seen.items =  6000 2192
seen.user, seen.items =  5851 2013


In [6]:
print('build', args.dataset, 'knowledge graph from dataset...')
kg = KG_based_KG(args, dataset)

build MovieLens-1M_core knowledge graph from dataset...
Remove duplicates...
relation =  {'purchase': [['user', 0, 'purchase', 7455], ['product', 7455, 'purchase', 0], ['user', 0, 'purchase', 7727], ['product', 7727, 'purchase', 0], ['user', 0, 'purchase', 6706], ['product', 6706, 'purchase', 0], ['user', 0, 'purchase', 7732], ['product', 7732, 'purchase', 0], ['user', 0, 'purchase', 6201], ['product', 6201, 'purchase', 0], ['user', 0, 'purchase', 6975], ['product', 6975, 'purchase', 0], ['user', 0, 'purchase', 7261], ['product', 7261, 'purchase', 0], ['user', 0, 'purchase', 6377], ['product', 6377, 'purchase', 0], ['user', 0, 'purchase', 7660], ['product', 7660, 'purchase', 0], ['user', 0, 'purchase', 6127], ['product', 6127, 'purchase', 0], ['user', 0, 'purchase', 7796], ['product', 7796, 'purchase', 0], ['user', 0, 'purchase', 7419], ['product', 7419, 'purchase', 0], ['user', 0, 'purchase', 6781], ['product', 6781, 'purchase', 0], ['user', 0, 'purchase', 6660], ['product', 6660, 'pu

build KG from review dataset class

save information

In [7]:
print(args.dataset, ' save dataset, trn tst label, kg')
save_dataset(args.dataset, dataset)
save_labels(args.dataset, trn_label, mode='train')
save_labels(args.dataset, tst_label, mode='test')
save_kg(args.dataset, kg)

MovieLens-1M_core  save dataset, trn tst label, kg


part2 answer covering rate check

In [8]:
embeds = load_embed(args.dataset)
embeds[SELF_LOOP] = (np.zeros(50), 0.0)

Load embedding: ../data/MovieLens-1M_Core/transe_embed.pkl


In [9]:
from math import log

def evaluate(topk_matches, test_user_products):
    """Compute metrics for predicted recommendations.
    Args:
        topk_matches: a list or dict of product ids in ascending order.
    """
    cum_k = 0
    invalid_users = []
    # Compute metrics
    precisions, recalls, ndcgs, hits = [], [], [], []
    test_user_idxs = list(test_user_products.keys())
    for uid in test_user_idxs:

        if uid not in topk_matches:
            print('uid not in topk_matches = ',uid)
            invalid_users.append(uid)
            continue
        pred_list, rel_set = topk_matches[uid][::-1], test_user_products[uid]

        if len(pred_list) == 0:
            cum_k += 1
            ndcgs.append(0)
            recalls.append(0)
            precisions.append(0)
            hits.append(0)
            continue

        dcg = 0.0
        hit_num = 0.0
        for i in range(len(pred_list)):
            if pred_list[i] in rel_set:
                dcg += 1. / (log(i + 2) / log(2))
                hit_num += 1
        # idcg
        idcg = 0.0
        for i in range(min(len(rel_set), len(pred_list))):
            idcg += 1. / (log(i + 2) / log(2))
        ndcg = dcg / idcg

        recall = hit_num / len(rel_set)

        precision = hit_num / len(pred_list)

        hit = 1.0 if hit_num > 0.0 else 0.0

        ndcgs.append(ndcg)
        recalls.append(recall)
        precisions.append(precision)
        hits.append(hit)

    avg_precision = np.mean(precisions) * 100
    avg_recall = np.mean(recalls) * 100
    avg_ndcg = np.mean(ndcgs) * 100
    avg_hit = np.mean(hits) * 100
    print('NDCG={:.3f} |  Recall={:.3f} | HR={:.3f} | Precision={:.3f} | Invalid users={}'.format(
            avg_ndcg, avg_recall, avg_hit, avg_precision, len(invalid_users)))
    print('cum_k == 0 ',  cum_k)
    return avg_precision, avg_recall, avg_ndcg, avg_hit, invalid_users, cum_k

In [10]:
max_acts = 51

def get_actions(path, user):
    """Compute actions for current node."""
    curr_node_type, curr_node_id = path
    actions = [(SELF_LOOP, curr_node_id)]  # self-loop must be included.

    relations_nodes = kg(curr_node_type, curr_node_id)
    candidate_acts = []  # list of tuples of (relation, node_type, node_id)

    for r in relations_nodes:
        next_node_ids = relations_nodes[r]
        next_node_set = []
        for n_id in next_node_ids:
            next_node_set.append([dataset.et_idx2ty[n_id],n_id])
        next_node_ids = [n_set[1] for n_set in next_node_set]
        candidate_acts.extend(zip([r] * len(next_node_ids), next_node_ids))

    # (3) If candidate action set is empty, only return self-loop action.
    if len(candidate_acts) == 0:
        return actions

    # (4) If number of available actions is smaller than max_acts, return action sets.
    if len(candidate_acts) <= max_acts:
        candidate_acts = sorted(candidate_acts, key=lambda x: (x[0], x[1]))
        actions.extend(candidate_acts)
        return actions

    # (5) If there are too many actions, do some deterministic trimming here!
    user_embed = embeds[USER][user]
    scores = []
    for r, next_node_id in candidate_acts:
        next_node_type = dataset.et_idx2ty[next_node_id]
        if next_node_type == USER:
            src_embed = user_embed
        elif next_node_type == PRODUCT:
            src_embed = user_embed + embeds[PURCHASE][0]
        elif next_node_type == WORD:
            src_embed = user_embed + embeds[MENTION][0]
        else:
            src_embed = user_embed + embeds[PURCHASE][0] + embeds[r][0]
        score = np.matmul(src_embed, embeds[next_node_type][next_node_id])
        scores.append(score)
        
    candidate_idxs = np.argsort(scores)[-max_acts:]  # choose actions with larger scores
    candidate_acts = sorted([candidate_acts[i] for i in candidate_idxs], key=lambda x: (x[0], x[1]))
    actions.extend(candidate_acts)
    return actions

In [11]:
def ans_cover_rate(step):
    anser_cover = {}
    for user, trn_item_list in trn_label.items():
        total_item = [[USER, user]]
        cache = {}
        cache[USER] = {}
        cache[USER][user] = 1
        for _ in range(step):
            total_item_tmp = []
            for action in total_item:
                next_action = get_actions(action, user)

                for n_action in next_action:
                    curr_node_type, _ = action
                    relation, next_node_id = n_action
                    if relation == SELF_LOOP: next_node_type = curr_node_type
                    else: next_node_type = dataset.et_idx2ty[next_node_id]
                    if next_node_type not in cache: cache[next_node_type] = {}
                    if next_node_id not in cache[next_node_type]:
                        cache[next_node_type][next_node_id] = 1
                        total_item_tmp.append([next_node_type, next_node_id])
            total_item = total_item_tmp

        anser_cover[user] = [it for it in cache['product'] if it not in trn_item_list]

    evaluate(anser_cover, tst_label)

In [12]:
ans_cover_rate(1)

NDCG=0.000 |  Recall=0.000 | HR=0.000 | Precision=0.000 | Invalid users=0
cum_k == 0  5851


In [13]:
ans_cover_rate(2)

NDCG=0.000 |  Recall=0.000 | HR=0.000 | Precision=0.000 | Invalid users=0
cum_k == 0  5851


In [14]:
ans_cover_rate(3)

NDCG=21.241 |  Recall=96.186 | HR=99.419 | Precision=0.809 | Invalid users=0
cum_k == 0  0
