# 生成 pred.json

In [4]:
import pandas as pd
import json
import argparse
import os

# 直接指定参数值
dataset = 'dunnhumby'
fold_id = 0


if __name__ == '__main__':
    data_history = pd.read_csv(f'dataset/{dataset}_history.csv')
    data_future = pd.read_csv(f'dataset/{dataset}_future.csv')

    pred_dict = dict()
    for user, user_data in data_future.groupby('CUSTOMER_ID'):
        user_history = data_history[data_history['CUSTOMER_ID'].isin([user])]
        history_items = user_history['MATERIAL_NUMBER'].tolist()
        # print(history_items)
        s_pop_dict = dict()
        for item in history_items:
            if item not in s_pop_dict.keys():
                s_pop_dict[item] = 1
            else:
                s_pop_dict[item] += 1
        s_dict = sorted(s_pop_dict.items(), key=lambda d: d[1], reverse=True)
        pred = []
        for item, cnt in s_dict:
            pred.append(item)
        pred_dict[user] = pred

    if not os.path.exists('p_top_pred/'):
        os.makedirs('p_top_pred/')
    with open(f'p_top_pred/{dataset}_pred{fold_id}.json', 'w') as f:
        json.dump(pred_dict, f)

# 生成 keyset.json

In [5]:
import pandas as pd
import json
import random
import argparse
import os


if __name__ == '__main__':

    dataset = 'dunnhumby'
    fold_id = 0

    data_future = pd.read_csv(f'dataset/{dataset}_future.csv')
    data_history = pd.read_csv(f'dataset/{dataset}_history.csv')
    data = pd.concat([data_history, data_future])

    user = list(set(data_future['CUSTOMER_ID']))
    user_num = len(user)
    random.shuffle(user)
    user = [str(user_id) for user_id in user]

    train_user = user[:int(user_num*4/5*0.9)]
    val_user = user[int(user_num*4/5*0.9):int(user_num*4/5)]
    test_user = user[int(user_num*4/5):]

    item_num = max(data['MATERIAL_NUMBER'].tolist())+1
    keyset_dict = dict()
    keyset_dict['item_num'] = item_num
    keyset_dict['train'] = train_user
    keyset_dict['val'] = val_user
    keyset_dict['test'] = test_user

    print(keyset_dict)
    if not os.path.exists('keyset/'):
        os.makedirs('keyset/')
    keyset_file = f'keyset/{dataset}_keyset_{fold_id}.json'
    with open(keyset_file, 'w') as f:
        json.dump(keyset_dict, f)

{'item_num': 3003, 'train': ['582504', '395215', '381277', '793623', '37488', '405859', '513671', '316095', '72537', '946336', '952156', '186463', '620371', '411952', '445065', '221751', '130188', '800736', '446860', '934716', '960345', '951153', '955284', '118154', '700523', '749933', '602449', '25455', '712091', '209287', '64677', '597681', '397656', '872182', '669592', '577874', '986832', '451105', '624665', '538485', '906717', '547632', '367922', '282218', '34478', '438943', '862051', '339540', '843776', '351965', '405259', '254380', '483191', '706412', '792913', '96036', '205291', '584121', '674439', '814821', '444576', '720254', '160287', '340481', '519243', '244888', '443460', '595480', '140765', '541717', '716745', '285264', '363037', '758870', '163060', '669061', '343545', '714089', '232742', '413462', '84970', '269404', '815743', '751007', '90336', '44984', '996003', '404884', '321407', '659891', '641034', '572115', '156921', '735194', '933148', '729226', '482578', '306173', 

# 評估

In [6]:
import pandas as pd
import json
import os

import numpy as np
import math
#note ground truth is vector, rank_list is the sorted item index.

def label2vec(label_list, input_size):
    #label_list -> list
    #input_size -> item number
    label_vec = np.zeros(input_size)
    for label in label_list:
        label_vec[label]=1
    return label_vec

def get_repeat_explore(repeat_list, pred_rank_list, k):
    count = 0
    repeat_cnt = 0.0
    for pred in pred_rank_list:
        if count >= k:
            break
        if pred in repeat_list:
            repeat_cnt += 1
        count += 1
    repeat_ratio = repeat_cnt/k
    return repeat_ratio, 1-repeat_ratio

def get_DCG(truth_list, pred_rank_list, k):
    count = 0
    dcg = 0
    for pred in pred_rank_list:
        if count >= k:
            break
        if pred in truth_list:
            dcg += (1)/math.log2(count+1+1)
        count += 1
    return dcg

def get_NDCG(truth_list, pred_rank_list, k):
    dcg = get_DCG(truth_list, pred_rank_list, k)
    idcg = 0
    num_item = len(truth_list)
    for i in range(num_item):
        idcg += (1) / math.log2(i + 1 + 1)
    ndcg = dcg / idcg
    return ndcg

def get_HT(truth_list, pred_rank_list, k):
    count = 0
    for pred in pred_rank_list:
        if count >= k:
            break
        if pred in truth_list:
            return 1
        count += 1
    return 0

def get_Recall(truth_list, pred_rank_list, k):
    truth_num = len(truth_list)
    count = 0
    correct = 0.0
    for pred in pred_rank_list:
        if count >= k:
            break
        if pred in truth_list:
            correct += 1
        count += 1
    recall = correct/truth_num
    return recall

def get_precision(truth_list, pred_rank_list, k):
    correct_predictions = 0
    for pred in pred_rank_list[:k]:
        if pred in truth_list:
            correct_predictions += 1
    precision = correct_predictions / k if k > 0 else 0
    return precision

def get_f1_score(recall, precision):
    if recall + precision > 0:
        f1_score = 2 * (precision * recall) / (precision + recall)
    else:
        f1_score = 0
    return f1_score

def get_precision_recall_Fscore(groundtruth, pred):
    a = groundtruth
    b = pred
    correct = 0
    truth = 0
    positive = 0

    for idx in range(len(a)):
        if a[idx] == 1:
            truth += 1
            if b[idx] == 1:
                correct += 1
        if b[idx] == 1:
            positive += 1

    flag = 0
    if 0 == positive:
        precision = 0
        flag = 1
        #print('postivie is 0')
    else:
        precision = correct/positive
    if 0 == truth:
        recall = 0
        flag = 1
        #print('recall is 0')
    else:
        recall = correct/truth

    if flag == 0 and precision + recall > 0:
        F = 2*precision*recall/(precision+recall)
    else:
        F = 0
    return precision, recall, F, correct

def get_repeat_eval(pred_folder, dataset, size, fold_list, file):
    history_file = f'dataset/{dataset}_history.csv'
    truth_file = f'jsondata/{dataset}_future.json'
    with open(truth_file, 'r') as f:
        data_truth = json.load(f)
    data_history = pd.read_csv(history_file)
    a_ndcg = []
    a_recall = []
    a_hit = []
    a_repeat_ratio = []
    a_explore_ratio = []
    a_recall_repeat = []
    a_recall_explore = []
    a_hit_repeat = []
    a_hit_explore = []
    a_precision = []
    a_f1 = []

    for ind in fold_list:
        keyset_file = f'keyset/{dataset}_keyset_{ind}.json'
        pred_file = f'{pred_folder}/{dataset}_pred{ind}.json'
        with open(keyset_file, 'r') as f:
            keyset = json.load(f)
        with open(pred_file, 'r') as f:
            data_pred = json.load(f)
        # compute fold
        ndcg = []
        recall = []
        hit = []
        repeat_ratio = []
        explore_ratio = []
        recall_repeat = []
        recall_explore = []
        hit_repeat = []
        hit_explore = []
        precision = []
        f1 = []

        for user in keyset['test']:
            pred = data_pred[user]
            truth = data_truth[user][1]
            # print(user)
            user_history = data_history[data_history['CUSTOMER_ID'].isin([int(user)])]
            repeat_items = list(set(user_history['MATERIAL_NUMBER']))
            truth_repeat = list(set(truth)&set(repeat_items)) # might be none
            truth_explore = list(set(truth)-set(truth_repeat)) # might be none

            u_ndcg = get_NDCG(truth, pred, size)
            ndcg.append(u_ndcg)
            u_recall = get_Recall(truth, pred, size)
            recall.append(u_recall)
            u_hit = get_HT(truth, pred, size)
            hit.append(u_hit)

            # 计算精确率
            u_precision = get_precision(truth, pred, size)

            # 计算F1分数
            u_f1_score = get_f1_score(u_recall, u_precision)

            precision.append(u_precision)  # 确保您有一个用于存储精确率的列表
            f1.append(u_f1_score)  # 确保您有一个用于存储F1分数的列表

            u_repeat_ratio, u_explore_ratio = get_repeat_explore(repeat_items, pred, size)# here repeat items
            repeat_ratio.append(u_repeat_ratio)
            explore_ratio.append(u_explore_ratio)

            if len(truth_repeat)>0:
                u_recall_repeat = get_Recall(truth_repeat, pred, size)# here repeat truth, since repeat items might not in the groundtruth
                recall_repeat.append(u_recall_repeat)
                u_hit_repeat = get_HT(truth_repeat, pred, size)
                hit_repeat.append(u_hit_repeat)

            if len(truth_explore)>0:
                u_recall_explore = get_Recall(truth_explore, pred, size)
                u_hit_explore = get_HT(truth_explore, pred, size)
                recall_explore.append(u_recall_explore)
                hit_explore.append(u_hit_explore)

        a_ndcg.append(np.mean(ndcg))
        a_recall.append(np.mean(recall))
        a_hit.append(np.mean(hit))
        a_repeat_ratio.append(np.mean(repeat_ratio))
        a_explore_ratio.append(np.mean(explore_ratio))
        a_recall_repeat.append(np.mean(recall_repeat))
        a_recall_explore.append(np.mean(recall_explore))
        a_hit_repeat.append(np.mean(hit_repeat))
        a_hit_explore.append(np.mean(hit_explore))

        a_precision.append(np.mean(precision))
        a_f1.append(np.mean(f1))

        print(ind, np.mean(recall))
        file.write(str(ind)+' '+str(np.mean(recall))+'\n')

    print('basket size:', size)
    print('recall, ndcg, hit:', np.mean(a_recall), np.mean(a_ndcg), np.mean(a_hit))
    print('precision, f1:', np.mean(a_precision),  np.mean(a_f1))
    print('repeat-explore ratio:', np.mean(a_repeat_ratio), np.mean(a_explore_ratio))
    print('repeat-explore recall', np.mean(a_recall_repeat), np.mean(a_recall_explore))
    print('repeat-explore hit:', np.mean(a_hit_repeat), np.mean(a_hit_explore))

    file.write('basket size: ' + str(size) + '\n')
    file.write('recall, ndcg, hit: '+ str(np.mean(a_recall)) +' ' +str(np.mean(a_ndcg))+' '+ str(np.mean(a_hit)) +'\n')
    file.write('repeat-explore ratio:'+ str(np.mean(a_repeat_ratio)) +' ' +str(np.mean(a_explore_ratio)) +'\n')
    file.write('repeat-explore recall' + str(np.mean(a_recall_repeat)) + ' ' + str(np.mean(a_recall_explore)) +'\n')
    file.write('repeat-explore hit:' + str(np.mean(a_hit_repeat)) + ' ' + str(np.mean(a_hit_explore)) + '\n')
    return np.mean(a_recall)

if __name__ == '__main__':

    pred_folder = 'p_top_pred'
    fold_list = [0]

    eval_file = 'eval_results.txt'
    f = open(eval_file, 'w')
    for dataset in ['dunnhumby']:
        f.write('############'+dataset+'########### \n')
        get_repeat_eval(pred_folder, dataset, 5, fold_list, f)
        get_repeat_eval(pred_folder, dataset, 10, fold_list, f)
        get_repeat_eval(pred_folder, dataset, 30, fold_list, f)
        get_repeat_eval(pred_folder, dataset, 50, fold_list, f)
        get_repeat_eval(pred_folder, dataset, 65, fold_list, f)

0 0.201257584713824
basket size: 5
recall, ndcg, hit: 0.201257584713824 0.2372063610188451 0.6465315666406859
precision, f1: 0.26695245518316446 0.1906763050292422
repeat-explore ratio: 0.9999220576773189 7.794232268121589e-05
repeat-explore recall 0.34994590464127046 0.0
repeat-explore hit: 0.7452830188679245 0.0
0 0.28574471673012125
basket size: 10
recall, ndcg, hit: 0.28574471673012125 0.2872314938048419 0.7186282151208107
precision, f1: 0.20066250974279032 0.19663961981162836
repeat-explore ratio: 0.999298519095869 0.0007014809041309431
repeat-explore recall 0.49280921762660956 0.0
repeat-explore hit: 0.8283917340521114 0.0
0 0.425907011923478
basket size: 30
recall, ndcg, hit: 0.425907011923478 0.35047220814249014 0.8047544816835541
precision, f1: 0.10771628994544039 0.15212146305271665
repeat-explore ratio: 0.9722525331254872 0.027747466874512863
repeat-explore recall 0.7442295480518485 0.0
repeat-explore hit: 0.9276729559748428 0.0
0 0.49008431399020885
basket size: 50
recall, 