In [9]:
import numpy as np
import math
from scipy import stats

metrics = ['NDCG@10', 'NDCG@5', 'Mean Reciprocal Rank', 'Recall@10', 'Recall@5']

def read_metrics(path, torv):
    with open(path, 'r') as file:   
        lines = file.readlines()
    ndcg_10_list = []
    ndcg_5_list = []
    mrr_list = []
    recall_10_list = []
    recall_5_list = []
    flags = []
    for metric in metrics:
        flags.append(torv + ': ' + metric)
    

    for line in lines:
        line = line.strip()  
        if flags[0] in line:
            ndcg_10_list.append(float(line.split(flags[0] + ': ')[1].strip()))
        elif flags[1] in line:
            ndcg_5_list.append(float(line.split(flags[1] + ': ')[1].strip()))
        elif flags[2] in line:
            mrr_list.append(float(line.split(flags[2] + ': ')[1].strip()))
        elif flags[3] in line:
            recall_10_list.append(float(line.split(flags[3] + ': ')[1].strip()))
        elif flags[4] in line:
            recall_5_list.append(float(line.split(flags[4] + ': ')[1].strip()))
    return ndcg_10_list, ndcg_5_list, mrr_list, recall_10_list, recall_5_list

def find_max_epoch(data):
    max_value = max(data)
    # print("----", max_value)
    max_index = data.index(max_value)
    return max_index, max_value

def find_top_three_epochs(lst):
    top_three_indices = sorted(range(len(lst)), key=lambda i: lst[i], reverse=True)[:3]
    return top_three_indices



# metrics_name = ['ndcg10', 'ndcg5', 'mrr', 'r10', 'r5']

def mean(data):
    return sum(data) / len(data)


def variance(data):
    data_mean = mean(data)
    acc = 0
    for x in data:
        acc += (x - data_mean) ** 2
    return acc / (len(data) - 1)


alpha = 0.05

# x_data = exp2_ndcg_5_list
# y_data = exp0_ndcg_5_list

def t_test(x_data, y_data, alpha=0.05):
    m = len(x_data)
    x = np.mean(x_data)
    s1 = np.var(x_data, ddof=1) 

    n = len(y_data)
    y = np.mean(y_data)
    s2 = np.var(y_data, ddof=1) 

    s0 = s1 / m + s2 / n
    t = (x - y) / math.sqrt(s0)
    l = (s0 ** 2) / ((s1 ** 2) / (m * m * (m - 1)) + (s2 ** 2) / (n * n * (n - 1)))
    right = stats.t.isf(alpha, l)
    pvalue = 1 - stats.t.cdf(t, l)

    print('--- mean: ', x, y)
    print('--- Reject: ', -t < -right)
    print('--- p-value:', pvalue)

    return t, pvalue


def get_test_res_list(path):
    ndcg10_val, ndcg5_val, mrr_val, r10_val, r5_val = read_metrics(path, 'Validation') 
    # max_epoch = find_max_epoch(r10_val)
    max_epoch, max_value = find_max_epoch(mrr_val)

    # epochs = find_top_three_epochs(r10_val)
    print('best epoch:', max_epoch, 'best val mrr:', max_value)
    # print(epochs)
    ndcg10_test, ndcg5_test, mrr_test, r10_test, r5_test = read_metrics(path, 'Test') 
    start = max_epoch
    # end = max_epoch+2
    ndcg10_test_ = ndcg10_test[start]
    ndcg5_test_ = ndcg5_test[start]
    mrr_test_ = mrr_test[start]
    r10_test_ = r10_test[start]
    r5_test_ = r5_test[start]
    # ndcg10_test_ = [ndcg10_test[i] for i in epochs]
    # ndcg5_test_ = [ndcg5_test[i] for i in epochs]
    # mrr_test_ = [mrr_test[i] for i in epochs]
    # r10_test_ = [r10_test[i] for i in epochs]
    # r5_test_ = [r5_test[i] for i in epochs]
    return ndcg10_test_, ndcg5_test_, mrr_test_, r10_test_, r5_test_

def get_res(exp0_path, exp2_path):
    exp0_ndcg10 = []
    exp0_ndcg5 = []
    exp0_mrr = []
    exp0_r10 = []
    exp0_r5 = []

    for p in exp0_path:
        res = get_test_res_list(p)
        exp0_ndcg10.append(res[0])
        exp0_ndcg5.append(res[1])
        exp0_mrr.append(res[2])
        exp0_r10.append(res[3])
        exp0_r5.append(res[4])
    exp0_res = [exp0_ndcg10, exp0_ndcg5, exp0_mrr, exp0_r10, exp0_r5]

    exp2_ndcg10 = []
    exp2_ndcg5 = []
    exp2_mrr = []
    exp2_r10 = []
    exp2_r5 = []

    for p in exp2_path:
        res = get_test_res_list(p)
        exp2_ndcg10.append(res[0])
        exp2_ndcg5.append(res[1])
        exp2_mrr.append(res[2])
        exp2_r10.append(res[3])
        exp2_r5.append(res[4])
    exp2_res = [exp2_ndcg10, exp2_ndcg5, exp2_mrr, exp2_r10, exp2_r5]
    
    for i in range(len(metrics)):
        print("\n")
        # print('exp2:', metrics[i], exp2_res[i])
        # print('exp0:', metrics[i], exp0_res[i])
        t_test(x_data=exp2_res[i], y_data=exp0_res[i])

In [None]:
# # V100 initial DGCF VS. DCGLive, commercial
# exp0_path = ['./interaction_prediction_ks_full_dgcf.attention.adj.txt', './interaction_prediction_ks_full_dgcf_3.attention.adj.txt', './interaction_prediction_ks_full_dgcf_2.attention.adj.txt']
# exp2_path = ['./final/interaction_prediction_ks_full_exp_final_linear_09_2.attention.adj.txt', 
#             './final/interaction_prediction_ks_full_exp_final_linear_09.attention.adj.txt',
#             './final/interaction_prediction_ks_full_exp_final_linear_09_3.attention.adj.txt']
# get_res(exp0_path=exp0_path, exp2_path=exp2_path)

In [None]:
# V100 initial DGCF VS. DCGLive, KuaiLive
exp0_path = ['./KuaiLive/interaction_prediction_kuailive5_dgcf.attention.adj.txt', 
             './KuaiLive/interaction_prediction_kuailive5_dgcf_2.attention.adj.txt',
             './KuaiLive/interaction_prediction_kuailive5_dgcf_3.attention.adj.txt'
             ]
exp2_path = ['./KuaiLive/final/interaction_prediction_kuailive5_3_lr_30.attention.adj.txt',
             './KuaiLive/final/interaction_prediction_kuailive5_2_lr_30.attention.adj.txt',
             './KuaiLive/final/interaction_prediction_kuailive5_lr_30.attention.adj.txt']

get_res(exp0_path=exp0_path, exp2_path=exp2_path)

best epoch: 7 best val mrr: 0.10310192996114798
best epoch: 4 best val mrr: 0.102389764603501
best epoch: 27 best val mrr: 0.10075579929122003
best epoch: 23 best val mrr: 0.11413522716986989
best epoch: 47 best val mrr: 0.09868431933802518
best epoch: 33 best val mrr: 0.10814223717938079


--- mean:  0.1336315907107688 0.11180459457449038
--- Reject:  True
--- p-value: 0.018835292764909695


--- mean:  0.09996098559505884 0.08155970408924887
--- Reject:  True
--- p-value: 0.027506838628094887


--- mean:  0.11708248121694621 0.10809855678334983
--- Reject:  False
--- p-value: 0.1119366996588177


--- mean:  0.26140976908954566 0.22101983295839475
--- Reject:  True
--- p-value: 0.044035240303800016


--- mean:  0.15607788379489565 0.12614744136736225
--- Reject:  True
--- p-value: 0.015645148114482477
