# A Large Language Model-based tool to facilitate data harmonization: summarize results

In [52]:
#****************************************
# MIT License
# Copyright (c) 2025 Zexu Li, Jinying Chen
#  
# author(s): Zexu Li, Jinying Chen, Boston University Chobanian & Avedisian School of Medicine
# date: 2025-7-7
# ver: 1.0
# 
# This code was written to support data analysis for the Data Harmonization Using Natural Language 
# Processing (NLP harmonization) project and the 2025 paper published in PLOS One.
# The code is for research use only, and is provided as it is.
# 

In [53]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [54]:
import os
import re
import pandas as pd
import numpy as np
import scipy.stats as stats
import pickle

In [55]:
datadir = "[path to input data]"

In [56]:
def mean_diff_ci(col1, col2, confidence=0.95):
    """
    Calculate the mean difference and confidence interval for paired samples.

    Parameters:
    - col1, col2: array-like, two paired samples (same length)
    - confidence: float, confidence level (default 0.95)

    Returns:
    - mean_diff: float, mean of the differences
    - ci: tuple, lower and upper bound of the confidence interval
    """
    col1 = np.asarray(col1)
    col2 = np.asarray(col2)
    
    if col1.shape != col2.shape:
        raise ValueError("Input arrays must have the same shape.")
    
    diff = col1 - col2
    mean_diff = np.mean(diff)
    sem_diff = stats.sem(diff)
    df = len(diff) - 1
    t_crit = stats.t.ppf((1 + confidence) / 2, df)
    margin = t_crit * sem_diff
    ci = (mean_diff - margin, mean_diff + margin)
    
    return mean_diff, ci

In [57]:
def ci_of_one_sample(data, confidence = 0.95):
    # 95% CI using t-distribution
    confidence = confidence
    # Sample statistics
    mean = np.mean(data)
    sem = stats.sem(data)  # Standard error of the mean

    df = len(data) - 1  # degrees of freedom
    t_crit = stats.t.ppf((1 + confidence) / 2, df)
    margin = t_crit * sem
    
    ci_lower = mean - margin
    ci_upper = mean + margin
    
    print(f"Mean: {mean:.6f}")
    print(f"95% CI: ({ci_lower:.6f}, {ci_upper:.6f})")

In [58]:
def extract_performance_scores(file_path, modelname, trial):
    with open(file_path, 'r') as file:
        lines = file.readlines()

    optimize_opts = ["opt_HR", "opt_MRR", "opt_uniq_HR"]
    eval_relts = {}
    eval_relts[modelname] = []
    eval_relts['e5'] = []
    eval_relts['optimize_opt'] = []
    eval_relts['metric'] = []
    
    
    j = 0
    for i, line in enumerate(lines):
        if re.search("^Best Grid:", line):
            #print(line)
            (HR30, HR20, HR10, HR5, MRR) = re.search(r"unique_HR:([0-9\.]+),([0-9\.]+),([0-9\.]+),([0-9\.]+), MRR:([0-9\.]+)", line).groups()
            opt = optimize_opts[j]
            
            eval_relts['optimize_opt'] += [opt] * 5
            eval_relts[modelname].append(float(HR30))
            eval_relts['metric'].append("HR30")
            eval_relts[modelname].append(float(HR20))
            eval_relts['metric'].append("HR20")
            eval_relts[modelname].append(float(HR10))
            eval_relts['metric'].append("HR10")
            eval_relts[modelname].append(float(HR5))
            eval_relts['metric'].append("HR5")
            eval_relts[modelname].append(float(MRR))
            eval_relts['metric'].append("MRR")
            
        elif re.search("^New E5,", line):
            (HR30, HR20, HR10, HR5, MRR) = re.search(r"unique_HR:([0-9\.]+),([0-9\.]+),([0-9\.]+),([0-9\.]+), MRR:([0-9\.]+)", line).groups()
            opt = optimize_opts[j]

            eval_relts['e5'].append(float(HR30))
            eval_relts['e5'].append(float(HR20))
            eval_relts['e5'].append(float(HR10))
            eval_relts['e5'].append(float(HR5))
            eval_relts['e5'].append(float(MRR))
            
            j+=1
        
    eval_relts['trial'] = [trial] * len(eval_relts[modelname])
    eval_relts['file_name'] = [file_path] * len(eval_relts[modelname])

    '''
    print(eval_relts)
    for key in eval_relts.keys() :
        print(f"{key}: {len(eval_relts[key])} elements")
    '''
    df = pd.DataFrame(eval_relts)
    
    return(df)

In [59]:
def extract_scores_from_folder(model_output_path, model_name):
    df_ls = []
    trial = 1
    for file_name in sorted(os.listdir(model_output_path)):
        if file_name.endswith('.txt') and file_name.startswith('output'):
            file_path = os.path.join(model_output_path, file_name)
            #(trial) = re.search(r"output([0-9]+).txt$", file_name).group(1)
            print(f"trial No mapping: {trial} -> {file_name}")
            df = extract_performance_scores(file_path, model_name, trial)
            df_ls.append(df)
            trial += 1
    
    
    combined_df = pd.concat(df_ls, axis=0, ignore_index=True)
    
    combined_df.tail()
    return (combined_df)

In [60]:
def extract_scores(combined_df, optimize_opt = "opt_uniq_HR", models = ["RF", "e5"]):
    file_info = combined_df[['trial', 'file_name']].drop_duplicates()
    
    wide_df = combined_df[ (combined_df["optimize_opt"] == optimize_opt) ].pivot(index='trial', columns='metric', values=models)
    
    #wide_df.columns

    wide_df.columns = [f'{val}_{col}' for val, col in wide_df.columns]
    wide_df = wide_df.reset_index()

    final_df = pd.merge(file_info, wide_df, on='trial')

    return(final_df)

## Table 4 

In [61]:
model_output_file = datadir + "/E5_RFonHR_comparison_50trials_res.csv"

df1 = pd.read_csv(model_output_file)

#df1

df = df1.drop(columns=['E5_File', 'File_name'])

#df

In [62]:
summary = pd.DataFrame({
    'mean': df.mean(),
    'std': df.std(ddof=1)  # ddof=1 for sample standard deviation
})

summary

Unnamed: 0,mean,std
E5_Unique_HR_30,0.910857,0.028507
E5_Unique_HR_20,0.904571,0.027899
E5_Unique_HR_10,0.871143,0.034452
E5_Unique_HR_5,0.803143,0.040436
E5_MRR,0.658849,0.044175
top_30_HR_whole,0.986,0.01241
top_20_whole,0.974857,0.017211
top_10_whole,0.929429,0.026673
top_5_whole,0.872,0.038176
MRR_whole,0.744448,0.036333


In [63]:
# Paired t-test
t_stat, p_value = stats.ttest_rel(df["top_30_HR_whole"], df["E5_Unique_HR_30"])
print(f"HR30: t-statistic: {t_stat:.4f}, p-value: {p_value:.4f}")
t_stat, p_value = stats.ttest_rel(df["top_20_whole"], df["E5_Unique_HR_20"])
print(f"HR20: t-statistic: {t_stat:.4f}, p-value: {p_value:.4f}")
t_stat, p_value = stats.ttest_rel(df["top_10_whole"], df["E5_Unique_HR_10"])
print(f"HR10: t-statistic: {t_stat:.4f}, p-value: {p_value:.4f}")
t_stat, p_value = stats.ttest_rel(df["top_5_whole"], df["E5_Unique_HR_5"])
print(f"HR5: t-statistic: {t_stat:.4f}, p-value: {p_value:.4f}")
t_stat, p_value = stats.ttest_rel(df["MRR_whole"], df["E5_MRR"])
print(f"MRR: t-statistic: {t_stat:.4f}, p-value: {p_value:.4f}")

HR30: t-statistic: 17.8974, p-value: 0.0000
HR20: t-statistic: 17.7762, p-value: 0.0000
HR10: t-statistic: 11.9482, p-value: 0.0000
HR5: t-statistic: 13.6974, p-value: 0.0000
MRR: t-statistic: 18.2350, p-value: 0.0000


In [64]:
mean_diff_ci(df["top_30_HR_whole"], df["E5_Unique_HR_30"], confidence=0.95)
mean_diff_ci(df["top_20_whole"], df["E5_Unique_HR_20"], confidence=0.95)
mean_diff_ci(df["top_10_whole"], df["E5_Unique_HR_10"], confidence=0.95)
mean_diff_ci(df["top_5_whole"], df["E5_Unique_HR_5"], confidence=0.95)
mean_diff_ci(df["MRR_whole"], df["E5_MRR"], confidence=0.95)

(0.07514285715999997, (0.06670559497995944, 0.08358011934004049))

(0.07028571419999997, (0.062340013789207435, 0.07823141461079251))

(0.05828571439999999, (0.048482621756140806, 0.06808880704385917))

(0.0688571428, (0.05875492648065321, 0.0789593591193468))

(0.08559866773999998, (0.07616534944608044, 0.09503198603391952))

### Results from the orginal 50 MT trials (sensitivity analysis for Table 4, not reported in paper)  

In [65]:
''' The Random Forest model may produce slightly different results each time, even when using the same set of hyperparameters, 
due to its inherent randomness.

Table 4 reports the results from retraining the Random Forest model using the optimal hyperparameters obtained from the original 50 trials. 
These results differed slightly from the original outputs (only in the last decimal place), but did not affect the conclusions.
'''

' The Random Forest model may produce slightly different results each time, even when using the same set of hyperparameters, \ndue to its inherent randomness.\n\nTable 4 reports the results from retraining the Random Forest model using the optimal hyperparameters obtained from the original 50 trials. \nThese results differed slightly from the original outputs (only in the last decimal place), but did not affect the conclusions.\n'

In [66]:
# extract e5 and RF results from original trials
model_output_path = datadir + "/02232025res_v2/"
model_name = "RF"

combined_df = extract_scores_from_folder(model_output_path, model_name)

combined_df.tail()

trial No mapping: 1 -> output1.txt
trial No mapping: 2 -> output10.txt
trial No mapping: 3 -> output11.txt
trial No mapping: 4 -> output12.txt
trial No mapping: 5 -> output13.txt
trial No mapping: 6 -> output14.txt
trial No mapping: 7 -> output15.txt
trial No mapping: 8 -> output16.txt
trial No mapping: 9 -> output17.txt
trial No mapping: 10 -> output18.txt
trial No mapping: 11 -> output19.txt
trial No mapping: 12 -> output2.txt
trial No mapping: 13 -> output20.txt
trial No mapping: 14 -> output21.txt
trial No mapping: 15 -> output22.txt
trial No mapping: 16 -> output23.txt
trial No mapping: 17 -> output24.txt
trial No mapping: 18 -> output25.txt
trial No mapping: 19 -> output26.txt
trial No mapping: 20 -> output27.txt
trial No mapping: 21 -> output28.txt
trial No mapping: 22 -> output29.txt
trial No mapping: 23 -> output3.txt
trial No mapping: 24 -> output30.txt
trial No mapping: 25 -> output31.txt
trial No mapping: 26 -> output32.txt
trial No mapping: 27 -> output33.txt
trial No mapp

Unnamed: 0,RF,e5,optimize_opt,metric,trial,file_name
745,1.0,0.957143,opt_uniq_HR,HR30,50,/restricted/projectnb/fhs-std-chen/jchen/code_...
746,1.0,0.942857,opt_uniq_HR,HR20,50,/restricted/projectnb/fhs-std-chen/jchen/code_...
747,0.942857,0.9,opt_uniq_HR,HR10,50,/restricted/projectnb/fhs-std-chen/jchen/code_...
748,0.9,0.842857,opt_uniq_HR,HR5,50,/restricted/projectnb/fhs-std-chen/jchen/code_...
749,0.730895,0.689515,opt_uniq_HR,MRR,50,/restricted/projectnb/fhs-std-chen/jchen/code_...


In [67]:
df1 = extract_scores(combined_df, "opt_uniq_HR", models = ["RF", "e5"])
df = df1.drop(columns=['file_name'])
summary = pd.DataFrame({
    'mean': df.mean(),
    'std': df.std(ddof=1)  # ddof=1 for sample standard deviation
})

summary

Unnamed: 0,mean,std
trial,25.5,14.57738
RF_HR10,0.927714,0.027291
RF_HR20,0.973714,0.013648
RF_HR30,0.986857,0.012527
RF_HR5,0.873143,0.036122
RF_MRR,0.744401,0.037811
e5_HR10,0.871143,0.034452
e5_HR20,0.904571,0.027899
e5_HR30,0.910857,0.028507
e5_HR5,0.803143,0.040436


In [68]:
# Paired t-test
t_stat, p_value = stats.ttest_rel(df["RF_HR30"], df["e5_HR30"])
print(f"HR30: t-statistic: {t_stat:.4f}, p-value: {p_value:.4f}")
t_stat, p_value = stats.ttest_rel(df["RF_HR20"], df["e5_HR20"])
print(f"HR20: t-statistic: {t_stat:.4f}, p-value: {p_value:.4f}")
t_stat, p_value = stats.ttest_rel(df["RF_HR10"], df["e5_HR10"])
print(f"HR10: t-statistic: {t_stat:.4f}, p-value: {p_value:.4f}")
t_stat, p_value = stats.ttest_rel(df["RF_HR5"], df["e5_HR5"])
print(f"HR5: t-statistic: {t_stat:.4f}, p-value: {p_value:.4f}")
t_stat, p_value = stats.ttest_rel(df["RF_MRR"], df["e5_MRR"])
print(f"MRR: t-statistic: {t_stat:.4f}, p-value: {p_value:.4f}")

HR30: t-statistic: 17.2555, p-value: 0.0000
HR20: t-statistic: 17.6267, p-value: 0.0000
HR10: t-statistic: 10.9587, p-value: 0.0000
HR5: t-statistic: 14.1813, p-value: 0.0000
MRR: t-statistic: 17.6433, p-value: 0.0000


In [69]:
mean_diff_ci(df["RF_HR30"], df["e5_HR30"], confidence=0.95)
mean_diff_ci(df["RF_HR20"], df["e5_HR20"], confidence=0.95)
mean_diff_ci(df["RF_HR10"], df["e5_HR10"], confidence=0.95)
mean_diff_ci(df["RF_HR5"], df["e5_HR5"], confidence=0.95)
mean_diff_ci(df["RF_MRR"], df["e5_MRR"], confidence=0.95)

(0.07600000000000003, (0.0671490622858366, 0.08485093771416345))

(0.06914285714285716, (0.06126006367052508, 0.07702565061518923))

(0.056571428571428585, (0.04619748493912138, 0.06694537220373578))

(0.06999999999999998, (0.06008058355530563, 0.07991941644469433))

(0.0855513852785291, (0.0758070600104507, 0.09529571054660749))

### Appendix S2 Text: Table C (sensitivity analysis for Table 4, using MRR to tune hyperparameters)

In [70]:
# extract e5 and RF results from original trials
model_output_path = datadir + "/02232025res_v2/"
model_name = "RF"

combined_df = extract_scores_from_folder(model_output_path, model_name)

combined_df.tail()

trial No mapping: 1 -> output1.txt
trial No mapping: 2 -> output10.txt
trial No mapping: 3 -> output11.txt
trial No mapping: 4 -> output12.txt
trial No mapping: 5 -> output13.txt
trial No mapping: 6 -> output14.txt
trial No mapping: 7 -> output15.txt
trial No mapping: 8 -> output16.txt
trial No mapping: 9 -> output17.txt
trial No mapping: 10 -> output18.txt
trial No mapping: 11 -> output19.txt
trial No mapping: 12 -> output2.txt
trial No mapping: 13 -> output20.txt
trial No mapping: 14 -> output21.txt
trial No mapping: 15 -> output22.txt
trial No mapping: 16 -> output23.txt
trial No mapping: 17 -> output24.txt
trial No mapping: 18 -> output25.txt
trial No mapping: 19 -> output26.txt
trial No mapping: 20 -> output27.txt
trial No mapping: 21 -> output28.txt
trial No mapping: 22 -> output29.txt
trial No mapping: 23 -> output3.txt
trial No mapping: 24 -> output30.txt
trial No mapping: 25 -> output31.txt
trial No mapping: 26 -> output32.txt
trial No mapping: 27 -> output33.txt
trial No mapp

Unnamed: 0,RF,e5,optimize_opt,metric,trial,file_name
745,1.0,0.957143,opt_uniq_HR,HR30,50,/restricted/projectnb/fhs-std-chen/jchen/code_...
746,1.0,0.942857,opt_uniq_HR,HR20,50,/restricted/projectnb/fhs-std-chen/jchen/code_...
747,0.942857,0.9,opt_uniq_HR,HR10,50,/restricted/projectnb/fhs-std-chen/jchen/code_...
748,0.9,0.842857,opt_uniq_HR,HR5,50,/restricted/projectnb/fhs-std-chen/jchen/code_...
749,0.730895,0.689515,opt_uniq_HR,MRR,50,/restricted/projectnb/fhs-std-chen/jchen/code_...


In [71]:
df1 = extract_scores(combined_df, "opt_MRR", models = ["RF", "e5"])
df = df1.drop(columns=['file_name'])
summary = pd.DataFrame({
    'mean': df.mean(),
    'std': df.std(ddof=1)  # ddof=1 for sample standard deviation
})

summary

Unnamed: 0,mean,std
trial,25.5,14.57738
RF_HR10,0.928,0.029569
RF_HR20,0.976,0.01566
RF_HR30,0.987143,0.011987
RF_HR5,0.873143,0.036237
RF_MRR,0.742546,0.037461
e5_HR10,0.871143,0.034452
e5_HR20,0.904571,0.027899
e5_HR30,0.910857,0.028507
e5_HR5,0.803143,0.040436


In [72]:
# Paired t-test
t_stat, p_value = stats.ttest_rel(df["RF_HR30"], df["e5_HR30"])
print(f"HR30: t-statistic: {t_stat:.4f}, p-value: {p_value:.4f}")
t_stat, p_value = stats.ttest_rel(df["RF_HR20"], df["e5_HR20"])
print(f"HR20: t-statistic: {t_stat:.4f}, p-value: {p_value:.4f}")
t_stat, p_value = stats.ttest_rel(df["RF_HR10"], df["e5_HR10"])
print(f"HR10: t-statistic: {t_stat:.4f}, p-value: {p_value:.4f}")
t_stat, p_value = stats.ttest_rel(df["RF_HR5"], df["e5_HR5"])
print(f"HR5: t-statistic: {t_stat:.4f}, p-value: {p_value:.4f}")
t_stat, p_value = stats.ttest_rel(df["RF_MRR"], df["e5_MRR"])
print(f"MRR: t-statistic: {t_stat:.4f}, p-value: {p_value:.4f}")


HR30: t-statistic: 17.8519, p-value: 0.0000
HR20: t-statistic: 17.9546, p-value: 0.0000
HR10: t-statistic: 11.8800, p-value: 0.0000
HR5: t-statistic: 14.8015, p-value: 0.0000
MRR: t-statistic: 18.6155, p-value: 0.0000


In [73]:
mean_diff_ci(df["RF_HR30"], df["e5_HR30"], confidence=0.95)
mean_diff_ci(df["RF_HR20"], df["e5_HR20"], confidence=0.95)
mean_diff_ci(df["RF_HR10"], df["e5_HR10"], confidence=0.95)
mean_diff_ci(df["RF_HR5"], df["e5_HR5"], confidence=0.95)
mean_diff_ci(df["RF_MRR"], df["e5_MRR"], confidence=0.95)

(0.0762857142857143, (0.06769828299886906, 0.08487314557255955))

(0.07142857142857145, (0.0634339116095023, 0.0794232312476406))

(0.05685714285714287, (0.047239387140565664, 0.06647489857372008))

(0.06999999999999998, (0.060496242781853796, 0.07950375721814616))

(0.08369658593699501, (0.07466139261675697, 0.09273177925723305))

## Table 6

In [74]:
file_path = datadir + 'test_res_full2_022425.pkl'
with open(file_path, 'rb') as file:
    test_res_full2 = pickle.load(file)
test_res_full_df = pd.DataFrame.from_dict(test_res_full2, orient='index')
test_res_full_df.columns = ['top_30_HR_whole','top_20_whole','top_10_whole','top_5_whole','MRR_whole']

In [75]:
df = test_res_full_df
summary = pd.DataFrame({
    'mean': df.mean(),
    'std': df.std(ddof=1)  # ddof=1 for sample standard deviation
})

summary

Unnamed: 0,mean,std
top_30_HR_whole,0.986,0.01241
top_20_whole,0.974857,0.017211
top_10_whole,0.929429,0.026673
top_5_whole,0.872,0.038176
MRR_whole,0.744448,0.036333


In [76]:
df.apply(ci_of_one_sample)

Mean: 0.986000
95% CI: (0.982473, 0.989527)
Mean: 0.974857
95% CI: (0.969966, 0.979748)
Mean: 0.929429
95% CI: (0.921848, 0.937009)
Mean: 0.872000
95% CI: (0.861151, 0.882849)
Mean: 0.744448
95% CI: (0.734122, 0.754774)


top_30_HR_whole    None
top_20_whole       None
top_10_whole       None
top_5_whole        None
MRR_whole          None
dtype: object

### full model vs. removing Fuzzy features

In [77]:
file_path = datadir + 'test_res_drop_fuzz2_022425.pkl'
with open(file_path, 'rb') as file:
    test_res_drop_fuzz2 = pickle.load(file)
test_res_drop_fuzz_df = pd.DataFrame.from_dict(test_res_drop_fuzz2, orient='index')
test_res_drop_fuzz_df.columns = ['top_30_HR_drop_fuzz','top_20_drop_fuzz','top_10_drop_fuzz','top_5_drop_fuzz','MRR_drop_fuzz']


In [78]:
df = test_res_drop_fuzz_df
summary = pd.DataFrame({
    'mean': df.mean(),
    'std': df.std(ddof=1)  # ddof=1 for sample standard deviation
})

summary

Unnamed: 0,mean,std
top_30_HR_drop_fuzz,0.984857,0.014832
top_20_drop_fuzz,0.974286,0.015542
top_10_drop_fuzz,0.919429,0.026883
top_5_drop_fuzz,0.874286,0.036393
MRR_drop_fuzz,0.740758,0.039445


In [79]:
# Paired t-test
t_stat, p_value = stats.ttest_rel(test_res_full_df['top_30_HR_whole'], test_res_drop_fuzz_df['top_30_HR_drop_fuzz'])
print(f"HR30: t-statistic: {t_stat:.4f}, p-value: {p_value:.4f}")
t_stat, p_value = stats.ttest_rel(test_res_full_df['top_20_whole'], test_res_drop_fuzz_df['top_20_drop_fuzz'])
print(f"HR20: t-statistic: {t_stat:.4f}, p-value: {p_value:.4f}")
t_stat, p_value = stats.ttest_rel(test_res_full_df['top_10_whole'], test_res_drop_fuzz_df['top_10_drop_fuzz'])
print(f"HR10: t-statistic: {t_stat:.4f}, p-value: {p_value:.4f}")
t_stat, p_value = stats.ttest_rel(test_res_full_df['top_5_whole'], test_res_drop_fuzz_df['top_5_drop_fuzz'])
print(f"HR5: t-statistic: {t_stat:.4f}, p-value: {p_value:.4f}")
t_stat, p_value = stats.ttest_rel(test_res_full_df['MRR_whole'], test_res_drop_fuzz_df['MRR_drop_fuzz'])
print(f"MRR: t-statistic: {t_stat:.4f}, p-value: {p_value:.4f}")

HR30: t-statistic: 0.7268, p-value: 0.4708
HR20: t-statistic: 0.2860, p-value: 0.7761
HR10: t-statistic: 3.4223, p-value: 0.0013
HR5: t-statistic: -0.7739, p-value: 0.4427
MRR: t-statistic: 1.1922, p-value: 0.2389


In [80]:
print(mean_diff_ci(test_res_full_df['top_30_HR_whole'], test_res_drop_fuzz_df['top_30_HR_drop_fuzz'], confidence=0.95))
print(mean_diff_ci(test_res_full_df['top_20_whole'], test_res_drop_fuzz_df['top_20_drop_fuzz'], confidence=0.95))
print(mean_diff_ci(test_res_full_df['top_10_whole'], test_res_drop_fuzz_df['top_10_drop_fuzz'], confidence=0.95))
print(mean_diff_ci(test_res_full_df['top_5_whole'], test_res_drop_fuzz_df['top_5_drop_fuzz'], confidence=0.95))
print(mean_diff_ci(test_res_full_df['MRR_whole'], test_res_drop_fuzz_df['MRR_drop_fuzz'], confidence=0.95))

(0.001142857142857152, (-0.002016910421279172, 0.004302624706993476))
(0.0005714285714285694, (-0.0034435348504647492, 0.004586391993321888))
(0.010000000000000002, (0.004128012518692328, 0.015871987481307675))
(-0.0022857142857142863, (-0.008220950460859399, 0.0036495218894308256))
(0.0036897228767850954, (-0.0025298079658527687, 0.009909253719422959))


### full model vs. removing other features

In [81]:
file_path = datadir + 'test_res_drop_derive2_022425.pkl'
with open(file_path, 'rb') as file:
    test_res_drop_derive2 = pickle.load(file)
test_res_drop_derive_df = pd.DataFrame.from_dict(test_res_drop_derive2, orient='index')
test_res_drop_derive_df.columns = ['top_30_HR_drop_derive','top_20_drop_derive','top_10_drop_derive','top_5_drop_derive','MRR_drop_derive']

In [82]:
df = test_res_drop_derive_df
summary = pd.DataFrame({
    'mean': df.mean(),
    'std': df.std(ddof=1)  # ddof=1 for sample standard deviation
})

summary

Unnamed: 0,mean,std
top_30_HR_drop_derive,0.985143,0.014419
top_20_drop_derive,0.967429,0.018713
top_10_drop_derive,0.920857,0.024525
top_5_drop_derive,0.867429,0.038293
MRR_drop_derive,0.738643,0.042256


In [83]:
# Paired t-test
t_stat, p_value = stats.ttest_rel(test_res_full_df['top_30_HR_whole'], test_res_drop_derive_df['top_30_HR_drop_derive'])
print(f"HR30: t-statistic: {t_stat:.4f}, p-value: {p_value:.4f}")
t_stat, p_value = stats.ttest_rel(test_res_full_df['top_20_whole'], test_res_drop_derive_df['top_20_drop_derive'])
print(f"HR20: t-statistic: {t_stat:.4f}, p-value: {p_value:.4f}")
t_stat, p_value = stats.ttest_rel(test_res_full_df['top_10_whole'], test_res_drop_derive_df['top_10_drop_derive'])
print(f"HR10: t-statistic: {t_stat:.4f}, p-value: {p_value:.4f}")
t_stat, p_value = stats.ttest_rel(test_res_full_df['top_5_whole'], test_res_drop_derive_df['top_5_drop_derive'])
print(f"HR5: t-statistic: {t_stat:.4f}, p-value: {p_value:.4f}")
t_stat, p_value = stats.ttest_rel(test_res_full_df['MRR_whole'], test_res_drop_derive_df['MRR_drop_derive'])
print(f"MRR: t-statistic: {t_stat:.4f}, p-value: {p_value:.4f}")

HR30: t-statistic: 0.5033, p-value: 0.6170
HR20: t-statistic: 3.7753, p-value: 0.0004
HR10: t-statistic: 3.9686, p-value: 0.0002
HR5: t-statistic: 2.0300, p-value: 0.0478
MRR: t-statistic: 2.7129, p-value: 0.0092


In [84]:
print(mean_diff_ci(test_res_full_df['top_30_HR_whole'], test_res_drop_derive_df['top_30_HR_drop_derive'], confidence=0.95))
print(mean_diff_ci(test_res_full_df['top_20_whole'], test_res_drop_derive_df['top_20_drop_derive'], confidence=0.95))
print(mean_diff_ci(test_res_full_df['top_10_whole'], test_res_drop_derive_df['top_10_drop_derive'], confidence=0.95))
print(mean_diff_ci(test_res_full_df['top_5_whole'], test_res_drop_derive_df['top_5_drop_derive'], confidence=0.95))
print(mean_diff_ci(test_res_full_df['MRR_whole'], test_res_drop_derive_df['MRR_drop_derive'], confidence=0.95))

(0.0008571428571428585, (-0.0025653107651489484, 0.004279596479434666))
(0.0074285714285714285, (0.0034743931323089963, 0.01138274972483386))
(0.008571428571428574, (0.0042311540258631675, 0.01291170311699398))
(0.004571428571428571, (4.599170812829168e-05, 0.00909686543472885))
(0.005804638562640383, (0.0015048794750431582, 0.010104397650237608))


### full model vs. removing LLM features

In [85]:
file_path = datadir + 'test_res_drop_label_022425.pkl'
with open(file_path, 'rb') as file:
    test_res_drop_label = pickle.load(file)
test_res_drop_label_df = pd.DataFrame.from_dict(test_res_drop_label, orient='index')
test_res_drop_label_df.columns = ['top_30_HR_drop_LLM','top_20_drop_LLM','top_10_drop_LLM','top_5_drop_LLM','MRR_drop_LLM']
test_res_drop_label_df.mean()

top_30_HR_drop_LLM    0.907714
top_20_drop_LLM       0.870286
top_10_drop_LLM       0.810571
top_5_drop_LLM        0.715143
MRR_drop_LLM          0.572752
dtype: float64

In [86]:
df = test_res_drop_label_df
summary = pd.DataFrame({
    'mean': df.mean(),
    'std': df.std(ddof=1)  # ddof=1 for sample standard deviation
})

summary

Unnamed: 0,mean,std
top_30_HR_drop_LLM,0.907714,0.033684
top_20_drop_LLM,0.870286,0.039556
top_10_drop_LLM,0.810571,0.042693
top_5_drop_LLM,0.715143,0.057391
MRR_drop_LLM,0.572752,0.05386


In [87]:
# Paired t-test
t_stat, p_value = stats.ttest_rel(test_res_full_df['top_30_HR_whole'], test_res_drop_label_df['top_30_HR_drop_LLM'])
print(f"HR30: t-statistic: {t_stat:.4f}, p-value: {p_value:.4f}")
t_stat, p_value = stats.ttest_rel(test_res_full_df['top_20_whole'], test_res_drop_label_df['top_20_drop_LLM'])
print(f"HR20: t-statistic: {t_stat:.4f}, p-value: {p_value:.4f}")
t_stat, p_value = stats.ttest_rel(test_res_full_df['top_10_whole'], test_res_drop_label_df['top_10_drop_LLM'])
print(f"HR10: t-statistic: {t_stat:.4f}, p-value: {p_value:.4f}")
t_stat, p_value = stats.ttest_rel(test_res_full_df['top_5_whole'], test_res_drop_label_df['top_5_drop_LLM'])
print(f"HR5: t-statistic: {t_stat:.4f}, p-value: {p_value:.4f}")
t_stat, p_value = stats.ttest_rel(test_res_full_df['MRR_whole'], test_res_drop_label_df['MRR_drop_LLM'])
print(f"MRR: t-statistic: {t_stat:.4f}, p-value: {p_value:.4f}")


HR30: t-statistic: 17.2772, p-value: 0.0000
HR20: t-statistic: 19.1793, p-value: 0.0000
HR10: t-statistic: 18.9379, p-value: 0.0000
HR5: t-statistic: 18.2486, p-value: 0.0000
MRR: t-statistic: 27.8167, p-value: 0.0000


In [88]:
print(mean_diff_ci(test_res_full_df['top_30_HR_whole'], test_res_drop_label_df['top_30_HR_drop_LLM'], confidence=0.95))
print(mean_diff_ci(test_res_full_df['top_20_whole'], test_res_drop_label_df['top_20_drop_LLM'], confidence=0.95))
print(mean_diff_ci(test_res_full_df['top_10_whole'], test_res_drop_label_df['top_10_drop_LLM'], confidence=0.95))
print(mean_diff_ci(test_res_full_df['top_5_whole'], test_res_drop_label_df['top_5_drop_LLM'], confidence=0.95))
print(mean_diff_ci(test_res_full_df['MRR_whole'], test_res_drop_label_df['MRR_drop_LLM'], confidence=0.95))

(0.0782857142857143, (0.06917999975140357, 0.08739142882002504))
(0.1045714285714286, (0.09361461446546378, 0.11552824267739344))
(0.11885714285714286, (0.10624471597576628, 0.13146956973851942))
(0.15685714285714286, (0.13958365249648233, 0.1741306332178034))
(0.1716958303194102, (0.15929191596498288, 0.1840997446738375))
