In [None]:
import pandas as pd
from scipy import stats

def perform_statistical_tests(result_df):
    # Get unique datasets and methods
    datasets = result_df['Dataset'].unique()
    methods = result_df['Method'].unique()
    
    # Remove 'BaLu' from methods list since we're comparing against it
    comparison_methods = [method for method in methods if method != 'BaLu']
    
    # Initialize result DataFrames
    pehe_pvalues = pd.DataFrame(index=datasets, columns=comparison_methods)
    mae_pvalues = pd.DataFrame(index=datasets, columns=comparison_methods)
    
    # Perform tests for each dataset and method combination
    for dataset in datasets:
        # Get BaLu results for this dataset
        balu_data = result_df[(result_df['Dataset'] == dataset) & 
                               (result_df['Method'] == 'BaLu')]
        
        if len(balu_data) == 0:
            print(f"Warning: No BaLu data found for dataset {dataset}")
            continue
            
        balu_pehe = balu_data['PEHE'].values
        balu_mae = balu_data['MAE'].values
        
        for method in comparison_methods:
            method_data = result_df[(result_df['Dataset'] == dataset) & 
                                   (result_df['Method'] == method)]
            
            if len(method_data) == 0:
                print(f"Warning: No {method} data found for dataset {dataset}")
                continue
                
            method_pehe = method_data['PEHE'].values
            method_mae = method_data['MAE'].values
            
            # Perform Wilcoxon rank-sum test (Mann-Whitney U test)
            # This is a non-parametric test that doesn't assume normal distribution
            
            # Test for PEHE
            if len(balu_pehe) > 0 and len(method_pehe) > 0:
                _, pehe_pval = stats.mannwhitneyu(balu_pehe, method_pehe, 
                                                alternative='two-sided')
                pehe_pvalues.loc[dataset, method] = pehe_pval
            
            # Test for MAE
            if len(balu_mae) > 0 and len(method_mae) > 0:
                _, mae_pval = stats.mannwhitneyu(balu_mae, method_mae, 
                                               alternative='two-sided')
                mae_pvalues.loc[dataset, method] = mae_pval
    
    return pehe_pvalues, mae_pvalues

def add_significance_stars(pvalue_df):
    """Add significance stars to p-values in scientific notation"""
    starred_df = pvalue_df.copy()
    
    for col in starred_df.columns:
        for idx in starred_df.index:
            pval = pd.to_numeric(starred_df.loc[idx, col], errors='coerce')
            if pd.notna(pval):
                if pval < 0.001:
                    starred_df.loc[idx, col] = f"{pval:.2e}***"
                elif pval < 0.01:
                    starred_df.loc[idx, col] = f"{pval:.2e}**"
                elif pval < 0.05:
                    starred_df.loc[idx, col] = f"{pval:.2e}*"
                else:
                    starred_df.loc[idx, col] = f"{pval:.2e}"
    
    return starred_df


def format_pvalue_table(pvalue_df, metric_name):
    formatted_df = pvalue_df.copy()
    
    # Convert to numeric and format in scientific notation
    for col in formatted_df.columns:
        formatted_df[col] = pd.to_numeric(formatted_df[col], errors='coerce')
        # Format in scientific notation with 2 decimal places
        formatted_df[col] = formatted_df[col].apply(
            lambda x: f"{x:.2e}" if pd.notna(x) else "NaN"
        )
    
    return formatted_df

In [29]:
import pandas as pd
import os
import json
import numpy as np


def fileter_method(must_item: list, method):
        method_name = method
        for item in must_item:
            if item not in method:
                return False, None
            if "L=" in item or 'Balu_plus' in item:
                item = item 
            else:
                item = "_"+item
            method_name = method_name.replace(item, "")
        return True, method_name

def collect_data_p_balu(p: str, datasets=[], result_dir='results', train_test_flag="test"):
    if not datasets:
        datasets = [d for d in os.listdir(result_dir) if os.path.isdir(os.path.join(result_dir, d))]
    
    for dataset in datasets:
        dataset_dir = os.path.join(result_dir, dataset)
        method_dirs = [d for d in os.listdir(dataset_dir) if os.path.isdir(os.path.join(dataset_dir, d))]
        break
    
    method_dirs = [e for e in method_dirs if 'Balu' in e]

    data_rows = []
    for method in method_dirs:
        flag, method_name = fileter_method(['Balu_plus_imp=', 'L=64-64_K=64', "gconv=GCN_rconv=GCN", "reldrop=0.0_beta=0.0001_gamma=0.0001_eta=0.0001"], method)
        if not flag:
            continue
        if "-64" in method_name:
            continue
        if "BaLu" not in method_name:
            continue 
        
        # Collect results for each dataset
        for dataset in datasets:
            dataset_dir = os.path.join(result_dir, dataset)
            
            if not os.path.exists(dataset_dir):
                print(f"{dataset_dir} not exits!")
                continue
                
            method_dir = os.path.join(dataset_dir, method)
            if not os.path.exists(method_dir):
                print(f"{method_dir} not exits!")
                continue

            imputation_dir = method_dir
            results_files = [f for f in os.listdir(imputation_dir) 
                        if f.startswith(f"p={p}_") and f.endswith(f"_{train_test_flag}_results.json")]
            # results_files=results_files[:len(results_files)*1.5//2]

            for file in results_files:
                file_path = os.path.join(method_dir, file) # imputation, file)
                try:
                    row = {
                        'Method': method,
                        # 'imputation': imputation
                    }
                    row['Dataset'] = dataset
                    with open(file_path, 'r') as f:
                        results = json.load(f)
                        if 'effect_pehe' in results:
                            row[f"PEHE"] = results['effect_pehe']
                            # effect_pehe_values.append(results['effect_pehe'])
                        if 'effect_mae' in results:
                            row['MAE'] = results['effect_mae']
                            # effect_mae_values.append(results['effect_mae'])
                except (json.JSONDecodeError, FileNotFoundError) as e:
                    print(f"Warning: Error reading file {file_path}: {e}")
                data_rows.append(row)
    
    # Create DataFrame
    df = pd.DataFrame(data_rows)
    
    # Sort the DataFrame by method and imputation
    if not df.empty:
        df = df.sort_values(by=['Method']) #, 'imputation'])
    return df


def collect_data_p_others(p: str, datasets=[], result_dir='results', train_test_flag="test"):
    if not datasets:
        # If no datasets are specified, get all available datasets from the directory
        datasets = [d for d in os.listdir(result_dir) if os.path.isdir(os.path.join(result_dir, d))]
    
    # Initialize an empty list to store data rows
    data_rows = []
    
    # First pass: collect all methods and imputations across all datasets
    all_method_imputation_pairs = []
    print(datasets)
    for dataset in datasets:
        dataset_dir = os.path.join(result_dir, dataset)
        if not os.path.exists(dataset_dir):
            print(f"Warning: Dataset directory {dataset_dir} not found. Skipping.")
            continue
            
        # Get all method directories for this dataset
        method_dirs = [d for d in os.listdir(dataset_dir) if os.path.isdir(os.path.join(dataset_dir, d))]
        
        method_dirs = [e for e in method_dirs if 'Balu' not in e]

        for method in method_dirs:
            if method == 'rml':
                continue
            method_dir = os.path.join(dataset_dir, method)
            # print(method_dir)
            # print(os.listdir(method_dir))
            for imputation in os.listdir(method_dir):
                
                if p == '0.0' and imputation != 'full':                 # if the missing percentage is 0.0, no evaluation need for other imputation methods
                    continue
                if p != '0.0' and imputation == 'full':                 # if has missing data, there is no evaluation for complete data (full)
                    continue
                
                
                if imputation not in ['no', 'full']:
                    continue
                
                method_imputation_pair = (method, imputation)
                if method_imputation_pair not in all_method_imputation_pairs:
                    all_method_imputation_pairs.append(method_imputation_pair)
    
    # Second pass: collect results for each method/imputation pair across all datasets
    for method, imputation in all_method_imputation_pairs:
        for dataset in datasets:
            
            dataset_dir = os.path.join(result_dir, dataset)
            if not os.path.exists(dataset_dir):
                print(f"{method_dir} not exits!")
                continue
                
            method_dir = os.path.join(dataset_dir, method)
            if not os.path.exists(method_dir):
                print(f"{method_dir} not exits!")
                continue
            
            imputation_dir = os.path.join(method_dir, imputation)
            
            results_files = [f for f in os.listdir(imputation_dir) 
                        if f.startswith(f"p={p}") and f.endswith(f"{train_test_flag}_results.json")]
            
            
            # Collect metrics from all matching files
            for file in results_files:
                file_path = os.path.join(method_dir, imputation, file)
                try:
                    row = {
                        'Method': method,
                        # 'imputation': imputation
                    }
                    row['Dataset'] = dataset
                    filter_th = 40
                    with open(file_path, 'r') as f:
                        results = json.load(f)
                        # if results['effect_pehe'] == np.nan or results['effect_pehe'] > filter_th: continue
                        if 'Syn_M=' in dataset and results['effect_pehe'] > 25: continue 
                        
                        # if results['effect_mae'] == np.nan or results['effect_mae'] > filter_th: continue
                        if 'effect_pehe' in results:
                            row[f"PEHE"] = results['effect_pehe']
                        if 'effect_mae' in results:
                            row['MAE'] = results['effect_mae']
                except (json.JSONDecodeError, FileNotFoundError) as e:
                    print(f"Warning: Error reading file {file_path}: {e}")
            
                data_rows.append(row)
    
    # Create DataFrame
    df = pd.DataFrame(data_rows)
    
    # Sort the DataFrame by method and imputation
    if not df.empty:
        df = df.sort_values(by=['Method'])
    
    return df


# For Complete Datasets: $p_{miss}=0.0$

In [30]:

# from util import *
import pandas as pd

datasets = ['Syn_M=None_SimRel=1_Rel=4', 'Youtube_M=20_SimRel=1_Rel=4', 'BlogCatalog1_M=20_SimRel=0_Rel=1', 'Flickr1_M=20_SimRel=0_Rel=1']     # , 'BlogCatalog1_M=20_SimRel=1_Rel=4', 'Flickr1_M=20_SimRel=1_Rel=4'
datasets = [e+"_MCAR" for e in datasets] 
alian_names = ['Instagram', "Youtube", "BlogCatalog", "Flickr"]
dataset_map = {datasets[i]: alian_names[i] for i in range(len(datasets))}

p = '0.0'

balu_dir = 'results_balu_tuning'
df_balu = collect_data_p_balu(result_dir=balu_dir, p=p, datasets=datasets, train_test_flag='test')
# df_balu = df_balu.loc[df_balu['Method'] == 'Balu_V5']
df_balu['Method'] = df_balu['Method'].apply(lambda x: "BaLu(-edge)" if "IGMC" in str(x) else "BaLu")


other_dir = 'results_Q1_MAR_MCAR'
df_others = collect_data_p_others(result_dir=other_dir,p=p, datasets=datasets, train_test_flag='test')
df_others = df_others.loc[(df_others['Method'] != 'dml')]
df_others['Method'] = df_others['Method'].apply(lambda x: "GCN-HSIC" if "GCN" in str(x) else "SAGE-HSIC" if "GraphSAGE" in str(x) else x)
df_others['Method'] = df_others['Method'].apply(lambda x: "X-Learner" if str(x)=='xl' else "CausalForest" if str(x)=='cf' else "T-Learner" if 'tl' == str(x) else "R-Learner" if 'rl' == str(x) else x)

####################################################### Merge Dataframes, Oder rows #######################################################
result_df = pd.concat([df_balu, df_others], ignore_index=True)
result_df['Dataset'] = result_df['Dataset'].apply(lambda x: dataset_map[str(x)])
Methods_order = ['BaLu', 'BaLu(-edge)', 'GCN-HSIC', 'SAGE-HSIC', 'NetDeconf', 'SPNet', 'CausalForest', 'R-Learner', 'T-Learner', 'X-Learner']
result_df['Method'] = pd.Categorical(result_df['Method'], categories=Methods_order, ordered=True)
result_df_ordered = result_df.sort_values('Method')
result_df = result_df_ordered.reset_index(drop=True)

###################################################################  significance test #######################################################################
result_df = result_df.dropna()
result_df = result_df.loc[(result_df['Method']!='BaLu(-edge)')]
pehe_pvalues, mae_pvalues = perform_statistical_tests(result_df)

# Format and display results
pehe_formatted = format_pvalue_table(pehe_pvalues, "PEHE")
mae_formatted = format_pvalue_table(mae_pvalues, "MAE")

print(pehe_formatted.to_latex(index=False))
print(mae_formatted.to_latex(index=False))

# Optional: Save results to CSV files
# pehe_pvalues.to_csv('pehe_pvalues.csv')
# mae_pvalues.to_csv('mae_pvalues.csv')

print("\nP-value tables saved as 'pehe_pvalues.csv' and 'mae_pvalues.csv'")

# Optional: Create a combined summary with significance indicators

# Create tables with significance stars
pehe_with_stars = add_significance_stars(pehe_pvalues)
mae_with_stars = add_significance_stars(mae_pvalues)

print("\nPEHE P-values with significance indicators:")
display(pehe_with_stars)

print("\nMAE P-values with significance indicators:")
display(mae_with_stars)

['Syn_M=None_SimRel=1_Rel=4_MCAR', 'Youtube_M=20_SimRel=1_Rel=4_MCAR', 'BlogCatalog1_M=20_SimRel=0_Rel=1_MCAR', 'Flickr1_M=20_SimRel=0_Rel=1_MCAR']
\begin{tabular}{llllllll}
\toprule
GCN-HSIC & SAGE-HSIC & NetDeconf & SPNet & CausalForest & R-Learner & T-Learner & X-Learner \\
\midrule
5.76e-06 & 6.61e-08 & 3.71e-04 & 5.29e-02 & 6.70e-08 & 1.17e-05 & 6.70e-08 & 6.70e-08 \\
3.14e-02 & 1.59e-04 & 5.98e-01 & 4.57e-01 & 8.57e-02 & 9.86e-02 & 8.32e-03 & 2.38e-02 \\
6.61e-08 & 6.61e-08 & 1.51e-07 & 6.62e-08 & 6.70e-08 & 1.17e-05 & 6.70e-08 & 6.70e-08 \\
6.61e-08 & 6.61e-08 & 6.64e-08 & 6.61e-08 & 6.70e-08 & 1.17e-05 & 6.70e-08 & 6.70e-08 \\
\bottomrule
\end{tabular}

\begin{tabular}{llllllll}
\toprule
GCN-HSIC & SAGE-HSIC & NetDeconf & SPNet & CausalForest & R-Learner & T-Learner & X-Learner \\
\midrule
2.16e-07 & 4.10e-05 & 3.85e-07 & 9.57e-06 & 6.70e-08 & 1.17e-05 & 6.70e-08 & 6.70e-08 \\
1.57e-05 & 1.57e-05 & 3.43e-06 & 2.56e-05 & 6.70e-08 & 1.17e-05 & 6.70e-08 & 6.70e-08 \\
6.61e-08 & 3.

Unnamed: 0,GCN-HSIC,SAGE-HSIC,NetDeconf,SPNet,CausalForest,R-Learner,T-Learner,X-Learner
Flickr,5.76e-06***,6.61e-08***,3.71e-04***,5.29e-02,6.70e-08***,1.17e-05***,6.70e-08***,6.70e-08***
BlogCatalog,3.14e-02*,1.59e-04***,5.98e-01,4.57e-01,8.57e-02,9.86e-02,8.32e-03**,2.38e-02*
Instagram,6.61e-08***,6.61e-08***,1.51e-07***,6.62e-08***,6.70e-08***,1.17e-05***,6.70e-08***,6.70e-08***
Youtube,6.61e-08***,6.61e-08***,6.64e-08***,6.61e-08***,6.70e-08***,1.17e-05***,6.70e-08***,6.70e-08***



MAE P-values with significance indicators:


Unnamed: 0,GCN-HSIC,SAGE-HSIC,NetDeconf,SPNet,CausalForest,R-Learner,T-Learner,X-Learner
Flickr,2.16e-07***,4.10e-05***,3.85e-07***,9.57e-06***,6.70e-08***,1.17e-05***,6.70e-08***,6.70e-08***
BlogCatalog,1.57e-05***,1.57e-05***,3.43e-06***,2.56e-05***,6.70e-08***,1.17e-05***,6.70e-08***,6.70e-08***
Instagram,6.61e-08***,3.84e-07***,1.51e-07***,6.61e-08***,6.70e-08***,1.17e-05***,6.70e-08***,6.70e-08***
Youtube,6.61e-08***,1.20e-07***,6.66e-08***,3.70e-04***,6.70e-08***,1.17e-05***,6.70e-08***,6.70e-08***


# For Incomplete Datasets: $p_{miss}=0.1$

In [31]:
import pandas as pd
import os
import json
import numpy as np


def fileter_method(must_item: list, method):
        method_name = method
        for item in must_item:
            if item not in method:
                return False, None
            if "L=" in item or 'Balu_plus' in item:
                item = item 
            else:
                item = "_"+item
            method_name = method_name.replace(item, "")
        return True, method_name

def collect_data_p_balu(p: str, datasets=[], result_dir='results', train_test_flag="test"):
    if not datasets:
        datasets = [d for d in os.listdir(result_dir) if os.path.isdir(os.path.join(result_dir, d))]
    
    for dataset in datasets:
        dataset_dir = os.path.join(result_dir, dataset)
        method_dirs = [d for d in os.listdir(dataset_dir) if os.path.isdir(os.path.join(dataset_dir, d))]
        break
    
    method_dirs = [e for e in method_dirs if 'Balu' in e]

    data_rows = []
    for method in method_dirs:
        flag, method_name = fileter_method(['Balu_plus_imp=', 'L=64-64_K=64', "gconv=GCN_rconv=GCN", "reldrop=0.0_beta=0.0001_gamma=0.0001_eta=0.0001"], method)
        if not flag:
            continue
        if "-64" in method_name:
            continue
        if "BaLu" not in method_name:
            continue 
        
        # Collect results for each dataset
        for dataset in datasets:
            dataset_dir = os.path.join(result_dir, dataset)
            
            if not os.path.exists(dataset_dir):
                print(f"{dataset_dir} not exits!")
                continue
                
            method_dir = os.path.join(dataset_dir, method)
            if not os.path.exists(method_dir):
                print(f"{method_dir} not exits!")
                continue

            imputation_dir = method_dir
            results_files = [f for f in os.listdir(imputation_dir) 
                        if f.startswith(f"p={p}_") and f.endswith(f"_{train_test_flag}_results.json")]
            # results_files=results_files[:len(results_files)*1.5//2]

            for file in results_files:
                file_path = os.path.join(method_dir, file) # imputation, file)
                try:
                    row = {
                        'Method': method,
                        # 'imputation': imputation
                    }
                    row['Dataset'] = dataset
                    with open(file_path, 'r') as f:
                        results = json.load(f)
                        if 'effect_pehe' in results:
                            row[f"PEHE"] = results['effect_pehe']
                            # effect_pehe_values.append(results['effect_pehe'])
                        if 'effect_mae' in results:
                            row['MAE'] = results['effect_mae']
                            # effect_mae_values.append(results['effect_mae'])
                except (json.JSONDecodeError, FileNotFoundError) as e:
                    print(f"Warning: Error reading file {file_path}: {e}")
                data_rows.append(row)
    
    # Create DataFrame
    df = pd.DataFrame(data_rows)
    
    # Sort the DataFrame by method and imputation
    if not df.empty:
        df = df.sort_values(by=['Method']) #, 'imputation'])
    return df


def collect_data_p_others(p: str, datasets=[], result_dir='results', train_test_flag="test", best_performance_dict=None):
    if not datasets:
        # If no datasets are specified, get all available datasets from the directory
        datasets = [d for d in os.listdir(result_dir) if os.path.isdir(os.path.join(result_dir, d))]
    
    # Initialize an empty list to store data rows
    data_rows = []
    
    # First pass: collect all methods and imputations across all datasets
    all_method_imputation_pairs = []
    print(datasets)
    for dataset in datasets:
        dataset_dir = os.path.join(result_dir, dataset)
        if not os.path.exists(dataset_dir):
            print(f"Warning: Dataset directory {dataset_dir} not found. Skipping.")
            continue
            
        # Get all method directories for this dataset
        method_dirs = [d for d in os.listdir(dataset_dir) if os.path.isdir(os.path.join(dataset_dir, d))]
        
        method_dirs = [e for e in method_dirs if 'Balu' not in e]
        
        for method in method_dirs:
            if method == 'rml':
                continue
            method_dir = os.path.join(dataset_dir, method)
            
            for imputation in os.listdir(method_dir):
                
                if p == '0.0' and imputation != 'full':                 # if the missing percentage is 0.0, no evaluation need for other imputation methods
                    continue
                if p != '0.0' and imputation == 'full':                 # if has missing data, there is no evaluation for complete data (full)
                    continue
                
                if best_performance_dict[dataset][method] != imputation:
                    continue

                
                method_imputation_pair = (method, imputation)
                if method_imputation_pair not in all_method_imputation_pairs:
                    all_method_imputation_pairs.append(method_imputation_pair)
    
    # Second pass: collect results for each method/imputation pair across all datasets
    for method, imputation in all_method_imputation_pairs:
        for dataset in datasets:
            
            dataset_dir = os.path.join(result_dir, dataset)
            if not os.path.exists(dataset_dir):
                print(f"{method_dir} not exits!")
                continue
                
            method_dir = os.path.join(dataset_dir, method)
            if not os.path.exists(method_dir):
                print(f"{method_dir} not exits!")
                continue
            
            imputation_dir = os.path.join(method_dir, imputation)
            
            results_files = [f for f in os.listdir(imputation_dir) 
                        if f.startswith(f"p={p}") and f.endswith(f"{train_test_flag}_results.json")]
            
            
            # Collect metrics from all matching files
            for file in results_files:
                file_path = os.path.join(method_dir, imputation, file)
                try:
                    row = {
                        'Method': method,
                        # 'imputation': imputation
                    }
                    row['Dataset'] = dataset
                    filter_th = 40
                    with open(file_path, 'r') as f:
                        results = json.load(f)
                        # if results['effect_pehe'] == np.nan or results['effect_pehe'] > filter_th: continue
                        if 'Syn_M=' in dataset and results['effect_pehe'] > 25: continue 
                        
                        # if results['effect_mae'] == np.nan or results['effect_mae'] > filter_th: continue
                        if 'effect_pehe' in results:
                            row[f"PEHE"] = results['effect_pehe']
                        if 'effect_mae' in results:
                            row['MAE'] = results['effect_mae']
                except (json.JSONDecodeError, FileNotFoundError) as e:
                    print(f"Warning: Error reading file {file_path}: {e}")
            
                data_rows.append(row)
    
    # Create DataFrame
    df = pd.DataFrame(data_rows)
    
    # Sort the DataFrame by method and imputation
    if not df.empty:
        df = df.sort_values(by=['Method'])
    
    return df


In [32]:

# from util import *
import pandas as pd

datasets = ['Syn_M=None_SimRel=1_Rel=4', 'Youtube_M=20_SimRel=1_Rel=4', 'BlogCatalog1_M=20_SimRel=0_Rel=1', 'Flickr1_M=20_SimRel=0_Rel=1']     # , 'BlogCatalog1_M=20_SimRel=1_Rel=4', 'Flickr1_M=20_SimRel=1_Rel=4'
datasets = [e+"_MCAR" for e in datasets] 
alian_names = ['Instagram', "Youtube", "BlogCatalog", "Flickr"]
dataset_map = {datasets[i]: alian_names[i] for i in range(len(datasets))}

p = '0.1'

balu_dir = 'results_balu_tuning'
df_balu = collect_data_p_balu(result_dir=balu_dir, p=p, datasets=datasets, train_test_flag='test')
# df_balu = df_balu.loc[df_balu['Method'] == 'Balu_V5']
df_balu['Method'] = df_balu['Method'].apply(lambda x: "BaLu(-edge)" if "IGMC" in str(x) else "BaLu")


other_dir = 'results_Q1_MAR_MCAR'
best_performance_dict ={'Syn_M=None_SimRel=1_Rel=4_MCAR':{'SPNet':'gain', 'GCN_no_drop=0.1_HSIC':'gain', 'dml':'gain', 'cf':'gain', 'tl':'gain', 'GraphSAGE_no_drop=0.1_HSIC':'grape_rel=0.0', 'NetDeconf':'gain', 'rl':'gain', 'xl':'gain'},
 'Youtube_M=20_SimRel=1_Rel=4_MCAR':{'SPNet':'gain', 'GCN_no_drop=0.1_HSIC':'gain', 'dml':'gain', 'cf':'gain', 'tl':'gain', 'GraphSAGE_no_drop=0.1_HSIC':'gain', 'NetDeconf':'grape_rel=0.0', 'rl':'gain', 'xl':'gain'},
 'BlogCatalog1_M=20_SimRel=0_Rel=1_MCAR':{'SPNet':'mice', 'GCN_no_drop=0.1_HSIC':'mice', 'dml':'gain', 'cf':'grape_rel=0.0', 'tl':'mean', 'GraphSAGE_no_drop=0.1_HSIC':'grape_real', 'NetDeconf':'grape_rel=0.0', 'rl':'missforest', 'xl':'mean'},
 'Flickr1_M=20_SimRel=0_Rel=1_MCAR':{'SPNet':'grape_rel=0.0', 'GCN_no_drop=0.1_HSIC':'grape_real', 'dml':'gain', 'cf':'grape_rel=0.0', 'tl':'mice', 'GraphSAGE_no_drop=0.1_HSIC':'missforest', 'NetDeconf':'grape_rel=0.0', 'rl':'mice', 'xl':'knn'}}

df_others = collect_data_p_others(result_dir=other_dir,p=p, datasets=datasets, train_test_flag='test', best_performance_dict=best_performance_dict)
df_others = df_others.loc[(df_others['Method'] != 'dml')]
df_others['Method'] = df_others['Method'].apply(lambda x: "GCN-HSIC" if "GCN" in str(x) else "SAGE-HSIC" if "GraphSAGE" in str(x) else x)
df_others['Method'] = df_others['Method'].apply(lambda x: "X-Learner" if str(x)=='xl' else "CausalForest" if str(x)=='cf' else "T-Learner" if 'tl' == str(x) else "R-Learner" if 'rl' == str(x) else x)

####################################################### Merge Dataframes, Oder rows #######################################################
result_df = pd.concat([df_balu, df_others], ignore_index=True)
result_df['Dataset'] = result_df['Dataset'].apply(lambda x: dataset_map[str(x)])
Methods_order = ['BaLu', 'BaLu(-edge)', 'GCN-HSIC', 'SAGE-HSIC', 'NetDeconf', 'SPNet', 'CausalForest', 'R-Learner', 'T-Learner', 'X-Learner']
result_df['Method'] = pd.Categorical(result_df['Method'], categories=Methods_order, ordered=True)
result_df_ordered = result_df.sort_values('Method')
result_df = result_df_ordered.reset_index(drop=True)

###################################################################  significance test #######################################################################
result_df = result_df.dropna()
result_df = result_df.loc[(result_df['Method']!='BaLu(-edge)')]
pehe_pvalues, mae_pvalues = perform_statistical_tests(result_df)

# Format and display results
pehe_formatted = format_pvalue_table(pehe_pvalues, "PEHE")
mae_formatted = format_pvalue_table(mae_pvalues, "MAE")

print(pehe_formatted.to_latex(index=False))
print(mae_formatted.to_latex(index=False))

# Optional: Save results to CSV files
# pehe_pvalues.to_csv('pehe_pvalues.csv')
# mae_pvalues.to_csv('mae_pvalues.csv')

print("\nP-value tables saved as 'pehe_pvalues.csv' and 'mae_pvalues.csv'")

# Optional: Create a combined summary with significance indicators

# Create tables with significance stars
pehe_with_stars = add_significance_stars(pehe_pvalues)
mae_with_stars = add_significance_stars(mae_pvalues)

print("\nPEHE P-values with significance indicators:")
display(pehe_with_stars)

print("\nMAE P-values with significance indicators:")
display(mae_with_stars)

['Syn_M=None_SimRel=1_Rel=4_MCAR', 'Youtube_M=20_SimRel=1_Rel=4_MCAR', 'BlogCatalog1_M=20_SimRel=0_Rel=1_MCAR', 'Flickr1_M=20_SimRel=0_Rel=1_MCAR']
\begin{tabular}{llllllll}
\toprule
GCN-HSIC & SAGE-HSIC & NetDeconf & SPNet & CausalForest & R-Learner & T-Learner & X-Learner \\
\midrule
1.18e-04 & 3.63e-10 & 2.25e-02 & 1.26e-02 & 2.49e-07 & 1.05e-07 & 4.41e-10 & 7.22e-10 \\
2.28e-01 & 4.18e-05 & 4.19e-01 & 7.30e-01 & 1.46e-01 & 1.11e-01 & 7.79e-03 & 1.82e-02 \\
2.72e-11 & 5.60e-12 & 3.72e-10 & 6.52e-11 & 3.01e-09 & 3.01e-09 & 2.72e-11 & 4.60e-11 \\
2.72e-11 & 5.60e-12 & 3.72e-10 & 2.71e-11 & 3.01e-09 & 3.01e-09 & 2.72e-11 & 2.93e-11 \\
\bottomrule
\end{tabular}

\begin{tabular}{llllllll}
\toprule
GCN-HSIC & SAGE-HSIC & NetDeconf & SPNet & CausalForest & R-Learner & T-Learner & X-Learner \\
\midrule
4.96e-07 & 7.19e-09 & 1.61e-02 & 3.21e-05 & 3.01e-09 & 3.01e-09 & 2.72e-11 & 2.72e-11 \\
7.35e-06 & 1.08e-08 & 4.84e-04 & 1.77e-06 & 3.01e-09 & 3.01e-09 & 2.72e-11 & 2.72e-11 \\
5.04e-08 & 6.

Unnamed: 0,GCN-HSIC,SAGE-HSIC,NetDeconf,SPNet,CausalForest,R-Learner,T-Learner,X-Learner
Flickr,1.18e-04***,3.63e-10***,2.25e-02*,1.26e-02*,2.49e-07***,1.05e-07***,4.41e-10***,7.22e-10***
BlogCatalog,2.28e-01,4.18e-05***,4.19e-01,7.30e-01,1.46e-01,1.11e-01,7.79e-03**,1.82e-02*
Instagram,2.72e-11***,5.60e-12***,3.72e-10***,6.52e-11***,3.01e-09***,3.01e-09***,2.72e-11***,4.60e-11***
Youtube,2.72e-11***,5.60e-12***,3.72e-10***,2.71e-11***,3.01e-09***,3.01e-09***,2.72e-11***,2.93e-11***



MAE P-values with significance indicators:


Unnamed: 0,GCN-HSIC,SAGE-HSIC,NetDeconf,SPNet,CausalForest,R-Learner,T-Learner,X-Learner
Flickr,4.96e-07***,7.19e-09***,1.61e-02*,3.21e-05***,3.01e-09***,3.01e-09***,2.72e-11***,2.72e-11***
BlogCatalog,7.35e-06***,1.08e-08***,4.84e-04***,1.77e-06***,3.01e-09***,3.01e-09***,2.72e-11***,2.72e-11***
Instagram,5.04e-08***,6.49e-09***,9.62e-04***,5.03e-08***,3.01e-09***,3.01e-09***,2.72e-11***,2.72e-11***
Youtube,4.03e-05***,2.94e-08***,2.26e-06***,7.77e-08***,3.01e-09***,3.01e-09***,2.72e-11***,2.72e-11***
