In [7]:
import pandas as pd
import numpy as np
from scipy import stats

def extract_paired_data():
    generation_data = {
        'Smote': {
            'GAMs': [0.531, 0.511, 0.515, 0.513, 0.364, 0.472, 0.789, 0.364],
            'LDA': [0.652, 0.523, 0.511, 0.652, 0.340, 0.445, 0.817, 0.340], 
            'Naive Bayes': [0.656, 0.582, 0.582, 0.656, 0.316, 0.430, 0.769, 0.316],
            'Decision Tree': [0.663, 0.592, 0.536, 0.663, 0.329, 0.431, 0.790, 0.329],
            'TabNet': [0.890, 0.855, 0.890, 0.851, 0.702, 0.744, 0.793, 0.702],
            'Proposed XAI model': [0.955, 0.940, 0.954, 0.927, 0.749, 0.771, 0.798, 0.749]
        },
        'Adasyn': {
            'GAMs': [0.536, 0.533, 0.540, 0.536, 0.373, 0.480, 0.803, 0.373],
            'LDA': [0.644, 0.520, 0.508, 0.644, 0.328, 0.446, 0.827, 0.328],
            'Naive Bayes': [0.658, 0.587, 0.597, 0.658, 0.444, 0.556, 0.792, 0.444], 
            'Decision Tree': [0.658, 0.575, 0.521, 0.658, 0.336, 0.436, 0.800, 0.336],
            'TabNet': [0.858, 0.817, 0.824, 0.810, 0.694, 0.734, 0.781, 0.694],
            'Proposed XAI model': [0.899, 0.855, 0.902, 0.823, 0.738, 0.760, 0.785, 0.738]
        },
        'CTGAN': {
            'LDA': [0.683, 0.620, 0.648, 0.683, 0.541, 0.626, 0.784, 0.541],
            'Naive Bayes': [0.693, 0.678, 0.682, 0.693, 0.571, 0.656, 0.786, 0.571],
            'GAMs': [0.722, 0.685, 0.705, 0.722, 0.807, 0.799, 0.791, 0.807],
            'Decision Tree': [0.803, 0.813, 0.830, 0.803, 0.668, 0.721, 0.789, 0.668],
            'TabNet': [0.856, 0.760, 0.755, 0.767, 0.476, 0.575, 0.778, 0.476],
            'Proposed XAI model': [0.869, 0.867, 0.866, 0.869, 0.405, 0.513, 0.793, 0.405]
        },
        'CopulaGANSynthesizer': {
            'Naive Bayes': [0.679, 0.623, 0.627, 0.679, 0.662, 0.717, 0.786, 0.662],
            'LDA': [0.686, 0.594, 0.617, 0.686, 0.532, 0.624, 0.778, 0.532],
            'GAMs': [0.712, 0.660, 0.710, 0.714, 0.795, 0.790, 0.847, 0.795],
            'Decision Tree': [0.778, 0.789, 0.806, 0.778, 0.778, 0.786, 0.796, 0.778],
            'TabNet': [0.793, 0.678, 0.699, 0.663, 0.620, 0.692, 0.794, 0.620],
            'Proposed XAI model': [0.851, 0.847, 0.845, 0.851, 0.477, 0.582, 0.803, 0.477]
        },
        'Nbsynthetic': {
            'GAMs': [0.789, 0.772, 0.778, 0.789, 0.672, 0.730, 0.806, 0.672],
            'LDA': [0.826, 0.801, 0.800, 0.826, 0.506, 0.600, 0.786, 0.506],
            'Decision Tree': [0.909, 0.906, 0.918, 0.909, 0.753, 0.773, 0.798, 0.753],
            'Naive Bayes': [0.940, 0.940, 0.942, 0.940, 0.730, 0.757, 0.787, 0.730],
            'TabNet': [0.970, 0.962, 0.978, 0.947, 0.840, 0.813, 0.792, 0.840],
            'Proposed XAI model': [0.980, 0.979, 0.979, 0.980, 0.852, 0.814, 0.781, 0.852]
        }
    }
    return generation_data

def perform_comprehensive_ttest():
    generation_data = extract_paired_data()
    metrics = ['Accuracy', 'F1-Score', 'Precision', 'Recall', 
               'Accuracy', 'F1-Score', 'Precision', 'Recall']
    
    comparison_results = []
    
    all_models = set()
    for gen_data in generation_data.values():
        all_models.update(gen_data.keys())
    all_models.discard('Proposed XAI model')
    all_models = sorted(list(all_models))
    
    for model in all_models:
        row_data = {'Comparison models': f'{model} vs. Proposed XAI model'}
        
        for metric_idx, metric in enumerate(metrics):
            proposed_scores = []
            other_scores = []
            
            for gen_method, models in generation_data.items():
                if model in models and 'Proposed XAI model' in models:
                    proposed_scores.append(models['Proposed XAI model'][metric_idx])
                    other_scores.append(models[model][metric_idx])
            
            if len(proposed_scores) >= 2:
                t_stat, p_value = stats.ttest_rel(proposed_scores, other_scores)
                
                if p_value < 0.001:
                    p_str = 'p<0.001'
                elif p_value < 0.01:
                    p_str = 'p<0.01'
                elif p_value < 0.05:
                    p_str = 'p<0.05'
                else:
                    p_str = f'p={p_value:.3f}'
                
                if metric_idx < 4:
                    column_name = f'Test {metric}'
                else:
                    column_name = f'External {metric}'
                
                row_data[column_name] = p_str
            else:
                if metric_idx < 4:
                    column_name = f'Test {metric}'
                else:
                    column_name = f'External {metric}'
                row_data[column_name] = 'N/A'
        
        comparison_results.append(row_data)
    
    return pd.DataFrame(comparison_results)

# 결과 테이블 생성 및 출력
comparison_df = perform_comprehensive_ttest()
print(comparison_df.to_string(index=False))

                   Comparison models Test Accuracy Test F1-Score Test Precision Test Recall External Accuracy External F1-Score External Precision External Recall
Decision Tree vs. Proposed XAI model        p<0.05       p=0.061        p=0.098      p<0.05           p=0.669           p=0.652            p=0.662         p=0.669
         GAMs vs. Proposed XAI model        p<0.05        p<0.01         p<0.05      p<0.05           p=0.815           p=0.794            p=0.186         p=0.815
          LDA vs. Proposed XAI model        p<0.01        p<0.01         p<0.01     p<0.001           p=0.179           p=0.201            p=0.608         p=0.179
  Naive Bayes vs. Proposed XAI model        p<0.05        p<0.05         p<0.05      p<0.05           p=0.462           p=0.531            p=0.309         p=0.462
       TabNet vs. Proposed XAI model        p<0.05        p<0.05         p<0.05     p=0.055           p=0.580           p=0.431            p=0.365         p=0.580


In [8]:
import pandas as pd
import numpy as np
from scipy import stats

def extract_paired_data():
    generation_data = {
        'Smote': {
            'GAMs': [0.531, 0.511, 0.515, 0.513, 0.364, 0.472, 0.789, 0.364],
            'LDA': [0.652, 0.523, 0.511, 0.652, 0.340, 0.445, 0.817, 0.340], 
            'Naive Bayes': [0.656, 0.582, 0.582, 0.656, 0.316, 0.430, 0.769, 0.316],
            'Decision Tree': [0.663, 0.592, 0.536, 0.663, 0.329, 0.431, 0.790, 0.329],
            'TabNet': [0.890, 0.855, 0.890, 0.851, 0.702, 0.744, 0.793, 0.702],
            'Proposed XAI model': [0.955, 0.940, 0.954, 0.927, 0.749, 0.771, 0.798, 0.749]
        },
        'Adasyn': {
            'GAMs': [0.536, 0.533, 0.540, 0.536, 0.373, 0.480, 0.803, 0.373],
            'LDA': [0.644, 0.520, 0.508, 0.644, 0.328, 0.446, 0.827, 0.328],
            'Naive Bayes': [0.658, 0.587, 0.597, 0.658, 0.444, 0.556, 0.792, 0.444], 
            'Decision Tree': [0.658, 0.575, 0.521, 0.658, 0.336, 0.436, 0.800, 0.336],
            'TabNet': [0.858, 0.817, 0.824, 0.810, 0.694, 0.734, 0.781, 0.694],
            'Proposed XAI model': [0.899, 0.855, 0.902, 0.823, 0.738, 0.760, 0.785, 0.738]
        },
        'CTGAN': {
            'LDA': [0.683, 0.620, 0.648, 0.683, 0.541, 0.626, 0.784, 0.541],
            'Naive Bayes': [0.693, 0.678, 0.682, 0.693, 0.571, 0.656, 0.786, 0.571],
            'GAMs': [0.722, 0.685, 0.705, 0.722, 0.807, 0.799, 0.791, 0.807],
            'Decision Tree': [0.803, 0.813, 0.830, 0.803, 0.668, 0.721, 0.789, 0.668],
            'TabNet': [0.856, 0.760, 0.755, 0.767, 0.476, 0.575, 0.778, 0.476],
            'Proposed XAI model': [0.869, 0.867, 0.866, 0.869, 0.405, 0.513, 0.793, 0.405]
        },
        'CopulaGANSynthesizer': {
            'Naive Bayes': [0.679, 0.623, 0.627, 0.679, 0.662, 0.717, 0.786, 0.662],
            'LDA': [0.686, 0.594, 0.617, 0.686, 0.532, 0.624, 0.778, 0.532],
            'GAMs': [0.712, 0.660, 0.710, 0.714, 0.795, 0.790, 0.847, 0.795],
            'Decision Tree': [0.778, 0.789, 0.806, 0.778, 0.778, 0.786, 0.796, 0.778],
            'TabNet': [0.793, 0.678, 0.699, 0.663, 0.620, 0.692, 0.794, 0.620],
            'Proposed XAI model': [0.851, 0.847, 0.845, 0.851, 0.477, 0.582, 0.803, 0.477]
        },
        'Nbsynthetic': {
            'GAMs': [0.789, 0.772, 0.778, 0.789, 0.672, 0.730, 0.806, 0.672],
            'LDA': [0.826, 0.801, 0.800, 0.826, 0.506, 0.600, 0.786, 0.506],
            'Decision Tree': [0.909, 0.906, 0.918, 0.909, 0.753, 0.773, 0.798, 0.753],
            'Naive Bayes': [0.940, 0.940, 0.942, 0.940, 0.730, 0.757, 0.787, 0.730],
            'TabNet': [0.970, 0.962, 0.978, 0.947, 0.840, 0.813, 0.792, 0.840],
            'Proposed XAI model': [0.980, 0.979, 0.979, 0.980, 0.852, 0.814, 0.781, 0.852]
        }
    }
    return generation_data

def perform_comprehensive_ttest():
    generation_data = extract_paired_data()
    metrics = ['Accuracy', 'F1-Score', 'Precision', 'Recall']
    
    comparison_results = []
    
    all_models = set()
    for gen_data in generation_data.values():
        all_models.update(gen_data.keys())
    all_models.discard('Proposed XAI model')
    all_models = sorted(list(all_models))
    
    for model in all_models:
        row_data = {'Comparison models': f'{model} vs. Proposed XAI model'}
        
        for metric_idx, metric in enumerate(metrics):
            proposed_scores = []
            other_scores = []
            
            for gen_method, models in generation_data.items():
                if model in models and 'Proposed XAI model' in models:
                    proposed_scores.append(models['Proposed XAI model'][metric_idx])
                    other_scores.append(models[model][metric_idx])
            
            if len(proposed_scores) >= 2:
                t_stat, p_value = stats.ttest_rel(proposed_scores, other_scores)
                
                if p_value < 0.001:
                    p_str = 'p<0.001'
                elif p_value < 0.01:
                    p_str = 'p<0.01'
                elif p_value < 0.05:
                    p_str = 'p<0.05'
                else:
                    p_str = f'p={p_value:.3f}'
                
                row_data[metric] = p_str
            else:
                row_data[metric] = 'N/A'
        
        comparison_results.append(row_data)
    
    return pd.DataFrame(comparison_results)

# 결과 테이블 생성 및 출력
comparison_df = perform_comprehensive_ttest()
print(comparison_df.to_string(index=False))

                   Comparison models Accuracy F1-Score Precision  Recall
Decision Tree vs. Proposed XAI model   p<0.05  p=0.061   p=0.098  p<0.05
         GAMs vs. Proposed XAI model   p<0.05   p<0.01    p<0.05  p<0.05
          LDA vs. Proposed XAI model   p<0.01   p<0.01    p<0.01 p<0.001
  Naive Bayes vs. Proposed XAI model   p<0.05   p<0.05    p<0.05  p<0.05
       TabNet vs. Proposed XAI model   p<0.05   p<0.05    p<0.05 p=0.055
