In [199]:
import json
import pandas as pd
from pathlib import Path
import decimal

# Load derivation data to get indices
derivation_path = Path('../data/modified_data/coref/derivation_100.json')
with open(derivation_path) as f:
    derivation_data = json.load(f)

# Extract indices from derivation data
derivation_indices = [item['index'] for item in derivation_data]
print(derivation_indices)
# Create mapping between modification types and their filenames
modification_mapping = {
    'derivation': 'derivation'
}

# Base path for results
base_path = Path('../pretrained_language_models/coreference_resolution/tmp')

# Create index mapping for each model and modification
index_mappings = {}
for model in ['bert-base-cased', 'gpt2', 't5-base']:
    index_mappings[model] = {}
    model_path = base_path / f'{model}_results'
    
    for mod_type, filename in modification_mapping.items():
        csv_path = model_path / f'{filename}_predictions.csv'
        if csv_path.exists():
            df = pd.read_csv(csv_path)
            # Replace indices with derivation indices
            df['index'] = derivation_indices[:len(df)]
            # Create new file with replaced indices
            output_path = model_path / f'{filename}_predictions.csv'
            df.to_csv(output_path, index=False)


[1203, 1292, 1393, 85, 185, 837, 783, 830, 356, 518, 315, 1492, 849, 232, 734, 671, 1486, 1478, 894, 601, 312, 1154, 211, 1225, 438, 817, 1451, 117, 89, 189, 37, 828, 965, 405, 794, 1386, 1339, 989, 1300, 1104, 1400, 27, 55, 534, 1381, 724, 547, 884, 285, 130, 124, 747, 1449, 1012, 114, 1219, 1056, 207, 274, 679, 845, 1433, 2, 553, 131, 1353, 1084, 684, 121, 436, 1232, 821, 1448, 120, 1141, 1191, 1312, 159, 77, 155, 87, 963, 376, 796, 198, 1187, 1308, 887, 902, 602, 964, 659, 871, 800, 1223, 772, 323, 1056, 1066, 1012]


In [369]:
import json
import os
import pandas as pd
from pathlib import Path
from scipy import stats

# Base path for results
base_path = Path('../pretrained_language_models/dialogue_contradiction_detection/tmp')

# Function to load and convert predictions to binary
def load_predictions(filepath):
    with open(filepath) as f:
        preds = json.load(f)
    for pred in preds.items():
        if pred[1] == "contradictory":
            preds[pred[0]] = 1
        elif pred[1] == "not contradictory":
            preds[pred[0]] = 0
    return preds

# Load original model predictions
model_orig_preds = {}
model_names = ['bert-base-cased', 'gpt2', 't5-base']

for model in model_names:
    filepath = base_path / f'{model}_results/{model}_predictions.json'
    model_orig_preds[model] = load_predictions(filepath)

# Load predictions for each modification
modifications = []
for model in model_names:
    mod_path = base_path / f'{model}_results'
    if mod_path.exists():
        # Get all CSV files containing predictions
        modifications.extend([f.stem for f in mod_path.glob('*_predictions.csv')])
modifications = list(set(modifications))  # Remove duplicates
modifications = [mod.replace('_predictions', '') for mod in modifications]

# Load negation types from GPT4 results
gpt4_negation_df = pd.read_csv('../eval/results/dialogue/gpt4o-0shot-negation_100.csv')
negation_types = gpt4_negation_df['type'].tolist()

# Sanity check negation types
valid_types = {'absolute', 'double', 'lexical', 'approximate', 'verbal'}
for neg_type in negation_types:
    if neg_type not in valid_types:
        print(f"WARNING: Invalid negation type found: {neg_type}")

# Create results list to store accuracy and statistical test results
results_rows = []
negation_results_rows = []

for mod in modifications:
    for model in model_names:
        # Get original predictions
        orig_preds = model_orig_preds[model]
        
        # Get modified predictions from CSV file
        mod_filepath = base_path / f'{model}_results/{mod}_predictions.csv'
        if mod_filepath.exists():
            mod_df = pd.read_csv(mod_filepath)
            # Calculate accuracies
            orig_correct = 0
            mod_correct = 0
            total = 0
            orig_list = []
            mod_list = []
            
            # Track results by negation type if this is negation mod
            if mod == 'negation':
                results_by_type = {
                    'absolute': {'orig_correct': 0, 'mod_correct': 0, 'total': 0, 'orig_binary': [], 'mod_binary': []},
                    'double': {'orig_correct': 0, 'mod_correct': 0, 'total': 0, 'orig_binary': [], 'mod_binary': []},
                    'lexical': {'orig_correct': 0, 'mod_correct': 0, 'total': 0, 'orig_binary': [], 'mod_binary': []},
                    'approximate': {'orig_correct': 0, 'mod_correct': 0, 'total': 0, 'orig_binary': [], 'mod_binary': []},
                    'verbal': {'orig_correct': 0, 'mod_correct': 0, 'total': 0, 'orig_binary': [], 'mod_binary': []}
                }
            
            # Load labels from eval/results/dialogue CSV
            eval_filepath = Path(f'../data/modified_data/dialogue/{mod}_100.json')
            if eval_filepath.exists():
                label_df = json.load(open(eval_filepath))
            if len(mod_df) != len(label_df):
                print(mod, model, len(mod_df), len(label_df))
                
            for idx, row in mod_df.iterrows():
                # Get ground truth labels from JSON in order
                orig_label = label_df[idx][1]['original_label'] if 'original_label' in label_df[idx][1] else label_df[idx][1]['label']
                mod_label = label_df[idx][1]['modified_label'] if 'modified_label' in label_df[idx][1] else label_df[idx][1]['label']
                
                # Get predictions from CSV
                orig_pred = row['original'] if 'original' in row else row['original_pred']
                mod_pred = row['modified'] if 'modified' in row else row['modified_pred']
                
                # Compare predictions with ground truth
                orig_correct_bool = orig_pred == orig_label
                mod_correct_bool = mod_pred == mod_label
                
                if orig_correct_bool:
                    orig_correct += 1
                    if mod == 'negation':
                        if idx >= len(negation_types):
                            print(f"WARNING: Index {idx} exceeds negation types list length")
                            continue
                        results_by_type[negation_types[idx]]['orig_correct'] += 1
                if mod_correct_bool:
                    mod_correct += 1
                    if mod == 'negation':
                        if idx >= len(negation_types):
                            continue
                        results_by_type[negation_types[idx]]['mod_correct'] += 1
                        
                if mod == 'negation':
                    if idx >= len(negation_types):
                        continue
                    results_by_type[negation_types[idx]]['total'] += 1
                    results_by_type[negation_types[idx]]['orig_binary'].append(1 if orig_correct_bool else 0)
                    results_by_type[negation_types[idx]]['mod_binary'].append(1 if mod_correct_bool else 0)
                    
                orig_list.append(orig_pred)
                mod_list.append(mod_pred)
                total += 1
                
            orig_acc = orig_correct / total if total > 0 else 0
            mod_acc = mod_correct / total if total > 0 else 0
            pct_diff = ((mod_acc - orig_acc) / orig_acc) * 100 if orig_acc > 0 else 0
            
            orig_list_binary = [1 if pred == label else 0 for pred, label in zip(orig_list, [label_df[i][1]['original_label'] if 'original_label' in label_df[i][1] else label_df[i][1]['label'] for i in range(len(orig_list))])]
            mod_list_binary = [1 if pred == label else 0 for pred, label in zip(mod_list, [label_df[i][1]['modified_label'] if 'modified_label' in label_df[i][1] else label_df[i][1]['label'] for i in range(len(mod_list))])]

            # Perform paired t-test on the raw predictions (0/1)
            try:
                _, p_value_mw = stats.mannwhitneyu(orig_list_binary, mod_list_binary)
                _, p_value_w = stats.wilcoxon(orig_list_binary, mod_list_binary)
                p_value = min(p_value_mw, p_value_w)  # Use the more conservative p-value
            except ValueError:
                # If all elements are identical, set p-value to 1.0 since there is no difference
                p_value = 1.0
                
            # Add significance level
            if p_value < 0.01:
                significance = '**'
            elif p_value < 0.05:
                significance = '*'
            elif p_value < 0.1:
                significance = '.'
            else:
                significance = 'ns'
            
            row = {
                'model': model,
                'modification': mod,
                'original_acc': decimal.Decimal(orig_acc).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
                'modified_acc': decimal.Decimal(mod_acc).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
                'percentage_diff': decimal.Decimal(pct_diff).quantize(decimal.Decimal('0.1'), rounding=decimal.ROUND_HALF_UP),
                'p_value': decimal.Decimal(p_value).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
                'significance': significance,
                'significant': 'Yes' if p_value < 0.05 else 'No'
            }
            results_rows.append(row)
            
            # Add rows for each negation type if this is negation mod
            if mod == 'negation':
                for neg_type, results in results_by_type.items():
                    type_orig_acc = results['orig_correct'] / results['total'] if results['total'] > 0 else 0
                    type_mod_acc = results['mod_correct'] / results['total'] if results['total'] > 0 else 0
                    type_pct_diff = ((type_mod_acc - type_orig_acc) / type_orig_acc) * 100 if type_orig_acc > 0 else 0

                    try:
                        _, p_value_mw = stats.mannwhitneyu(results['orig_binary'], results['mod_binary'])
                        _, p_value_w = stats.wilcoxon(results['orig_binary'], results['mod_binary'])
                        p_value = min(p_value_mw, p_value_w)  # Use the more conservative p-value
                    except ValueError:
                        # If all elements are identical, set p-value to 1.0 since there is no difference
                        p_value = 1.0
                        
                    # Add significance level
                    if p_value < 0.01:
                        significance = '**'
                    elif p_value < 0.05:
                        significance = '*'
                    elif p_value < 0.1:
                        significance = '.'
                    else:
                        significance = 'ns'

                    type_row = {
                        'model': model,
                        'modification': neg_type,
                        'original_acc': decimal.Decimal(type_orig_acc).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
                        'modified_acc': decimal.Decimal(type_mod_acc).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
                        'pct_diff': decimal.Decimal(type_pct_diff).quantize(decimal.Decimal('0.1'), rounding=decimal.ROUND_HALF_UP),
                        'wilcoxon_pvalue': decimal.Decimal(p_value_w).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
                        'mannwhitney_pvalue': decimal.Decimal(p_value_mw).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
                        'pvalue': decimal.Decimal(p_value).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
                        'significance': significance,
                        'significant': p_value < 0.05
                    }
                    negation_results_rows.append(type_row)

# Convert to dataframes
results_df = pd.DataFrame(results_rows)
negation_results_df = pd.DataFrame(negation_results_rows)

# Save results
results_df.to_csv(base_path / 'dialogue_plm_results.csv', index=False)
negation_results_df.to_csv(base_path / 'dialogue_plm_negation_results.csv', index=False)

# Display results
print("Results by modification and model:")
print(results_df)


Results by modification and model:
              model              modification original_acc modified_acc  \
0   bert-base-cased             temporal_bias        0.870        0.870   
1              gpt2             temporal_bias        0.850        0.830   
2           t5-base             temporal_bias        0.810        0.760   
3   bert-base-cased         active_to_passive        0.870        0.900   
4              gpt2         active_to_passive        0.840        0.870   
5           t5-base         active_to_passive        0.850        0.820   
6   bert-base-cased                derivation        0.882        0.882   
7              gpt2                derivation        0.828        0.849   
8           t5-base                derivation        0.849        0.839   
9   bert-base-cased                  negation        0.890        0.670   
10             gpt2                  negation        0.880        0.620   
11          t5-base                  negation        0.900       

In [370]:
# Also analyze results from eval/results/dialogue/
eval_base_path = Path('../eval/results/dialogue')
eval_results_rows = []
eval_negation_results_rows = []

for mod in os.listdir(eval_base_path):
    if not mod.endswith('_100.csv'):
        continue
    model = mod.split('-0shot-')[0]
    if model == 'mixtral':
        continue
    mod = mod.split('-0shot-')[1].replace('_100.csv', '')
    # Load predictions from CSV
    eval_filepath = eval_base_path / f'{model}-0shot-{mod}_100.csv'
    if not eval_filepath.exists():
        continue
    df = pd.read_csv(eval_filepath)
    
    compare_file = Path(f'../data/modified_data/dialogue/{mod}_100.json')
    if not compare_file.exists():
        continue
    compare_df = json.load(open(compare_file))
    if len(df) != len(compare_df):
        print(f"Warning: Length mismatch for {mod} {model}")
    
    # Calculate accuracies
    orig_correct = sum(df['original_pred'] == df['original_label'])
    mod_correct = sum(df['modified_pred'] == df['modified_label'])
    total = len(df)
        
    orig_acc = orig_correct / total if total > 0 else 0
    mod_acc = mod_correct / total if total > 0 else 0
    pct_diff = ((mod_acc - orig_acc) / orig_acc) * 100 if orig_acc > 0 else 0
    
    # Convert predictions to binary (0/1) based on correctness for t-test
    orig_binary = (df['original_pred'] == df['original_label']).astype(int)
    mod_binary = (df['modified_pred'] == df['modified_label']).astype(int)
    
    # Perform paired t-test on binary correctness values
    try:
        _, p_value_mw = stats.mannwhitneyu(orig_binary, mod_binary)
        _, p_value_wilc = stats.wilcoxon(orig_binary, mod_binary)
        p_value = min(p_value_mw, p_value_wilc)  # Use most conservative p-value
    except ValueError:
        # If all elements are identical, set p-value to 1.0 since there is no difference
        p_value = 1.0
        
    # Add significance level
    if p_value < 0.01:
        significance = '**'
    elif p_value < 0.05:
        significance = '*'
    elif p_value < 0.1:
        significance = '.'
    else:
        significance = 'ns'
    
    row = {
        'model': model,
        'modification': mod,
        'original_acc': decimal.Decimal(orig_acc).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
        'modified_acc': decimal.Decimal(mod_acc).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
        'percentage_diff': decimal.Decimal(pct_diff).quantize(decimal.Decimal('0.1'), rounding=decimal.ROUND_HALF_UP),
        'p_value': decimal.Decimal(p_value).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
        'significance': significance,
        'significant': p_value < 0.05
    }
    eval_results_rows.append(row)
    
    # Add rows for each negation type if this is negation mod
    if mod == 'negation':
        # Verify negation types match expected values
        expected_types = {'absolute', 'double', 'lexical', 'approximate', 'verbal'}
        actual_types = set(df['type'].unique())
        if actual_types != expected_types:
            print(f"Warning: Unexpected negation types for {model}")
            print(f"Expected: {expected_types}")
            print(f"Found: {actual_types}")
            
        for neg_type in df['type'].unique():
            type_df = df[df['type'] == neg_type]
            
            type_orig_correct = sum(type_df['original_pred'] == type_df['original_label'])
            type_mod_correct = sum(type_df['modified_pred'] == type_df['modified_label'])
            type_total = len(type_df)
            
            type_orig_acc = type_orig_correct / type_total if type_total > 0 else 0
            type_mod_acc = type_mod_correct / type_total if type_total > 0 else 0
            type_pct_diff = ((type_mod_acc - type_orig_acc) / type_orig_acc) * 100 if type_orig_acc > 0 else 0
            
            # Statistical tests for this negation type
            type_orig_binary = (type_df['original_pred'] == type_df['original_label']).astype(int)
            type_mod_binary = (type_df['modified_pred'] == type_df['modified_label']).astype(int)
            
            try:
                _, type_p_value_mw = stats.mannwhitneyu(type_orig_binary, type_mod_binary)
                _, type_p_value_wilc = stats.wilcoxon(type_orig_binary, type_mod_binary)
                type_p_value = min(type_p_value_mw, type_p_value_wilc)
            except ValueError:
                type_p_value = 1.0
                
            # Add significance level
            if type_p_value < 0.01:
                significance = '**'
            elif type_p_value < 0.05:
                significance = '*'
            elif type_p_value < 0.1:
                significance = '.'
            else:
                significance = 'ns'
                
            type_row = {
                'model': model,
                'modification': neg_type,
                'original_acc': decimal.Decimal(type_orig_acc).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
                'modified_acc': decimal.Decimal(type_mod_acc).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
                'pct_diff': decimal.Decimal(type_pct_diff).quantize(decimal.Decimal('0.1'), rounding=decimal.ROUND_HALF_UP),
                'wilcoxon_pvalue': decimal.Decimal(type_p_value_wilc).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
                'mannwhitney_pvalue': decimal.Decimal(type_p_value_mw).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
                'pvalue': decimal.Decimal(type_p_value).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
                'significance': significance,
                'significant': type_p_value < 0.05
            }
            eval_negation_results_rows.append(type_row)

# Convert to dataframes
eval_results_df = pd.DataFrame(eval_results_rows)
eval_negation_results_df = pd.DataFrame(eval_negation_results_rows)

# Save results
eval_results_df.to_csv(base_path / 'dialogue_llm_results.csv', index=False)
eval_negation_results_df.to_csv(base_path / 'dialogue_llm_negation_results.csv', index=False)

# Display results
print("\nEvaluation Results by modification and model:")
# print(eval_results_df)
print("\nNegation Results by type and model:")
# print(eval_negation_results_df)



Evaluation Results by modification and model:

Negation Results by type and model:


In [371]:
# Load LLM and PLM results
base_path = Path('../pretrained_language_models/dialogue_contradiction_detection/tmp')
llm_results = pd.read_csv(base_path / 'dialogue_llm_results.csv')
plm_results = pd.read_csv(base_path / 'dialogue_plm_results.csv')

# Load LLM and PLM negation results
llm_neg_results = pd.read_csv(base_path / 'dialogue_llm_negation_results.csv')
plm_neg_results = pd.read_csv(base_path / 'dialogue_plm_negation_results.csv')

# Combine the regular results
combined_results = pd.concat([llm_results, plm_results], ignore_index=True)

# Combine the negation results
combined_neg_results = pd.concat([llm_neg_results, plm_neg_results], ignore_index=True)

# Sort by model and modification
combined_results = combined_results.sort_values(['model', 'modification'])
combined_neg_results = combined_neg_results.sort_values(['model', 'modification'])

# Save combined results
combined_results.to_csv(base_path / 'dialogue_combined_results.csv', index=False)
combined_neg_results.to_csv(base_path / 'dialogue_combined_negation_results.csv', index=False)

print("\nCombined Results:")
print(combined_results)
print("\nCombined Negation Results:")
print(combined_neg_results)



Combined Results:
              model         modification  original_acc  modified_acc  \
54  bert-base-cased    active_to_passive         0.870         0.900   
90  bert-base-cased       capitalization         0.927         0.927   
99  bert-base-cased               casual         0.900         0.820   
87  bert-base-cased        compound_word         0.890         0.850   
81  bert-base-cased  concept_replacement         0.890         0.860   
..              ...                  ...           ...           ...   
62          t5-base             negation         0.900         0.680   
74          t5-base          punctuation         0.860         0.860   
65          t5-base            sentiment         0.840         0.810   
53          t5-base        temporal_bias         0.810         0.760   
86          t5-base            typo_bias         0.840         0.820   

    percentage_diff  p_value significance significant  
54              3.4    0.317           ns          No  
90  

In [372]:
import numpy as np
base_path = Path('../pretrained_language_models/dialogue_contradiction_detection/tmp')
modification_order =["B: Tem", "B: Geo", "B: Len", "O: Spell","O: Cap","O: Punc",
"M: Deri",
"M: Com",
"Sx: Voice",
"Sx: Gra",
"Sx: Conj",
"Sm: Con",
"P: Neg",
"P: Disc",
"P: Senti",
"G: Cas",
"G: Dial",]
# Read the combined results
df = pd.read_csv(base_path / 'dialogue_combined_results.csv')

# Create mapping from modification names to standardized names
mod_mapping = {
    'temporal_bias': 'B: Tem',
    'geographical_bias': 'B: Geo', 
    'length_bias': 'B: Len',
    'typo_bias': 'O: Spell',
    'capitalization': 'O: Cap',
    'punctuation': 'O: Punc',
    'derivation': 'M: Deri',
    'compound_word': 'M: Com',
    'active_to_passive': 'Sx: Voice',
    'grammatical_role': 'Sx: Gra',
    'coordinating_conjunction': 'Sx: Conj',
    'concept_replacement': 'Sm: Con',
    'negation': 'P: Neg',
    'discourse': 'P: Disc',
    'sentiment': 'P: Senti',
    'casual': 'G: Cas',
    'dialectal': 'G: Dial'
}

# Map the modification names
df['modification'] = df['modification'].map(mod_mapping)

# Define model order
model_order = ['bert-base-cased', 'gpt2', 't5-base', 'gpt4o', 'claude-3-5-sonnet', 'llama']

# Pivot the data to get modifications as rows and models as columns
pivot_df = df.pivot(index='modification', columns='model', values='percentage_diff')
p_values = df.pivot(index='modification', columns='model', values='p_value')

# Reorder rows and columns
pivot_df = pivot_df.reindex(modification_order, axis=0)
pivot_df = pivot_df.reindex(model_order, axis=1)
p_values = p_values.reindex(modification_order, axis=0)
p_values = p_values.reindex(model_order, axis=1)

# Function to generate color based on value
def get_color(val, p_val):
    if np.isnan(val):
        return ''
    elif val > 0:
        # Green gradient for positive values
        intensity = min(abs(val)/10, 1)  # Scale to max 10% change
        val_str = f'+{val:.1f}'
        if p_val < 0.01:
            val_str = f'\\textbf{{{val_str}}}**'
        elif p_val < 0.05:
            val_str = f'\\textbf{{{val_str}}}*'
        elif p_val < 0.1:
            val_str = f'\\textbf{{{val_str}}}'
        return f'\\cellcolor{{green!{int(intensity*30)}}} {val_str}'
    else:
        # Red gradient for negative values
        intensity = min(abs(val)/10, 1)  # Scale to max 10% change
        val_str = f'{val:.1f}'
        if p_val < 0.01:
            val_str = f'\\textbf{{{val_str}}}**'
        elif p_val < 0.05:
            val_str = f'\\textbf{{{val_str}}}*'
        elif p_val < 0.1:
            val_str = f'\\textbf{{{val_str}}}'
        return f'\\cellcolor{{red!{int(intensity*30)}}} {val_str}'

# Generate LaTeX table
latex_table = '\\begin{table}[h]\n\\centering\n\\begin{tabular}{l' + 'r'*len(pivot_df.columns) + '}\n'
latex_table += '\\hline\n'
latex_table += 'Modification & ' + ' & '.join([f'\\textbf{{{col}}}' for col in pivot_df.columns]) + ' \\\\\n'
latex_table += '\\hline\n'

prev_category = None
for idx, row in pivot_df.iterrows():
    current_category = idx[0]  # Get first character of modification name
    if prev_category is not None and current_category != prev_category:
        latex_table += '\\hline\n'
    prev_category = current_category
    
    latex_table += f'\\textbf{{{idx}}} & '
    latex_table += ' & '.join([get_color(val, p_values.loc[idx, col]) for col, val in row.items()]) + ' \\\\\n'

latex_table += '\\hline\n'
latex_table += '\\end{tabular}\n'
latex_table += '\\caption{Percentage Change in Micro F1 Score by Model and Modification Type}\n'
latex_table += '\\label{tab:ner_results}\n'
latex_table += '\\end{table}'

# Save to file
with open(base_path / 'dialogue_results_table.tex', 'w') as f:
    f.write(latex_table)

print("LaTeX table saved to .tex")


LaTeX table saved to .tex


In [373]:
# Load results from CSV
base_path = Path('../pretrained_language_models/dialogue_contradiction_detection/tmp')

results_df = pd.read_csv(base_path / 'dialogue_combined_results.csv')

# Create lists of unique modifications and models
modification_order = list(mod_mapping.keys())
negation_order = ['verbal', 'lexical', 'double', 'approximate', 'absolute']
model_order = ['bert-base-cased', 'gpt2', 't5-base', 'gpt4o', 'claude-3-5-sonnet', 'llama']

# Create empty DataFrame with multi-level columns
columns = pd.MultiIndex.from_product([model_order, ['original', 'modified', 'diff']])
results_df_pivot = pd.DataFrame(index=modification_order, columns=columns)



# Fill DataFrame
for mod in modification_order:
    for model in model_order:
        row = results_df[(results_df['modification'] == mod) & (results_df['model'] == model)]
        if not row.empty:
            results_df_pivot.loc[mod, (model, 'original')] = row['original_acc'].values[0]
            results_df_pivot.loc[mod, (model, 'modified')] = row['modified_acc'].values[0]
            results_df_pivot.loc[mod, (model, 'diff')] = row['percentage_diff'].values[0]
            if mod == 'temporal_bias' and model == 'bert':
                print(row)

# Save to CSV
results_df_pivot.to_csv(base_path / 'dialogue_results_df.csv')

# Save negation results to separate CSV
# Load negation results from CSV
negation_results_df = pd.read_csv(base_path / 'dialogue_combined_negation_results.csv')

# Create empty DataFrame with multi-level columns for negation results
negation_df_pivot = pd.DataFrame(index=['negation'], columns=columns)

# Fill DataFrame with negation results
for mod in negation_order:  
    for model in model_order:
        row = negation_results_df[(negation_results_df['modification'] == mod) & (negation_results_df['model'] == model)]
        if not row.empty:
            negation_df_pivot.loc[mod, (model, 'original')] = row['original_acc'].values[0]
            negation_df_pivot.loc[mod, (model, 'modified')] = row['modified_acc'].values[0] 
            negation_df_pivot.loc[mod, (model, 'diff')] = row['pct_diff'].values[0]

# Save negation results to CSV
negation_df_pivot.to_csv(base_path / 'dialogue_negation_results_df.csv')


print("Results saved to dialogue_results_df.csv and dialogue_negation_results_df.csv")


Results saved to dialogue_results_df.csv and dialogue_negation_results_df.csv


In [374]:
import json
import os
import pandas as pd
from pathlib import Path
from scipy import stats

# Base path for results
base_path = Path('../pretrained_language_models/coreference_resolution/tmp')

# Function to load and convert predictions to binary
def load_predictions(filepath):
    with open(filepath) as f:
        preds = json.load(f)
    return preds

# Load original model predictions
model_orig_preds = {}
model_names = ['bert-base-cased', 'gpt2', 't5-base']

for model in model_names:
    filepath = base_path / f'{model}_results/{model}_predictions.json'
    model_orig_preds[model] = load_predictions(filepath)

# Load predictions for each modification
modifications = []
for model in model_names:
    mod_path = base_path / f'{model}_results'
    if mod_path.exists():
        # Get all CSV files containing predictions
        modifications.extend([f.stem for f in mod_path.glob('*_predictions.csv')])
modifications = list(set(modifications))  # Remove duplicates
modifications = [mod.replace('_predictions', '') for mod in modifications]

# Load negation types from GPT4 results
gpt4_negation_df = pd.read_csv('../eval/results/coref/gpt4o-0shot-negation_100.csv')
negation_types = gpt4_negation_df['type'].tolist()

# Sanity check negation types
valid_types = {'absolute', 'double', 'lexical', 'approximate', 'verbal'}
for neg_type in negation_types:
    if neg_type not in valid_types:
        print(f"WARNING: Invalid negation type found: {neg_type}")

# Create results list to store accuracy and statistical test results
results_rows = []
negation_results_rows = []

for mod in modifications:
    for model in model_names:
        # Get original predictions
        orig_preds = model_orig_preds[model]
        
        # Get modified predictions from CSV file
        mod_filepath = base_path / f'{model}_results/{mod}_predictions.csv'
        if mod_filepath.exists():
            mod_df = pd.read_csv(mod_filepath)
            # Calculate accuracies
            orig_correct = 0
            mod_correct = 0
            total = 0
            orig_list = []
            mod_list = []
            
            # Track results by negation type if this is negation mod
            if mod == 'negation':
                results_by_type = {
                    'absolute': {'orig_correct': 0, 'mod_correct': 0, 'total': 0, 'orig_binary': [], 'mod_binary': []},
                    'double': {'orig_correct': 0, 'mod_correct': 0, 'total': 0, 'orig_binary': [], 'mod_binary': []},
                    'lexical': {'orig_correct': 0, 'mod_correct': 0, 'total': 0, 'orig_binary': [], 'mod_binary': []},
                    'approximate': {'orig_correct': 0, 'mod_correct': 0, 'total': 0, 'orig_binary': [], 'mod_binary': []},
                    'verbal': {'orig_correct': 0, 'mod_correct': 0, 'total': 0, 'orig_binary': [], 'mod_binary': []}
                }
            
            # Load labels from eval/results/dialogue CSV
            eval_filepath = Path(f'../data/modified_data/coref/{mod}_100.json')
            if eval_filepath.exists():
                label_df = json.load(open(eval_filepath))
            if len(mod_df) != len(label_df):
                print(mod, model)
                
            for idx, row in mod_df.iterrows():
                # Get ground truth labels from JSON in order
                orig_label = label_df[idx]['original_label'] if 'original_label' in label_df[idx] else label_df[idx]['label']
                mod_label = label_df[idx]['modified_label'] if 'modified_label' in label_df[idx] else label_df[idx]['label']
                
                # Get predictions from CSV
                orig_pred = row['original'] if 'original' in row else row['original_pred']
                mod_pred = row['modified'] if 'modified' in row else row['modified_pred']
                
                # Compare predictions with ground truth
                orig_correct_bool = orig_pred == orig_label
                mod_correct_bool = mod_pred == mod_label
                
                if orig_correct_bool:
                    orig_correct += 1
                    if mod == 'negation':
                        if idx >= len(negation_types):
                            print(f"WARNING: Index {idx} exceeds negation types list length")
                            continue
                        results_by_type[negation_types[idx]]['orig_correct'] += 1
                if mod_correct_bool:
                    mod_correct += 1
                    if mod == 'negation':
                        if idx >= len(negation_types):
                            continue
                        results_by_type[negation_types[idx]]['mod_correct'] += 1
                        
                if mod == 'negation':
                    if idx >= len(negation_types):
                        continue
                    results_by_type[negation_types[idx]]['total'] += 1
                    results_by_type[negation_types[idx]]['orig_binary'].append(1 if orig_correct_bool else 0)
                    results_by_type[negation_types[idx]]['mod_binary'].append(1 if mod_correct_bool else 0)
                    
                orig_list.append(orig_pred)
                mod_list.append(mod_pred)
                total += 1
                
            orig_acc = orig_correct / total if total > 0 else 0
            mod_acc = mod_correct / total if total > 0 else 0
            pct_diff = ((mod_acc - orig_acc) / orig_acc) * 100 if orig_acc > 0 else 0
            
            orig_list_binary = [1 if pred == label else 0 for pred, label in zip(orig_list, [label_df[i]['original_label'] if 'original_label' in label_df[i] else label_df[i]['label'] for i in range(len(orig_list))])]
            mod_list_binary = [1 if pred == label else 0 for pred, label in zip(mod_list, [label_df[i]['modified_label'] if 'modified_label' in label_df[i] else label_df[i]['label'] for i in range(len(mod_list))])]

            # Perform paired t-test on the raw predictions (0/1)
            try:
                _, p_value_mw = stats.mannwhitneyu(orig_list_binary, mod_list_binary)
                _, p_value_w = stats.wilcoxon(orig_list_binary, mod_list_binary)
                p_value = min(p_value_mw, p_value_w)  # Use the more conservative p-value
            except ValueError:
                # If all elements are identical, set p-value to 1.0 since there is no difference
                p_value = 1.0
                
            # Add significance level
            if p_value < 0.01:
                significance = '**'
            elif p_value < 0.05:
                significance = '*'
            elif p_value < 0.1:
                significance = '.'
            else:
                significance = 'ns'
            
            row = {
                'model': model,
                'modification': mod,
                'original_acc': decimal.Decimal(orig_acc).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
                'modified_acc': decimal.Decimal(mod_acc).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
                'percentage_diff': decimal.Decimal(pct_diff).quantize(decimal.Decimal('0.1'), rounding=decimal.ROUND_HALF_UP),
                'p_value': decimal.Decimal(p_value).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
                'significance': significance,
                'significant': 'Yes' if p_value < 0.05 else 'No'
            }
            results_rows.append(row)
            
            # Add rows for each negation type if this is negation mod
            if mod == 'negation':
                for neg_type, results in results_by_type.items():
                    type_orig_acc = results['orig_correct'] / results['total'] if results['total'] > 0 else 0
                    type_mod_acc = results['mod_correct'] / results['total'] if results['total'] > 0 else 0
                    type_pct_diff = ((type_mod_acc - type_orig_acc) / type_orig_acc) * 100 if type_orig_acc > 0 else 0

                    try:
                        _, p_value_mw = stats.mannwhitneyu(results['orig_binary'], results['mod_binary'])
                        _, p_value_w = stats.wilcoxon(results['orig_binary'], results['mod_binary'])
                        p_value = min(p_value_mw, p_value_w)  # Use the more conservative p-value
                    except ValueError:
                        # If all elements are identical, set p-value to 1.0 since there is no difference
                        p_value = 1.0
                        
                    # Add significance level
                    if p_value < 0.01:
                        significance = '**'
                    elif p_value < 0.05:
                        significance = '*'
                    elif p_value < 0.1:
                        significance = '.'
                    else:
                        significance = 'ns'

                    type_row = {
                        'model': model,
                        'modification': neg_type,
                        'original_acc': decimal.Decimal(type_orig_acc).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
                        'modified_acc': decimal.Decimal(type_mod_acc).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
                        'pct_diff': decimal.Decimal(type_pct_diff).quantize(decimal.Decimal('0.1'), rounding=decimal.ROUND_HALF_UP),
                        'wilcoxon_pvalue': decimal.Decimal(p_value_w).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
                        'mannwhitney_pvalue': decimal.Decimal(p_value_mw).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
                        'pvalue': decimal.Decimal(p_value).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
                        'significance': significance,
                        'significant': p_value < 0.05
                    }
                    negation_results_rows.append(type_row)

# Convert to dataframes
results_df = pd.DataFrame(results_rows)
negation_results_df = pd.DataFrame(negation_results_rows)

# Save results
results_df.to_csv(base_path / 'coreference_plm_results.csv', index=False)
negation_results_df.to_csv(base_path / 'coreference_plm_negation_results.csv', index=False)

# Display results
# print("Results by modification and model:")
# print(results_df)


  temp = _wilcoxon_iv(x, y, zero_method, correction, alternative, method, axis)
  temp = _wilcoxon_iv(x, y, zero_method, correction, alternative, method, axis)
  temp = _wilcoxon_iv(x, y, zero_method, correction, alternative, method, axis)


In [375]:
# Also analyze results from eval/results/dialogue/
eval_base_path = Path('../eval/results/coref')
eval_results_rows = []
eval_negation_results_rows = []

for mod in os.listdir(eval_base_path):
    if not mod.endswith('_100.csv'):
        continue
    model = mod.split('-0shot-')[0]
    if model == 'mixtral':
        continue
    mod = mod.split('-0shot-')[1].replace('_100.csv', '')
    # Load predictions from CSV
    eval_filepath = eval_base_path / f'{model}-0shot-{mod}_100.csv'
    if not eval_filepath.exists():
        continue
    df = pd.read_csv(eval_filepath)
    
    compare_file = Path(f'../data/modified_data/coref/{mod}_100.json')
    if not compare_file.exists():
        continue
    compare_df = json.load(open(compare_file))
    if len(df) != len(compare_df):
        print(f"Warning: Length mismatch for {mod} {model}")
    
    # Calculate accuracies
    orig_correct = sum(df['original_pred'] == df['original_label'])
    mod_correct = sum(df['modified_pred'] == df['modified_label'])
    total = len(df)
        
    orig_acc = orig_correct / total if total > 0 else 0
    mod_acc = mod_correct / total if total > 0 else 0
    pct_diff = ((mod_acc - orig_acc) / orig_acc) * 100 if orig_acc > 0 else 0
    
    # Convert predictions to binary (0/1) based on correctness for t-test
    orig_binary = (df['original_pred'] == df['original_label']).astype(int)
    mod_binary = (df['modified_pred'] == df['modified_label']).astype(int)
    
    # Perform paired t-test on binary correctness values
    try:
        _, p_value_mw = stats.mannwhitneyu(orig_binary, mod_binary)
        _, p_value_wilc = stats.wilcoxon(orig_binary, mod_binary)
        p_value = min(p_value_mw, p_value_wilc)  # Use most conservative p-value
    except ValueError:
        # If all elements are identical, set p-value to 1.0 since there is no difference
        p_value = 1.0
        
    # Add significance level
    if p_value < 0.01:
        significance = '**'
    elif p_value < 0.05:
        significance = '*'
    elif p_value < 0.1:
        significance = '.'
    else:
        significance = 'ns'
    
    row = {
        'model': model,
        'modification': mod,
        'original_acc': decimal.Decimal(orig_acc).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
        'modified_acc': decimal.Decimal(mod_acc).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
        'percentage_diff': decimal.Decimal(pct_diff).quantize(decimal.Decimal('0.1'), rounding=decimal.ROUND_HALF_UP),
        'p_value': decimal.Decimal(p_value).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
        'significance': significance,
        'significant': p_value < 0.05
    }
    eval_results_rows.append(row)
    
    # Add rows for each negation type if this is negation mod
    if mod == 'negation':
        # Verify negation types match expected values
        expected_types = {'absolute', 'double', 'lexical', 'approximate', 'verbal'}
        actual_types = set(df['type'].unique())
        if actual_types != expected_types:
            print(f"Warning: Unexpected negation types for {model}")
            print(f"Expected: {expected_types}")
            print(f"Found: {actual_types}")
            
        for neg_type in df['type'].unique():
            type_df = df[df['type'] == neg_type]
            
            type_orig_correct = sum(type_df['original_pred'] == type_df['original_label'])
            type_mod_correct = sum(type_df['modified_pred'] == type_df['modified_label'])
            type_total = len(type_df)
            
            type_orig_acc = type_orig_correct / type_total if type_total > 0 else 0
            type_mod_acc = type_mod_correct / type_total if type_total > 0 else 0
            type_pct_diff = ((type_mod_acc - type_orig_acc) / type_orig_acc) * 100 if type_orig_acc > 0 else 0
            
            # Statistical tests for this negation type
            type_orig_binary = (type_df['original_pred'] == type_df['original_label']).astype(int)
            type_mod_binary = (type_df['modified_pred'] == type_df['modified_label']).astype(int)
            
            try:
                _, type_p_value_mw = stats.mannwhitneyu(type_orig_binary, type_mod_binary)
                _, type_p_value_wilc = stats.wilcoxon(type_orig_binary, type_mod_binary)
                type_p_value = min(type_p_value_mw, type_p_value_wilc)
            except ValueError:
                type_p_value = 1.0
                
            # Add significance level
            if type_p_value < 0.01:
                significance = '**'
            elif type_p_value < 0.05:
                significance = '*'
            elif type_p_value < 0.1:
                significance = '.'
            else:
                significance = 'ns'
                
            type_row = {
                'model': model,
                'modification': neg_type,
                'original_acc': decimal.Decimal(type_orig_acc).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
                'modified_acc': decimal.Decimal(type_mod_acc).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
                'pct_diff': decimal.Decimal(type_pct_diff).quantize(decimal.Decimal('0.1'), rounding=decimal.ROUND_HALF_UP),
                'wilcoxon_pvalue': decimal.Decimal(type_p_value_wilc).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
                'mannwhitney_pvalue': decimal.Decimal(type_p_value_mw).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
                'pvalue': decimal.Decimal(type_p_value).quantize(decimal.Decimal('0.001'), rounding=decimal.ROUND_HALF_UP),
                'significance': significance,
                'significant': type_p_value < 0.05
            }
            eval_negation_results_rows.append(type_row)

# Convert to dataframes
eval_results_df = pd.DataFrame(eval_results_rows)
eval_negation_results_df = pd.DataFrame(eval_negation_results_rows)

# Save results
eval_results_df.to_csv(base_path / 'coreference_llm_results.csv', index=False)
eval_negation_results_df.to_csv(base_path / 'coreference_llm_negation_results.csv', index=False)

# Display results
print("\nEvaluation Results by modification and model:")
# print(eval_results_df)
print("\nNegation Results by type and model:")
# print(eval_negation_results_df)


  temp = _wilcoxon_iv(x, y, zero_method, correction, alternative, method, axis)
  temp = _wilcoxon_iv(x, y, zero_method, correction, alternative, method, axis)



Evaluation Results by modification and model:

Negation Results by type and model:


  temp = _wilcoxon_iv(x, y, zero_method, correction, alternative, method, axis)


In [376]:
# Load LLM and PLM results
llm_results = pd.read_csv(base_path / 'coreference_llm_results.csv')
plm_results = pd.read_csv(base_path / 'coreference_plm_results.csv')

# Load LLM and PLM negation results
llm_neg_results = pd.read_csv(base_path / 'coreference_llm_negation_results.csv')
plm_neg_results = pd.read_csv(base_path / 'coreference_plm_negation_results.csv')

# Combine the regular results
combined_results = pd.concat([llm_results, plm_results], ignore_index=True)

# Combine the negation results
combined_neg_results = pd.concat([llm_neg_results, plm_neg_results], ignore_index=True)

# Sort by model and modification
combined_results = combined_results.sort_values(['model', 'modification'])
combined_neg_results = combined_neg_results.sort_values(['model', 'modification'])

# Save combined results
combined_results.to_csv(base_path / 'coreference_combined_results.csv', index=False)
combined_neg_results.to_csv(base_path / 'coreference_combined_negation_results.csv', index=False)

print("\nCombined Results:")
print(combined_results)
print("\nCombined Negation Results:")
print(combined_neg_results)



Combined Results:
              model         modification  original_acc  modified_acc  \
54  bert-base-cased    active_to_passive         0.495         0.474   
90  bert-base-cased       capitalization         0.535         0.485   
99  bert-base-cased               casual         0.550         0.430   
87  bert-base-cased        compound_word         0.604         0.604   
81  bert-base-cased  concept_replacement         0.440         0.420   
..              ...                  ...           ...           ...   
62          t5-base             negation         0.592         0.571   
74          t5-base          punctuation         0.586         0.596   
65          t5-base            sentiment         0.640         0.640   
53          t5-base        temporal_bias         0.600         0.580   
86          t5-base            typo_bias         0.633         0.643   

    percentage_diff  p_value significance significant  
54             -4.3    0.732           ns          No  
90  

In [377]:
import numpy as np
base_path = Path('../pretrained_language_models/coreference_resolution/tmp')
modification_order =["B: Tem", "B: Geo", "B: Len", "O: Spell","O: Cap","O: Punc",
"M: Deri",
"M: Com",
"Sx: Voice",
"Sx: Gra",
"Sx: Conj",
"Sm: Con",
"P: Neg",
"P: Disc",
"P: Senti",
"G: Cas",
"G: Dial",]
# Read the combined results
df = pd.read_csv(base_path / 'coreference_combined_results.csv')

# Create mapping from modification names to standardized names
mod_mapping = {
    'temporal_bias': 'B: Tem',
    'geographical_bias': 'B: Geo', 
    'length_bias': 'B: Len',
    'typo_bias': 'O: Spell',
    'capitalization': 'O: Cap',
    'punctuation': 'O: Punc',
    'derivation': 'M: Deri',
    'compound_word': 'M: Com',
    'active_to_passive': 'Sx: Voice',
    'grammatical_role': 'Sx: Gra',
    'coordinating_conjunction': 'Sx: Conj',
    'concept_replacement': 'Sm: Con',
    'negation': 'P: Neg',
    'discourse': 'P: Disc',
    'sentiment': 'P: Senti',
    'casual': 'G: Cas',
    'dialectal': 'G: Dial'
}

# Map the modification names
df['modification'] = df['modification'].map(mod_mapping)

# Define model order
model_order = ['bert-base-cased', 'gpt2', 't5-base', 'gpt4o', 'claude-3-5-sonnet', 'llama']

# Pivot the data to get modifications as rows and models as columns
pivot_df = df.pivot(index='modification', columns='model', values='percentage_diff')
p_values = df.pivot(index='modification', columns='model', values='p_value')

# Reorder rows and columns
pivot_df = pivot_df.reindex(modification_order, axis=0)
pivot_df = pivot_df.reindex(model_order, axis=1)
p_values = p_values.reindex(modification_order, axis=0)
p_values = p_values.reindex(model_order, axis=1)

# Function to generate color based on value
def get_color(val, p_val):
    if np.isnan(val):
        return ''
    elif val > 0:
        # Green gradient for positive values
        intensity = min(abs(val)/10, 1)  # Scale to max 10% change
        val_str = f'+{val:.1f}'
        if p_val < 0.01:
            val_str = f'\\textbf{{{val_str}}}**'
        elif p_val < 0.05:
            val_str = f'\\textbf{{{val_str}}}*'
        elif p_val < 0.1:
            val_str = f'\\textbf{{{val_str}}}'
        return f'\\cellcolor{{green!{int(intensity*30)}}} {val_str}'
    else:
        # Red gradient for negative values
        intensity = min(abs(val)/10, 1)  # Scale to max 10% change
        val_str = f'{val:.1f}'
        if p_val < 0.01:
            val_str = f'\\textbf{{{val_str}}}**'
        elif p_val < 0.05:
            val_str = f'\\textbf{{{val_str}}}*'
        elif p_val < 0.1:
            val_str = f'\\textbf{{{val_str}}}'
        return f'\\cellcolor{{red!{int(intensity*30)}}} {val_str}'

# Generate LaTeX table
latex_table = '\\begin{table}[h]\n\\centering\n\\begin{tabular}{l' + 'r'*len(pivot_df.columns) + '}\n'
latex_table += '\\hline\n'
latex_table += 'Modification & ' + ' & '.join([f'\\textbf{{{col}}}' for col in pivot_df.columns]) + ' \\\\\n'
latex_table += '\\hline\n'

prev_category = None
for idx, row in pivot_df.iterrows():
    current_category = idx[0]  # Get first character of modification name
    if prev_category is not None and current_category != prev_category:
        latex_table += '\\hline\n'
    prev_category = current_category
    
    latex_table += f'\\textbf{{{idx}}} & '
    latex_table += ' & '.join([get_color(val, p_values.loc[idx, col]) for col, val in row.items()]) + ' \\\\\n'

latex_table += '\\hline\n'
latex_table += '\\end{tabular}\n'
latex_table += '\\caption{Percentage Change in Micro F1 Score by Model and Modification Type}\n'
latex_table += '\\label{tab:ner_results}\n'
latex_table += '\\end{table}'

# Save to file
with open(base_path / 'coreference_results_table.tex', 'w') as f:
    f.write(latex_table)

print("LaTeX table saved to .tex")


LaTeX table saved to .tex


In [378]:
# Load results from CSV
base_path = Path('../pretrained_language_models/coreference_resolution/tmp')

results_df = pd.read_csv(base_path / 'coreference_combined_results.csv')

# Create lists of unique modifications and models
modification_order = list(mod_mapping.keys())
model_order = ['bert-base-cased', 'gpt2', 't5-base', 'gpt4o', 'claude-3-5-sonnet', 'llama']
negation_order = ['verbal', 'lexical', 'double', 'approximate', 'absolute']
# Create empty DataFrame with multi-level columns
columns = pd.MultiIndex.from_product([model_order, ['original', 'modified', 'diff']])
results_df_pivot = pd.DataFrame(index=modification_order, columns=columns)

# Fill DataFrame
for mod in modification_order:
    for model in model_order:
        row = results_df[(results_df['modification'] == mod) & (results_df['model'] == model)]
        if not row.empty:
            results_df_pivot.loc[mod, (model, 'original')] = row['original_acc'].values[0]
            results_df_pivot.loc[mod, (model, 'modified')] = row['modified_acc'].values[0]
            results_df_pivot.loc[mod, (model, 'diff')] = row['percentage_diff'].values[0]
            if mod == 'temporal_bias' and model == 'bert':
                print(row)

# Save to CSV
results_df_pivot.to_csv(base_path / 'coreference_results_df.csv')

# Save negation results to separate CSV
# Load negation results from CSV
negation_results_df = pd.read_csv(base_path / 'coreference_combined_negation_results.csv')

# Create empty DataFrame with multi-level columns for negation results
negation_df_pivot = pd.DataFrame(index=['negation'], columns=columns)

# Fill DataFrame with negation results
for mod in negation_order:
    for model in model_order:
        row = negation_results_df[(negation_results_df['modification'] == mod) & (negation_results_df['model'] == model)]
        if not row.empty:   
            negation_df_pivot.loc[mod, (model, 'original')] = row['original_acc'].values[0]
            negation_df_pivot.loc[mod, (model, 'modified')] = row['modified_acc'].values[0] 
            negation_df_pivot.loc[mod, (model, 'diff')] = row['pct_diff'].values[0]

# Save negation results to CSV
negation_df_pivot.to_csv(base_path / 'coreference_negation_results_df.csv')


print("Results saved to coreference_results_df.csv and coreference_negation_results_df.csv")


Results saved to coreference_results_df.csv and coreference_negation_results_df.csv


In [382]:
import numpy as np

# Read the combined results
base_path = Path('dialogue_contradiction_detection/tmp')
df = pd.read_csv(base_path / 'dialogue_combined_negation_results.csv')

# Create mapping from modification names to standardized names
negation_order= ['Verbal', 'Lexical', 'Double', 'Approximate', 'Absolute']
mod_mapping = {
    'verbal': 'Verbal',
    'lexical': 'Lexical',
    'double': 'Double',
    'approximate': 'Approximate',
    'absolute': 'Absolute',
}

# Define model order
model_order = ['bert-base-cased', 'gpt2', 't5-base', 'gpt4o', 'claude-3-5-sonnet', 'llama']

# Map the modification names
df['modification'] = df['modification'].map(mod_mapping)

# Pivot the data to get modifications as rows and models as columns
pivot_df = df.pivot(index='modification', columns='model', values='pct_diff')
p_values = df.pivot(index='modification', columns='model', values='pvalue')
significance = df.pivot(index='modification', columns='model', values='significance')

# Reorder rows according to modification_order
pivot_df = pivot_df.reindex(negation_order)
p_values = p_values.reindex(negation_order)
significance = significance.reindex(negation_order)

print(pivot_df)

# Function to generate color based on value
def get_color(val, sig):
    if np.isnan(val):
        return ''
    elif val > 0:
        # Green gradient for positive values
        intensity = min(abs(val)/10, 1)  # Scale to max 10% change
        val_str = f'+{val:.1f}'
        if sig == '.':  # Just bold for '.'
            val_str = f'\\textbf{{{val_str}}}'
        elif sig == '*':  # One asterisk
            val_str = f'\\textbf{{{val_str}}}*'
        elif sig == '**':  # Two asterisks
            val_str = f'\\textbf{{{val_str}}}**'
        return f'\\cellcolor{{green!{int(intensity*30)}}} {val_str}'
    else:
        # Red gradient for negative values
        intensity = min(abs(val)/10, 1)  # Scale to max 10% change
        val_str = f'{val:.1f}'
        if sig == '.':  # Just bold for '.'
            val_str = f'\\textbf{{{val_str}}}'
        elif sig == '*':  # One asterisk
            val_str = f'\\textbf{{{val_str}}}*'
        elif sig == '**':  # Two asterisks
            val_str = f'\\textbf{{{val_str}}}**'
        return f'\\cellcolor{{red!{int(intensity*30)}}} {val_str}'

# Generate LaTeX table
latex_table = '\\begin{table}[h]\n\\centering\n\\begin{tabular}{l' + 'r'*len(model_order) + '}\n'
latex_table += '\\hline\n'
latex_table += 'Modification & ' + ' & '.join([f'\\textbf{{{col}}}' for col in model_order]) + ' \\\\\n'
latex_table += '\\hline\n'

prev_category = None
for idx, row in pivot_df.iterrows():
    current_category = idx[0]  # Get first character of modification name
    if prev_category is not None and current_category != prev_category:
        latex_table += '\\hline\n'
    prev_category = current_category
    
    latex_table += f'\\textbf{{{idx}}} & '
    latex_table += ' & '.join([get_color(row[col], significance.loc[idx, col]) for col in model_order]) + ' \\\\\n'

latex_table += '\\hline\n'
latex_table += '\\end{tabular}\n'
latex_table += '\\caption{Percentage Change in Micro F1 Score by Model and Modification Type}\n'
latex_table += '\\label{tab:ner_results}\n'
latex_table += '\\end{table}'

# Save to file
with open('dialogue_negation_type_results_table.tex', 'w') as f:
    f.write(latex_table)

print("LaTeX table saved to ner_results_table.tex")


model         bert-base-cased  claude-3-5-sonnet  gpt2  gpt4o  llama  t5-base
modification                                                                 
Verbal                  -26.7               -6.2 -21.4    5.9  -27.8     -6.7
Lexical                 -30.0              -32.1 -34.5  -37.0  -35.7    -30.0
Double                   14.3              -36.4 -30.0  -22.2  -50.0    -22.2
Approximate             -26.9              -23.1 -25.0  -16.7  -20.0    -20.8
Absolute                -27.3              -30.8 -36.4  -33.3  -38.5    -41.7
LaTeX table saved to ner_results_table.tex


In [383]:
import numpy as np

# Read the combined results
base_path = Path('coreference_resolution/tmp')
df = pd.read_csv(base_path / 'coreference_combined_negation_results.csv')

# Create mapping from modification names to standardized names
negation_order= ['Verbal', 'Lexical', 'Double', 'Approximate', 'Absolute']
mod_mapping = {
    'verbal': 'Verbal',
    'lexical': 'Lexical',
    'double': 'Double',
    'approximate': 'Approximate',
    'absolute': 'Absolute',
}

# Define model order
model_order = ['bert-base-cased', 'gpt2', 't5-base', 'gpt4o', 'claude-3-5-sonnet', 'llama']

# Map the modification names
df['modification'] = df['modification'].map(mod_mapping)

# Pivot the data to get modifications as rows and models as columns
pivot_df = df.pivot(index='modification', columns='model', values='pct_diff')
p_values = df.pivot(index='modification', columns='model', values='pvalue')
significance = df.pivot(index='modification', columns='model', values='significance')

# Reorder rows according to modification_order
pivot_df = pivot_df.reindex(negation_order)
p_values = p_values.reindex(negation_order)
significance = significance.reindex(negation_order)

print(pivot_df)

# Function to generate color based on value
def get_color(val, sig):
    if np.isnan(val):
        return ''
    elif val > 0:
        # Green gradient for positive values
        intensity = min(abs(val)/10, 1)  # Scale to max 10% change
        val_str = f'+{val:.1f}'
        if sig == '.':  # Just bold for '.'
            val_str = f'\\textbf{{{val_str}}}'
        elif sig == '*':  # One asterisk
            val_str = f'\\textbf{{{val_str}}}*'
        elif sig == '**':  # Two asterisks
            val_str = f'\\textbf{{{val_str}}}**'
        return f'\\cellcolor{{green!{int(intensity*30)}}} {val_str}'
    else:
        # Red gradient for negative values
        intensity = min(abs(val)/10, 1)  # Scale to max 10% change
        val_str = f'{val:.1f}'
        if sig == '.':  # Just bold for '.'
            val_str = f'\\textbf{{{val_str}}}'
        elif sig == '*':  # One asterisk
            val_str = f'\\textbf{{{val_str}}}*'
        elif sig == '**':  # Two asterisks
            val_str = f'\\textbf{{{val_str}}}**'
        return f'\\cellcolor{{red!{int(intensity*30)}}} {val_str}'

# Generate LaTeX table
latex_table = '\\begin{table}[h]\n\\centering\n\\begin{tabular}{l' + 'r'*len(model_order) + '}\n'
latex_table += '\\hline\n'
latex_table += 'Modification & ' + ' & '.join([f'\\textbf{{{col}}}' for col in model_order]) + ' \\\\\n'
latex_table += '\\hline\n'

prev_category = None
for idx, row in pivot_df.iterrows():
    current_category = idx[0]  # Get first character of modification name
    if prev_category is not None and current_category != prev_category:
        latex_table += '\\hline\n'
    prev_category = current_category
    
    latex_table += f'\\textbf{{{idx}}} & '
    latex_table += ' & '.join([get_color(row[col], significance.loc[idx, col]) for col in model_order]) + ' \\\\\n'

latex_table += '\\hline\n'
latex_table += '\\end{tabular}\n'
latex_table += '\\caption{Percentage Change in Micro F1 Score by Model and Modification Type}\n'
latex_table += '\\label{tab:ner_results}\n'
latex_table += '\\end{table}'

# Save to file
with open(base_path / 'coreference_negation_type_results_table.tex', 'w') as f:
    f.write(latex_table)

print("LaTeX table saved to coreference_negation_type_results_table.tex")

model         bert-base-cased  claude-3-5-sonnet  gpt2  gpt4o  llama  t5-base
modification                                                                 
Verbal                    9.1              -29.4   0.0  -33.3  -35.3    -20.0
Lexical                  -8.3               -9.1 -15.4   -5.0  -14.3    -13.3
Double                    0.0                0.0  14.3   -9.1    0.0     28.6
Approximate             -13.3               14.3 -12.5  -16.7   -4.0     -6.2
Absolute                -16.7                0.0  40.0   14.3   40.0     40.0
LaTeX table saved to coreference_negation_type_results_table.tex
