In [61]:
import pandas as pd
import numpy as np
import os
from scipy import stats

# Read all modification files
modifications = [f.split('_modif.txt')[0] for f in os.listdir('../../eval/PLM_results/sentiment_analysis/BERT') if f.endswith('_modif.txt')]

bert_predictions = {}
gpt2_predictions = {}
t5_predictions = {}
model_predictions = {}

# Load BERT predictions
for mod in modifications:
    bert_predictions[f'{mod}_modif'] = pd.read_csv(f'../../eval/PLM_results/sentiment_analysis/BERT/{mod}_modif.txt', header=None)[0].tolist()
    bert_predictions[f'{mod}_ori'] = pd.read_csv(f'../../eval/PLM_results/sentiment_analysis/BERT/{mod}_ori.txt', header=None)[0].tolist()
    if len(bert_predictions[f'{mod}_modif']) != len(bert_predictions[f'{mod}_ori']):
        print(mod)
# Load GPT2 predictions  
for mod in modifications:
    gpt2_predictions[f'{mod}_modif'] = pd.read_csv(f'../../eval/PLM_results/sentiment_analysis/GPT2/{mod}_modif.txt', header=None)[0].tolist()
    gpt2_predictions[f'{mod}_ori'] = pd.read_csv(f'../../eval/PLM_results/sentiment_analysis/GPT2/{mod}_ori.txt', header=None)[0].tolist()
    if len(gpt2_predictions[f'{mod}_modif']) != len(gpt2_predictions[f'{mod}_ori']):
        print(mod)
# Load T5 predictions
for mod in modifications:
    t5_predictions[f'{mod}_modif'] = pd.read_csv(f'../../eval/PLM_results/sentiment_analysis/T5/{mod}_modif.txt', header=None)[0].tolist()
    t5_predictions[f'{mod}_ori'] = pd.read_csv(f'../../eval/PLM_results/sentiment_analysis/T5/{mod}_ori.txt', header=None)[0].tolist()

# Read the original labels and model predictions
labels = {}
model_names = ['gpt4o', 'claude-3-5-sonnet', 'llama']
for mod in modifications:
    labels[mod] = pd.read_csv(f'../../eval/results/sa/gpt4o-0shot-{mod}_100.csv')
    for model in model_names:
        key = f'{model}_{mod}'
        model_predictions[key] = pd.read_csv(f'../../eval/results/sa/{model}-0shot-{mod}_100.csv')
        if len(model_predictions[key]) != len(labels[mod]): 
            print(mod, model)

# Create comparison files for BERT, GPT2, T5
for mod in modifications:
    # BERT comparison
    # Create directories if they don't exist
    os.makedirs('tmp/bert', exist_ok=True)
    os.makedirs('tmp/gpt2', exist_ok=True) 
    os.makedirs('tmp/t5', exist_ok=True)
    # print(mod)
    bert_comp = pd.DataFrame({
        'original_text': labels[mod]['original_text'],
        'modified_text': labels[mod]['text'],
        'original_label': labels[mod]['original_label'],
        'original_pred': bert_predictions[f'{mod}_ori'],
        'modified_label': labels[mod]['modified_label'],
        'modified_pred': bert_predictions[f'{mod}_modif'],
    })
    bert_comp.to_csv(f'tmp/bert/{mod}_comparison.csv', index=False)
    
    # GPT2 comparison  
    gpt2_comp = pd.DataFrame({
        'original_text': labels[mod]['original_text'],
        'modified_text': labels[mod]['text'],
        'original_label': labels[mod]['original_label'],
        'original_pred': gpt2_predictions[f'{mod}_ori'],
        'modified_label': labels[mod]['modified_label'],
        'modified_pred': gpt2_predictions[f'{mod}_modif'],
    })
    gpt2_comp.to_csv(f'tmp/gpt2/{mod}_comparison.csv', index=False)
    
    # T5 comparison
    t5_comp = pd.DataFrame({
        'original_text': labels[mod]['original_text'],
        'modified_text': labels[mod]['text'],
        'original_label': labels[mod]['original_label'],
        'original_pred': t5_predictions[f'{mod}_ori'],
        'modified_label': labels[mod]['modified_label'],
        'modified_pred': t5_predictions[f'{mod}_modif'],
    })
    t5_comp.to_csv(f'tmp/t5/{mod}_comparison.csv', index=False)
# Calculate results for all models
results = []
negation_results = []

def get_significance_level(pvalue):
    if pvalue < 0.01:
        return "**"   # Very significant
    elif pvalue < 0.05:
        return "*"    # Significant
    elif pvalue < 0.1:
        return "."    # Weakly significant
    else:
        return "ns"   # Not significant

# Process BERT
for mod in modifications:
    bert_orig = np.array([1 if p == l else 0 for p, l in zip(bert_predictions[f'{mod}_ori'], labels[mod]['original_label'])])
    bert_mod = np.array([1 if p == l else 0 for p, l in zip(bert_predictions[f'{mod}_modif'], labels[mod]['modified_label'])])
    bert_orig_acc = np.round(bert_orig.mean(), decimals=3)
    bert_mod_acc = np.round(bert_mod.mean(), decimals=3)
    bert_pct_diff = np.round(((bert_mod_acc - bert_orig_acc) / bert_orig_acc) * 100, decimals=1)
    
    if np.array_equal(bert_orig, bert_mod):
        wilcoxon_pvalue = 1.0
        mannwhitney_pvalue = 1.0
    else:
        _, wilcoxon_pvalue = stats.wilcoxon(bert_orig, bert_mod)
        _, mannwhitney_pvalue = stats.mannwhitneyu(bert_orig, bert_mod, alternative='two-sided')
    pvalue = min(wilcoxon_pvalue, mannwhitney_pvalue)
    significance = get_significance_level(pvalue)

    results.append({
        'model': 'bert',
        'modification': mod,
        'original_acc': bert_orig_acc,
        'modified_acc': bert_mod_acc,
        'pct_diff': bert_pct_diff,
        'wilcoxon_pvalue': wilcoxon_pvalue,
        'mannwhitney_pvalue': mannwhitney_pvalue,
        'pvalue': pvalue,
        'significance': significance,
        'significant': wilcoxon_pvalue < 0.05 or mannwhitney_pvalue < 0.05
    })

    if mod == 'negation':
        # Get negation types from gpt4o data
        neg_types = model_predictions['gpt4o_negation']['type'].tolist()
        for neg_type in set(neg_types):
            type_indices = [i for i, t in enumerate(neg_types) if t == neg_type]
            bert_orig_type = bert_orig[type_indices]
            bert_mod_type = bert_mod[type_indices]
            
            bert_orig_acc_type = np.round(bert_orig_type.mean(), decimals=3)
            bert_mod_acc_type = np.round(bert_mod_type.mean(), decimals=3)
            bert_pct_diff_type = np.round(((bert_mod_acc_type - bert_orig_acc_type) / bert_orig_acc_type) * 100, decimals=1)
            
            if np.array_equal(bert_orig_type, bert_mod_type):
                wilcoxon_pvalue = 1.0
                mannwhitney_pvalue = 1.0
            else:
                _, wilcoxon_pvalue = stats.wilcoxon(bert_orig_type, bert_mod_type)
                _, mannwhitney_pvalue = stats.mannwhitneyu(bert_orig_type, bert_mod_type, alternative='two-sided')
            pvalue = min(wilcoxon_pvalue, mannwhitney_pvalue)
            significance = get_significance_level(pvalue)
            
            negation_results.append({
                'model': 'bert',
                'modification': f'{neg_type}',
                'original_acc': bert_orig_acc_type,
                'modified_acc': bert_mod_acc_type,
                'pct_diff': bert_pct_diff_type,
                'wilcoxon_pvalue': wilcoxon_pvalue,
                'mannwhitney_pvalue': mannwhitney_pvalue,
                'pvalue': pvalue,
                'significance': significance,
                'significant': pvalue < 0.05
            })
    

# Process GPT2
for mod in modifications:
    gpt2_orig = np.array([1 if p == l else 0 for p, l in zip(gpt2_predictions[f'{mod}_ori'], labels[mod]['original_label'])])
    gpt2_mod = np.array([1 if p == l else 0 for p, l in zip(gpt2_predictions[f'{mod}_modif'], labels[mod]['modified_label'])])
    gpt2_orig_acc = np.round(gpt2_orig.mean(), decimals=3)
    gpt2_mod_acc = np.round(gpt2_mod.mean(), decimals=3)
    gpt2_pct_diff = np.round(((gpt2_mod_acc - gpt2_orig_acc) / gpt2_orig_acc) * 100, decimals=1)
    
    if np.array_equal(gpt2_orig, gpt2_mod):
        wilcoxon_pvalue = 1.0
        mannwhitney_pvalue = 1.0
    else:
        _, wilcoxon_pvalue = stats.wilcoxon(gpt2_orig, gpt2_mod)
        _, mannwhitney_pvalue = stats.mannwhitneyu(gpt2_orig, gpt2_mod, alternative='two-sided')
    pvalue = min(wilcoxon_pvalue, mannwhitney_pvalue)
    significance = get_significance_level(pvalue)
    results.append({
        'model': 'gpt2',
        'modification': mod,
        'original_acc': gpt2_orig_acc,
        'modified_acc': gpt2_mod_acc,
        'pct_diff': gpt2_pct_diff,
        'wilcoxon_pvalue': wilcoxon_pvalue,
        'mannwhitney_pvalue': mannwhitney_pvalue,
        'pvalue': pvalue,
        'significance': significance,
        'significant': wilcoxon_pvalue < 0.05 or mannwhitney_pvalue < 0.05
    })

    if mod == 'negation':
        # Get negation types from gpt4o data
        neg_types = model_predictions['gpt4o_negation']['type'].tolist()
        for neg_type in set(neg_types):
            type_indices = [i for i, t in enumerate(neg_types) if t == neg_type]
            gpt2_orig_type = gpt2_orig[type_indices]
            gpt2_mod_type = gpt2_mod[type_indices]
            
            gpt2_orig_acc_type = np.round(gpt2_orig_type.mean(), decimals=3)
            gpt2_mod_acc_type = np.round(gpt2_mod_type.mean(), decimals=3)
            gpt2_pct_diff_type = np.round(((gpt2_mod_acc_type - gpt2_orig_acc_type) / gpt2_orig_acc_type) * 100, decimals=1)
            
            if np.array_equal(gpt2_orig_type, gpt2_mod_type):
                wilcoxon_pvalue = 1.0
                mannwhitney_pvalue = 1.0
            else:
                _, wilcoxon_pvalue = stats.wilcoxon(gpt2_orig_type, gpt2_mod_type)
                _, mannwhitney_pvalue = stats.mannwhitneyu(gpt2_orig_type, gpt2_mod_type, alternative='two-sided')
            pvalue = min(wilcoxon_pvalue, mannwhitney_pvalue)
            significance = get_significance_level(pvalue)
            
            negation_results.append({
                'model': 'gpt2',
                'modification': f'{neg_type}',
                'original_acc': gpt2_orig_acc_type,
                'modified_acc': gpt2_mod_acc_type,
                'pct_diff': gpt2_pct_diff_type,
                'wilcoxon_pvalue': wilcoxon_pvalue,
                'mannwhitney_pvalue': mannwhitney_pvalue,
                'pvalue': pvalue,
                'significance': significance,
                'significant': pvalue < 0.05
            })
    

# Process T5
for mod in modifications:
    t5_orig = np.array([1 if p == l else 0 for p, l in zip(t5_predictions[f'{mod}_ori'], labels[mod]['original_label'])])
    t5_mod = np.array([1 if p == l else 0 for p, l in zip(t5_predictions[f'{mod}_modif'], labels[mod]['modified_label'])])
    t5_orig_acc = np.round(t5_orig.mean(), decimals=3)
    t5_mod_acc = np.round(t5_mod.mean(), decimals=3)
    t5_pct_diff = np.round(((t5_mod_acc - t5_orig_acc) / t5_orig_acc) * 100, decimals=1)
    
    if np.array_equal(t5_orig, t5_mod):
        wilcoxon_pvalue = 1.0
        mannwhitney_pvalue = 1.0
    else:
        _, wilcoxon_pvalue = stats.wilcoxon(t5_orig, t5_mod)
        _, mannwhitney_pvalue = stats.mannwhitneyu(t5_orig, t5_mod, alternative='two-sided')
    pvalue = min(wilcoxon_pvalue, mannwhitney_pvalue)
    significance = get_significance_level(pvalue)
    results.append({
        'model': 't5',
        'modification': mod,
        'original_acc': t5_orig_acc,
        'modified_acc': t5_mod_acc,
        'pct_diff': t5_pct_diff,
        'wilcoxon_pvalue': wilcoxon_pvalue,
        'mannwhitney_pvalue': mannwhitney_pvalue,
        'pvalue': pvalue,
        'significance': significance,
        'significant': wilcoxon_pvalue < 0.05 or mannwhitney_pvalue < 0.05
    })

    if mod == 'negation':
        # Get negation types from gpt4o data
        neg_types = model_predictions['gpt4o_negation']['type'].tolist()
        for neg_type in set(neg_types):
            type_indices = [i for i, t in enumerate(neg_types) if t == neg_type]
            t5_orig_type = t5_orig[type_indices]
            t5_mod_type = t5_mod[type_indices]
            
            t5_orig_acc_type = np.round(t5_orig_type.mean(), decimals=3)
            t5_mod_acc_type = np.round(t5_mod_type.mean(), decimals=3)
            t5_pct_diff_type = np.round(((t5_mod_acc_type - t5_orig_acc_type) / t5_orig_acc_type) * 100, decimals=1)
            
            if np.array_equal(t5_orig_type, t5_mod_type):
                wilcoxon_pvalue = 1.0
                mannwhitney_pvalue = 1.0
            else:
                _, wilcoxon_pvalue = stats.wilcoxon(t5_orig_type, t5_mod_type)
                _, mannwhitney_pvalue = stats.mannwhitneyu(t5_orig_type, t5_mod_type, alternative='two-sided')
            pvalue = min(wilcoxon_pvalue, mannwhitney_pvalue)
            significance = get_significance_level(pvalue)
            
            negation_results.append({
                'model': 't5',
                'modification': f'{neg_type}',
                'original_acc': t5_orig_acc_type,
                'modified_acc': t5_mod_acc_type,
                'pct_diff': t5_pct_diff_type,
                'wilcoxon_pvalue': wilcoxon_pvalue,
                'mannwhitney_pvalue': mannwhitney_pvalue,
                'pvalue': pvalue,
                'significance': significance,
                'significant': pvalue < 0.05
            })
    

# Process other models
for model in model_names:
    for mod in modifications:
        model_data = model_predictions[f'{model}_{mod}']
        model_orig = np.array([1 if p == l else 0 for p, l in zip(model_data['original_pred'], labels[mod]['original_label'])])
        model_mod = np.array([1 if p == l else 0 for p, l in zip(model_data['modified_pred'], labels[mod]['modified_label'])])
        model_orig_acc = np.round(model_orig.mean(), decimals=3)
        model_mod_acc = np.round(model_mod.mean(), decimals=3)
        model_pct_diff = np.round(((model_mod_acc - model_orig_acc) / model_orig_acc) * 100, decimals=1)
        
        if np.array_equal(model_orig, model_mod):
            wilcoxon_pvalue = 1.0
            mannwhitney_pvalue = 1.0
        else:
            _, wilcoxon_pvalue = stats.wilcoxon(model_orig, model_mod)
            _, mannwhitney_pvalue = stats.mannwhitneyu(model_orig, model_mod, alternative='two-sided')
        pvalue = min(wilcoxon_pvalue, mannwhitney_pvalue)
        significance = get_significance_level(pvalue)
        results.append({
            'model': model,
            'modification': mod,
            'original_acc': model_orig_acc,
            'modified_acc': model_mod_acc,
            'pct_diff': model_pct_diff,
            'wilcoxon_pvalue': wilcoxon_pvalue,
            'mannwhitney_pvalue': mannwhitney_pvalue,
            'pvalue': pvalue,
            'significance': significance,
            'significant': pvalue < 0.05
        })

        if mod == 'negation':
            # Get negation types
            neg_types = model_data['type'].tolist()
            for neg_type in set(neg_types):
                type_indices = [i for i, t in enumerate(neg_types) if t == neg_type]
                model_orig_type = model_orig[type_indices]
                model_mod_type = model_mod[type_indices]
                
                model_orig_acc_type = np.round(model_orig_type.mean(), decimals=3)
                model_mod_acc_type = np.round(model_mod_type.mean(), decimals=3)
                model_pct_diff_type = np.round(((model_mod_acc_type - model_orig_acc_type) / model_orig_acc_type) * 100, decimals=1)
                
                if np.array_equal(model_orig_type, model_mod_type):
                    wilcoxon_pvalue = 1.0
                    mannwhitney_pvalue = 1.0
                else:
                    _, wilcoxon_pvalue = stats.wilcoxon(model_orig_type, model_mod_type)
                    _, mannwhitney_pvalue = stats.mannwhitneyu(model_orig_type, model_mod_type, alternative='two-sided')
                pvalue = min(wilcoxon_pvalue, mannwhitney_pvalue)
                significance = get_significance_level(pvalue)
                
                negation_results.append({
                    'model': model,
                    'modification': f'{neg_type}',
                    'original_acc': model_orig_acc_type,
                    'modified_acc': model_mod_acc_type,
                    'pct_diff': model_pct_diff_type,
                    'wilcoxon_pvalue': wilcoxon_pvalue,
                    'mannwhitney_pvalue': mannwhitney_pvalue,
                    'pvalue': pvalue,
                    'significance': significance,
                    'significant': pvalue < 0.05
                })
        

results_df = pd.DataFrame(results)
negation_results_df = pd.DataFrame(negation_results)

results_df.to_csv('sentiment_analysis_results.csv', index=False)
negation_results_df.to_csv('sentiment_analysis_negation_results.csv', index=False)


In [62]:
import numpy as np
modification_order = [
    "B: Tem", "B: Geo", "B: Len", 
    "O: Spell", "O: Cap", "O: Punc",
    "M: Deri", "M: Com",
    "Sx: Voice", "Sx: Gra", "Sx: Conj", 
    "Sm: Con",
    "P: Neg", "P: Disc", "P: Senti",
    "G: Cas", "G: Dial"
]

# Read the combined results
df = pd.read_csv('sentiment_analysis_results.csv')

# Create mapping from modification names to standardized names
mod_mapping = {
    'temporal_bias': 'B: Tem',
    'geographical_bias': 'B: Geo', 
    'length_bias': 'B: Len',
    'typo_bias': 'O: Spell',
    'capitalization': 'O: Cap',
    'punctuation': 'O: Punc',
    'derivation': 'M: Deri',
    'compound_word': 'M: Com',
    'active_to_passive': 'Sx: Voice',
    'grammatical_role': 'Sx: Gra',
    'coordinating_conjunction': 'Sx: Conj',
    'concept_replacement': 'Sm: Con',
    'negation': 'P: Neg',
    'discourse': 'P: Disc',
    'sentiment': 'P: Senti',
    'casual': 'G: Cas',
    'dialectal': 'G: Dial'
}

# Map the modification names
df['modification'] = df['modification'].map(mod_mapping)

# Define model order
model_order = ['bert', 'gpt2', 't5', 'gpt4o', 'claude-3-5-sonnet', 'llama']

# Pivot the data to get modifications as rows and models as columns
pivot_df = df.pivot(index='modification', columns='model', values='pct_diff')
significance_df = df.pivot(index='modification', columns='model', values='significance')

# Reorder rows and columns
pivot_df = pivot_df.reindex(modification_order, axis=0)
pivot_df = pivot_df.reindex(model_order, axis=1)
significance_df = significance_df.reindex(modification_order, axis=0)
significance_df = significance_df.reindex(model_order, axis=1)

# Function to generate color based on value
def get_color(val, significance):
    if np.isnan(val):
        return ''
    elif val > 0:
        # Green gradient for positive values
        intensity = min(abs(val)/10, 1)  # Scale to max 10% change
        val_str = f'+{val:.1f}'
        if significance == '**':
            val_str = f'\\textbf{{{val_str}}}**'
        elif significance == '*':
            val_str = f'\\textbf{{{val_str}}}*'
        elif significance == '.':
            val_str = f'\\textbf{{{val_str}}}'
        return f'\\cellcolor{{green!{int(intensity*30)}}} {val_str}'
    else:
        # Red gradient for negative values
        intensity = min(abs(val)/10, 1)  # Scale to max 10% change
        val_str = f'{val:.1f}'
        if significance == '**':
            val_str = f'\\textbf{{{val_str}}}**'
        elif significance == '*':
            val_str = f'\\textbf{{{val_str}}}*'
        elif significance == '.':
            val_str = f'\\textbf{{{val_str}}}'
        return f'\\cellcolor{{red!{int(intensity*30)}}} {val_str}'

# Generate LaTeX table
latex_table = '\\begin{table}[h]\n\\centering\n\\begin{tabular}{l' + 'r'*len(pivot_df.columns) + '}\n'
latex_table += '\\hline\n'
latex_table += 'Modification & ' + ' & '.join([f'\\textbf{{{col}}}' for col in pivot_df.columns]) + ' \\\\\n'
latex_table += '\\hline\n'

prev_category = None
for idx, row in pivot_df.iterrows():
    current_category = idx[0]  # Get first character of modification name
    if prev_category is not None and current_category != prev_category:
        latex_table += '\\hline\n'
    prev_category = current_category
    
    latex_table += f'\\textbf{{{idx}}} & '
    latex_table += ' & '.join([get_color(val, significance_df.loc[idx, col]) for col, val in row.items()]) + ' \\\\\n'

latex_table += '\\hline\n'
latex_table += '\\end{tabular}\n'
latex_table += '\\caption{Percentage Change in Micro F1 Score by Model and Modification Type}\n'
latex_table += '\\label{tab:ner_results}\n'
latex_table += '\\end{table}'

# Save to file
with open('sa_results_table.tex', 'w') as f:
    f.write(latex_table)

print("LaTeX table saved to ner_results_table.tex")


LaTeX table saved to ner_results_table.tex


In [63]:
import csv

In [64]:
# Load results from CSV
results_df = pd.read_csv('sentiment_analysis_results.csv')
negation_df = pd.read_csv('sentiment_analysis_negation_results.csv')

# Create lists of unique modifications and models
modification_order = list(mod_mapping.keys())
negation_order = ['verbal', 'lexical', 'double', 'approximate', 'absolute']
model_order = ['bert', 'gpt2', 't5', 'gpt4o', 'claude-3-5-sonnet', 'llama']

# Create empty DataFrame with multi-level columns
columns = pd.MultiIndex.from_product([model_order, ['original', 'modified', 'diff']])
results_df_pivot = pd.DataFrame(index=modification_order, columns=columns)
negation_df_pivot = pd.DataFrame(index=negation_order, columns=columns)

# Fill DataFrame for sentiment analysis results
for mod in modification_order:
    for model in model_order:
        row = results_df[(results_df['modification'] == mod) & (results_df['model'] == model)]
        if not row.empty:
            results_df_pivot.loc[mod, (model, 'original')] = row['original_acc'].values[0]
            results_df_pivot.loc[mod, (model, 'modified')] = row['modified_acc'].values[0]
            results_df_pivot.loc[mod, (model, 'diff')] = row['pct_diff'].values[0]
            if mod == 'temporal_bias' and model == 'bert':
                print(row)

# Fill DataFrame for negation results
for mod in negation_order:
    for model in model_order:
        row = negation_df[(negation_df['modification'] == mod) & (negation_df['model'] == model)]
        if not row.empty:
            negation_df_pivot.loc[mod, (model, 'original')] = row['original_acc'].values[0]
            negation_df_pivot.loc[mod, (model, 'modified')] = row['modified_acc'].values[0]
            negation_df_pivot.loc[mod, (model, 'diff')] = row['pct_diff'].values[0]

# Save to CSV
results_df_pivot.to_csv('sentiment_analysis_results_df.csv')
negation_df_pivot.to_csv('negation_results_df.csv')

print("Results saved to sentiment_analysis_results_df.csv and negation_results_df.csv")


  model   modification  original_acc  modified_acc  pct_diff  wilcoxon_pvalue  \
1  bert  temporal_bias          0.92          0.89      -3.3         0.179712   

   mannwhitney_pvalue    pvalue significance  significant  
1            0.471985  0.179712           ns        False  
Results saved to sentiment_analysis_results_df.csv and negation_results_df.csv


In [68]:
import numpy as np

# Read the combined results
df = pd.read_csv('sentiment_analysis_negation_results.csv')

# Create mapping from modification names to standardized names
negation_order= ['Verbal', 'Lexical', 'Double', 'Approximate', 'Absolute']
mod_mapping = {
    'verbal': 'Verbal',
    'lexical': 'Lexical',
    'double': 'Double',
    'approximate': 'Approximate',
    'absolute': 'Absolute',
}

# Define model order
model_order = ['bert', 'gpt2', 't5', 'gpt4o', 'claude-3-5-sonnet', 'llama']

# Map the modification names
df['modification'] = df['modification'].map(mod_mapping)

# Pivot the data to get modifications as rows and models as columns
pivot_df = df.pivot(index='modification', columns='model', values='pct_diff')
p_values = df.pivot(index='modification', columns='model', values='pvalue')
significance = df.pivot(index='modification', columns='model', values='significance')

# Reorder rows according to modification_order
pivot_df = pivot_df.reindex(negation_order)
p_values = p_values.reindex(negation_order)
significance = significance.reindex(negation_order)

print(pivot_df)

# Function to generate color based on value
def get_color(val, sig):
    if np.isnan(val):
        return ''
    elif val > 0:
        # Green gradient for positive values
        intensity = min(abs(val)/10, 1)  # Scale to max 10% change
        val_str = f'+{val:.1f}'
        if sig == '.':  # Just bold for '.'
            val_str = f'\\textbf{{{val_str}}}'
        elif sig == '*':  # One asterisk
            val_str = f'\\textbf{{{val_str}}}*'
        elif sig == '**':  # Two asterisks
            val_str = f'\\textbf{{{val_str}}}**'
        return f'\\cellcolor{{green!{int(intensity*30)}}} {val_str}'
    else:
        # Red gradient for negative values
        intensity = min(abs(val)/10, 1)  # Scale to max 10% change
        val_str = f'{val:.1f}'
        if sig == '.':  # Just bold for '.'
            val_str = f'\\textbf{{{val_str}}}'
        elif sig == '*':  # One asterisk
            val_str = f'\\textbf{{{val_str}}}*'
        elif sig == '**':  # Two asterisks
            val_str = f'\\textbf{{{val_str}}}**'
        return f'\\cellcolor{{red!{int(intensity*30)}}} {val_str}'

# Generate LaTeX table
latex_table = '\\begin{table}[h]\n\\centering\n\\begin{tabular}{l' + 'r'*len(model_order) + '}\n'
latex_table += '\\hline\n'
latex_table += 'Modification & ' + ' & '.join([f'\\textbf{{{col}}}' for col in model_order]) + ' \\\\\n'
latex_table += '\\hline\n'

prev_category = None
for idx, row in pivot_df.iterrows():
    current_category = idx[0]  # Get first character of modification name
    if prev_category is not None and current_category != prev_category:
        latex_table += '\\hline\n'
    prev_category = current_category
    
    latex_table += f'\\textbf{{{idx}}} & '
    latex_table += ' & '.join([get_color(row[col], significance.loc[idx, col]) for col in model_order]) + ' \\\\\n'

latex_table += '\\hline\n'
latex_table += '\\end{tabular}\n'
latex_table += '\\caption{Percentage Change in Micro F1 Score by Model and Modification Type}\n'
latex_table += '\\label{tab:ner_results}\n'
latex_table += '\\end{table}'

# Save to file
with open('sentiment_analysis_negation_type_results_table.tex', 'w') as f:
    f.write(latex_table)

print("LaTeX table saved to sentiment_analysis_negation_type_results_table.tex")


model         bert  claude-3-5-sonnet  gpt2  gpt4o  llama    t5
modification                                                   
Verbal       -17.7              -11.8 -23.6  -22.2   -5.5 -33.3
Lexical       -6.7                0.0   0.0    7.2    0.0 -20.0
Double       -25.0              -36.8 -11.1  -22.2  -10.0 -30.0
Approximate   -8.7              -18.2 -13.6  -10.0  -18.2 -25.0
Absolute     -33.3                0.0 -20.0   -8.4  -14.3 -20.0
LaTeX table saved to sentiment_analysis_negation_type_results_table.tex
