In [142]:
import json
import os

def parse_ner_predictions(input_file):
    """Parse NER predictions into structured format"""
    with open(input_file, 'r') as f:
        data = json.load(f)
    
    parsed_data = []
    
    for item in data:
        # Get the original text, tokens and labels
        text = item['text']
        tokens = item['tokenized_text']
        gold_labels = item['gold_label']
        pred_labels = item['prediction']
        
        # Sanity check that lengths match
        if len(tokens) != len(gold_labels) or len(tokens) != len(pred_labels):
            raise ValueError(f"Mismatched lengths in {input_file}: tokens={len(tokens)}, gold={len(gold_labels)}, pred={len(pred_labels)}")
        
        # Build gold and prediction lists
        gold = []
        pred = []
        
        # Track multi-token entities
        curr_gold_entity = ''
        curr_gold_text = []
        curr_pred_entity = ''
        curr_pred_text = []
        
        for token, gold_label, pred_label in zip(tokens, gold_labels, pred_labels):
            # Handle gold labels
            if gold_label.startswith('B-'):
                if curr_gold_entity:
                    gold.append({'text': ' '.join(curr_gold_text), 'value': curr_gold_entity.upper()})
                curr_gold_entity = gold_label[2:]
                curr_gold_text = [token]
            elif gold_label.startswith('I-'):
                if curr_gold_entity == gold_label[2:]:
                    curr_gold_text.append(token)
            elif gold_label == 'O':
                if curr_gold_entity:
                    gold.append({'text': ' '.join(curr_gold_text), 'value': curr_gold_entity.upper()})
                    curr_gold_entity = ''
                    curr_gold_text = []
            
            # Handle predicted labels
            if pred_label.startswith('B-'):
                if curr_pred_entity:
                    pred.append({'text': ' '.join(curr_pred_text), 'value': curr_pred_entity.upper()})
                curr_pred_entity = pred_label[2:]
                curr_pred_text = [token]
            elif pred_label.startswith('I-'):
                if curr_pred_entity == pred_label[2:]:
                    curr_pred_text.append(token)
            elif pred_label == 'O':
                if curr_pred_entity:
                    pred.append({'text': ' '.join(curr_pred_text), 'value': curr_pred_entity.upper()})
                    curr_pred_entity = ''
                    curr_pred_text = []
        
        # Add any remaining entities
        if curr_gold_entity:
            gold.append({'text': ' '.join(curr_gold_text), 'value': curr_gold_entity.upper()})
        if curr_pred_entity:
            pred.append({'text': ' '.join(curr_pred_text), 'value': curr_pred_entity.upper()})
            
        parsed_item = {
            'text': text,
            'gold': gold,
            'prediction': pred
        }
        parsed_data.append(parsed_item)
    
    return parsed_data

def process_ner_files(model_name):
    """Process all NER files for a given model"""
    input_dir = f'NER_{model_name}'
    output_dir = f'parsed_NER_{model_name}'
    
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        
    for filename in os.listdir(input_dir):
        if filename.endswith('.json'):
            input_path = os.path.join(input_dir, filename)
            output_path = os.path.join(output_dir, filename)
            
            parsed_data = parse_ner_predictions(input_path)
            
            # Sanity check number of entities matches between input and parsed
            with open(input_path, 'r') as f:
                input_data = json.load(f)
            total_gold_count_input = 0    
            total_pred_count_input = 0
            total_gold_count_parsed = 0
            total_pred_count_parsed = 0
            for i, (input_item, parsed_item) in enumerate(zip(input_data, parsed_data)):
                gold_count = sum(1 for label in input_item['gold_label'] if label.startswith('B-'))
                pred_count = sum(1 for label in input_item['prediction'] if label.startswith('B-'))
                total_gold_count_input += gold_count
                total_pred_count_input += pred_count
                total_gold_count_parsed += len(parsed_item['gold'])
                total_pred_count_parsed += len(parsed_item['prediction'])
                if len(parsed_item['gold']) != gold_count:
                    raise ValueError(f"Mismatch in gold entities for item {i} in {filename}: {len(parsed_item['gold'])} vs {gold_count}")
                if len(parsed_item['prediction']) != pred_count:
                    raise ValueError(f"Mismatch in predicted entities for item {i} in {filename}: {len(parsed_item['prediction'])} vs {pred_count}")
            if filename == 'bert_active_to_passive_ori.json':
                # Count entities by class for gold labels
                input_gold_class_counts = {}
                input_pred_class_counts = {}
                parsed_gold_class_counts = {}
                parsed_pred_class_counts = {}
                
                # Count from input data
                for item in input_data:
                    for label in item['gold_label']:
                        if label.startswith('B-'):
                            entity_class = label[2:]
                            input_gold_class_counts[entity_class] = input_gold_class_counts.get(entity_class, 0) + 1
                    for label in item['prediction']:
                        if label.startswith('B-'):
                            entity_class = label[2:]
                            input_pred_class_counts[entity_class] = input_pred_class_counts.get(entity_class, 0) + 1
                
                # Count from parsed data            
                for item in parsed_data:
                    for entity in item['gold']:
                        entity_class = entity['value']
                        parsed_gold_class_counts[entity_class] = parsed_gold_class_counts.get(entity_class, 0) + 1
                    for entity in item['prediction']:
                        entity_class = entity['value'] 
                        parsed_pred_class_counts[entity_class] = parsed_pred_class_counts.get(entity_class, 0) + 1
                
                print("Input gold entity counts by class:")
                for entity_class, count in input_gold_class_counts.items():
                    print(f"{entity_class}: {count}")
                print("\nInput predicted entity counts by class:")
                for entity_class, count in input_pred_class_counts.items():
                    print(f"{entity_class}: {count}")
                print("\nParsed gold entity counts by class:")
                for entity_class, count in parsed_gold_class_counts.items():
                    print(f"{entity_class}: {count}")
                print("\nParsed predicted entity counts by class:")
                for entity_class, count in parsed_pred_class_counts.items():
                    print(f"{entity_class}: {count}")
            with open(output_path, 'w') as f:
                json.dump(parsed_data, f, indent=2)


In [None]:
process_ner_files('BERT')
# process_ner_files('GPT2')
# process_ner_files('T5')


In [95]:
import pandas as pd
import glob
import ast
import difflib
from tqdm import tqdm

In [195]:
def get_example_f1_and_counts(example):
    true_entities = []
    pred_entities = []
    
    # Get entities from appropriate field names
    gold_entities = example['gold']
    pred_entities_raw = example['prediction']
    
    # Handle empty case
    if not gold_entities and not pred_entities_raw:
        return 0.0, {}

    # Process gold entities
    for entity in gold_entities:
        if isinstance(entity, str):
            entity = ast.literal_eval(entity)
        # Store as tuple of (text, value, class) to handle duplicates
        if entity.get('text') is not None and entity.get('value') is not None:
            true_entities.append((entity['text'], entity['value'], entity['value']))
        elif entity.get('text') is not None and entity.get('class') is not None:
            true_entities.append((entity['text'], entity['class'], entity['class']))
        else:
            # Handle dictionary format entities
            for key, value in entity.items():
                if isinstance(value, str):
                    true_entities.append((key, value, value))

    # Process predicted entities
    for entity in pred_entities_raw:
        if isinstance(entity, str):
            entity = ast.literal_eval(entity)
        if entity.get('text') is not None:
            pred_entities.append((entity['text'], entity['value'], entity['value']))
        else:
            for key, value in entity.items():
                if isinstance(value, str):
                    pred_entities.append((key, value, value))
    
    # Calculate per-class counts
    class_counts = {}
    # Get unique classes from both true and predicted entities to ensure complete coverage
    classes = set(e[2] for e in true_entities) | set(e[2] for e in pred_entities)
    
    for cls in classes:
        # Get entities for this class
        true_cls = [e for e in true_entities if e[2] == cls]
        pred_cls = [e for e in pred_entities if e[2] == cls]
        
        # Calculate counts for this class allowing for duplicates
        tp = sum(1 for t in true_cls if t in pred_cls)
        # Count false positives - predictions that don't match any gold entity
        fp = len(pred_cls) - tp
        # Count false negatives - gold entities that weren't predicted
        fn = len(true_cls) - tp
        
        class_counts[cls] = (tp, fp, fn)
    
    # Calculate overall F1 for the example
    total_tp = sum(counts[0] for counts in class_counts.values())
    total_fp = sum(counts[1] for counts in class_counts.values())
    total_fn = sum(counts[2] for counts in class_counts.values())
    
    # Handle edge case where no true positives
    if total_tp == 0:
        return 0.0, class_counts
        
    precision = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0
    recall = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0
    
    if precision + recall == 0:
        return 0.0, class_counts
        
    f1 = 2 * (precision * recall) / (precision + recall)
    return f1, class_counts

def get_f1_scores_and_counts(data):
    if not data:
        return [], {}
        
    scores_and_counts = [get_example_f1_and_counts(example) for example in data]
    f1_scores = [score for score, _ in scores_and_counts]
    
    # Combine per-class counts across all examples
    class_counts = {}
    for _, example_counts in scores_and_counts:
        for cls, (tp, fp, fn) in example_counts.items():
            if cls not in class_counts:
                class_counts[cls] = [0, 0, 0]
            class_counts[cls][0] += tp  # Add true positives
            class_counts[cls][1] += fp  # Add false positives
            class_counts[cls][2] += fn  # Add false negatives
            
    return f1_scores, class_counts

def calculate_micro_f1(counts):
    if isinstance(counts, tuple):
        tp, fp, fn = counts
        if tp == 0:
            return 0.0
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        if precision + recall == 0:
            return 0.0
        return 2 * (precision * recall) / (precision + recall)
    else:
        # Calculate micro F1 across all classes
        total_tp = sum(counts[cls][0] for cls in counts)
        total_fp = sum(counts[cls][1] for cls in counts)
        total_fn = sum(counts[cls][2] for cls in counts)
        
        if total_tp == 0:
            return {'micro_f1': 0.0, 'support': 0}
            
        precision = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0
        recall = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0
        
        if precision + recall == 0:
            micro_f1 = 0.0
        else:
            micro_f1 = 2 * (precision * recall) / (precision + recall)
            
        # Calculate per-class metrics
        class_f1s = {'micro_f1': micro_f1}
        total_support = 0
        
        for cls, (tp, fp, fn) in counts.items():
            support = tp + fn  # Support is true positives + false negatives
            total_support += support
            
            if tp == 0:
                class_f1s[cls] = {'f1': 0.0, 'support': support}
                continue
                
            precision = tp / (tp + fp) if (tp + fp) > 0 else 0
            recall = tp / (tp + fn) if (tp + fn) > 0 else 0
            
            if precision + recall == 0:
                class_f1s[cls] = {'f1': 0.0, 'support': support}
            else:
                f1 = 2 * (precision * recall) / (precision + recall)
                class_f1s[cls] = {'f1': f1, 'support': support}
        
        class_f1s['support'] = total_support
        return class_f1s


In [None]:
import json
import numpy as np
from scipy import stats
import pandas as pd
import os
from pathlib import Path
import ast
def load_json_file(filepath):
    with open(filepath, 'r') as f:
        return json.load(f)

# Get list of all modifications from filenames
modifications = set()
models = ['GPT2', 'BERT', 'T5']
for model in models:
    model_dir = f'parsed_NER_{model}'
    for filename in os.listdir(model_dir):
        if filename.startswith(f'{model.lower()}_') and filename.endswith('_ori.json'):
            mod = filename.replace(f'{model.lower()}_', '').replace('_ori.json', '')
            modifications.add(mod)
modifications = list(modifications)

# Load original and modified files for each model
ori_files = {model: {} for model in models}
modif_files = {model: {} for model in models}

for model in models:
    model_dir = f'parsed_NER_{model}'
    for modification in modifications:
        # Load original files
        ori_filepath = f'{model_dir}/{model.lower()}_{modification}_ori.json'
        ori_files[model][modification] = load_json_file(ori_filepath)
        
        # Load modified files
        modif_filepath = f'{model_dir}/{model.lower()}_{modification}_modif.json'
        modif_files[model][modification] = load_json_file(modif_filepath)


# Calculate and store F1 scores for each model
results = []
negation_type_results = []  # For storing negation type breakdown

for model in models:
    for modification in modifications:
        compare_file = Path(f'../../data/modified_data/ner/{modification}_100.json')
        if not compare_file.exists():
            continue
        compare_df = json.load(open(compare_file))
        if len(compare_df) != len(ori_files[model][modification]):
            print('mismatch',modification, model)
        ori_f1_scores, ori_counts = get_f1_scores_and_counts(ori_files[model][modification])
        modif_f1_scores, modif_counts = get_f1_scores_and_counts(modif_files[model][modification])
        # Calculate mean F1 scores
        ori_mean_f1 = np.mean(ori_f1_scores)
        modif_mean_f1 = np.mean(modif_f1_scores)
        # Calculate micro F1 scores
        ori_micro_f1 = calculate_micro_f1(ori_counts)
        modif_micro_f1 = calculate_micro_f1(modif_counts)
        # Calculate percentage change
        mean_f1_pct_change = ((modif_mean_f1 - ori_mean_f1) / ori_mean_f1) * 100
        micro_f1_pct_change = ((modif_micro_f1['micro_f1'] - ori_micro_f1['micro_f1']) / ori_micro_f1['micro_f1']) * 100
        
        # Perform paired t-test on per-example F1 scores
        _, p_wilcoxon = stats.wilcoxon(ori_f1_scores, modif_f1_scores)
        _, p_mannwhitney = stats.mannwhitneyu(ori_f1_scores, modif_f1_scores)
        p_value = min(p_wilcoxon, p_mannwhitney)
        
        # Determine significance level
        if p_value < 0.01:
            significance = "**"
        elif p_value < 0.05:
            significance = "*"
        elif p_value < 0.1:
            significance = "."
        else:
            significance = "ns"
        
        results.append({
            'model': model,
            'modification': modification,
            'original_mean_f1': ori_mean_f1,
            'modified_mean_f1': modif_mean_f1,
            'mean_f1_pct_change': mean_f1_pct_change,
            'original_micro_f1': ori_micro_f1,
            'modified_micro_f1': modif_micro_f1,
            'micro_f1_pct_change': micro_f1_pct_change,
            'p_value': p_value,
            'significance': significance
        })
        
        # Additional analysis for negation types
        if modification == 'negation':
            # Load negation type information
            negation_file = Path(f'../../data/modified_data/ner/negation_100.json')
            negation_data = json.load(open(negation_file))
            
            # Group examples by negation type
            type_results = {}
            for idx, (ori_f1, mod_f1) in enumerate(zip(ori_f1_scores, modif_f1_scores)):
                neg_type = negation_data[idx].get('subtype', 'unknown')
                if neg_type not in type_results:
                    type_results[neg_type] = {'ori_f1s': [], 'mod_f1s': []}
                type_results[neg_type]['ori_f1s'].append(ori_f1)
                type_results[neg_type]['mod_f1s'].append(mod_f1)
            
            # Calculate metrics for each negation type
            for neg_type, scores in type_results.items():
                ori_mean = np.mean(scores['ori_f1s'])
                mod_mean = np.mean(scores['mod_f1s'])
                pct_change = ((mod_mean - ori_mean) / ori_mean) * 100 if ori_mean > 0 else 0
                
                # Statistical tests
                if len(scores['ori_f1s']) > 1:  # Only if we have enough samples
                    _, p_wilcoxon = stats.wilcoxon(scores['ori_f1s'], scores['mod_f1s'])
                    _, p_mannwhitney = stats.mannwhitneyu(scores['ori_f1s'], scores['mod_f1s'])
                    p_value = min(p_wilcoxon, p_mannwhitney)
                else:
                    p_value = 1.0
                
                # Determine significance level for negation types
                if p_value < 0.01:
                    significance = "**"
                elif p_value < 0.05:
                    significance = "*"
                elif p_value < 0.1:
                    significance = "."
                else:
                    significance = "ns"
                
                negation_type_results.append({
                    'model': model,
                    'negation_type': neg_type,
                    'original_mean_f1': ori_mean,
                    'modified_mean_f1': mod_mean,
                    'mean_f1_pct_change': pct_change,
                    'sample_size': len(scores['ori_f1s']),
                    'p_value': p_value,
                    'significance': significance
                })

# Create DataFrame and save to CSV
df = pd.DataFrame(results)
df.to_csv('ner_modification_results_plm.csv', index=False)

# Save negation type results
if negation_type_results:
    df_negation = pd.DataFrame(negation_type_results)
    df_negation.to_csv('ner_negation_type_results_plm.csv', index=False)


In [210]:
import json
import numpy as np
from scipy import stats
import pandas as pd
import os


In [202]:
def get_example_f1_and_counts_list(gold, pred):
    # Convert string labels to lists if needed
    if isinstance(gold, str):
        gold = ast.literal_eval(gold)
    if isinstance(pred, str):
        pred = ast.literal_eval(pred)

    # Standardize format to list of dicts with 'text' and 'value' keys
    def standardize_format(data):
        if isinstance(data, dict):
            return [{'text': k, 'value': v} for k, v in data.items()]
        elif isinstance(data, list) and len(data) > 0:
            if isinstance(data[0], dict):
                standardized = []
                for item in data:
                    if 'text' not in item:
                        for text, value in item.items():
                            standardized.append({'text': text, 'value': value})
                    else:
                        standardized.append(item)
                return standardized
        return data

    gold = standardize_format(gold)
    pred = standardize_format(pred)

    # Calculate metrics by comparing each prediction against gold
    tp = 0
    gold_matched = [False] * len(gold)
    pred_matched = [False] * len(pred)

    # First pass - find exact matches
    for i, p in enumerate(pred):
        for j, g in enumerate(gold):
            if not gold_matched[j] and not pred_matched[i]:
                if p['text'] == g['text'] and p['value'] == g['value']:
                    tp += 1
                    gold_matched[j] = True
                    pred_matched[i] = True

    # Calculate false positives and false negatives
    fp = len(pred) - tp  # Predictions that didn't match any gold
    fn = len(gold) - tp  # Gold entities that weren't matched

    # Calculate F1 score for this example
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return f1, (tp, fp, fn)

def calculate_micro_f1_list(counts):
    # Sum up all true positives, false positives, and false negatives
    tp = sum(count[0] for count in counts)
    fp = sum(count[1] for count in counts)
    fn = sum(count[2] for count in counts)

    # Calculate micro-averaged precision and recall
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0

    # Calculate micro F1
    micro_f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    return micro_f1


In [None]:
# Read and analyze modification results from ner directory
import glob
import os
import pandas as pd
from tqdm import tqdm
ner_results_dir = '../../eval/results/ner/'
ner_results_files = glob.glob(os.path.join(ner_results_dir, '*.csv'))

print("\nAnalyzing results from ner directory:")
print("-" * 50)

# Create list to store results
results_data = []
negation_results_data = []

for results_file in ner_results_files:
    # Extract model and modification from filename
    filename = os.path.basename(results_file)
    print(filename)
    if 'DP' in filename or 'ner' in filename:
        continue
    model = filename.split('-')[0]
    # if model == 'gpt4o':
    #     if 'new' not in filename:
    #         continue
    #     modification = filename.split('-')[2].replace('_100_new.csv', '')
    # else:
    modification = filename.split('-')[2].replace('_100_new.csv', '')
    print(modification)
    
    print(f"\nResults from {filename}:")
    print("=" * 50)
    
    # Read the CSV file
    df = pd.read_csv(results_file)
    compare_file = Path(f'../../data/modified_data/ner/{modification}_100.json')
    if not compare_file.exists():
        continue
    compare_df = json.load(open(compare_file))
    if len(compare_df) != len(df):
        print('mismatch',modification, model)
    # Calculate macro F1 scores
    # Get labels and predictions
    # Get original and modified labels/predictions
    ori_labels = df['original_label'].values
    ori_preds = df['original_pred'].values
    mod_labels = df['modified_label'].values
    mod_preds = df['modified_pred'].values
    # Calculate F1 scores using helper functions
    ori_f1_scores = []
    modif_f1_scores = []
    print('original')
    for l, p in tqdm(zip(ori_labels, ori_preds)):
        f1, _ = get_example_f1_and_counts_list(l, p)
        ori_f1_scores.append(f1)
    print('modified')
    for l, p in tqdm(zip(mod_labels, mod_preds)):
        f1, _ = get_example_f1_and_counts_list(l, p)
        modif_f1_scores.append(f1)
    print('done calculating f1 scores')
    # Calculate mean F1 scores
    ori_mean_f1 = np.mean(ori_f1_scores)
    modif_mean_f1 = np.mean(modif_f1_scores)
    # Calculate percentage difference
    mean_pct_diff = ((modif_mean_f1 - ori_mean_f1) / ori_mean_f1) * 100
    # Perform t-test
    _, p_value_mw = stats.mannwhitneyu(ori_f1_scores, modif_f1_scores, alternative='two-sided')
    _, p_value_w = stats.wilcoxon(ori_f1_scores, modif_f1_scores)
    p_value = min(p_value_mw, p_value_w)
    
    # Determine significance level
    if p_value < 0.01:
        significance = "**"
    elif p_value < 0.05:
        significance = "*"
    elif p_value < 0.1:
        significance = "."
    else:
        significance = "ns"
        
    print(f"\n{model} - {modification.upper()} Modification:")
    print(f"Original Mean F1: {ori_mean_f1:.3f}")
    print(f"Modified Mean F1: {modif_mean_f1:.3f}")
    print(f"Mean F1 Percentage Change: {mean_pct_diff:.1f}%")
    print(f"P-value: {p_value:.4f}")
    print(f"Significance: {significance}")
    
    # For negation, get subtype results
    if modification == 'negation':
        for subtype in ['verbal', 'lexical', 'double', 'approximate', 'absolute']:
            subtype_df = df[df['type'] == subtype]
            if len(subtype_df) == 0:
                continue
                
            # Calculate F1 scores for subtype
            ori_subtype_f1 = []
            mod_subtype_f1 = []
            
            for l, p in zip(subtype_df['original_label'], subtype_df['original_pred']):
                f1, _ = get_example_f1_and_counts_list(l, p)
                ori_subtype_f1.append(f1)
                
            for l, p in zip(subtype_df['modified_label'], subtype_df['modified_pred']):
                f1, _ = get_example_f1_and_counts_list(l, p)
                mod_subtype_f1.append(f1)
                
            # Calculate stats
            ori_mean = np.mean(ori_subtype_f1)
            mod_mean = np.mean(mod_subtype_f1)
            pct_diff = ((mod_mean - ori_mean) / ori_mean) * 100
            
            # Statistical tests
            _, p_mw = stats.mannwhitneyu(ori_subtype_f1, mod_subtype_f1, alternative='two-sided')
            _, p_w = stats.wilcoxon(ori_subtype_f1, mod_subtype_f1)
            p_val = min(p_mw, p_w)
            
            # Determine significance
            if p_val < 0.001:
                sig = '***'
            elif p_val < 0.01:
                sig = '**'
            elif p_val < 0.05:
                sig = '*'
            elif p_val < 0.1:
                sig = '.'
            else:
                sig = 'ns'
                
            # Store subtype results
            # results_data.append({
            #     'model': model,
            #     'modification': f'negation_{subtype}',
            #     'original_mean_f1': ori_mean,
            #     'modified_mean_f1': mod_mean,
            #     'mean_f1_pct_change': pct_diff,
            #     'p_value': p_val,
            #     'significance': sig
            # })
            
            # Also store in negation results
            negation_results_data.append({
                'model': model,
                'negation_type': subtype,
                'original_mean_f1': ori_mean,
                'modified_mean_f1': mod_mean,
                'mean_f1_pct_change': pct_diff,
                'sample_size': len(subtype_df),
                'p_value': p_val,
                'significance': sig
            })
    
    # Store main results
    results_data.append({
        'model': model,
        'modification': modification,
        'original_mean_f1': ori_mean_f1,
        'modified_mean_f1': modif_mean_f1,
        'mean_f1_pct_change': mean_pct_diff,
        'p_value': p_value,
        'significance': significance
    })

# Create DataFrame and save to CSV
results_df = pd.DataFrame(results_data)
results_df.to_csv('ner_modification_results_llm.csv', index=False)
print("\nResults saved to ner_modification_results_llm.csv")

# Save negation results
negation_results_df = pd.DataFrame(negation_results_data)
negation_results_df.to_csv('ner_negation_type_results_llm.csv', index=False)
print("\nNegation results saved to ner_negation_type_results_llm.csv")


In [None]:
# Read both CSV files
llm_results = pd.read_csv('ner_modification_results_llm.csv')
plm_results = pd.read_csv('ner_modification_results_plm.csv')

# Combine the dataframes
combined_results = pd.concat([llm_results, plm_results], ignore_index=True)

# Save combined results
combined_results.to_csv('ner_modification_results_combined.csv', index=False)
print("\nCombined results saved to ner_modification_results_combined.csv")

negation_results_llm = pd.read_csv('ner_negation_type_results_llm.csv')
negation_results_plm = pd.read_csv('ner_negation_type_results_plm.csv')

combined_negation_results = pd.concat([negation_results_llm, negation_results_plm], ignore_index=True)

combined_negation_results.to_csv('ner_negation_type_results_combined.csv', index=False)
print("\nCombined negation results saved to ner_negation_type_results_combined.csv")


In [246]:
modification_order =["B: Tem", "B: Geo", "B: Len", "O: Spell","O: Cap","O: Punc",
"M: Deri",
"M: Com",
"Sx: Voice",
"Sx: Gra",
"Sx: Conj",
"Sm: Con",
"P: Neg",
"P: Disc",
"P: Senti",
"G: Cas",
"G: Dial",]

In [None]:
import numpy as np

# Read the combined results
df = pd.read_csv('ner_modification_results_combined.csv')

# Create mapping from modification names to standardized names
mod_mapping = {
    'temporal_bias': 'B: Tem',
    'geographical_bias': 'B: Geo', 
    'length_bias': 'B: Len',
    'typo_bias': 'O: Spell',
    'capitalization': 'O: Cap',
    'punctuation': 'O: Punc',
    'derivation': 'M: Deri',
    'compound_word': 'M: Com',
    'active_to_passive': 'Sx: Voice',
    'grammatical_role': 'Sx: Gra',
    'coordinating_conjunction': 'Sx: Conj',
    'concept_replacement': 'Sm: Con',
    'negation': 'P: Neg',
    'discourse': 'P: Disc',
    'sentiment': 'P: Senti',
    'casual': 'G: Cas',
    'dialectal': 'G: Dial'
}

# Define model order
model_order = ['BERT', 'GPT2', 'T5', 'gpt4o', 'claude', 'llama']

# Map the modification names
df['modification'] = df['modification'].map(mod_mapping)

# Pivot the data to get modifications as rows and models as columns
pivot_df = df.pivot(index='modification', columns='model', values='mean_f1_pct_change')
p_values = df.pivot(index='modification', columns='model', values='p_value')
significance = df.pivot(index='modification', columns='model', values='significance')

# Reorder rows according to modification_order
pivot_df = pivot_df.reindex(modification_order)
p_values = p_values.reindex(modification_order)
significance = significance.reindex(modification_order)

print(pivot_df)

# Function to generate color based on value
def get_color(val, sig):
    if np.isnan(val):
        return ''
    elif val > 0:
        # Green gradient for positive values
        intensity = min(abs(val)/10, 1)  # Scale to max 10% change
        val_str = f'+{val:.1f}'
        if sig == '.':  # Just bold for '.'
            val_str = f'\\textbf{{{val_str}}}'
        elif sig == '*':  # One asterisk
            val_str = f'\\textbf{{{val_str}}}*'
        elif sig == '**':  # Two asterisks
            val_str = f'\\textbf{{{val_str}}}**'
        return f'\\cellcolor{{green!{int(intensity*30)}}} {val_str}'
    else:
        # Red gradient for negative values
        intensity = min(abs(val)/10, 1)  # Scale to max 10% change
        val_str = f'{val:.1f}'
        if sig == '.':  # Just bold for '.'
            val_str = f'\\textbf{{{val_str}}}'
        elif sig == '*':  # One asterisk
            val_str = f'\\textbf{{{val_str}}}*'
        elif sig == '**':  # Two asterisks
            val_str = f'\\textbf{{{val_str}}}**'
        return f'\\cellcolor{{red!{int(intensity*30)}}} {val_str}'

# Generate LaTeX table
latex_table = '\\begin{table}[h]\n\\centering\n\\begin{tabular}{l' + 'r'*len(model_order) + '}\n'
latex_table += '\\hline\n'
latex_table += 'Modification & ' + ' & '.join([f'\\textbf{{{col}}}' for col in model_order]) + ' \\\\\n'
latex_table += '\\hline\n'

prev_category = None
for idx, row in pivot_df.iterrows():
    current_category = idx[0]  # Get first character of modification name
    if prev_category is not None and current_category != prev_category:
        latex_table += '\\hline\n'
    prev_category = current_category
    
    latex_table += f'\\textbf{{{idx}}} & '
    latex_table += ' & '.join([get_color(row[col], significance.loc[idx, col]) for col in model_order]) + ' \\\\\n'

latex_table += '\\hline\n'
latex_table += '\\end{tabular}\n'
latex_table += '\\caption{Percentage Change in Micro F1 Score by Model and Modification Type}\n'
latex_table += '\\label{tab:ner_results}\n'
latex_table += '\\end{table}'

# Save to file
with open('ner_results_table.tex', 'w') as f:
    f.write(latex_table)

print("LaTeX table saved to ner_results_table.tex")


In [None]:
# Load results from CSV
results_df = pd.read_csv('ner_modification_results_combined.csv')
negation_results_df = pd.read_csv('ner_negation_type_results_combined.csv')
# Create lists of unique modifications and models
modification_order = list(mod_mapping.keys())
model_order = ['BERT', 'GPT2', 'T5', 'gpt4o', 'claude', 'llama']
negation_order = ['verbal', 'lexical', 'double', 'approximate', 'absolute']
# Create empty DataFrame with multi-level columns
columns = pd.MultiIndex.from_product([model_order, ['original', 'modified', 'diff']])
results_df_pivot = pd.DataFrame(index=modification_order, columns=columns)

negation_columns = pd.MultiIndex.from_product([model_order, ['original', 'modified', 'diff']])
negation_results_df_pivot = pd.DataFrame(index=negation_order, columns=negation_columns)
# Fill DataFrame
for mod in modification_order:
    for model in model_order:
        row = results_df[(results_df['modification'] == mod) & (results_df['model'] == model)]
        if not row.empty:
            results_df_pivot.loc[mod, (model, 'original')] = row['original_mean_f1'].values[0]
            results_df_pivot.loc[mod, (model, 'modified')] = row['modified_mean_f1'].values[0]
            results_df_pivot.loc[mod, (model, 'diff')] = row['mean_f1_pct_change'].values[0]
            if mod == 'temporal_bias' and model == 'bert':
                print(row)
        
for mod in negation_order:
    for model in model_order:
        row = negation_results_df[(negation_results_df['negation_type'] == mod) & (negation_results_df['model'] == model)]
        if not row.empty:
            negation_results_df_pivot.loc[mod, (model, 'original')] = row['original_mean_f1'].values[0]
            negation_results_df_pivot.loc[mod, (model, 'modified')] = row['modified_mean_f1'].values[0]
            negation_results_df_pivot.loc[mod, (model, 'diff')] = row['mean_f1_pct_change'].values[0]
            if mod == 'temporal_bias' and model == 'bert':
                print(row)


# Save to CSV
results_df_pivot.to_csv('ner_results_df.csv')
negation_results_df_pivot.to_csv('ner_negation_results_df.csv')

print("Results saved to ner_results_df.csv")
print("Negation results saved to ner_negation_results_df.csv")


In [None]:
import numpy as np

# Read the combined results
df = pd.read_csv('ner_negation_type_results_combined.csv')

# Create mapping from modification names to standardized names
negation_order= ['Verbal', 'Lexical', 'Double', 'Approximate', 'Absolute']
mod_mapping = {
    'verbal': 'Verbal',
    'lexical': 'Lexical',
    'double': 'Double',
    'approximate': 'Approximate',
    'absolute': 'Absolute',
}

# Define model order
model_order = ['BERT', 'GPT2', 'T5', 'gpt4o', 'claude', 'llama']

# Map the modification names
df['negation_type'] = df['negation_type'].map(mod_mapping)

# Pivot the data to get modifications as rows and models as columns
pivot_df = df.pivot(index='negation_type', columns='model', values='mean_f1_pct_change')
p_values = df.pivot(index='negation_type', columns='model', values='p_value')
significance = df.pivot(index='negation_type', columns='model', values='significance')

# Reorder rows according to modification_order
pivot_df = pivot_df.reindex(negation_order)
p_values = p_values.reindex(negation_order)
significance = significance.reindex(negation_order)

print(pivot_df)

# Function to generate color based on value
def get_color(val, sig):
    if np.isnan(val):
        return ''
    elif val > 0:
        # Green gradient for positive values
        intensity = min(abs(val)/10, 1)  # Scale to max 10% change
        val_str = f'+{val:.1f}'
        if sig == '.':  # Just bold for '.'
            val_str = f'\\textbf{{{val_str}}}'
        elif sig == '*':  # One asterisk
            val_str = f'\\textbf{{{val_str}}}*'
        elif sig == '**':  # Two asterisks
            val_str = f'\\textbf{{{val_str}}}**'
        return f'\\cellcolor{{green!{int(intensity*30)}}} {val_str}'
    else:
        # Red gradient for negative values
        intensity = min(abs(val)/10, 1)  # Scale to max 10% change
        val_str = f'{val:.1f}'
        if sig == '.':  # Just bold for '.'
            val_str = f'\\textbf{{{val_str}}}'
        elif sig == '*':  # One asterisk
            val_str = f'\\textbf{{{val_str}}}*'
        elif sig == '**':  # Two asterisks
            val_str = f'\\textbf{{{val_str}}}**'
        return f'\\cellcolor{{red!{int(intensity*30)}}} {val_str}'

# Generate LaTeX table
latex_table = '\\begin{table}[h]\n\\centering\n\\begin{tabular}{l' + 'r'*len(model_order) + '}\n'
latex_table += '\\hline\n'
latex_table += 'Modification & ' + ' & '.join([f'\\textbf{{{col}}}' for col in model_order]) + ' \\\\\n'
latex_table += '\\hline\n'

prev_category = None
for idx, row in pivot_df.iterrows():
    current_category = idx[0]  # Get first character of modification name
    if prev_category is not None and current_category != prev_category:
        latex_table += '\\hline\n'
    prev_category = current_category
    
    latex_table += f'\\textbf{{{idx}}} & '
    latex_table += ' & '.join([get_color(row[col], significance.loc[idx, col]) for col in model_order]) + ' \\\\\n'

latex_table += '\\hline\n'
latex_table += '\\end{tabular}\n'
latex_table += '\\caption{Percentage Change in Micro F1 Score by Model and Modification Type}\n'
latex_table += '\\label{tab:ner_results}\n'
latex_table += '\\end{table}'

# Save to file
with open('ner_negation_type_results_table.tex', 'w') as f:
    f.write(latex_table)

print("LaTeX table saved to ner_results_table.tex")
