In [3]:
from datasets import load_dataset
import dspy
import openai
import os
import re
import pandas as pd
import ast
import json
from dotenv import load_dotenv
import numpy as np

In [4]:
load_dotenv()

True

In [5]:

openai.api_key = os.getenv('OPENAI_API_KEY')
openai.organization = os.getenv('OPENAI_ORGANIZATION')


In [6]:
lm = dspy.LM('together_ai/meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo', temperature=0, max_tokens=1024)
dspy.configure(lm=lm)

In [41]:
ds = pd.read_json('../preprocessing/train_dev_test_data/ner/fewnerd_sample_test.json', encoding_errors='replace')
ds = ds.to_dict('records')


In [42]:
ds[0]

{'id': 1,
 'text': 'In the early 1930s the band moved to the Grill Room of the Taft Hotel in New York ; the band was renamed ``George Hall and His Hotel Taft Orchestra``.',
 'label': [{'Grill Room': 'BUILDING'},
  {'Taft Hotel': 'BUILDING'},
  {'New York': 'LOCATION'},
  {'George Hall and His Hotel Taft Orchestra': 'ORGANIZATION'}],
 'dataset': 'few_nerd',
 'entity': ['Grill Room',
  'Taft Hotel',
  'New York',
  'George Hall and His Hotel Taft Orchestra']}

In [43]:
def remove_space(text):
    """Clean up spacing and formatting in dialogue text."""
    lines = text.split('\n')
    cleaned_lines = []
    
    for line in lines:
        # Remove multiple spaces
        cleaned = ' '.join(line.split())
        
        # Fix spacing around punctuation
        cleaned = re.sub(r'\s+([.,!?:;])', r'\1', cleaned)
        cleaned = re.sub(r'([.,!?:;])\s+', r'\1 ', cleaned)
        
        # Fix contractions
        cleaned = re.sub(r'\s*\'\s*s\b', "'s", cleaned)
        cleaned = re.sub(r'\s*n\s*\'\s*t\b', "n't", cleaned)
        cleaned = re.sub(r'\s*\'\s*ve\b', "'ve", cleaned)
        cleaned = re.sub(r'\s*\'\s*re\b', "'re", cleaned)
        cleaned = re.sub(r'\s*\'\s*ll\b', "'ll", cleaned)
        cleaned = re.sub(r'\s*\'\s*d\b', "'d", cleaned)
        cleaned = re.sub(r'\s*\'\s*m\b', "'m", cleaned)
        
        # Fix spaces around parentheses
        cleaned = re.sub(r'\(\s+', '(', cleaned)
        cleaned = re.sub(r'\s+\)', ')', cleaned)
        
        # Remove leading/trailing whitespace
        cleaned = cleaned.strip()
        
        cleaned_lines.append(cleaned)
        
    return '\n'.join(cleaned_lines)


In [44]:
examples = [
    dspy.Example({ 
                  "text" : r["text"], 
                  "label": str(r['label'])
                }
                  ).with_inputs("text")
    
    for r in ds
    ]


In [45]:
example = examples[0]
for k, v in example.items():
    print(f"\n{k.upper()}:\n")
    print(v)



TEXT:

In the early 1930s the band moved to the Grill Room of the Taft Hotel in New York ; the band was renamed ``George Hall and His Hotel Taft Orchestra``.

LABEL:

[{'Grill Room': 'BUILDING'}, {'Taft Hotel': 'BUILDING'}, {'New York': 'LOCATION'}, {'George Hall and His Hotel Taft Orchestra': 'ORGANIZATION'}]


In [46]:
def calculate_f1_ent(gold_entities, predicted_entities):
    """
    Calculates the F1 score given the true labels and predicted labels.
    """
    # print("Input types:")
    # print(f"gold_entities type: {type(gold_entities)}")
    # print(f"predicted_entities type: {type(predicted_entities)}")
    
    if predicted_entities is None:
        return 0.0, 0.0, 0.0

    true_entities = {}
    pred_entities = {}
    
    # Convert to empty list if NaN
    def handle_nan(entities):
        # print(f"Handling NaN for: {type(entities)}")
        # If it's already a list, return it as is
        if isinstance(entities, list):
            return entities
        # Handle pandas/numpy types
        if isinstance(entities, (pd.Series, np.ndarray)):
            nan_check = pd.isna(entities)
            if isinstance(nan_check, (pd.Series, np.ndarray)):
                if nan_check.any():
                    return "[]"
            elif nan_check:
                return "[]"
        # Handle single values
        elif pd.isna(entities):
            return "[]"
        return entities

    # Handle NaN cases
    gold_entities = handle_nan(gold_entities)
    predicted_entities = handle_nan(predicted_entities)
            
    # Parse strings if needed
    if isinstance(gold_entities, str):
        gold_entities = ast.literal_eval(gold_entities)
    if isinstance(predicted_entities, str):
        predicted_entities = ast.literal_eval(predicted_entities)

    # Process gold entities
    # print(gold_entities)
    for entity in gold_entities:
        # print(entity)
        if isinstance(entity, str):
            entity = ast.literal_eval(entity)
        if entity.get('text') is not None:
            true_entities[entity['text']] = entity['value']
        else:
            for key, value in entity.items():
                true_entities[key] = value
    
    # Process predicted entities
    for entity in predicted_entities:
        if isinstance(entity, str):
            entity = ast.literal_eval(entity)
        if entity.get('text') is not None:  
            pred_entities[entity['text']] = entity['value']
        else:
            for key, value in entity.items():
                pred_entities[key] = value

    # Calculate metrics
    true_positives = sum(1 for text in true_entities if text in pred_entities and true_entities[text] == pred_entities[text])
    false_positives = sum(1 for text in pred_entities if text not in true_entities)
    false_negatives = sum(1 for text in true_entities if text not in pred_entities)

    if true_positives == 0:
        return 0.0, 0.0, 0.0

    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)
    f1_score = 2 * (precision * recall) / (precision + recall)

    return precision, recall, f1_score

In [47]:
def extract_prediction(pred):
    matches = re.findall(r"\[\{.*\}\]", pred)
    # print(matches)
    parsed_answer = matches[-1] if matches else ""
    if parsed_answer == "":
        return {}
    parsed_answer = ast.literal_eval(parsed_answer)
    return parsed_answer


In [48]:
def eval_metric(true, prediction, trace=None):
    pred = prediction.label
    
    matches = re.findall(r"\[\{.*\}\]", pred)
    # print(matches)
    parsed_answer = matches[-1] if matches else ""
    if parsed_answer == "":
        return 0.0
    parsed_answer = ast.literal_eval(parsed_answer)
    # print(type(parsed_answer))
    gold_entities = ast.literal_eval(true.label)
    # print(parsed_answer)
    precision, recall, f1_score = calculate_f1_ent(gold_entities=gold_entities, predicted_entities= parsed_answer)
    return f1_score

In [49]:
from dspy.evaluate import Evaluate

evaluate = Evaluate(devset= examples, metric=eval_metric, num_threads=6, display_progress=True, display_table=10)


In [50]:
class Ent(dspy.Signature):
    """Extract named entities from the text. Possible entity type: ART, BUILDING, EVENT, LOCATION, ORGANIZATION, OTHER, PERSON, PRODUCT"""
    text = dspy.InputField()
    label = dspy.OutputField(desc='The list of named entities in the text: [{"text": the text span, "value": the entity label},].', prefix = 'Entities:')

In [51]:
class SimpleEnt(dspy.Module):
    def __init__(self):
        super().__init__()
        self.prog = dspy.Predict(Ent)

    def forward(self, text):

        return self.prog(text = text)


In [52]:
simple_ent = SimpleEnt()

In [53]:
pred = simple_ent(text=example.text)
print("\nTEXT:\n")
print(example.text)

print("\nANSWER:\n")
print(example.label)
print("\nPREDICTION:\n")
print(pred)



TEXT:

In the early 1930s the band moved to the Grill Room of the Taft Hotel in New York ; the band was renamed ``George Hall and His Hotel Taft Orchestra``.

ANSWER:

[{'Grill Room': 'BUILDING'}, {'Taft Hotel': 'BUILDING'}, {'New York': 'LOCATION'}, {'George Hall and His Hotel Taft Orchestra': 'ORGANIZATION'}]

PREDICTION:

Prediction(
    label='[{"text": "George Hall", "value": "PERSON"}, {"text": "Grill Room of the Taft Hotel", "value": "LOCATION"}, {"text": "New York", "value": "LOCATION"}, {"text": "Hotel Taft Orchestra", "value": "ORGANIZATION"}, {"text": "George Hall and His Hotel Taft Orchestra", "value": "ORGANIZATION"}, {"text": "Taft Hotel", "value": "LOCATION"}]'
)


In [54]:
eval_metric(example, pred)

0.5

In [55]:
evaluate = Evaluate(devset= examples, metric=eval_metric, num_threads=6, display_progress=True, display_table=10, return_outputs= True, return_all_scores=True)
results = evaluate(simple_ent)
items = []
for sample in results[1]:

    item = {}
    sentence = sample[0]['text']
    label = sample[0]['label']
    if sample[1] == {}:
        pred = {}
    else:
        pred = sample[1]['label']
    item['text'] = sentence
    item['label'] = label
    item['pred'] = pred
    items.append(item)
df_result = pd.DataFrame(data = items)
df_result.to_csv('results/ner/llama-0shot-ner.csv')

Average Metric: 0.5 / 1  (50.0):   0%|          | 0/251 [00:00<?, ?it/s]

Average Metric: 154.06580151874277 / 251  (61.4): 100%|██████████| 251/251 [01:05<00:00,  3.83it/s]


Unnamed: 0,text,example_label,pred_label,eval_metric
0,In the early 1930s the band moved to the Grill Room of the Taft Hotel in New York ; the band was renamed ``George Hall...,"[{'Grill Room': 'BUILDING'}, {'Taft Hotel': 'BUILDING'}, {'New York': 'LOCATION'}, {'George Hall and His Hotel Taft Orchestra': 'ORGANIZATION'}]","[{""text"": ""George Hall"", ""value"": ""PERSON""}, {""text"": ""Grill Room of the Taft Hotel"", ""value"": ""LOCATION""}, {""text"": ""New York"", ""value"": ""LOCATION""}, {""text"": ""Hotel Taft Orchestra"", ""value"": ""ORGANIZATION""},...",✔️ [0.500]
1,"The final season of minor league play Elkin Memorial Park saw season attendance of 16,322, an average of 299 per contest.",[{'Elkin Memorial Park': 'LOCATION'}],"[{""text"": ""Elkin Memorial Park"", ""value"": ""LOCATION""}]",✔️ [1.000]
2,"They finished the season 14\u201319, 9\u20139 in C-USA play to finish in seventh place.",[{'C-USA play': 'EVENT'}],"[{""text"": ""C-USA"", ""value"": ""ORGANIZATION""}]",
3,"The B-52 pilot, Major Larry G.Messinger, later recalled,","[{'B-52': 'PRODUCT'}, {'Larry G.Messinger': 'PERSON'}]","[{""text"": ""B-52"", ""value"": ""PRODUCT""}, {""text"": ""Major Larry G.Messinger"", ""value"": ""PERSON""}]",✔️ [0.500]
4,The Austro-Hungarian Navy built and operated two classes of protected cruisers.,[{'Austro-Hungarian Navy': 'ORGANIZATION'}],"[{""text"": ""Austro-Hungarian Navy"", ""value"": ""ORGANIZATION""}]",✔️ [1.000]
5,Elin Hilderbrand is an American writer mostly of romance novels.,"[{'Elin Hilderbrand': 'PERSON'}, {'American': 'LOCATION'}]","[{""text"": ""Elin Hilderbrand"", ""value"": ""PERSON""}, {""text"": ""American"", ""value"": ""LOCATION""}]",✔️ [1.000]
6,"A prototype was fitted in the mid-'60s in a one-off DB5 extended 4'' after the doors and driven by Marek personally, and a normally 6-cylinder...","[{""DB5 extended 4''"": 'PRODUCT'}, {'Marek': 'PERSON'}, {'Aston Martin DB7': 'PRODUCT'}, {'V8 unit': 'PRODUCT'}]","[{""text"": ""DB5"", ""value"": ""PRODUCT""}, {""text"": ""Marek"", ""value"": ""PERSON""}, {""text"": ""Aston Martin DB7"", ""value"": ""PRODUCT""}]",✔️ [0.571]
7,"He has caught the attention of major publications and media outlets such as CNN, The Huffington Post, Hypebeast, The Guardian, Juxtapoz Magazine, Daily Mail, NRDC,...","[{'CNN': 'ORGANIZATION'}, {'The Huffington Post': 'ORGANIZATION'}, {'Hypebeast': 'ORGANIZATION'}, {'The Guardian': 'ORGANIZATION'}, {'Juxtapoz Magazine': 'ORGANIZATION'}, {'Daily Mail, NRDC': 'ORGANIZATION'}, {'Aljazeera': 'ORGANIZATION'}, {'Hi-Fructose Magazine': 'ORGANIZATION'}]","[{""text"": ""CNN"", ""value"": ""ORGANIZATION""}, {""text"": ""The Huffington Post"", ""value"": ""ORGANIZATION""}, {""text"": ""Hypebeast"", ""value"": ""ORGANIZATION""}, {""text"": ""The Guardian"", ""value"": ""ORGANIZATION""}, {""text"": ""Juxtapoz Magazine"", ""value"": ""ORGANIZATION""}, {""text"":...",✔️ [0.824]
8,The Cnidaria are a group of animals found exclusively in aquatic and mostly marine environments.,[{'Cnidaria': 'OTHER'}],"[{""text"": ""Cnidaria"", ""value"": ""OTHER""}]",✔️ [1.000]
9,"The Ninth suffered a serious defeat at the Battle of Camulodunum under Quintus Petillius Cerialis in the rebellion of Boudica (61), when most of the...","[{'Camulodunum': 'EVENT'}, {'Quintus Petillius Cerialis': 'PERSON'}, {'Boudica': 'EVENT'}, {'Camulodunum': 'LOCATION'}, {'Colchester': 'LOCATION'}]","[{""text"": ""Ninth"", ""value"": ""ORGANIZATION""}, {""text"": ""Battle of Camulodunum"", ""value"": ""EVENT""}, {""text"": ""Quintus Petillius Cerialis"", ""value"": ""PERSON""}, {""text"": ""Boudica"", ""value"": ""PERSON""}, {""text"": ""Camulodunum"", ""value"": ""LOCATION""}, {""text"":...",✔️ [0.750]


In [56]:
def evaluate_modified_set(ds, program):
    examples = [
    dspy.Example({ 
                  "text" : remove_space(r["modified_text"]), 
                  "label": str(r['modified_label']),
                  "original_text": remove_space(r['original_text']),
                  "original_label": str(r['original_label']),
                  "index": r['index'],
                  "type": r['subtype'] if 'subtype' in r else None
                #   "original_label": str(r['original_label'])
                }
                  ).with_inputs("text")
    
    for r in ds
    ]
    evaluate = Evaluate(devset= examples, metric=eval_metric, num_threads=6, display_progress=True, display_table=1, return_outputs= True, return_all_scores=True, provide_traceback=True)
    results = evaluate(program)
    return results

In [57]:
class Ent(dspy.Signature):
    """Extract named entities from the text. Possible entity type: ART, BUILDING, EVENT, LOCATION, ORGANIZATION, OTHER, PERSON, PRODUCT"""
    text = dspy.InputField()
    label = dspy.OutputField(desc='The list of named entities in the text: [{"text": the text span, "value": the entity label},].', prefix = "Entities:")

class SimpleEnt(dspy.Module):
    def __init__(self):
        super().__init__()
        self.prog = dspy.Predict(Ent)

    def forward(self, text):

        return self.prog(text = text)
simple_ent = SimpleEnt()


In [58]:
import glob
import difflib


In [59]:
original_pred_ds = pd.read_csv('results/ner/llama-0shot-ner.csv', index_col=False)
original_pred_ds['text'] = original_pred_ds['text'].apply(lambda x: remove_space(x.encode('utf-8').decode('unicode-escape')))  # Replace 'your_function' with the actual function

# Get specific json files we want to process
json_files = glob.glob('../data/modified_data/ner/*_100.json')

for json_file in json_files:
    # if 'negation' not in json_file:
    #     continue
    # Load the json file
    print(json_file)
    with open(json_file, 'r') as f:
        data = json.load(f)
    
    results = evaluate_modified_set(data,simple_ent)

    # Convert results to dataframe
    items = []
    for sample in results[1]:
        item = {}
        sentence = sample[0]['text']
        label = sample[0]['label'] 
        # print(sample)
        if sample[1] == {}:
            pred = "[]"
        else:
            pred = sample[1]['label']
        item['text'] = sentence
        # print(label)
        # item['modified_label'] = [{entity['text']: entity['value']} for entity in label]
        item['modified_label'] = label
        pred = extract_prediction(pred)
        item['modified_pred'] = pred
        item['modified_pred'] = [{entity['text']: entity['value']} for entity in pred]
        original_text = sample[0]['original_text'].encode('utf-8').decode('unicode-escape')
        item['original_text'] = original_text
        index = sample[0]['index']
        # Find the best match for the original_text in the original_pred_ds['text'] using difflib
        matches = original_pred_ds['pred'].iloc[index] if index < len(original_pred_ds) else []
        # print(matches)
        item['original_label'] = sample[0]['original_label']
        item['original_pred'] = matches if matches else '[]'
        # Check if original_label is NaN and assign modified_label if it is
        if pd.isna(item['original_label']):
            item['original_label'] = item['modified_label']
        item['type'] = sample[0]['type']
        items.append(item)
    
    df_result = pd.DataFrame(data=items)
    
    # Save results with filename based on input json
    output_filename = f"results/ner/llama-0shot-{json_file.split('/')[-1].replace('.json', '')}_new.csv"
    df_result.to_csv(output_filename)


../preprocessing/data_after_phase2/rahmad/casual_100.json


Average Metric: 54.91143431731665 / 92  (59.7): 100%|██████████| 92/92 [00:00<00:00, 2542.07it/s] 


Unnamed: 0,text,example_label,original_text,original_label,index,type,pred_label,eval_metric
0,He produced Kim Fowley and BMX Bandits' ``Hidden Agenda At the Thirteenth Not`` for Receiver Records.,"[{'text': 'Kim Fowley', 'value': 'PERSON'}, {'text': 'BMX Bandits', 'value': 'ORGANIZATION'}, {'text': 'Receiver Records', 'value': 'ORGANIZATION'}, {'text': 'Hidden Agenda At the Thirteenth Not', 'value': 'ART'}]",He went on to produce Kim Fowley and the BMX Bandits (band) Receiver Records' album ``Hidden Agenda At the Thirteenth Not``.,"[{'Kim Fowley': 'PERSON'}, {'BMX Bandits': 'ORGANIZATION'}, {'Receiver Records': 'ORGANIZATION'}, {'Hidden Agenda At the Thirteenth Not': 'ART'}]",25,,"[{""text"": ""Kim Fowley"", ""value"": ""PERSON""}, {""text"": ""BMX Bandits"", ""value"": ""ORGANIZATION""}, {""text"": ""Receiver Records"", ""value"": ""ORGANIZATION""}]",✔️ [0.857]


../preprocessing/data_after_phase2/rahmad/discourse_100.json


Average Metric: 44.59163059163058 / 72  (61.9): 100%|██████████| 72/72 [00:00<00:00, 2821.86it/s] 


Unnamed: 0,text,example_label,original_text,original_label,index,type,pred_label,eval_metric
0,"Santa is actually innocent of the crime, which was instead masterminded by scheming relative Cousin Mel, who is mentioned briefly in the song, made into...","[{'text': 'Santa', 'value': 'PERSON'}, {'text': 'Cousin Mel', 'value': 'PERSON'}]","Moreover, Santa is actually innocent of the crime, which was instead masterminded by scheming relative Cousin Mel, who is mentioned briefly in the song but...","[{'Santa': 'PERSON'}, {'Cousin Mel': 'PERSON'}]",43,delete,"[{""text"": ""Santa"", ""value"": ""PERSON""}, {""text"": ""Cousin Mel"", ""value"": ""PERSON""}]",✔️ [1.000]


../preprocessing/data_after_phase2/rahmad/compound_word_100.json


Average Metric: 53.39575163398692 / 86  (62.1): 100%|██████████| 86/86 [00:00<00:00, 2779.95it/s] 


Unnamed: 0,text,example_label,original_text,original_label,index,type,pred_label,eval_metric
0,Most of the town is actually a high-security gated community called Orchid Island Golf and Beach Club.,"[{'text': 'Orchid Island Golf and Beach Club', 'value': 'LOCATION'}]",Most of the town is actually a gated community called Orchid Island Golf and Beach Club.,[{'Orchid Island Golf and Beach Club': 'LOCATION'}],47,,"[{""text"": ""Orchid Island Golf and Beach Club"", ""value"": ""LOCATION""}]",✔️ [1.000]


../preprocessing/data_after_phase2/rahmad/temporal_bias_100.json


Average Metric: 55.99611238434767 / 91  (61.5): 100%|██████████| 91/91 [00:00<00:00, 2999.44it/s] 


Unnamed: 0,text,example_label,original_text,original_label,index,type,pred_label,eval_metric
0,"Moreover, Santa is actually innocent of the crime, which was instead masterminded by scheming relative Cousin Mel, who is mentioned briefly in the song but...","[{'text': 'Santa', 'value': 'PERSON'}, {'text': 'Cousin Mel', 'value': 'PERSON'}]","Moreover, Santa is actually innocent of the crime, which was instead masterminded by scheming relative Cousin Mel, who is mentioned briefly in the song but...","[{'Santa': 'PERSON'}, {'Cousin Mel': 'PERSON'}]",43,,"[{""text"": ""Santa"", ""value"": ""PERSON""}, {""text"": ""Cousin Mel"", ""value"": ""PERSON""}]",✔️ [1.000]


../preprocessing/data_after_phase2/rahmad/coordinating_conjunction_100.json


Average Metric: 46.700649350649336 / 61  (76.6): 100%|██████████| 61/61 [00:00<00:00, 2696.02it/s]


Unnamed: 0,text,example_label,original_text,original_label,index,type,pred_label,eval_metric
0,"Internal conflicts, especially between Covaci and Baniciu, were escalating and making headlines in the media.","[{'text': 'Baniciu', 'value': 'PERSON'}, {'text': 'Covaci', 'value': 'PERSON'}]","Internal conflicts, especially between Covaci and Baniciu, were making headlines in the media.","[{'text': 'Baniciu', 'value': 'PERSON'}, {'text': 'Covaci', 'value': 'PERSON'}]",57,,"[{""text"": ""Covaci"", ""value"": ""PERSON""}, {""text"": ""Baniciu"", ""value"": ""PERSON""}]",✔️ [1.000]


../preprocessing/data_after_phase2/rahmad/capitalization_100.json


Average Metric: 58.761624649859925 / 100  (58.8): 100%|██████████| 100/100 [00:00<00:00, 3063.55it/s]


Unnamed: 0,text,example_label,original_text,original_label,index,type,pred_label,eval_metric
0,"The B-52 PILOT, Major Larry G.Messinger, later recalled,","[{'text': 'B-52', 'value': 'PRODUCT'}, {'text': 'Larry G.Messinger', 'value': 'PERSON'}]","The B-52 pilot, Major Larry G.Messinger, later recalled,","[{'B-52': 'PRODUCT'}, {'Larry G.Messinger': 'PERSON'}]",3,,"[{""text"": ""B-52"", ""value"": ""PRODUCT""}, {""text"": ""Major Larry G.Messinger"", ""value"": ""PERSON""}]",✔️ [0.500]


../preprocessing/data_after_phase2/rahmad/dialectal_100.json


Average Metric: 61.20051353874881 / 99  (61.8): 100%|██████████| 99/99 [00:00<00:00, 3242.31it/s] 


Unnamed: 0,text,example_label,original_text,original_label,index,type,pred_label,eval_metric
0,"The kampong people also not happy lor, then from 1846 to 1848, they more and more against Sturdza.","[{'text': 'Sturdza', 'value': 'PERSON'}]","The peasantry was also aggrieved, and between 1846 and 1848 opposition to Sturdza intensified.",[{'Sturdza': 'PERSON'}],41,singaporean_english,"[{""text"": ""Sturdza"", ""value"": ""PERSON""}]",✔️ [1.000]


../preprocessing/data_after_phase2/rahmad/sentiment_100.json


Average Metric: 77.45829725829728 / 123  (63.0): 100%|██████████| 123/123 [00:00<00:00, 264.27it/s]


Unnamed: 0,text,example_label,original_text,original_label,index,type,pred_label,eval_metric
0,"Loyalists reluctantly recruited from Queens County, New York by Lieutenant Colonel Richard Hewlett for the 3rd battalion DeLancey's Brigade had barely established a fortified position...","[{'text': 'Queens County', 'value': 'LOCATION'}, {'text': 'New York', 'value': 'LOCATION'}, {'text': 'Richard Hewlett', 'value': 'PERSON'}, {'text': ""DeLancey's Brigade"", 'value': 'ORGANIZATION'}, {'text': 'Long Island', 'value': 'LOCATION'},...","Loyalists recruited from Queens County, New York by Lieutenant Colonel Richard Hewlett for the 3rd battalion DeLancey's Brigade had established a fortified position in early...","[{'Queens County': 'LOCATION'}, {'New York': 'LOCATION'}, {'Richard Hewlett': 'PERSON'}, {""DeLancey's Brigade"": 'ORGANIZATION'}, {'Long Island': 'LOCATION'}, {'Setauket': 'LOCATION'}, {'Long Island Sound': 'LOCATION'}, {'Fairfield': 'LOCATION'}]",65,negative,"[{""text"": ""Queens County"", ""value"": ""LOCATION""}, {""text"": ""New York"", ""value"": ""LOCATION""}, {""text"": ""Richard Hewlett"", ""value"": ""PERSON""}, {""text"": ""DeLancey's Brigade"", ""value"": ""ORGANIZATION""}, {""text"": ""Long Island"", ""value"": ""LOCATION""},...",✔️ [1.000]


../preprocessing/data_after_phase2/rahmad/grammatical_role_100.json


Average Metric: 56.80534759358287 / 83  (68.4): 100%|██████████| 83/83 [00:00<00:00, 2509.30it/s] 


Unnamed: 0,text,example_label,original_text,original_label,index,type,pred_label,eval_metric
0,"Her brother ran unsuccessfully for New York from the United States House of Representatives upstate, in 1992 and 1994.","[{'text': 'New York', 'value': 'LOCATION'}, {'text': 'United States House of Representatives', 'value': 'ORGANIZATION'}]","Her brother ran unsuccessfully for the United States House of Representatives from upstate New York, in 1992 and 1994.","[{'text': 'New York', 'value': 'LOCATION'}, {'text': 'United States House of Representatives', 'value': 'ORGANIZATION'}]",63,,"[{""text"": ""New York"", ""value"": ""LOCATION""}, {""text"": ""United States House of Representatives"", ""value"": ""ORGANIZATION""}]",✔️ [1.000]


../preprocessing/data_after_phase2/rahmad/length_bias_100.json


Average Metric: 54.911434317316655 / 92  (59.7): 100%|██████████| 92/92 [00:00<00:00, 2888.16it/s]


Unnamed: 0,text,example_label,original_text,original_label,index,type,pred_label,eval_metric
0,He produced Kim Fowley and BMX Bandits' ``Hidden Agenda At the Thirteenth Not`` for Receiver Records.,"[{'text': 'Kim Fowley', 'value': 'PERSON'}, {'text': 'BMX Bandits', 'value': 'ORGANIZATION'}, {'text': 'Receiver Records', 'value': 'ORGANIZATION'}, {'text': 'Hidden Agenda At the Thirteenth Not', 'value': 'ART'}]",He went on to produce Kim Fowley and the BMX Bandits (band) Receiver Records' album ``Hidden Agenda At the Thirteenth Not``.,"[{'Kim Fowley': 'PERSON'}, {'BMX Bandits': 'ORGANIZATION'}, {'Receiver Records': 'ORGANIZATION'}, {'Hidden Agenda At the Thirteenth Not': 'ART'}]",25,shorter,"[{""text"": ""Kim Fowley"", ""value"": ""PERSON""}, {""text"": ""BMX Bandits"", ""value"": ""ORGANIZATION""}, {""text"": ""Receiver Records"", ""value"": ""ORGANIZATION""}]",✔️ [0.857]


../preprocessing/data_after_phase2/rahmad/concept_replacement_100.json


Average Metric: 50.61690009337067 / 85  (59.5): 100%|██████████| 85/85 [00:00<00:00, 3182.81it/s] 


Unnamed: 0,text,example_label,original_text,original_label,index,type,pred_label,eval_metric
0,"It is the brainchild of Golaem, a France-based software company (born in Rennes in 2009).","[{'text': 'Golaem', 'value': 'ORGANIZATION'}, {'text': 'France', 'value': 'LOCATION'}, {'text': 'Rennes', 'value': 'LOCATION'}]","It is developed by Golaem, a France -based software company (created in Rennes in 2009).","[{'Golaem': 'ORGANIZATION'}, {'France': 'LOCATION'}, {'Rennes': 'LOCATION'}]",28,idiom,"[{""text"": ""Golaem"", ""value"": ""ORGANIZATION""}, {""text"": ""France"", ""value"": ""LOCATION""}, {""text"": ""Rennes"", ""value"": ""LOCATION""}]",✔️ [1.000]


../preprocessing/data_after_phase2/rahmad/typo_bias_100.json


Average Metric: 59.62302436125964 / 100  (59.6): 100%|██████████| 100/100 [00:00<00:00, 1410.18it/s]


Unnamed: 0,text,example_label,original_text,original_label,index,type,pred_label,eval_metric
0,"A German teacher for much of her life, MacKeith also advocated for peace during the Aldermaston Marches in the 1950s and demonstrations against the Vietnam...","[{'text': 'German', 'value': 'LOCATION'}, {'text': 'MacKeith', 'value': 'PERSON'}, {'text': 'Aldermaston Marches', 'value': 'EVENT'}, {'text': 'Vietnam War', 'value': 'EVENT'}, {'text': 'Grosvenor Squar', 'value': 'LOCATION'}]","A German teacher for much of her life, MacKeith also advocated for peace during the Aldermaston Marches in the 1950s and demonstrations against the Vietnam...","[{'German': 'LOCATION'}, {'MacKeith': 'PERSON'}, {'Aldermaston Marches': 'EVENT'}, {'Vietnam War': 'EVENT'}, {'Grosvenor Square': 'LOCATION'}]",74,,"[{""text"": ""Aldermaston Marches"", ""value"": ""EVENT""}, {""text"": ""Vietnam War"", ""value"": ""EVENT""}, {""text"": ""Grosvenor Squar"", ""value"": ""LOCATION""}, {""text"": ""MacKeith"", ""value"": ""PERSON""}]",✔️ [0.889]


../preprocessing/data_after_phase2/rahmad/geographical_bias_100.json


Average Metric: 75.67702210070631 / 102  (74.2): 100%|██████████| 102/102 [00:00<00:00, 3032.54it/s]


Unnamed: 0,text,example_label,original_text,original_label,index,type,pred_label,eval_metric
0,The Nauruan fishing community built and operated two types of advanced fishing vessels.,"[{'text': 'Nauruan fishing community', 'value': 'ORGANIZATION'}]",The Austro-Hungarian Navy built and operated two classes of protected cruisers.,"[{'text': 'Austro-Hungarian Navy', 'value': 'ORGANIZATION'}]",4,,"[{""text"": ""Nauruan"", ""value"": ""LOCATION""}]",


../preprocessing/data_after_phase2/rahmad/punctuation_100.json


Average Metric: 53.40021645021644 / 100  (53.4): 100%|██████████| 100/100 [00:00<00:00, 3170.37it/s]


Unnamed: 0,text,example_label,original_text,original_label,index,type,pred_label,eval_metric
0,"Conway is the hub of operations for Norfolk Southern in the Greater Pittsburgh area, featuring a hump yard and a crew change point for virtually...","[{'text': 'Conway', 'value': 'LOCATION'}, {'text': 'Norfolk Southern', 'value': 'LOCATION'}, {'text': 'Greater Pittsburgh', 'value': 'LOCATION'}, {'text': 'Pittsburgh', 'value': 'LOCATION'}, {'text': 'FortWayne Line', 'value': 'LOCATION'}]","Conway is the hub of operations for Norfolk Southern in the Greater Pittsburgh area, featuring a hump yard and a crew change point for virtually...","[{'Conway': 'LOCATION'}, {'Norfolk Southern': 'LOCATION'}, {'Greater Pittsburgh': 'LOCATION'}, {'Pittsburgh': 'LOCATION'}, {'Fort Wayne Line': 'LOCATION'}]",35,,"[{""text"": ""Conway"", ""value"": ""LOCATION""}, {""text"": ""Norfolk Southern"", ""value"": ""ORGANIZATION""}, {""text"": ""Greater Pittsburgh"", ""value"": ""LOCATION""}, {""text"": ""Pittsburgh"", ""value"": ""LOCATION""}, {""text"": ""FortWayne Line"", ""value"": ""LOCATION""}]",✔️ [1.000]


../preprocessing/data_after_phase2/rahmad/derivation_100.json


Average Metric: 43.966170104405386 / 69  (63.7): 100%|██████████| 69/69 [00:00<00:00, 1098.27it/s]


Unnamed: 0,text,example_label,original_text,original_label,index,type,pred_label,eval_metric
0,The government announced a national funeral and a day of national mourning.,[],The government announced a state funeral and a day of national mourning.,[],46,,"[{""text"": ""government"", ""value"": ""ORGANIZATION""}, {""text"": ""national funeral"", ""value"": ""EVENT""}, {""text"": ""day of national mourning"", ""value"": ""EVENT""}]",


../preprocessing/data_after_phase2/rahmad/active_to_passive_100.json


Average Metric: 50.45845004668533 / 81  (62.3): 100%|██████████| 81/81 [00:00<00:00, 329.93it/s] 


Unnamed: 0,text,example_label,original_text,original_label,index,type,pred_label,eval_metric
0,"Genre classics are focused on by Back to Basics, with older movies and underground cult films being shown, and with occasional stabs at mainstream and...","[{'text': 'Back to Basics', 'value': 'ORGANIZATION'}]","Back to Basics focuses on genre classics, showing older movies and underground cult films, with occasional stabs at mainstream and newer genre pieces.",[{'Back to Basics': 'ORGANIZATION'}],56,,"[{""text"": ""Back to Basics"", ""value"": ""ORGANIZATION""}]",✔️ [1.000]


../preprocessing/data_after_phase2/rahmad/negation_100.json


Average Metric: 63.374306739012596 / 110  (57.6): 100%|██████████| 110/110 [00:00<00:00, 1793.37it/s]


Unnamed: 0,text,example_label,original_text,original_label,index,type,pred_label,eval_metric
0,"It is developed by no company, neither Golaem nor any other (created in Rennes in 2009).","[{'text': 'Golaem', 'value': 'ORGANIZATION'}, {'text': 'Rennes', 'value': 'LOCATION'}]","It is developed by Golaem, a France -based software company (created in Rennes in 2009).","[{'Golaem': 'ORGANIZATION'}, {'France': 'LOCATION'}, {'Rennes': 'LOCATION'}]",28,absolute,"[{""text"": ""Golaem"", ""value"": ""ORGANIZATION""}, {""text"": ""Rennes"", ""value"": ""LOCATION""}]",✔️ [1.000]


In [113]:
from scipy import stats

In [115]:
result_files = glob.glob('results/ner/llama-0shot-*_100.csv')

aggregated_results = []

def convert_string_to_entities(entity_str):
    """Convert string representation of entities to proper format"""
    if isinstance(entity_str, str):
        try:
            # Convert string to list of dicts
            entities = ast.literal_eval(entity_str)
            # Handle nested lists by flattening
            if isinstance(entities, list):
                # Handle double nested lists
                if len(entities) > 0 and isinstance(entities[0], list):
                    entities = entities[0]
                # Handle list of dicts with text/value format
                if len(entities) > 0 and isinstance(entities[0], dict):
                    # Handle format with text/value keys
                    if 'text' in entities[0]:
                        return entities
                    # Handle format with single key-value pair
                    if len(entities[0]) == 1:
                        converted = []
                        for e in entities:
                            for text, value in e.items():
                                converted.append({'text': text, 'value': value})
                        return converted
                    # Handle format with multiple key-value pairs
                    converted = []
                    for e in entities:
                        for text, value in e.items():
                            if isinstance(value, str):
                                converted.append({'text': text, 'value': value})
                    return converted
            return entities
        except:
            return []
    return entity_str

for file in result_files:
    # Extract modification type from filename
    mod_type = file.split('-')[-1].replace('.csv','')
    print(mod_type)
    # Read results file
    df = pd.read_csv(file)

    # Collect all predictions and labels for micro F1 calculation
    all_original_labels = []
    all_original_preds = []
    all_modified_labels = []
    all_modified_preds = []

    for idx, row in df.iterrows():
        # Convert string representations to proper format
        original_label = convert_string_to_entities(row['original_label'])
        original_pred = convert_string_to_entities(row['original_pred'])
        modified_label = convert_string_to_entities(row['modified_label'])
        modified_pred = convert_string_to_entities(row['modified_pred'])

        # Append to combined lists
        all_original_labels.extend(original_label)
        all_original_preds.extend(original_pred)
        all_modified_labels.extend(modified_label)
        all_modified_preds.extend(modified_pred)

    # Calculate micro F1 scores using calculate_f1_ent
    original_precision, original_recall, original_f1 = calculate_f1_ent(all_original_labels, all_original_preds)
    modified_precision, modified_recall, modified_f1 = calculate_f1_ent(all_modified_labels, all_modified_preds)
    
    # Calculate the difference between original and modified F1 scores
    difference = -round(original_f1 - modified_f1, 2)
    
    # Calculate percentage difference with respect to original F1
    pct_difference = -round((original_f1 - modified_f1) / original_f1 * 100, 2) if original_f1 != 0 else 0
    
    # Perform t-test between original and modified predictions
    t_stat, p_value = stats.ttest_ind(
        (df['original_pred'] == df['original_label']).astype(float),
        (df['modified_pred'] == df['modified_label']).astype(float)
    )
    
    aggregated_results.append({
        'task': 'named_entity_recognition',
        'modification': mod_type,
        'original_res': round(original_f1 , 2),  # Convert to percentage
        'modified_res': round(modified_f1 , 2),  # Convert to percentage
        'difference': difference,
        'pct_difference': pct_difference,
        'p_value': p_value,
        'original_precision': round(original_precision , 2),  # Convert to percentage
        'original_recall': round(original_recall , 2),  # Convert to percentage
        'modified_precision': round(modified_precision , 2),  # Convert to percentage
        'modified_recall': round(modified_recall , 2)  # Convert to percentage
    })

# Create final results dataframe
results_df = pd.DataFrame(aggregated_results)

# Sort the results based on modification_name
modification_name = ['temporal_bias_100', 'geographical_bias_100','length_bias_100', 'typo_bias_100', 'capitalization_100', 'punctuation_100', 'derivation_100', 'compound_word_100','active_to_passive_100','grammatical_role_100', 'coordinating_conjunction_100', 'concept_replacement_100','negation_100','discourse_100','sentiment_100','casual_100', 'dialectal_100']
results_df['modification'] = pd.Categorical(results_df['modification'], categories=modification_name, ordered=True)
results_df = results_df.sort_values(by='modification')

# Calculate averages across all modifications
avg_original = results_df['original_res'].mean()
avg_modified = results_df['modified_res'].mean()
avg_difference = avg_original - avg_modified
avg_pct_difference = results_df['pct_difference'].mean()
avg_orig_precision = results_df['original_precision'].mean()
avg_orig_recall = results_df['original_recall'].mean()
avg_mod_precision = results_df['modified_precision'].mean()
avg_mod_recall = results_df['modified_recall'].mean()

# Add averages as a new row
results_df.loc[len(results_df)] = {
    'task': 'named_entity_recognition',
    'modification': 'average',
    'original_res': round(avg_original, 2),
    'modified_res': round(avg_modified, 2),
    'difference': -round(avg_difference, 2),
    'pct_difference': round(avg_pct_difference, 2),
    'p_value': None,
    'original_precision': round(avg_orig_precision, 2),
    'original_recall': round(avg_orig_recall, 2),
    'modified_precision': round(avg_mod_precision, 2),
    'modified_recall': round(avg_mod_recall, 2)
}

print("\n")
results_df.to_csv('results/ner/llama-DP.csv')

# Apply styling to highlight rows where original_res > modified_res and significant p-values
def highlight_drops_and_significance(row):
    colors = [''] * len(row)
    if row['original_res'] > row['modified_res']:
        colors = ['background-color: red'] * len(row)
        # If p-value < 0.05, add bold text
        if 'p_value' in row and row['p_value'] is not None and row['p_value'] < 0.05:
            colors = ['background-color: red; font-weight: bold'] * len(row)
    return colors

results_df.round(2).style.apply(highlight_drops_and_significance, axis=1)


concept_replacement_100
negation_100
temporal_bias_100
grammatical_role_100
discourse_100
coordinating_conjunction_100
geographical_bias_100
punctuation_100
length_bias_100
capitalization_100
active_to_passive_100
compound_word_100
dialectal_100
typo_bias_100
casual_100
derivation_100
sentiment_100




  results_df.loc[len(results_df)] = {


Unnamed: 0,task,modification,original_res,modified_res,difference,pct_difference,p_value,original_precision,original_recall,modified_precision,modified_recall
2,named_entity_recognition,temporal_bias_100,0.76,0.73,-0.03,-3.53,1.0,0.73,0.79,0.71,0.75
6,named_entity_recognition,geographical_bias_100,0.71,0.74,0.03,4.43,0.0,0.65,0.78,0.74,0.74
8,named_entity_recognition,length_bias_100,0.75,0.69,-0.07,-8.9,1.0,0.7,0.81,0.66,0.71
13,named_entity_recognition,typo_bias_100,0.72,0.74,0.02,2.79,1.0,0.7,0.74,0.72,0.77
9,named_entity_recognition,capitalization_100,0.78,0.67,-0.1,-13.34,1.0,0.77,0.78,0.71,0.64
7,named_entity_recognition,punctuation_100,0.75,0.69,-0.06,-7.39,1.0,0.72,0.78,0.68,0.7
15,named_entity_recognition,derivation_100,0.75,0.75,-0.0,-0.44,1.0,0.73,0.79,0.71,0.79
11,named_entity_recognition,compound_word_100,0.76,0.74,-0.02,-3.15,0.79,0.74,0.79,0.75,0.73
10,named_entity_recognition,active_to_passive_100,0.76,0.68,-0.08,-9.96,1.0,0.73,0.79,0.72,0.66
3,named_entity_recognition,grammatical_role_100,0.78,0.75,-0.03,-3.82,0.01,0.76,0.81,0.76,0.75
