# Overview

This notebook computes exact entity F1, Recall, and Precision scores against the Human-Labeled entities, which we consider to be ground truth.

# Env Setup

In [None]:
import pandas as pd
import ast
import csv
import math
from google.colab import drive

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
cd drive/MyDrive/'6.8611 Research Project'/'Colab Notebooks'

/content/drive/MyDrive/6.8611 Research Project/Colab Notebooks


# Data Preprocessing

In [None]:
# load the datasets into dataframes
def load_tsv_dataset(file_path):
  """
  Loads a tsv dataset. Renames thne columns to 'token' and 'label'.
  Note that renaming the columns will overwrite the first row of the dataframe
  """
  df = pd.read_csv(file_path, delimiter='\t', header=None, engine='python')
  df.columns = ['token', 'label']
  return df

def load_csv_dataset(file_path):
  """
  Loads a csv dataset. Renames thne columns to 'token' and 'label'.
  Note that renaming the columns will overwrite the first row of the dataframe
  """
  df = pd.read_csv(file_path, delimiter=',', header=None, engine='python')
  df.columns = ['token', 'label']
  return df

In [None]:
# adjust dataset_name, num_shots, file_name_end
dataset_name = "NCBI-disease"
num_shots = "few_shot"
file_name_end = "-devel.csv"

all_tokens = load_tsv_dataset("llm-annotations/datasets/"+dataset_name+"/devel.tsv")['token'].tolist()
all_labels = load_tsv_dataset("llm-annotations/datasets/"+dataset_name+"/devel.tsv")['label'].tolist()
llm_devel_tokens = load_csv_dataset("devel_gpt_generated_datasets/"+num_shots+"/"+dataset_name+file_name_end)['token'].tolist()
llm_devel_labels = load_csv_dataset("devel_gpt_generated_datasets/"+num_shots+"/"+dataset_name+file_name_end)['label'].tolist()

In [None]:
# load_<file_type>_dataset's dataframe reads null as nan, so we convert nan back to null
for i in range(len(all_tokens)):
    token = all_tokens[i]
    if isinstance(token,float) and str(token)=="nan":
        all_tokens[i] = "null"

for i in range(len(llm_devel_tokens)):
    token = llm_devel_tokens[i]
    if isinstance(token,float) and str(token)=="nan":
        llm_devel_tokens[i] = "null"

for i in range(len(llm_devel_labels)):
  if isinstance(llm_devel_labels[i], float):
    # no label next to tokens "Example", "Text: " for JNLPBA one_shot
    llm_devel_labels[i] = "O"
  else:
    llm_devel_labels[i] = llm_devel_labels[i].split("-")[0]

In [None]:
print("Dataset: "+dataset_name)
print("# shots: "+num_shots)

Dataset: NCBI-disease
# shots: few_shot


# Extract Entities
An entity is a sequence of tokens starting with a token labeled as “B” and ends at the last consecutive token labeled as “I” (the token after this token is either labeled as “O” or the last token labeled “I” is the last token in the dataset). For intrinsic evaluation, we filtered for all the unique entities in the human-annotated and LLM-generated datasets such that later on if we find an entity in LLM-generated dataset that's not in human-annotated dataset, we can skip over that entity and know that it can't be correct due to a named entity being correct only if it is an exact match of the corresponding entity in the original human-annotated file.

In [None]:
all_orig_entities = set()
i = 0

while i<len(all_tokens):
    current_entity = ""
    if (all_labels[i]=="B"):
        current_entity+=all_tokens[i]
        i+=1
        while (all_labels[i] == "I"):
            current_entity+=" " + all_tokens[i]
            i+=1
        all_orig_entities.add(current_entity)
        current_entity = ""
    else:
        i+=1

In [None]:
# Get all unique entities in LLM-generated dataset
all_llm_entities = set()
i = 0
while i<len(llm_devel_tokens):
    current_entity = ""
    if (llm_devel_labels[i]=="B"):
        entity_first_token_index = i
        current_entity+=llm_devel_tokens[i]
        i+=1
        while (llm_devel_labels[i] == "I"):
            current_entity+=" " + llm_devel_tokens[i]
            i+=1
        all_llm_entities.add(current_entity)
        current_entity = ""
    else:
        i+=1

In [None]:
array_orig_entities = []
i = 0
while i<len(all_tokens):
    current_entity = ""
    if (all_labels[i]=="B"):
        current_entity+=all_tokens[i]
        entity_start_i = i
        i+=1
        while (all_labels[i] == "I"):
            current_entity+=" " + all_tokens[i]
            i+=1
        before_context = []
        context_start_i = entity_start_i
        while (context_start_i>0 and len(before_context)<5):
            before_context_token = all_tokens[context_start_i-1]
            if (context_start_i-1!=0):
                before_context_token = " "+before_context_token
            elif (context_start_i-1!=len(all_tokens)-1):
                before_context_token += " "
            before_context.append(before_context_token)
            context_start_i -=1
        before_context = before_context[::-1]

        after_context_i = i
        after_context = []
        while (after_context_i<len(all_tokens) and len(after_context)<5):
            after_context_token = all_tokens[after_context_i]
            if (after_context_i!=0):
                after_context_token = " "+after_context_token
            elif (after_context_i!=len(all_tokens)-1):
                after_context_token += " "
            after_context.append(after_context_token)
            after_context_i+=1
        array_orig_entities.append((current_entity, before_context, after_context,entity_start_i))

        current_entity = ""
    else:
        i+=1

In [None]:
array_llm_entities = []
i = 0
while i<len(llm_devel_tokens):
    current_entity = ""
    if (llm_devel_labels[i]=="B"):
        entity_first_token_index = i
        current_entity+=llm_devel_tokens[i]
        entity_start_i = i
        i+=1
        while (llm_devel_labels[i] == "I"):
            current_entity+=" " + llm_devel_tokens[i]
            i+=1
        if (current_entity in all_orig_entities):
            before_context = []
            while (entity_start_i>=0 and len(before_context)<5):
                before_context.append(llm_devel_tokens[entity_start_i-1])
                entity_start_i -=1
            before_context = before_context[::-1]

            after_context_i = i
            after_context = []
            while (after_context_i<len(llm_devel_tokens) and len(after_context)<5):
                after_context.append(llm_devel_tokens[after_context_i])
                after_context_i+=1
            array_llm_entities.append((current_entity, before_context, after_context))

        current_entity = ""
    else:
        i+=1

# Compute Label Accuracy

Keep track of which entity in array of entities from the LLM-generated dataset that we're at with an index counter variable llm_entity_check_index.
Iterate over each entity in an array of entities from the human-annotated dataset.
  - If the entity from human-annotated dataset is in set of all unique entities in LLM-generated dataset, compare that entity's before_context (array containing up to 5 tokens that were in front of this entity in human-annotated dataset) and after_context (array containing up to 5 tokens that were in front of this entity in human-annotated dataset) with the before_context and after_context of the entity in LLM-generated dataset[llm_entity_check_index:] if the entity in LLM-generated dataset has the same value. The two entities' are considered to be at the same position with sufficiently similar context if at least 3 of the 5 tokens in before_context and 3 of the 5 tokens in after_context have the same value and are in the same relative order. This estimates that there are 1-2 hallucinated/merged/different punctuation tokens in the LLM-generated dataset for every 5 tokens preceding and following a named entity.
  - After matching an entity in human-annotated dataset with a corresponding entity in LLM-generated dataset, we increment llm_entity_check_index to prevent future entities in the human-annotated dataset with the same context from automatically being matched to an entity in LLM-generated dataset that already corresponds to

In [None]:
llm_entity_check_index = 0
print(len(all_llm_entities))
num_correct = 0
for entity_context in array_orig_entities:
    entity, before_context, after_context, entity_start_i = entity_context
    if entity in all_llm_entities:
        for i in range(llm_entity_check_index,len(array_llm_entities)):
            llm_entity, llm_before_context,llm_after_context = array_llm_entities[i]
            if (llm_entity == entity):
                llm_before_context_string = ' '.join(llm_before_context)
                llm_after_context_string = ' '.join(llm_after_context)
                num_before_context_match = 0
                num_after_context_match = 0

                last_before_context_i = 0
                for context_token in before_context:
                    token_i = llm_before_context_string[last_before_context_i:].find(context_token)
                    if (token_i!=-1):
                        num_before_context_match+=1
                        last_before_context_i = token_i

                last_after_context_i = 0
                for context_token in after_context:
                    token_i = llm_after_context_string[last_after_context_i:].find(context_token)
                    if (token_i!=-1):
                        num_after_context_match+=1
                        last_after_context_i = token_i
                if (num_before_context_match>=3) and (num_after_context_match>=3):
                  num_correct+=1
                  llm_entity_check_index=i+1
                  break
print("# tokens in original devel dataset: "+str(len(array_orig_entities)))
print("# tokens in LLM-generated dataset: "+str(len(array_llm_entities)))
print("correct: "+str(num_correct))
print("precision: "+str(num_correct/len(array_orig_entities)))

596
# tokens in original devel dataset: 787
# tokens in LLM-generated dataset: 395
correct: 256
precision: 0.3252858958068615
