# Overview
Archived code for computing missing and hallucinated entities within GPT's labels.

# Env Setup

In [None]:
import pandas as pd
import ast
import csv
from google.colab import drive

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
cd drive/MyDrive/'6.8611 Research Project'/'Colab Notebooks'

/content/drive/MyDrive/6.8611 Research Project/Colab Notebooks


# Data Preprocessing

In [None]:
# load the datasets into dataframes
def load_tsv_dataset(file_path):
  """
  Loads a tsv dataset. Renames thne columns to 'token' and 'label'.
  Note that renaming the columns will overwrite the first row of the dataframe
  """
  df = pd.read_csv(file_path, delimiter='\t', header=None, engine='python')
  df.columns = ['token', 'label']
  return df

def load_csv_dataset(file_path):
  """
  Loads a csv dataset. Renames thne columns to 'token' and 'label'.
  Note that renaming the columns will overwrite the first row of the dataframe
  """
  df = pd.read_csv(file_path, delimiter=',', header=None, engine='python')
  df.columns = ['token', 'label']
  return df

def split_by_sentence(list_of_strings):
  sentences = []
  current_sentence = []
  num_sentences = 0

  for word in list_of_strings:
      current_sentence.append(word)
      if type(word) is str and word.endswith('.'):
          num_sentences += 1
          sentence_str = ' '.join(map(str, current_sentence))
          sentences.append(sentence_str)
          current_sentence = []

  print("\nNumber of sentences: ", num_sentences)
  return sentences

def get_filtered_entities(df, target_label):
  """
  df (pandas dataframe): has two columns 'token' and 'label'
  target_label: 'B', 'I', or 'O' (see description above for what these signify)

  Filtering involves: removing blanks, and filtering out entities that consist
  only of punctuation, numbers, or single letters.

  Return a frequency of all filtered entities with label 'target_label'.
  """
  filtered_df = df[df['label'] == target_label]
  target_entities = filtered_df['token'].tolist() # a set of all the entities with the target label

  # regex for filtering out nonsense strings
  punctuation = re.escape(string.punctuation)
  pattern = re.compile(rf'^(?![a-zA-Z]?$)(?!\d+$)(?!^[{punctuation}]+$).+')
  target_entities = [ent for ent in target_entities if pattern.match(ent)]
  return Counter(target_entities)

In [None]:
# adjust dataset_name, num_shots, file_name_end
dataset_name = "NCBI" # POSSIBILITIES: ['NCBI', 'JNLPBA', 'BC5CDR-C', 'BC4CDR-D', 'BC2GM']
num_shots = "few_shot" # POSSIBILITIES: ['zero_shot', 'one_shot', 'few_shot']
file_name_end = "-devel.csv"

all_tokens = load_tsv_dataset("llm-annotations/datasets/"+dataset_name+"/devel.tsv")['token'].tolist()
all_labels = load_tsv_dataset("llm-annotations/datasets/"+dataset_name+"/devel.tsv")['label'].tolist()
llm_devel_tokens = load_csv_dataset("devel_gpt_generated_datasets/"+num_shots+"/"+dataset_name+file_name_end)['token'].tolist()
llm_devel_labels = load_csv_dataset("devel_gpt_generated_datasets/"+num_shots+"/"+dataset_name+file_name_end)['label'].tolist()

In [None]:
# load_<file_type>_dataset's dataframe reads null as nan, so we convert nan back to null
for i in range(len(all_tokens)):
    token = all_tokens[i]
    if isinstance(token,float) and str(token)=="nan":
        all_tokens[i] = "null"

for i in range(len(llm_devel_tokens)):
    token = llm_devel_tokens[i]
    if isinstance(token,float) and str(token)=="nan":
        llm_devel_tokens[i] = "null"

In [None]:
print("Dataset: "+dataset_name)
print("# shots: "+num_shots)
print("# tokens in original devel dataset: "+str(len(all_tokens)))
print("# tokens in LLM-generated dataset: "+str(len(llm_devel_tokens)))

Dataset: NCBI-disease
# shots: few_shot
# tokens in original devel dataset: 23959
# tokens in LLM-generated dataset: 23917


# Extract Entities
An entity is a sequence of tokens starting with a token labeled as “B” and ends at the last consecutive token labeled as “I” (the token after this token is either labeled as “O” or the last token labeled “I” is the last token in the dataset). For intrinsic evaluation, we filtered for all the unique entities in the original devel and LLM-generated datasets.

In [None]:
# add unique entities in original devel dataset to a set

entities_original = set()
i = 0
while i<len(all_tokens):
    current_entity = ""
    if (all_labels[i]=="B"):
        current_entity+=all_tokens[i]+" "
        i+=1
        while (all_labels[i] == "I"):
            current_entity+=all_tokens[i]+" "
            i+=1
        current_entity = current_entity[:len(current_entity)-1]
        entities_original.add(current_entity)
        current_entity = ""
    else:
        #if (all_labels[i]=="I"):
            #print("unexpected label I without B before it? or label that's not B, I, or O")
            #print("token: "+all_tokens[i])
            #print("label: "+all_labels[i])
            #print("row: "+str(i+1))
        i+=1

print("# unique entities in original dataset: "+str(len(entities_original)))


# unique entities in original dataset: 363


In [None]:
# add unique entities in LLM-generated dataset to a set
entities_llm = set()
i = 0
while i<len(llm_devel_tokens):
    current_entity = ""
    if (llm_devel_labels[i]=="B"):
        current_entity+=llm_devel_tokens[i]+" "
        i+=1
        while (llm_devel_labels[i] == "I"):
            current_entity+=llm_devel_tokens[i]+" "
            i+=1
        current_entity = current_entity[:len(current_entity)-1]
        entities_llm.add(current_entity)
        current_entity = ""
    else:
        #if (llm_devel_labels[i]=="I"):
            #print("unexpected label I without B before it? or label that's not B, I, or O")
            #print("llm token: "+llm_devel_tokens[i])
            #print("llm label: "+llm_devel_labels[i])
            #print("llm row: "+str(i+1))
        i+=1

print("# unique entities in LLM-generated dataset: "+str(len(entities_llm)))

# unique entities in LLM-generated dataset: 596


# Compute Label Accuracy

In [None]:
num_correct = 0
num_hallucinations = 0
hallucinations = []

for entity in entities_llm:
    if entity in entities_original:
        num_correct+=1
    else:
        hallucinations.append(entity)
        num_hallucinations+=1
print("""Correctly labeled entities are entities that were labeled as <TYPE> entities
in both the original devel dataset and LLM-generated datasets.""")
print("# correctly labeled entities: "+str(num_correct))
print("\n")
print("<TYPE> in the following print statements is a placeholder for entity type (i.e. chemical, disease, gene, protein)")
print("\n")
print("""Hallucinations are entities that were labeled as <TYPE> entities in the LLM-generated dataset,
but they weren't labeled as <TYPE> entities in the original devel dataset.""")
print("# hallucinations: "+str(num_hallucinations))

Correctly labeled entities are entities that were labeled as <TYPE> entities
in both the original devel dataset and LLM-generated datasets.
# correctly labeled entities: 159


<TYPE> in the following print statements is a placeholder for entity type (i.e. chemical, disease, gene, protein)


Hallucinations are entities that were labeled as <TYPE> entities in the LLM-generated dataset,
but they weren't labeled as <TYPE> entities in the original devel dataset.
# hallucinations: 437


In [None]:
num_false_neg = 0
false_neg = []
for entity in entities_original:
    if entity not in entities_llm:
        num_false_neg+=1
        false_neg.append(entity)
print("""False negatives are entities that were labeled as <TYPE> entities in the original devel dataset,
but not labeled as <TYPE> entities in the LLM-generated dataset.""")
print("# false negatives: "+str(num_false_neg))

False negatives are entities that were labeled as <TYPE> entities in the original devel dataset,
but not labeled as <TYPE> entities in the LLM-generated dataset.
# false negatives: 204
