# Overview
We measured GPT-3.5's F1, precision, and recall on approximate entity matches. We developed the following definition of 'approximate match': any GPT-labeled entity that shares a token with a human-labeled entity is an approximate match. Intuitively, if the human labeled dataset identified 'Ebola virus disease' as an entity and GPT-3.5 only identified 'Ebola' as the entity, then that would be an approximate entity match.

# Env Setup

In [None]:
!pip install transformers seqeval[gpu]

Collecting seqeval[gpu]
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=25211bab75f6c04aa34ab8578f9d35f3d9276dbeb0e4ccb040181b2273e9e727
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [None]:
import csv
import json
import pandas as pd
import numpy as np
import re
import string
from collections import Counter
import os
from google.colab import drive
import time
import ast
import random
from collections import defaultdict
from seqeval.metrics import accuracy_score
from seqeval.metrics import classification_report
from seqeval.metrics import f1_score

In [None]:
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
cd drive/MyDrive/'6.8611 Research Project'/'Colab Notebooks'

/content/drive/.shortcut-targets-by-id/1vdEcgdXIfpnlORVlPsJtHUmKXSAqr69R/6.8611 Research Project/Colab Notebooks


In [None]:
ls

 BC5CDR-D_devel_1.csv                Intrinsic_exact_match.ipynb
 BC5CDR-D_devel_2.csv                Intrinsic_one.ipynb
 Data-cleaning.ipynb                 [0m[01;34mllm-annotations[0m/
 [01;34mdevel_gpt_generated_datasets[0m/      ' NER with BERT.ipynb'
 Fine-Tuning-Few-Shot.ipynb          openai-test.ipynb
 Fine-Tuning-Human-Annotated.ipynb   retry_prompts.gsheet
 Fine-Tuning-One-Shot.ipynb          RW-Fine-Tuning-Human-Annotated.ipynb
 Fine-Tuning-Zero-Shot.ipynb         [01;34msft_training_data[0m/
 GPT-Finetuning.ipynb                TEST_LABEL_BUGS.ipynb
 Intrinsic_approx_match.ipynb        tokens_labels.csv
 [01;34mintrinsic_data[0m/                     Untitled
 intrinsic_eval_entity.ipynb         zero-shot-bc5cdr-chem.pynb
 intrinsic_eval.ipynb               'zero_shot[FASTER].ipynb'
'Intrinsic Eval Precision.ipynb'     zero-shot.pynb


# Data Loading and Preprocessing

In [None]:
# SET PARAMS

dataset = 'NCBI'
prompt_type = 'Z'

In [None]:
file_path = f'intrinsic_data/{dataset}-{prompt_type}S.csv'

In [None]:
data = pd.read_csv(file_path)

# Intrinsic Labels Approximate Match Evaluation

In [None]:
def relabel(df):
  for i, row in df.iterrows():
    if row['label'] not in {'B', 'I', 'O'}:
      print('yo')
      if row['label'][0] in {'B', 'I', 'O'}:
        df.loc[i, 'label'] = row['label'][0]
      else:
        df.loc[i, 'label'] = 'O'
    if row['label_gpt'] not in {'B', 'I', 'O'}:
      if isinstance(row['label_gpt'], str) and row['label_gpt'][0] in {'B', 'I', 'O'}:
        df.loc[i, 'label_gpt'] = row['label_gpt'][0]
      else:
        df.loc[i, 'label_gpt'] = 'O'
  return df


In [None]:
def partial_fix(df):
  #get all index ranges of entities
  entities = []
  start_idx = None
  for idx, label in enumerate(df['label']):
      if label == 'B':
          if start_idx is not None:
              entities.append((start_idx, idx - 1))
          start_idx = idx
      elif label != 'I' and start_idx is not None:
          entities.append((start_idx, idx - 1))
          start_idx = None
  if start_idx is not None:
      entities.append((start_idx, len(df['label']) - 1))

  for start, end in entities:
    if 'B' in df.iloc[start:end+1]['label_gpt'].to_list() or 'I' in df.iloc[start:end+1]['label_gpt'].to_list():
      #approximate match
      df.loc[start:end+1, 'label_gpt'] = df.loc[start:end+1, 'label']
  return df

In [None]:
types = {'Z': 'Zero Shot', 'O': 'One Shot', 'F': 'Few Shot'}

def get_partial_report(dataset, prompt_type):
  file_path = f'intrinsic_data/{dataset}-{prompt_type}S.csv'
  data = pd.read_csv(file_path)
  data = relabel(data)
  data = partial_fix(data)
  labels = [data['label'].to_list()]
  predictions = [data['label_gpt'].to_list()]
  print('-----------------------------------------')
  print(dataset +' '+ types[prompt_type]+ ': ')
  print()
  print(classification_report(labels, predictions))
  print('-----------------------------------------')
  print()

In [None]:
datasets = ['NCBI', 'BC2GM', 'JNLPBA', 'BC5CDR-disease', 'BC5CDR-chem']
prompt_types = ['Z', 'O', 'F']
for dataset in datasets:
  for prompt_type in prompt_types:
    get_partial_report(dataset, prompt_type)

-----------------------------------------
NCBI Zero Shot: 

              precision    recall  f1-score   support

           _       0.37      0.76      0.49       787

   micro avg       0.37      0.76      0.49       787
   macro avg       0.37      0.76      0.49       787
weighted avg       0.37      0.76      0.49       787

-----------------------------------------

-----------------------------------------
NCBI One Shot: 

              precision    recall  f1-score   support

           _       0.37      0.89      0.52       787

   micro avg       0.37      0.89      0.52       787
   macro avg       0.37      0.89      0.52       787
weighted avg       0.37      0.89      0.52       787

-----------------------------------------

-----------------------------------------
NCBI Few Shot: 

              precision    recall  f1-score   support

           _       0.41      0.65      0.50       787

   micro avg       0.41      0.65      0.50       787
   macro avg       0.41   