# Notebook Overview
This notebook performs the following actions:
1. Load in the devel split of a BLURB BioNER datasets, and preprocess the data before passing it into the OpenAI API
2. Generate GPT-3.5 zero, one, and few-shot labels for these datasets. Use multithreading to parallelize the API calls
3. Provides space for post-processing of the labeled dataset.



# Env Setup

In [None]:
pip install openai

Collecting openai
  Downloading openai-1.3.6-py3-none-any.whl (220 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m220.9/220.9 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.25.2-py3-none-any.whl (74 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.0/75.0 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.2-py3-none-any.whl (76 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.9/76.9 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: h11, httpcore, httpx, openai
[31mERROR: pip's dependency resolver does not

In [None]:
pip install nest_asyncio



In [None]:
import csv
import json
import pandas as pd
import numpy as np
import re
import string
from collections import Counter
import openai
import os
from google.colab import drive
import time
import ast

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
cd drive/MyDrive/'6.8611 Research Project'/'Colab Notebooks'

/content/drive/MyDrive/6.8611 Research Project/Colab Notebooks


In [None]:
ls

 BC5CDR-D_devel_1.csv            [0m[01;34mllm-annotations[0m/                       tokens_labels.csv
 BC5CDR-D_devel_2.csv           ' NER with BERT.ipynb'                  zero-shot-bc5cdr-chem.pynb
 Data-cleaning.ipynb             openai-test.ipynb                     'zero_shot[FASTER].ipynb'
 [01;34mdevel_gpt_generated_datasets[0m/   retry_prompts.gsheet                   zero-shot.pynb
 intrinsic_eval.ipynb            RW-Fine-Tuning-Human-Annotated.ipynb


# Set Up OpenAI Client

In [None]:
os.environ['OPENAI_API_KEY'] = "" #NEED TO SET

In [None]:
api_key = os.getenv("OPENAI_API_KEY")
if api_key is None:
    raise ValueError("Please set the OPENAI_API_KEY environment variable.")

In [None]:
client = openai.OpenAI(api_key=api_key)

# Data Preprocessing Helpers

In [None]:
# load the datasets into dataframes

def load_tsv_dataset(file_path):
  """
  Loads a tsv dataset. Renames thne columns to 'token' and 'label'.
  Note that renaming the columns will overwrite the first row of the dataframe
  """
  df = pd.read_csv(file_path, delimiter='\t', header=None, engine='python')
  df.columns = ['token', 'label']
  print(df.head())
  return df


In [None]:
# load the datasets into dataframes

def load_csv_dataset(file_path):
  """
  Loads a csv dataset. Renames thne columns to 'token' and 'label'.
  Note that renaming the columns will overwrite the first row of the dataframe
  """
  df = pd.read_csv(file_path, header=None, engine='python')
  df.columns = ['token', 'label']
  print(df.head())
  return df


In [None]:
def split_by_sentence(list_of_strings):
  sentences = []
  current_sentence = []

  for word in list_of_strings:
      current_sentence.append(word)
      if type(word) is str and word.endswith('.'): # note that this also splits up decimal numbers
          sentence_str = ' '.join(map(str, current_sentence))
          sentences.append(sentence_str)
          current_sentence = []

  sentence_str = ' '.join(map(str, current_sentence))
  sentences.append(sentence_str)

  return sentences

In [None]:
def get_filtered_entities(df, target_label):
  """
  df (pandas dataframe): has two columns 'token' and 'label'
  target_label: 'B', 'I', or 'O' (see description above for what these signify)

  Filtering involves: removing blanks, and filtering out entities that consist
  only of punctuation, numbers, or single letters.

  Return a frequency of all filtered entities with label 'target_label'.
  """
  filtered_df = df[df['label'] == target_label]
  target_entities = filtered_df['token'].tolist() # a set of all the entities with the target label

  # regex for filtering out nonsense strings
  punctuation = re.escape(string.punctuation)
  pattern = re.compile(rf'^(?![a-zA-Z]?$)(?!\d+$)(?!^[{punctuation}]+$).+')
  target_entities = [ent for ent in target_entities if pattern.match(ent)]
  return Counter(target_entities)

In [None]:
def load_dfs(dataset):
  print('\nFor ', dataset, ": \n")
  zero_shot = f'devel_gpt_generated_datasets/zero_shot/{dataset}-devel.csv'
  one_shot = f'devel_gpt_generated_datasets/one_shot/{dataset}-devel.csv'
  few_shot = f'devel_gpt_generated_datasets/few_shot/{dataset}-devel.csv'
  devel = f'llm-annotations/datasets/{dataset}/devel.tsv'

  zero_shot_df = load_csv_dataset(zero_shot)
  one_shot_df = load_csv_dataset(one_shot)
  few_shot_df = load_csv_dataset(few_shot)
  devel_df = load_tsv_dataset(devel)
  print('Zero Shot Length: ', len(zero_shot_df), '\nOne Shot Length: ', len(one_shot_df), '\nFew Shot Length: ', len(few_shot_df), '\nTrue Length: ', len(devel_df))

  return devel_df, zero_shot_df, one_shot_df, few_shot_df

# Pre-Process Data

In [None]:
dataset = "" #NEED TO SET. All possible values are: ['NCBI', 'JNLPBA', 'BC5CDR-chem', 'BC5CDR-disease', 'BC2GM']

##CHANGE BASED ON WHICH SHOT YOU'RE RUNNING
file_path = f'devel_gpt_generated_datasets/few_shot/{dataset}-devel.csv'

In [None]:
devel = f'llm-annotations/datasets/{dataset}/devel.tsv'
devel_df = load_tsv_dataset(devel)

test = f'llm-annotations/datasets/{dataset}/test.tsv'
test_df = load_tsv_dataset(test)

          token label
0           Our     O
1          data     O
2       suggest     O
3          that     O
4  lipoxygenase     B
            token label
0          Number     O
1              of     O
2  glucocorticoid     B
3       receptors     I
4              in     O


In [None]:
def get_chunks(all_tokens, CHUNK_SIZE = 300):
  for i in range(len(all_tokens)):
    token = all_tokens[i]
    if isinstance(token,float) and str(token)=="nan":
      all_tokens[i]="null"

  sentences = split_by_sentence(all_tokens)

  SENTENCE_CHUNKS = []

  curr_chunk, curr_chunk_len = [], 0
  for sent in sentences:
    curr_chunk.append(sent)
    curr_chunk_len += len(sent)
    if curr_chunk_len >= CHUNK_SIZE:
      SENTENCE_CHUNKS.append(' '.join(curr_chunk))
      curr_chunk = []
      curr_chunk_len = 0

  SENTENCE_CHUNKS.append(' '.join(curr_chunk))
  return SENTENCE_CHUNKS


all_tokens = devel_df['token'].tolist()
CHUNK_SIZE = 300 # string length of the chunk
SENTENCE_CHUNKS = get_chunks(all_tokens)

print('NUM CHUNKS', len(SENTENCE_CHUNKS))

all_test_tokens = test_df['token'].tolist()
CHUNK_SIZE = 300 # string length of the chunk
TEST_SENTENCE_CHUNKS = get_chunks(all_test_tokens)

print('NUM TEST CHUNKS', len(TEST_SENTENCE_CHUNKS))

NUM CHUNKS 1628
NUM TEST CHUNKS 1623


In [None]:
n = len(test_df)
i = 0

TEST_SENTENCE_LABELS = []

while i < n:
  for chunk in TEST_SENTENCE_CHUNKS:
    num_tokens = len(chunk.split())
    rows = test_df.iloc[i:i+num_tokens].reset_index()

    tokens = [str(row['token'])+ '_' + str(i) for i, row in rows.iterrows()]
    labels = [row['label'] for _, row in rows.iterrows()]


    TEST_SENTENCE_LABELS.append(dict(zip(tokens, labels)))

    i += num_tokens

# Zero, One, or Few Shot

## Zero Shot

In [None]:
text = SENTENCE_CHUNKS[0] #text is the chunk to that is annotated by API call made in next cell

# remember to change <entity type> based on dataset in "focusing on identifying <entity type> entities" of INSTRUCTION
INSTRUCTION = f"""Given a biomedical text, perform Named Entity Recognition analysis on this text, focusing on identifying entities that are a protein, DNA, RNA, cell line, or cell type entity. Individual tokens are separated by white space and must be given exactly one label: ‘B’,  ‘I’, or ‘O’, according to the BIO labeling schema. Output a JSON object with key equal to token concatenated with '_'+(index of token) and value equal to label, where (index of token) starts from 0 for the first token, 1 for the second token, and so on.
Text: """

prompt = f"""{INSTRUCTION}\n{text}"""
print(prompt)

## (OLD & BUGGY - 11/30) Find Optimal Test Sentence

In [None]:
def label_ratio_of_chunks(list_chunks_labels):
  """
  Finds the ratio of labels "B", "I", "O" in each chunk and in total

  @param list_chunks_labels (List<Dict<Str, 'B'|'I'|'O'>>) List of chunks stored as dictionaries w/ token keys and label values
  @returns Dictionary w/ keys of 'B', 'I', 'O' and values of their appear ratio across all chunks
           & a list of each chunk w/ values of dictionaries storing each chunks' label ratio
  """
  #To find at end
  total_label_ratios = {'B': 0, 'I': 0, 'O': 0}

  #To find as we go through each chunk
  total_labels = 0
  total_label_counts = {'B': 0, 'I': 0, 'O': 0}
  list_chunk_label_counts = []
  list_chunk_label_ratios = []


  for chunk in list_chunks_labels:
    curr_chunk_label_counts = {'B': 0, 'I': 0, 'O': 0}
    curr_chunk_label_ratios = {'B': 0, 'I': 0, 'O': 0}
    chunk_labels = chunk.values()

    #Sum individual chunks' label counts & Add them to overall count
    for label in chunk_labels:
      curr_chunk_label_counts[label] = 1 + curr_chunk_label_counts[label]
      total_label_counts[label] = 1 + total_label_counts[label]

    #Find individual chunks' label ratios
    for label_type in ['B', 'I', 'O']:
      curr_chunk_label_ratios[label_type] = curr_chunk_label_counts[label_type] / max(1, len(chunk_labels))

    #Track chunk counts & ratios, & total # of labels
    total_labels += len(chunk_labels)
    list_chunk_label_counts.append(curr_chunk_label_counts)
    list_chunk_label_ratios.append(curr_chunk_label_ratios)

  for label_type in ['B', 'I', 'O']:
      total_label_ratios[label_type] = total_label_counts[label_type] / total_labels

  return total_label_ratios, list_chunk_label_ratios

In [None]:
def most_representative_chunk(list_chunks_labels):
  """
  Finds the chunk with the label ratio closest to the label ratio of all chunks combined

  @param list_chunks_labels (List<Dict<Str, 'B'|'I'|'O'>>) List of chunks stored as dictionaries w/ token keys and label values
  @returns int index of chunk in list_chunks_labels with the label ratio closest to all chunks combined
  """

  total_r, list_chunk_rs = label_ratio_of_chunks(list_chunks_labels)
  true_r = np.array(list(total_r.values()))
  print(true_r)

  min_dist = 1000000
  best_chunk = -1

  for i, chunk_r in enumerate(list_chunk_rs):
    test_r = np.array(list(chunk_r.values()))

    squared_dist = np.sum((true_r - test_r)**2, axis=0)
    dist = np.sqrt(squared_dist)

    if i == 60:
      print(dist)
    if dist <= min_dist:
      print(min_dist, dist, i)
      min_dist = dist
      best_chunk = i

  return i

In [None]:
def most_entities_chunk(list_chunks_labels):
  """
  Finds the chunk with the highest ratio of entities

  @param list_chunks_labels (List<Dict<Str, 'B'|'I'|'O'>>) List of chunks stored as dictionaries w/ token keys and label values
  @returns int index of chunk in list_chunks_labels with the highest ration of entities
  """

  total_r, list_chunk_rs = label_ratio_of_chunks(list_chunks_labels)

  max_B = -1
  best_chunk = -1



  for i, chunk_r in enumerate(list_chunk_rs):
    chunk_Bs = chunk_r['B']

    if i == 60:
      print(chunk_Bs)

    if chunk_Bs > max_B:
      print(max_B, chunk_Bs, i)
      max_B = chunk_Bs
      best_chunk = i

  return i

In [None]:
def longest_entities_chunks(list_chunks_labels):
  """
  Finds the chunk with the highest ratio of 'I' overall and chunk w/ the highest ratio of 'I' relative to its ratio of 'B'

  @param list_chunks_labels (List<Dict<Str, 'B'|'I'|'O'>>) List of chunks stored as dictionaries w/ token keys and label values
  @returns int index of chunk in list_chunks_labels with the highest ration of entities
  """

  total_r, list_chunk_rs = label_ratio_of_chunks(list_chunks_labels)

  max_I = -1
  max_IB = -1
  I_chunk = -1
  IB_chunk = -1


  for i, chunk_r in enumerate(list_chunk_rs):
    if chunk_r['B'] == 0:
      continue


    chunk_Is = chunk_r['I']
    chunk_IBs = chunk_Is / chunk_r['B']

    if chunk_Is > max_IB:
      print(max_I, chunk_Is, i, TEST_SENTENCE_LABELS[i])
      max_I = chunk_Is
      I_chunk = i

    if chunk_IBs > max_IB:
      print(max_IB, chunk_IBs, i, TEST_SENTENCE_LABELS[i])
      max_IB = chunk_IBs
      IB_chunk = i


  return I_chunk, IB_chunk

In [None]:
possible_test_sentences = [longest_entities_chunks(TEST_SENTENCE_LABELS),
                           most_entities_chunk(TEST_SENTENCE_LABELS),
                           most_representative_chunk(TEST_SENTENCE_LABELS)]
print(possible_test_sentences)

-1 0.045454545454545456 0 {'Number_0': 'O', 'of_1': 'O', 'glucocorticoid_2': 'B', 'receptors_3': 'I', 'in_4': 'O', 'lymphocytes_5': 'O', 'and_6': 'O', 'their_7': 'O', 'sensitivity_8': 'O', 'to_9': 'O', 'hormone_10': 'O', 'action_11': 'O', '._12': 'O', 'The_13': 'O', 'study_14': 'O', 'demonstrated_15': 'O', 'a_16': 'O', 'decreased_17': 'O', 'level_18': 'O', 'of_19': 'O', 'glucocorticoid_20': 'B', 'receptors_21': 'I', '(_22': 'O', 'GR_23': 'B', ')_24': 'O', 'in_25': 'O', 'peripheral_26': 'O', 'blood_27': 'O', 'lymphocytes_28': 'O', 'from_29': 'O', 'hypercholesterolemic_30': 'O', 'subjects_31': 'O', ',_32': 'O', 'and_33': 'O', 'an_34': 'O', 'elevated_35': 'O', 'level_36': 'O', 'in_37': 'O', 'patients_38': 'O', 'with_39': 'O', 'acute_40': 'O', 'myocardial_41': 'O', 'infarction_42': 'O', '._43': 'O'}
-1 0.6666666666666667 0 {'Number_0': 'O', 'of_1': 'O', 'glucocorticoid_2': 'B', 'receptors_3': 'I', 'in_4': 'O', 'lymphocytes_5': 'O', 'and_6': 'O', 'their_7': 'O', 'sensitivity_8': 'O', 'to_9'

(0, 1616)

### **NOTE: Play around until you find three test sentence you think are strong, but different.**

In [None]:
example_indices = [50, 10, 20] #one shot will just take the 0th index of this

In [None]:
i = 60

print(TEST_SENTENCE_CHUNKS[0])
for x,y in TEST_SENTENCE_LABELS[0].items():
  print(x.split('_')[0],y)

Furthermore , supernatant from Dx - treated CTL contained a nondialyzable factor which inhibited DNA synthesis and cell growth of CTL clones induced by IL 2 . Blocking of IL 2 synthesis and IL 2 receptor formation have been proposed as one of the major mechanisms of glucocorticoid - induced immunosuppression .

Furthermore O
, O
supernatant O
from O
Dx O
- O
treated O
CTL O
contained O
a O
nondialyzable O
factor O
which O
inhibited O
DNA O
synthesis O
and O
cell O
growth O
of O
CTL O
clones O
induced O
by O
IL B
2 I
. O
Blocking O
of O
IL B
2 I
synthesis O
and O
IL B
2 I
receptor I
formation O
have O
been O
proposed O
as O
one O
of O
the O
major O
mechanisms O
of O
glucocorticoid O
- O
induced O
immunosuppression O
. O


## (NEW & IMPROVED - 12/2) Find Optimal Test Sentence

My empirical experience w/ few-shot JNLPBA is that the top 3 most respresentative chunks acheive the best results. When using the other options at all(chunks w the most entities or the longest entities), GPT became much more liberal w/ its labeling. However, JNLPBA is one of the harder datasets bc it has more than one entity and GPT has to look out for any of them, so definitely still experiment

In [None]:
def label_ratio_of_chunks(list_chunks_labels):
  """
  Finds the ratio of labels "B", "I", "O" in each chunk and in total

  @param list_chunks_labels (List<Dict<Str, 'B'|'I'|'O'>>) List of chunks stored as dictionaries w/ token keys and label values
  @returns Dictionary w/ keys of 'B', 'I', 'O' and values of their appear ratio across all chunks
           & a list of each chunk w/ values of dictionaries storing each chunks' label ratio
  """
  #To find at end
  total_label_ratios = {'B': 0, 'I': 0, 'O': 0}

  #To find as we go through each chunk
  total_labels = 0
  total_label_counts = {'B': 0, 'I': 0, 'O': 0}
  list_chunk_label_counts = []
  list_chunk_label_ratios = []


  for chunk in list_chunks_labels:
    curr_chunk_label_counts = {'B': 0, 'I': 0, 'O': 0}
    curr_chunk_label_ratios = {'B': 0, 'I': 0, 'O': 0}
    chunk_labels = chunk.values()

    #Sum individual chunks' label counts & Add them to overall count
    for label in chunk_labels:
      curr_chunk_label_counts[label] = 1 + curr_chunk_label_counts[label]
      total_label_counts[label] = 1 + total_label_counts[label]

    #Find individual chunks' label ratios
    for label_type in ['B', 'I', 'O']:
      curr_chunk_label_ratios[label_type] = curr_chunk_label_counts[label_type] / max(1, len(chunk_labels))

    #Track chunk counts & ratios, & total # of labels
    total_labels += len(chunk_labels)
    list_chunk_label_counts.append(curr_chunk_label_counts)
    list_chunk_label_ratios.append(curr_chunk_label_ratios)

  for label_type in ['B', 'I', 'O']:
      total_label_ratios[label_type] = total_label_counts[label_type] / total_labels

  return total_label_ratios, list_chunk_label_ratios

In [None]:
def most_representative_chunk(list_chunks_labels):
  """
  Finds the chunk with the label ratio closest to the label ratio of all chunks combined

  @param list_chunks_labels (List<Dict<Str, 'B'|'I'|'O'>>) List of chunks stored as dictionaries w/ token keys and label values
  @returns int index of chunk in list_chunks_labels with the label ratio closest to all chunks combined
  """
  print('FINDING MOST REPRESENTATIVE CHUNK')

  total_r, list_chunk_rs = label_ratio_of_chunks(list_chunks_labels)
  true_r = np.array(list(total_r.values()))

  min_dist = 1000000
  old_min = 1000000
  best_chunk = -1

  for i, chunk_r in enumerate(list_chunk_rs):
    test_r = np.array(list(chunk_r.values()))

    squared_dist = np.sum((true_r - test_r)**2, axis=0)
    dist = np.sqrt(squared_dist)

    if dist <= min_dist:
      old_min = min_dist
      print(min_dist, dist, i)
      min_dist = dist
      best_chunk = i

    elif dist <= old_min:
      print('OLD MIN BEATEN:', (dist, i, old_min))

  return best_chunk

In [None]:
def most_entities_chunk(list_chunks_labels):
  """
  Finds the chunk with the highest ratio of entities

  @param list_chunks_labels (List<Dict<Str, 'B'|'I'|'O'>>) List of chunks stored as dictionaries w/ token keys and label values
  @returns int index of chunk in list_chunks_labels with the highest ration of entities
  """
  print('FINDING MOST ENTITIES')

  total_r, list_chunk_rs = label_ratio_of_chunks(list_chunks_labels)

  max_B = -1
  best_chunk = -1



  for i, chunk_r in enumerate(list_chunk_rs):
    chunk_Bs = chunk_r['B']

    if chunk_Bs > max_B:
      print(max_B, chunk_Bs, i)
      max_B = chunk_Bs
      best_chunk = i

  return best_chunk

In [None]:
def longest_entities_chunks(list_chunks_labels):
  """
  Finds the chunk with the highest ratio of 'I' overall and chunk w/ the highest ratio of 'I' relative to its ratio of 'B'

  @param list_chunks_labels (List<Dict<Str, 'B'|'I'|'O'>>) List of chunks stored as dictionaries w/ token keys and label values
  @returns int index of chunk in list_chunks_labels with the highest ration of entities
  """
  print('FINDING LONGEST ENTITY')

  total_r, list_chunk_rs = label_ratio_of_chunks(list_chunks_labels)

  max_I = -1
  max_IB = -1
  I_chunk = -1
  IB_chunk = -1


  for i, chunk_r in enumerate(list_chunk_rs):
    if chunk_r['B'] == 0:
      continue


    chunk_Is = chunk_r['I']
    chunk_IBs = chunk_Is / chunk_r['B']

    if chunk_Is > max_I:
      print('NEW LONGEST ENTITY OVERALL:', max_I, chunk_Is, i, TEST_SENTENCE_LABELS[i])
      max_I = chunk_Is
      I_chunk = i

    if chunk_IBs > max_IB:
      print('NEW LONGEST ENTITY/PER ENTITY:', max_IB, chunk_IBs, i, TEST_SENTENCE_LABELS[i])
      max_IB = chunk_IBs
      IB_chunk = i


  return I_chunk, IB_chunk

In [None]:
possible_test_sentences = [longest_entities_chunks(TEST_SENTENCE_LABELS),
                           most_entities_chunk(TEST_SENTENCE_LABELS),
                           most_representative_chunk(TEST_SENTENCE_LABELS)]
print(possible_test_sentences)

FINDING LONGEST ENTITY
NEW LONGEST ENTITY OVERALL: -1 0.045454545454545456 0 {'Number_0': 'O', 'of_1': 'O', 'glucocorticoid_2': 'B', 'receptors_3': 'I', 'in_4': 'O', 'lymphocytes_5': 'O', 'and_6': 'O', 'their_7': 'O', 'sensitivity_8': 'O', 'to_9': 'O', 'hormone_10': 'O', 'action_11': 'O', '._12': 'O', 'The_13': 'O', 'study_14': 'O', 'demonstrated_15': 'O', 'a_16': 'O', 'decreased_17': 'O', 'level_18': 'O', 'of_19': 'O', 'glucocorticoid_20': 'B', 'receptors_21': 'I', '(_22': 'O', 'GR_23': 'B', ')_24': 'O', 'in_25': 'O', 'peripheral_26': 'O', 'blood_27': 'O', 'lymphocytes_28': 'O', 'from_29': 'O', 'hypercholesterolemic_30': 'O', 'subjects_31': 'O', ',_32': 'O', 'and_33': 'O', 'an_34': 'O', 'elevated_35': 'O', 'level_36': 'O', 'in_37': 'O', 'patients_38': 'O', 'with_39': 'O', 'acute_40': 'O', 'myocardial_41': 'O', 'infarction_42': 'O', '._43': 'O'}
NEW LONGEST ENTITY/PER ENTITY: -1 0.6666666666666667 0 {'Number_0': 'O', 'of_1': 'O', 'glucocorticoid_2': 'B', 'receptors_3': 'I', 'in_4': 'O'

### **NOTE: Play around until you find three test sentence you think are strong, but different.**

In [None]:
example_indices = [576, 524, 610] #one shot will just take the 0th index of this

In [None]:
i = 60

print(TEST_SENTENCE_CHUNKS[0])
for x,y in TEST_SENTENCE_LABELS[0].items():
  print(x.split('_')[0],y)

Number of glucocorticoid receptors in lymphocytes and their sensitivity to hormone action . The study demonstrated a decreased level of glucocorticoid receptors ( GR ) in peripheral blood lymphocytes from hypercholesterolemic subjects , and an elevated level in patients with acute myocardial infarction .
Number O
of O
glucocorticoid B
receptors I
in O
lymphocytes O
and O
their O
sensitivity O
to O
hormone O
action O
. O
The O
study O
demonstrated O
a O
decreased O
level O
of O
glucocorticoid B
receptors I
( O
GR B
) O
in O
peripheral O
blood O
lymphocytes O
from O
hypercholesterolemic O
subjects O
, O
and O
an O
elevated O
level O
in O
patients O
with O
acute O
myocardial O
infarction O
. O


## One Shot

In [None]:
text = SENTENCE_CHUNKS[0] #text is the chunk to that is annotated by API call made in next cell

# remember to change <entity type> based on dataset in "focusing on identifying <entity type> entities" of INSTRUCTION
INSTRUCTION = f"""Given a biomedical text, perform Named Entity Recognition analysis on this text, and identify ENTITIES THAT ARE ONE OF THE ENTITIES IN THE SET ('protein', 'DNA', 'RNA', 'cell line', 'cell type'). Individual tokens are separated by white space and must be given exactly one label: ‘B’,  ‘I’, or ‘O’, according to the BIO labeling schema. Output a JSON object with key equal to token concatenated with '_'+(index of token) and value equal to label, where (index of token) starts from 0 for the first token, 1 for the second token, and so on. Here is a labeled example to help with this task:

Example Text:
{TEST_SENTENCE_CHUNKS[example_indices[0]]}

Example Response:
{TEST_SENTENCE_LABELS[example_indices[0]]}

Now label the following text: """

prompt = f"""{INSTRUCTION}\n{text}"""
print(prompt)

## Few Shot

In [None]:
text = SENTENCE_CHUNKS[0] #text is the chunk to that is annotated by API call made in next cell

#SET TO ENTITY TYPE OF YOUR DATASET
# remember to change <entity type> based on dataset in "focusing on identifying <entity type> entities" of INSTRUCTION
INSTRUCTION = f"""Given a biomedical text, perform Named Entity Recognition analysis on this text, focusing on identifying ONLY GENE ENTITIES. Individual tokens are separated by white space and must be given exactly one label: ‘B’,  ‘I’, or ‘O’, according to the BIO labeling schema. Output a JSON object with key equal to token concatenated with '_'+(index of token) and value equal to label, where (index of token) starts from 0 for the first token, 1 for the second token, and so on. Here is are a few examples of succuessful response labels for biomedical texts:

Example Text 1:
{TEST_SENTENCE_CHUNKS[example_indices[0]]}

Example Response 1:
{TEST_SENTENCE_LABELS[example_indices[0]]}

Example Text 2:
{TEST_SENTENCE_CHUNKS[example_indices[1]]}

Example Response 2:
{TEST_SENTENCE_LABELS[example_indices[1]]}

Example Text 3:
{TEST_SENTENCE_CHUNKS[example_indices[2]]}

Example Response 3:
{TEST_SENTENCE_LABELS[example_indices[2]]}

Now, label the following text: """

prompt = f"""{INSTRUCTION}\n{text}"""
print(prompt)

Given a biomedical text, perform Named Entity Recognition analysis on this text, focusing on identifying ONLY GENE ENTITIES. Individual tokens are separated by white space and must be given exactly one label: ‘B’,  ‘I’, or ‘O’, according to the BIO labeling schema. Output a JSON object with key equal to token concatenated with '_'+(index of token) and value equal to label, where (index of token) starts from 0 for the first token, 1 for the second token, and so on. Here is are a few examples of succuessful response labels for biomedical texts:

Example Text 1:
DNA elements recognizing NF - Y and Sp1 regulate the human multidrug - resistance gene promoter . Radioimmunoassay of plasma gonadotropins ; problems of specificity Patients randomized into the active treatment groups A30 ( n = 49 ) and A60 ( n = 48 ) received topical treatment with 3 . 0 % diclofenac in 2 .

Example Response 1:
{'DNA_0': 'O', 'elements_1': 'O', 'recognizing_2': 'O', 'NF_3': 'B', '-_4': 'I', 'Y_5': 'I', 'and_6': '

# Test Whichever Shot

In [None]:
start = time.time()

response = client.chat.completions.create(
  model="gpt-3.5-turbo-1106",
  response_format={ "type": "json_object" },
  messages=[
    {"role": "system", "content": "You are an expert named entity recognition of genes in the biomedical domain."}, #SET TO ENTITY TYPE OF YOUR DATASET
    {"role": "user", "content": prompt}
  ],
  temperature=0
)

print(f'{time.time() - start} seconds taken ')

11.798084259033203 seconds taken 


In [None]:
response_content = response.choices[0].message.content
print(response_content)

token_labels = ast.literal_eval(response_content)
print(token_labels)

# Note Lengths aren't guaranteed to be equal.
# There are some issues with API output (see last text cell at bottom of this file)
print(len(token_labels))
# print(len(prompt.split('\n')[2].split()))

{
  "Joys_0": "O",
  "and_1": "O",
  "F_2": "O",
  "._3": "O",
  "2_4": "O",
  "cases_5": "O",
  "of_6": "O",
  "type_7": "O",
  "II_8": "O",
  "tyrosinosis_9": "O",
  "(_10": "O",
  "Richner_11": "B",
  "-_12": "I",
  "Hanhart_13": "I",
  "syndrome_14": "I",
  ")_15": "O",
  "Following_16": "O",
  "conditioning_17": "O",
  ",_18": "O",
  "a_19": "O",
  "single_20": "O",
  "coat_21": "O",
  "of_22": "O",
  "adhesive_23": "O",
  "was_24": "O",
  "applied_25": "O",
  "and_26": "O",
  "light_27": "O",
  "-_28": "O",
  "cured_29": "O",
  "._30": "O",
  "The_31": "O",
  "East_32": "O",
  "African_33": "O",
  "dik_34": "O",
  "-_35": "O",
  "dik_36": "O",
  "antelope_37": "O",
  "represents_38": "O",
  "a_39": "O",
  "miniature_40": "O",
  "model_41": "O",
  "ruminant_42": "O",
  "for_43": "O",
  "comparative_44": "O",
  "studies_45": "O",
  "._46": "O",
  "Evaluation_47": "O",
  "of_48": "O",
  "automatic_49": "O",
  "blood_50": "O",
  "cell_51": "O",
  "counters_52": "O",
  "._53": "O"
}
{

In [None]:
with open(file_path, mode='w', newline='') as file:
  writer = csv.writer(file)
  writer.writerow(['token', 'label'])
  for token_label in token_labels.items():
    token, label = token_label
    token = token.split("_")[0]
    writer.writerow([token, label])

# Multithreading for Batch GPT Requests

In [None]:
import concurrent.futures

MAX_RETRIES = 3

def call_api(prompt):
    for _ in range(MAX_RETRIES):
      try:
          start = time.time()
          response = client.chat.completions.create(
              model="gpt-3.5-turbo-1106",
              response_format={ "type": "json_object" },
              temperature=0,
              messages=[
                  {"role": "system", "content": "You are an expert at annotating Named Entity Recognition datasets in the biomedical domain."},
                  {"role": "user", "content": prompt}
              ]
          )
          print('API call took ' + str(time.time()-start)+ ' seconds.')

          response_content = response.choices[0].message.content
          token_labels = ast.literal_eval(response_content)

          return token_labels

      except openai.OpenAIError as e:
        if e.code == 'rate_limit_exceeded':
          print("Rate limit exceeded. Waiting for 60 seconds.")
          time.sleep(60)
        else:
          print(f"An OpenAI-specific error occurred: {str(e)}")
      except Exception as e:
        print(f"An unexpected error occurred: {str(e)}")
      # except Exception as e:
      #   print(type(e), str(e))

    token_labels = dict()
    for token in prompt.split('\n')[2].split(" "):
      token_labels[token] = ''
    return token_labels


In [None]:
prompts = []

for chunk in SENTENCE_CHUNKS[1:]:
  prompt = f"""{INSTRUCTION}\n{chunk}"""
  prompts.append(prompt)
# Check that prompts contains tokens at the end of file in last prompt
print(prompts[0])

Given a biomedical text, perform Named Entity Recognition analysis on this text, focusing on identifying ONLY GENE ENTITIES. Individual tokens are separated by white space and must be given exactly one label: ‘B’,  ‘I’, or ‘O’, according to the BIO labeling schema. Output a JSON object with key equal to token concatenated with '_'+(index of token) and value equal to label, where (index of token) starts from 0 for the first token, 1 for the second token, and so on. Here is are a few examples of succuessful response labels for biomedical texts:

Example Text 1:
DNA elements recognizing NF - Y and Sp1 regulate the human multidrug - resistance gene promoter . Radioimmunoassay of plasma gonadotropins ; problems of specificity Patients randomized into the active treatment groups A30 ( n = 49 ) and A60 ( n = 48 ) received topical treatment with 3 . 0 % diclofenac in 2 .

Example Response 1:
{'DNA_0': 'O', 'elements_1': 'O', 'recognizing_2': 'O', 'NF_3': 'B', '-_4': 'I', 'Y_5': 'I', 'and_6': '

In [None]:
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    # Submit tasks to the executor
    future_to_prompt = {executor.submit(call_api, prompt): prompt for prompt in prompts}
    total_tokens = 0

    # Process results as they become available
    for i, future in enumerate(future_to_prompt):
        prompt = future_to_prompt[future]
        next_tokens_labels = future.result()
        if not isinstance(next_tokens_labels, dict):
          empty_token_labels = dict()
          for token in prompt.split('\n')[2].split():
            empty_token_labels[token] = ''
          next_tokens_labels = empty_token_labels
          # print("unsuccessful")
          # print(prompt)
          # print("------------------------------")
        # else:
          # print JSON output from API to see what the most recent successful prompt is and format of tokens/labels in JSON
          # print(next_tokens_labels)
          #replace devel.csv with your dataset
        total_tokens += len(next_tokens_labels)
        with open(file_path, mode='a', newline='') as file:
          writer = csv.writer(file)
          for token_label in next_tokens_labels.items():
            token, label = token_label
            token = token.split("_")[0]
            writer.writerow([token, label])

        if (i)%10 == 0:
          print('------------------------------------------------------------')
          print('Chunks written: ', i)
          print('Tokens used: ', total_tokens)
          print('------------------------------------------------------------')

API call took 11.4059579372406 seconds.
API call took 11.503546953201294 seconds.
------------------------------------------------------------
Chunks written:  0
Tokens used:  70
------------------------------------------------------------
API call took 11.602883338928223 seconds.
API call took 13.299752712249756 seconds.
API call took 13.977705717086792 seconds.
API call took 16.073824167251587 seconds.
API call took 16.143929719924927 seconds.
API call took 16.515360355377197 seconds.
API call took 19.610058784484863 seconds.
API call took 10.447763442993164 seconds.
API call took 21.92393469810486 seconds.
------------------------------------------------------------
Chunks written:  10
Tokens used:  780
------------------------------------------------------------
API call took 12.26800274848938 seconds.
API call took 12.543110609054565 seconds.
API call took 17.46278738975525 seconds.
API call took 15.172587633132935 seconds.
API call took 19.65182065963745 seconds.
API call took 13

Might have to do postprocessing for these issues:
1. " . . " in text but API only outputs 1 label for "." token <br>
  Ex: Text is "the . . apple" but output is \{'the_0', 'O', '._1': 'O', 'apple_2': 'O' \}. The expected output is \{'the_0', 'O', '._1': 'O', '._2': 'O, 'apple_3': 'O' \}
2. Token key in JSON output isn't formatted as token+"_"+index of token <br>
  <u>2.1</u>: Punctuation mark tokens (".", ",") are outputted as token+index of token (no _) <br>
    Ex: ".20" instead of "._20" where original token was "." and index was "20". CSV will have token ".20" instead of "." <br>
  <u>2.2</u>: Different symbol after original token: ".\)"+index when original token in text was "." CSV will have token ".\)" instead of "."
3. Additional token(s) as keys in JSON output from API that weren't in additional text <br>
  Ex: " - - " in text but API outputs 3 labels for " - " token

# IMPORTANT: Don't do post-processing as you get results from API before writing to CSV because API calls take much longer to run. Sometimes, code gets stuck if post-processing doesn't handle unexpected tokens properly

In [None]:
so_far = pd.read_csv(file_path)

In [None]:
so_far

Unnamed: 0,token,label
0,Joys,O
1,and,O
2,F,O
3,.,O
4,2,O
...,...,...
70832,avian,B
70833,integrins,I
70834,was,O
70835,obtained,O


In [None]:
devel_df