# Env Setup

In [None]:
import csv
import json
import pandas as pd
import numpy as np
import re
import string
from collections import Counter
import os
from google.colab import drive
import time
import ast

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
cd drive/MyDrive/'6.8611 Research Project'/'Colab Notebooks'

/content/drive/MyDrive/6.8611 Research Project/Colab Notebooks


In [None]:
ls

 BC5CDR-D_devel_1.csv            [0m[01;34mllm-annotations[0m/        zero-shot-bc5cdr-chem.pynb
 BC5CDR-D_devel_2.csv           ' NER with BERT.ipynb'  'zero_shot[FASTER].ipynb'
 Data-cleaning.ipynb             openai-test.ipynb       zero-shot.pynb
 [01;34mdevel_gpt_generated_datasets[0m/   retry_prompts.gsheet
 intrinsic_eval.ipynb            tokens_labels.csv


# Data Preprocessing

In [None]:
# load the datasets into dataframes

def load_tsv_dataset(file_path):
  """
  Loads a tsv dataset. Renames thne columns to 'token' and 'label'.
  Note that renaming the columns will overwrite the first row of the dataframe
  """
  df = pd.read_csv(file_path, delimiter='\t', header=None, engine='python')
  df.columns = ['token', 'label']
  # print(df.head())
  return df


In [None]:
# load the datasets into dataframes

def load_csv_dataset(file_path):
  """
  Loads a csv dataset. Renames thne columns to 'token' and 'label'.
  Note that renaming the columns will overwrite the first row of the dataframe
  """
  df = pd.read_csv(file_path, header=None, engine='python')
  df.columns = ['token', 'label']
  # print(df.head())
  return df


In [None]:
def split_by_sentence(list_of_strings):
  sentences = []
  current_sentence = []
  num_sentences = 0

  for word in list_of_strings:
      current_sentence.append(word)
      if type(word) is str and word.endswith('.'):
          num_sentences += 1
          sentence_str = ' '.join(map(str, current_sentence))
          sentences.append(sentence_str)
          current_sentence = []

  print("\nNumber of sentences: ", num_sentences)
  return sentences

In [None]:
def get_filtered_entities(df, target_label):
  """
  df (pandas dataframe): has two columns 'token' and 'label'
  target_label: 'B', 'I', or 'O' (see description above for what these signify)

  Filtering involves: removing blanks, and filtering out entities that consist
  only of punctuation, numbers, or single letters.

  Return a frequency of all filtered entities with label 'target_label'.
  """
  filtered_df = df[df['label'] == target_label]
  target_entities = filtered_df['token'].tolist() # a set of all the entities with the target label

  # regex for filtering out nonsense strings
  punctuation = re.escape(string.punctuation)
  pattern = re.compile(rf'^(?![a-zA-Z]?$)(?!\d+$)(?!^[{punctuation}]+$).+')
  target_entities = [ent for ent in target_entities if pattern.match(ent)]
  return Counter(target_entities)

# Intrinsic Eval


In [None]:
def load_dfs(dataset):
  print('\nFor ', dataset, ": \n")
  zero_shot = f'devel_gpt_generated_datasets/zero_shot/{dataset}_devel.csv'
  one_shot = f'devel_gpt_generated_datasets/one_shot/{dataset}-devel.csv'
  devel = f'llm-annotations/datasets/{dataset}/devel.tsv'

  zero_shot_df = load_csv_dataset(zero_shot)
  one_shot_df = load_csv_dataset(one_shot)
  devel_df = load_tsv_dataset(devel)
  print('Zero Shot Length: ', len(zero_shot_df), '\nOne Shot Length: ', len(one_shot_df), '\nTrue Length: ', len(devel_df))

  return devel_df, zero_shot_df, one_shot_df

In [None]:
one_shot = f'devel_gpt_generated_datasets/one_shot/BC5CDR-disease-devel-2.csv_bella_test'
one_shot_df = load_csv_dataset(one_shot)

NameError: ignored

In [None]:
def get_chunks(all_tokens, CHUNK_SIZE = 300):
  nans = 0
  for i in range(len(all_tokens)):
    token = all_tokens[i]
    if isinstance(token,float) and str(token)=="nan":
      nans +=1
      all_tokens[i]="null"
  print('Number of nans: ', nans)

  sentences = split_by_sentence(all_tokens)

  SENTENCE_CHUNKS = []
  sentence_chunk_len = []
  word_chunk_len = []
  chunk_sentence_count = 0

  curr_chunk, curr_chunk_len = [], 0
  for sent in sentences:
    curr_chunk.append(sent)
    curr_chunk_len += len(sent)
    chunk_sentence_count += 1
    if curr_chunk_len >= CHUNK_SIZE:
      word_chunk_len.append(curr_chunk_len)
      sentence_chunk_len.append(chunk_sentence_count)
      SENTENCE_CHUNKS.append(' '.join(curr_chunk))
      curr_chunk = []
      curr_chunk_len = 0
      chunk_sentence_count = 0

  print('Number of Sentences in First 10 Chunks: ', sentence_chunk_len[:10])
  SENTENCE_CHUNKS.append(' '.join(curr_chunk))
  print('Number of Chunks: ', len(SENTENCE_CHUNKS))
  return SENTENCE_CHUNKS, sentence_chunk_len, word_chunk_len

In [None]:
def data_label_dict(dataset_df, name):
  print('\nFor', name, ':')
  all_tokens = dataset_df['token'].tolist()
  CHUNK_SIZE = 300 # string length of the chunk
  sentence_chunks, sentence_chunk_len, word_chunk_len = get_chunks(all_tokens)

  n = len(dataset_df)
  i = 0

  sentence_labels = []

  while i < n:
    for chunk in sentence_chunks:
      num_tokens = len(chunk.split())
      rows = dataset_df.iloc[i:i+num_tokens].reset_index()

      tokens = [str(row['token'])+ '_' + str(i) for i, row in rows.iterrows()]
      labels = [row['label'] for _, row in rows.iterrows()]

      sentence_labels.append(dict(zip(tokens, labels)))

      i += num_tokens
  return all_tokens, sentence_chunks, sentence_labels, sentence_chunk_len, word_chunk_len


In [None]:
def chunk_length_analysis(length_type, devel_chunk_lengths, zero_chunk_lengths, one_chunk_lengths):
  d_z_same = True
  d_o_same = True

  for i in range(len(devel_chunk_lengths)):
    if d_z_same and devel_chunk_lengths[i] != zero_chunk_lengths[i] or d_o_same and devel_chunk_lengths[i] != one_chunk_lengths[i]:
      if d_z_same and devel_chunk_lengths[i] != zero_chunk_lengths[i]:
        d_z_same = False
        print('Number of ', length_type, ' in devel and ZERO shot chunks are identical until chunk number: ', i)

      if d_o_same and devel_chunk_lengths[i] != one_chunk_lengths[i]:
        d_o_same = False
        print('Number of ', length_type, ' in devel and ONE shot chunks are identical until chunk number: ', i)

      continue

In [None]:
for dataset in ['NCBI-disease', 'JNLPBA', 'BC5CDR-chem', 'BC5CDR-disease']:
  devel, zero_shot, one_shot = load_dfs(dataset)

  # for df, name in [[devel, 'devel'], [zero_shot, 'zero_shot'], [one_shot, 'one_shot']]:
  #   print(name, 'has ', get_filtered_entities(df, 'B'), ' entities labeled B, ', get_filtered_entities(df, 'I'), ' entities labeled I, and', get_filtered_entities(df, 'O'), ' entities labeled O')


  devel_tokens, devel_chunks, devel_labels, devel_chunk_s_lengths, devel_chunk_w_lengths = data_label_dict(devel, dataset + '_human')
  zero_tokens, zero_chunks, zero_labels, zero_chunk_s_lengths, zero_chunk_w_lengths = data_label_dict(zero_shot, dataset + '_zeroshot')
  one_tokens, one_chunks, one_labels, one_chunk_s_lengths, one_chunk_w_lengths = data_label_dict(one_shot, dataset + '_oneshot')

  chunk_length_analysis('sentences', devel_chunk_s_lengths, zero_chunk_s_lengths, one_chunk_s_lengths)
  chunk_length_analysis('words', devel_chunk_w_lengths, zero_chunk_w_lengths, one_chunk_w_lengths)





For  NCBI-disease : 

Zero Shot Length:  23880 
One Shot Length:  23924 
True Length:  23965

For NCBI-disease_human :
Number of nans:  9

Number of sentences:  1027
Number of Sentences in First 10 Chunks:  [4, 3, 5, 3, 3, 4, 3, 3, 4, 3]
Number of Chunks:  348

For NCBI-disease_zeroshot :
Number of nans:  10

Number of sentences:  933
Number of Sentences in First 10 Chunks:  [4, 3, 4, 3, 3, 3, 2, 3, 4, 3]
Number of Chunks:  350

For NCBI-disease_oneshot :
Number of nans:  9

Number of sentences:  963
Number of Sentences in First 10 Chunks:  [4, 3, 4, 3, 3, 4, 3, 3, 4, 3]
Number of Chunks:  349
Number of  sentences  in devel and ZERO shot chunks are identical until chunk number:  2
Number of  sentences  in devel and ONE shot chunks are identical until chunk number:  2
Number of  words  in devel and ZERO shot chunks are identical until chunk number:  2
Number of  words  in devel and ONE shot chunks are identical until chunk number:  2

For  JNLPBA : 

Zero Shot Length:  116906 
One Shot

## NOTE: Play around until you find a test sentence you think is strong and use that as labeled example.

In [None]:
i = 50

print(TEST_SENTENCE_CHUNKS[i])
print()
for x,y in TEST_SENTENCE_LABELS[i].items():
  print(x.split('_')[0],y)

These abnormal properties of GR ( reduced numbers of GR ) were preserved in the transformed cells from the patients . Octamer transcription factors 1 and 2 each bind to two different functional elements in the immunoglobulin heavy - chain promoter . Immunoglobulin heavy - chain genes contain two conserved sequence elements 5 ' to the site of transcription initiation : the octamer ATGCAAAT and the heptamer CTCATGA .

These O
abnormal O
properties O
of O
GR B
( O
reduced O
numbers O
of O
GR B
) O
were O
preserved O
in O
the O
transformed O
cells O
from O
the O
patients O
. O
Octamer O
transcription O
factors O
1 O
and O
2 O
each O
bind O
to O
two O
different O
functional B
elements I
in O
the O
immunoglobulin B
heavy I
- I
chain I
promoter I
. O
Immunoglobulin B
heavy I
- I
chain I
genes I
contain O
two O
conserved B
sequence I
elements I
5 O
' O
to O
the O
site O
of O
transcription O
initiation O
: O
the O
octamer O
ATGCAAAT O
and O
the O
heptamer O
CTCATGA O
. O


In [None]:
example_text = TEST_SENTENCE_CHUNKS[i]
example_response = TEST_SENTENCE_LABELS[i]

In [None]:
text = SENTENCE_CHUNKS[0] #text is the chunk to that is annotated by API call made in next cell

# remember to change <entity type> based on dataset in "focusing on identifying <entity type> entities" of INSTRUCTION
INSTRUCTION = f"""Given a biomedical text, perform Named Entity Recognition analysis on this text, and identify ENTITIES THAT ARE ONE OF THE ENTITIES IN THE SET ('protein', 'DNA', 'RNA', 'cell line', 'cell type'). Individual tokens are separated by white space and must be given exactly one label: ‘B’,  ‘I’, or ‘O’, according to the BIO labeling schema. Output a JSON object with key equal to token concatenated with '_'+(index of token) and value equal to label, where (index of token) starts from 0 for the first token, 1 for the second token, and so on. Here is a labeled example to help with this task:

Example Text:
{example_text}

Example Response:
{example_response}

Now label the following text: """

prompt = f"""{INSTRUCTION}\n{text}"""
print(prompt)

Given a biomedical text, perform Named Entity Recognition analysis on this text, and identify ENTITIES THAT ARE ONE OF THE ENTITIES IN THE SET ('protein', 'DNA', 'RNA', 'cell line', 'cell type'). Individual tokens are separated by white space and must be given exactly one label: ‘B’,  ‘I’, or ‘O’, according to the BIO labeling schema. Output a JSON object with key equal to token concatenated with '_'+(index of token) and value equal to label, where (index of token) starts from 0 for the first token, 1 for the second token, and so on. Here is a labeled example to help with this task:

Example Text:
These abnormal properties of GR ( reduced numbers of GR ) were preserved in the transformed cells from the patients . Octamer transcription factors 1 and 2 each bind to two different functional elements in the immunoglobulin heavy - chain promoter . Immunoglobulin heavy - chain genes contain two conserved sequence elements 5 ' to the site of transcription initiation : the octamer ATGCAAAT and

In [None]:
start = time.time()

response = client.chat.completions.create(
  model="gpt-3.5-turbo-1106",
  response_format={ "type": "json_object" },
  messages=[
    {"role": "system", "content": "You are an expert at annotating Named Entity Recognition datasets in the biomedical domain."},
    {"role": "user", "content": prompt}
  ],
  temperature=0
)

print(f'{time.time() - start} seconds taken ')

12.42869257926941 seconds taken 


In [None]:
response_content = response.choices[0].message.content
print(response_content)

token_labels = ast.literal_eval(response_content)
print(token_labels)

# Note Lengths aren't guaranteed to be equal.
# There are some issues with API output (see last text cell at bottom of this file)
print(len(token_labels))
# print(len(prompt.split('\n')[9].split()))

{
  "Our_0": "O",
  "data_1": "O",
  "suggest_2": "O",
  "that_3": "O",
  "lipoxygenase_4": "B",
  "metabolites_5": "I",
  "activate_6": "O",
  "ROI_7": "B",
  "formation_8": "I",
  "which_9": "O",
  "then_10": "O",
  "induce_11": "O",
  "IL_12": "B",
  "-_13": "I",
  "2_14": "I",
  "expression_15": "O",
  "via_16": "O",
  "NF_17": "B",
  "-_18": "I",
  "kappa_19": "I",
  "B_20": "I",
  "activation_21": "O",
  "._22": "O",
  "Human_23": "O",
  "immunodeficiency_24": "O",
  "virus_25": "O",
  "type_26": "O",
  "2_27": "O",
  "(_28": "O",
  "HIV_29": "B",
  "-_30": "I",
  "2_31": "I",
  ")_32": "O",
  ",_33": "O",
  "like_34": "O",
  "HIV_35": "B",
  "-_36": "I",
  "1_37": "I",
  ",_38": "O",
  "causes_39": "O",
  "AIDS_40": "O",
  "and_41": "O",
  "is_42": "O",
  "associated_43": "O",
  "with_44": "O",
  "AIDS_45": "O",
  "cases_46": "O",
  "primarily_47": "O",
  "in_48": "O",
  "West_49": "O",
  "Africa_50": "O",
  "._51": "O",
  "This_52": "O",
  "is_53": "O",
  "the_54": "O",
  "firs

In [None]:
with open(file_path, mode='w', newline='') as file:
  writer = csv.writer(file)
  writer.writerow(['token', 'label'])
  for token_label in token_labels.items():
    token, label = token_label
    token = token.split("_")[0]
    writer.writerow([token, label])

# Multithreading for Batch GPT Requests

In [None]:
import concurrent.futures

MAX_RETRIES = 3

def call_api(prompt):
    for _ in range(MAX_RETRIES):
      try:
          start = time.time()
          response = client.chat.completions.create(
              model="gpt-3.5-turbo-1106",
              response_format={ "type": "json_object" },
              temperature=0,
              messages=[
                  {"role": "system", "content": "You are an expert at annotating Named Entity Recognition datasets in the biomedical domain."},
                  {"role": "user", "content": prompt}
              ]
          )
          print('API call took ' + str(time.time()-start)+ ' seconds.')

          response_content = response.choices[0].message.content
          token_labels = ast.literal_eval(response_content)

          return token_labels

      except openai.OpenAIError as e:
        if e.code == 'rate_limit_exceeded':
          print("Rate limit exceeded. Waiting for 60 seconds.")
          time.sleep(60)
        else:
          print(f"An OpenAI-specific error occurred: {str(e)}")
      except Exception as e:
        print(f"An unexpected error occurred: {str(e)}")
      # except Exception as e:
      #   print(type(e), str(e))

    token_labels = dict()
    for token in prompt.split('\n')[2].split(" "):
      token_labels[token] = ''
    return token_labels


In [None]:
prompts = []

for chunk in SENTENCE_CHUNKS[1:]:
  prompt = f"""{INSTRUCTION}\n{chunk}"""
  prompts.append(prompt)
# Check that prompts contains tokens at the end of file in last prompt
print(prompts[0])

Given a biomedical text, perform Named Entity Recognition analysis on this text, and identify ENTITIES THAT ARE ONE OF THE ENTITIES IN THE SET ('protein', 'DNA', 'RNA', 'cell line', 'cell type'). Individual tokens are separated by white space and must be given exactly one label: ‘B’,  ‘I’, or ‘O’, according to the BIO labeling schema. Output a JSON object with key equal to token concatenated with '_'+(index of token) and value equal to label, where (index of token) starts from 0 for the first token, 1 for the second token, and so on. Here is a labeled example to help with this task:

Example Text:
These abnormal properties of GR ( reduced numbers of GR ) were preserved in the transformed cells from the patients . Octamer transcription factors 1 and 2 each bind to two different functional elements in the immunoglobulin heavy - chain promoter . Immunoglobulin heavy - chain genes contain two conserved sequence elements 5 ' to the site of transcription initiation : the octamer ATGCAAAT and

In [None]:
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    # Submit tasks to the executor
    future_to_prompt = {executor.submit(call_api, prompt): prompt for prompt in prompts}
    total_tokens = 0

    # Process results as they become available
    for i, future in enumerate(future_to_prompt):
        prompt = future_to_prompt[future]
        next_tokens_labels = future.result()
        if not isinstance(next_tokens_labels, dict):
          empty_token_labels = dict()
          for token in prompt.split('\n')[2].split():
            empty_token_labels[token] = ''
          next_tokens_labels = empty_token_labels
          # print("unsuccessful")
          # print(prompt)
          # print("------------------------------")
        # else:
          # print JSON output from API to see what the most recent successful prompt is and format of tokens/labels in JSON
          # print(next_tokens_labels)
          #replace devel.csv with your dataset
        total_tokens += len(next_tokens_labels)
        with open(file_path, mode='a', newline='') as file:
          writer = csv.writer(file)
          for token_label in next_tokens_labels.items():
            token, label = token_label
            token = token.split("_")[0]
            writer.writerow([token, label])

        if (i)%10 == 0:
          print('------------------------------------------------------------')
          print('Chunks written: ', i)
          print('Tokens used: ', total_tokens)
          print('------------------------------------------------------------')

API call took 7.202467203140259 seconds.
API call took 9.814164400100708 seconds.
API call took 10.390791654586792 seconds.
------------------------------------------------------------
Chunks written:  0
Tokens used:  65
------------------------------------------------------------
API call took 11.705240964889526 seconds.
API call took 12.195459365844727 seconds.
API call took 13.113972902297974 seconds.
API call took 13.163286685943604 seconds.
API call took 13.37048625946045 seconds.
API call took 14.688476324081421 seconds.
API call took 14.797148704528809 seconds.
API call took 7.748706340789795 seconds.
API call took 10.469637632369995 seconds.
API call took 8.943517446517944 seconds.
API call took 8.491566181182861 seconds.
API call took 11.378488302230835 seconds.
API call took 8.330048322677612 seconds.
API call took 8.609620571136475 seconds.
API call took 16.317009210586548 seconds.
------------------------------------------------------------
Chunks written:  10
Tokens used: 

Might have to do postprocessing for these issues:
1. " . . " in text but API only outputs 1 label for "." token <br>
  Ex: Text is "the . . apple" but output is \{'the_0', 'O', '._1': 'O', 'apple_2': 'O' \}. The expected output is \{'the_0', 'O', '._1': 'O', '._2': 'O, 'apple_3': 'O' \}
2. Token key in JSON output isn't formatted as token+"_"+index of token <br>
  <u>2.1</u>: Punctuation mark tokens (".", ",") are outputted as token+index of token (no _) <br>
    Ex: ".20" instead of "._20" where original token was "." and index was "20". CSV will have token ".20" instead of "." <br>
  <u>2.2</u>: Different symbol after original token: ".\)"+index when original token in text was "." CSV will have token ".\)" instead of "."
3. Additional token(s) as keys in JSON output from API that weren't in additional text <br>
  Ex: " - - " in text but API outputs 3 labels for " - " token

# IMPORTANT: Don't do post-processing as you get results from API before writing to CSV because API calls take much longer to run. Sometimes, code gets stuck if post-processing doesn't handle unexpected tokens properly

# Post-Processing: Do what you gotta do