# Overview

This is the old version of a notebook for generating zero-shot GPT labels of BioNER datasets. We experimented with both multiprocessing, and multithreading for speeding up API requests, along with different Python concurrency libraries.

# Env Setup

In [None]:
pip install openai

Collecting openai
  Downloading openai-1.3.3-py3-none-any.whl (220 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m220.3/220.3 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.25.1-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.0/75.0 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
Collecting httpcore (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.2-py3-none-any.whl (76 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.9/76.9 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting h11<0.15,>=0.13 (from httpcore->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: h11, httpcore, httpx, openai
[31mERROR: pip's dependency resolver does not currently

In [None]:
pip install nest_asyncio



In [None]:
import csv
import json
import pandas as pd
import numpy as np
import re
import string
from collections import Counter
import openai
import os
from google.colab import drive
import time
import ast

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
cd drive/MyDrive/'6.8611 Research Project'/'Colab Notebooks'

/content/drive/.shortcut-targets-by-id/1vdEcgdXIfpnlORVlPsJtHUmKXSAqr69R/6.8611 Research Project/Colab Notebooks


In [None]:
ls

Data-cleaning.ipynb  openai-test.ipynb  tokens_labels.csv
[0m[01;34mllm-annotations[0m/     retry_prompts.csv  zero-shot.pynb


# Set Up OpenAI Client

In [None]:
api_key = os.getenv("OPENAI_API_KEY")
if api_key is None:
    raise ValueError("Please set the OPENAI_API_KEY environment variable.")

In [None]:
client = openai.OpenAI(api_key=api_key)

# Data Preprocessing

In [None]:
# load the datasets into dataframes

def load_tsv_dataset(file_path):
  """
  Loads a tsv dataset. Renames thne columns to 'token' and 'label'.
  Note that renaming the columns will overwrite the first row of the dataframe
  """
  df = pd.read_csv(file_path, delimiter='\t', header=None, engine='python')
  df.columns = ['token', 'label']
  print(df.head())
  return df


In [None]:
def split_by_sentence(list_of_strings):
  sentences = []
  current_sentence = []

  for word in list_of_strings:
      current_sentence.append(word)
      if type(word) is str and word.endswith('.'):
          sentence_str = ' '.join(map(str, current_sentence))
          sentences.append(sentence_str)
          current_sentence = []

  return sentences

In [None]:
def get_filtered_entities(df, target_label):
  """
  df (pandas dataframe): has two columns 'token' and 'label'
  target_label: 'B', 'I', or 'O' (see description above for what these signify)

  Filtering involves: removing blanks, and filtering out entities that consist
  only of punctuation, numbers, or single letters.

  Return a frequency of all filtered entities with label 'target_label'.
  """
  filtered_df = df[df['label'] == target_label]
  target_entities = filtered_df['token'].tolist() # a set of all the entities with the target label

  # regex for filtering out nonsense strings
  punctuation = re.escape(string.punctuation)
  pattern = re.compile(rf'^(?![a-zA-Z]?$)(?!\d+$)(?!^[{punctuation}]+$).+')
  target_entities = [ent for ent in target_entities if pattern.match(ent)]
  return Counter(target_entities)

# BC5CDR-Disease

In [None]:
BC5CDR_d_devel = 'llm-annotations/datasets/BC5CDR-disease/devel.tsv'
BC5CDR_d_devel_df = load_tsv_dataset(BC5CDR_d_devel)

# BC2GM_train = 'llm-annotations/datasets/BC2GM/train.tsv'
# BC2GM_train_df = load_tsv_dataset(BC2GM_train)

# BC2GM_test = 'llm-annotations/datasets/BC2GM/test.tsv'
# BC2GM_test_df = load_tsv_dataset(BC2GM_test)

# BC2GM_train_dev = 'llm-annotations/datasets/BC2GM/train_dev.tsv'
# BC2GM_train_dev_df = load_tsv_dataset(BC2GM_train_dev)

           token label
0             22     O
1              -     O
2  oxacalcitriol     O
3     suppresses     O
4      secondary     B


In [None]:
len(BC5CDR_d_devel_df)


117391

In [None]:
all_tokens = BC5CDR_d_devel_df['token'].tolist()
sentences = split_by_sentence(all_tokens)

CHUNK_SIZE = 300 # string length of the chunk
BC5CDR_D_SENTENCE_CHUNKS = []

curr_chunk, curr_chunk_len = [], 0
for sent in sentences:
  curr_chunk.append(sent)
  curr_chunk_len += len(sent)
  if curr_chunk_len >= CHUNK_SIZE:
    BC5CDR_D_SENTENCE_CHUNKS.append(' '.join(curr_chunk))
    curr_chunk = []
    curr_chunk_len = 0

print('NUM CHUNKS', len(BC5CDR_D_SENTENCE_CHUNKS))
for chunk in BC5CDR_D_SENTENCE_CHUNKS:
  print(len(chunk))


NUM CHUNKS 1749
382
354
372
343
341
328
482
350
354
325
655
313
520
420
387
438
307
334
347
383
329
467
490
395
416
402
359
318
454
319
385
331
306
358
428
478
383
331
461
433
371
438
463
415
581
463
615
379
311
312
310
358
347
331
462
368
312
550
378
323
399
317
314
366
375
334
416
463
404
339
423
476
354
349
394
385
324
332
570
367
381
541
350
333
320
304
343
396
468
427
327
307
520
434
439
315
320
359
477
395
365
376
308
301
380
355
400
344
370
368
366
431
368
449
372
367
472
411
312
352
312
416
352
324
479
365
386
401
306
457
489
332
350
327
351
520
359
420
391
371
358
401
441
318
435
383
309
404
415
432
506
416
379
689
357
385
354
321
309
383
316
307
356
301
409
330
323
471
429
370
339
385
449
377
451
343
320
404
346
381
325
419
314
417
344
540
401
398
320
428
437
448
307
488
398
305
419
338
366
313
340
441
381
312
343
413
325
382
311
380
322
349
383
397
430
357
317
572
475
318
333
498
352
358
382
398
389
333
420
317
349
396
314
478
383
309
382
357
490
383
378
382
499
306
317
478


In [None]:
BC5CDR_D_SENTENCE_CHUNKS[0]

'22 - oxacalcitriol suppresses secondary hyperparathyroidism without inducing low bone turnover in dogs with renal failure . BACKGROUND : Calcitriol therapy suppresses serum levels of parathyroid hormone ( PTH ) in patients with renal failure but has several drawbacks , including hypercalcemia and / or marked suppression of bone turnover , which may lead to adynamic bone disease .'

In [None]:
# text = """Such studied acquired with low energy or medium energy collimation and a window centered on the 159 keV 123I photopeak contain appreciable septal breakthrough signals originating from Compton scatter of high energy photons primarily from 124I . Dissolution of the Pt - 30 % Ir microelectrode tip was observed by scanning electron microscopy at charge densities as low as 200 microC / cm2 X ph ( 1 A / cm2 ) , whereas erosion of activated iridium microelectrodes occurred only at the highest charge and current densities ( 3200 microC / cm2 X ph , 16 A / cm2 ) ."""
text = BC5CDR_D_SENTENCE_CHUNKS[0]
INSTRUCTION = f"""Given a biomedical text, perform Named Entity Recognition analysis on this text, focusing on identifying disease entities. Individual tokens are separated by white space and must be given exactly one label: ‘B’,  ‘I’, or ‘O’, according to the BIO labeling schema. The result should only contain a Python list of lists, where each inner list is [token, the token's label]. Output each token and label in the order the tokens appear. The text is below:"""
prompt = f"""{INSTRUCTION}\n{text}"""
#prompt = f"""Perform token-by-token Named Entity Recognition on the following biomedical text, focusing on identifying genes entities. Each token should be classified as: 'B' for the beginning of a gene, 'I' for the inside or continuation of a gene, or 'O' for tokens that do not belong to a gene entity. Please only provide the results in JSON format with the keys 'token' and 'label'. The biomedical text to label is as follows:
#
#{text}
#"""

print(prompt)


Given a biomedical text, perform Named Entity Recognition analysis on this text, focusing on identifying disease entities. Individual tokens are separated by white space and must be given exactly one label: ‘B’,  ‘I’, or ‘O’, according to the BIO labeling schema. The result should only contain a Python list of lists, where each inner list is [token, the token's label]. Output each token and label in the order the tokens appear. The text is below:
22 - oxacalcitriol suppresses secondary hyperparathyroidism without inducing low bone turnover in dogs with renal failure . BACKGROUND : Calcitriol therapy suppresses serum levels of parathyroid hormone ( PTH ) in patients with renal failure but has several drawbacks , including hypercalcemia and / or marked suppression of bone turnover , which may lead to adynamic bone disease .


In [None]:
start = time.time()

response = client.chat.completions.create(
  model="gpt-3.5-turbo",
  messages=[
    {"role": "system", "content": "You are an expert at annotating Named Entity Recognition datasets in the biomedical domain."},
    {"role": "user", "content": prompt}
  ],
  temperature=0
)

print(f'{time.time() - start} seconds taken ')

8.142457008361816 seconds taken 


In [None]:
response_content = response.choices[0].message.content
print(response_content)

token_labels = ast.literal_eval(response_content)
print(token_labels)
print(len(token_labels))

[['22', 'O'], ['-', 'O'], ['oxacalcitriol', 'O'], ['suppresses', 'O'], ['secondary', 'O'], ['hyperparathyroidism', 'B'], ['without', 'O'], ['inducing', 'O'], ['low', 'O'], ['bone', 'O'], ['turnover', 'O'], ['in', 'O'], ['dogs', 'O'], ['with', 'O'], ['renal', 'B'], ['failure', 'I'], ['.', 'O'], ['BACKGROUND', 'O'], [':', 'O'], ['Calcitriol', 'O'], ['therapy', 'O'], ['suppresses', 'O'], ['serum', 'O'], ['levels', 'O'], ['of', 'O'], ['parathyroid', 'B'], ['hormone', 'I'], ['(', 'O'], ['PTH', 'O'], [')', 'O'], ['in', 'O'], ['patients', 'O'], ['with', 'O'], ['renal', 'B'], ['failure', 'I'], ['but', 'O'], ['has', 'O'], ['several', 'O'], ['drawbacks', 'O'], [',', 'O'], ['including', 'O'], ['hypercalcemia', 'O'], ['and', 'O'], ['/', 'O'], ['or', 'O'], ['marked', 'O'], ['suppression', 'O'], ['of', 'O'], ['bone', 'O'], ['turnover', 'O'], [',', 'O'], ['which', 'O'], ['may', 'O'], ['lead', 'O'], ['to', 'O'], ['adynamic', 'O'], ['bone', 'O'], ['disease', 'O'], ['.', 'O']]
[['22', 'O'], ['-', 'O'], 

In [None]:
# with open('BC5CDR-D_devel.csv', mode='w', newline='') as file:
#     writer = csv.writer(file)
#     writer.writerow(['token', 'label'])  # Writing the header
#     for token_label in token_labels:
#       writer.writerow(token_label)

In [None]:
# with open('BC5CDR-D_devel.csv', mode='a', newline='') as file:
#     writer = csv.writer(file)
#     # writer.writerow(['token', 'label'])  # Writing the header
#     for token_label in token_labels:
#       writer.writerow(token_label)

# Multithreading for Batch GPT Requests

In [None]:
import concurrent.futures

MAX_RETRIES = 3

def call_api(prompt):
    for _ in range(MAX_RETRIES):
      try:
          response = client.chat.completions.create(
              model="gpt-3.5-turbo",
              messages=[
                  {"role": "system", "content": "You are an expert at annotating Named Entity Recognition datasets in the biomedical domain."},
                  {"role": "user", "content": prompt}
              ]
          )

          response_content = response.choices[0].message.content
          token_labels = ast.literal_eval(response_content)

          return token_labels
      except Exception as e:
        print(type(e), str(e))

    return [[token, ''] for token in prompt.split('\n')[1].split()]


In [None]:
prompts = []

for chunk in BC5CDR_D_SENTENCE_CHUNKS:
  prompt = f"""{INSTRUCTION}\n{chunk}"""
  prompts.append(prompt)
print(prompts[2:3])

["Given a biomedical text, perform Named Entity Recognition analysis on this text, focusing on identifying disease entities. Individual tokens are separated by white space and must be given exactly one label: ‘B’,  ‘I’, or ‘O’, according to the BIO labeling schema. The result should only contain a Python list of lists, where each inner list is [token, the token's label]. Output each token and label in the order the tokens appear. The text is below:\nThe animals received supplemental phosphate to enhance PTH secretion . Fourteen weeks after the start of phosphate supplementation , half of the Nx and Sham dogs received doses of OCT ( three times per week ) ; the other half were given vehicle for 60 weeks . Thereafter , the treatment modalities for a subset of animals were crossed over for an additional eight months ."]


In [None]:
prompts[0].split("\n")[1]

'22 - oxacalcitriol suppresses secondary hyperparathyroidism without inducing low bone turnover in dogs with renal failure . BACKGROUND : Calcitriol therapy suppresses serum levels of parathyroid hormone ( PTH ) in patients with renal failure but has several drawbacks , including hypercalcemia and / or marked suppression of bone turnover , which may lead to adynamic bone disease .'

In [None]:
unsuccessful_chunks = []
dat_shit = []

with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    # Submit tasks to the executor
    future_to_prompt = {executor.submit(call_api, prompt): prompt for prompt in prompts[1:]}

    # Process results as they become available
    for future in future_to_prompt:
        prompt = future_to_prompt[future]
        next_tokens_labels = future.result()
        if not isinstance(next_tokens_labels, list):
          next_tokens_labels = [[token, ''] for token in prompt.split('\n')[1].split()]
        with open('BC5CDR-D_devel.csv', mode='a', newline='') as file:
          writer = csv.writer(file)
          # writer.writerow(['token', 'label'])  # Writing the header
          for token_label in next_tokens_labels:
            writer.writerow(token_label)

<class 'SyntaxError'> '[' was never closed (<unknown>, line 1)
<class 'SyntaxError'> unterminated string literal (detected at line 1) (<unknown>, line 1)
<class 'SyntaxError'> '[' was never closed (<unknown>, line 1)
<class 'SyntaxError'> '[' was never closed (<unknown>, line 1)
<class 'SyntaxError'> invalid syntax (<unknown>, line 1)
<class 'SyntaxError'> closing parenthesis ']' does not match opening parenthesis '(' (<unknown>, line 1)
<class 'SyntaxError'> invalid syntax. Perhaps you forgot a comma? (<unknown>, line 1)
<class 'SyntaxError'> unterminated string literal (detected at line 1) (<unknown>, line 1)
<class 'SyntaxError'> unterminated string literal (detected at line 1) (<unknown>, line 1)
<class 'SyntaxError'> '[' was never closed (<unknown>, line 1)
<class 'SyntaxError'> '[' was never closed (<unknown>, line 1)
<class 'SyntaxError'> leading zeros in decimal integer literals are not permitted; use an 0o prefix for octal integers (<unknown>, line 1)
<class 'SyntaxError'> '['

In [None]:
next_tokens_labels

[['The', 'O'],
 ['animals', 'O'],
 ['received', 'O'],
 ['supplemental', 'O'],
 ['phosphate', 'O'],
 ['to', 'O'],
 ['enhance', 'O'],
 ['PTH', 'B'],
 ['secretion', 'O'],
 ['.', 'O'],
 ['Fourteen', 'O'],
 ['weeks', 'O'],
 ['after', 'O'],
 ['the', 'O'],
 ['start', 'O'],
 [Ellipsis],
 ['an', 'O'],
 ['additional', 'O'],
 ['eight', 'O'],
 ['months', 'O'],
 ['.', 'O']]

In [None]:
retry_prompts = []

df = pd.read_csv('retry_prompts.csv')
for _, row in df.iterrows():
  prompt = row['prompt']
  retry_prompts.append(prompt)
print(retry_prompts[0])

Given a biomedical text, perform Named Entity Recognition analysis on this text, focusing on identifying gene entities. Individual tokens are separated by white space and must be given exactly one label: ‘B’,  ‘I’, or ‘O’, according to the BIO labeling schema. The result should only contain a Python list of lists, where each inner list is [token, the token's label]. Output each token and label in the order the tokens appear. The text is below:
Such studied acquired with low energy or medium energy collimation and a window centered on the 159 keV 123I photopeak contain appreciable septal breakthrough signals originating from Compton scatter of high energy photons primarily from 124I . Dissolution of the Pt - 30 % Ir microelectrode tip was observed by scanning electron microscopy at charge densities as low as 200 microC / cm2 X ph ( 1 A / cm2 ) , whereas erosion of activated iridium microelectrodes occurred only at the highest charge and current densities ( 3200 microC / cm2 X ph , 16 A 

In [None]:
success_tokens, success_labels = [], []
unsuccessful_chunks = []

with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor:
    # Submit tasks to the executor
    future_to_prompt = {executor.submit(call_api, prompt): prompt for prompt in retry_prompts}

    # Process results as they become available
    for future in concurrent.futures.as_completed(future_to_prompt):
        prompt = future_to_prompt[future]
        try:
            response = future.result()
            if response:
              for t, l in response:
                success_tokens.append(t)
                success_labels.append(l)
              print(len(response), response)
            else:
              unsuccessful_chunks.append(prompt)
        except Exception as exc:
            pass


In [None]:
p1 = p2 = 0

while p1 < len(success_tokens) or p2 < len(success_labels):
  t = success_tokens[p1]
  l = success_labels[p2] if p2 < len(success_labels) else None
  print(t, l)
  p1 += 1
  p2 += 1

print(len(success_tokens))
print(len(success_labels))


0
0


In [None]:
from google.colab import files

results_df = pd.DataFrame({'token': success_tokens, 'label': success_labels})
csv_filename = 'tokens_labels.csv'
results_df.to_csv(csv_filename, index=False)
files.download(csv_filename)


retry_prompts_df = pd.DataFrame({'prompt': unsuccessful_chunks})
retry_csv_filename = 'retry_prompts.csv'
retry_prompts_df.to_csv(retry_csv_filename, index=False)
files.download(retry_csv_filename)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Batch GPT API Requests

In [None]:
import asyncio
import nest_asyncio
import aiohttp
import pandas as pd
from aiohttp import ClientSession, ClientTimeout
import concurrent.futures

In [None]:
INSTRUCTION = "Perform Named Entity Recognition on each token of the following biomedical text, focusing on genes entities. Each token should be labeled as: 'B' for the beginning of a gene, 'I' for the inside or continuation of a gene, 'O' for tokens that do not belong to a gene entity. Provide the result as a list of label strings. The biomedical text to label is as follows:"

# Async function to make a single API request
async def fetch(session, url, payload, headers):
    timeout = ClientTimeout(total=10)  # Adjust the timeout as needed
    try:
        async with session.post(url, json=payload, headers=headers, timeout=timeout) as response:
            if response.status == 200:
                return await response.json()
            else:
                # Handle non-200 responses
                return {"error": f"HTTP status {response.status}"}
    except Exception as e:
        # Handle other exceptions like timeouts
        return {"error": str(e)}

# Function to process a chunk of data
async def process_chunk(semaphore, session, chunk, url, headers):
    async with semaphore:
        # Concatenate tokens to form text
        text = ' '.join(chunk)
        prompt = f"""{INSTRUCTION}\n{text}"""
        payload = {
            "model": "gpt-3.5-turbo",
            "top_p": 10e-9,
            "temperature": 0,
            "messages": [
            {"role": "system", "content": "You are an expert at annotating Named Entity Recognition datasets in the biomedical domain."},
            {"role": "user", "content": prompt}]
        }
        response = await fetch(session, url, payload, headers)
        return response

In [None]:
# Main function to process the entire dataset
async def process_dataset(df, chunk_size):
    url = "https://api.openai.com/v1/chat/completions"
    headers = {
        "Authorization": "Bearer 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",
        "Content-Type": "application/json"
    }

    # Semaphore for rate limiting
    semaphore = asyncio.Semaphore(100)  # Adjust as per your rate limit

    async with ClientSession() as session:
        tasks = []
        for i in range(0, len(df), chunk_size):
            chunk = df[i:i+chunk_size]
            task = asyncio.ensure_future(process_chunk(semaphore, session, chunk, url, headers))
            tasks.append(task)

            # if (i // chunk_size) % 10 == 9:  # Adjust the condition as needed
            #     print(f"Processed {i // chunk_size + 1} chunks so far")

        responses = await asyncio.gather(*tasks)
        return responses


In [None]:
chunk_size=500

# Instead of asyncio.run, do the following:
loop = asyncio.get_event_loop()
if loop.is_running():
    # In environments like Jupyter, you need to use nest_asyncio
    nest_asyncio.apply()
    task = loop.create_task(process_dataset([str(token) for token in BC2GM_devel_df['token'].tolist()], chunk_size))
    responses = await task  # Await the task in the same cell
else:
    responses = loop.run_until_complete(process_dataset(BC2GM_devel_df['token'], chunk_size))


error 
error 
error 
error 
error 
error 
error 
error 
error 
error 


In [None]:
responses

[{'error': ''},
 {'error': ''},
 {'error': ''},
 {'error': ''},
 {'error': ''},
 {'error': ''},
 {'error': ''},
 {'error': ''},
 {'error': ''},
 {'error': ''},
 {'error': ''},
 {'error': ''},
 {'error': ''},
 {'error': ''},
 {'error': ''},
 {'error': ''},
 {'error': ''},
 {'error': ''},
 {'error': ''},
 {'error': ''},
 {'error': ''},
 {'error': ''},
 {'error': ''},
 {'error': ''},
 {'error': ''},
 {'error': ''},
 {'error': ''},
 {'error': ''},
 {'error': ''},
 {'error': ''},
 {'error': ''},
 {'error': ''},
 {'error': ''},
 {'error': ''},
 {'error': ''},
 {'error': ''},
 {'error': ''},
 {'error': ''},
 {'error': ''},
 {'error': ''},
 {'error': ''},
 {'error': ''},
 {'error': ''},
 {'error': ''},
 {'error': ''},
 {'error': ''},
 {'error': ''},
 {'error': ''},
 {'error': ''},
 {'error': ''},
 {'error': ''},
 {'error': ''},
 {'error': ''},
 {'error': ''},
 {'error': ''},
 {'error': ''},
 {'error': ''},
 {'error': ''},
 {'error': ''},
 {'error': ''},
 {'error': ''},
 {'error': ''},
 {'error