In [1]:
import os
import getpass
import json
import openai
from collections import defaultdict
import numpy as np
from rich.console import Console
from rich.table import Table

In [2]:
from datasets import load_dataset

In [4]:
import pandas as pd

In [30]:
import csv
import re

# train_text.txt - one sequence per line; words seperated by whitespace
f_train = open('covidnews-ner/dataset/train/train_text.txt', 'r').readlines()

# train_label_dist.txt - one sequence per line; labels seperated by whitespace
f_labels = open('covidnews-ner/dataset/train/train_label_dist.txt', 'r').readlines()

# types.txt - one entity type per line, lists all entities
f_types = open('covidnews-ner/dataset/types.txt', 'r').readlines()

In [22]:
print(f_train[0])
print(f_labels[0])

Jakarta ( ANTARA ) - as many as 419 confirmed patients positive COVID-19 in COVID-19 Wisma athlete Kemayoran , Central Jakarta to date , still undergoing maintenance or hospitalization .

B-location O O O O O O O B-person I-person I-person O B-disease O B-disease O O O O O B-location O O O O O O O O O



We first want to transform the raw data into a dataframe containing:
- the complete sentences
- the tokenized sentences 

possibly also include:
- the tokenized sentences
- the tokenized labels

for use in preprocessing/cleaning

In [26]:
data_train = pd.DataFrame({
    "sentences" : f_train,
    "labels" : f_labels
})

In [67]:
os.environ['OPENAI_API_KEY'] = open('../../../../Desktop/openai_apikey.txt', 'r').readlines()[0]

In [68]:
print(f_types)

['person\n', 'location\n', 'organization\n', 'time\n', 'disease\n', 'virus\n', 'product\n', 'animal\n', 'symptom\n', 'bacterium\n']


In [69]:
if "OPENAI_API_KEY" in os.environ:
    OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
else:
    OPENAI_API_KEY = getpass.getpass("Please enter your OpenAI API key: ")

In [70]:
openai.api_key = OPENAI_API_KEY

In [73]:
openai_query_params = {"model": "gpt-3.5-turbo", "temperature": 0, "max_tokens": 1024}

def ask_openai(prompt: str, openai_query_params=openai_query_params) -> str:
    response = openai.ChatCompletion.create(
        #prompt=prompt,
        messages=[{"role": "user", "content": prompt}
              ],
        **openai_query_params,
    )
    return response["choices"][0]["message"]["content"]

In [74]:
print(ask_openai("Hello, are you here?"))

Yes, I am here. How may I assist you?


In [80]:
sentence = f_train[0].strip('\n')

In [84]:
ex_prompt = f"""
In the sentence which I will state below between the tags <START SENTENCE> and <END SENTENCE>,
annotate the following entities in BIO format,
where any punctuation that is enclosed by whitespace on both sides is annotated with "O":
- person
- location
- organization
- time
- disease
- virus
- product
- animal
- symptom
- bacterium.
Format the output in json with the following keys:
- SENTENCE for the sentence I state below,
where the formatting including all whitespace should be kept as is, i.e., not corrected
- ANNOTATION for the generated annotation,
where entity labels should look like "B-location" and the annotation consists simply of a single string 
where all entity labels are joined, separated by a single whitespace.
Note that there have to be at least as many labels as there are tokens in the sentence,
- TOKENS_SENTENCE the number of tokens (words and punctuation in the sentence that is annotated,
- TOKENS_LABELS the number of tokens in the generated annotation saved in ANNOTATION.
 <START SENTENCE>{sentence}<END SENTENCE> Ensure that TOKENS_SENTENCE equals TOKENS_LABELS.
 """

In [85]:
print(ask_openai(ex_prompt))

{
    "SENTENCE": "Jakarta ( ANTARA ) - as many as 419 confirmed patients positive COVID-19 in COVID-19 Wisma athlete Kemayoran , Central Jakarta to date , still undergoing maintenance or hospitalization .",
    "ANNOTATION": "B-location O O O O O O O O B-disease I-disease B-location I-location I-location O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O

In [94]:
ex_prompt = f"""
For the sentence which I will state below between the tags <START SENTENCE> and <END SENTENCE>,
annotate the following entities in BIO format:
- person
- location
- organization
- time
- disease
- virus
- product
- animal
- symptom
- bacterium.
Remember that there should be as many entity labels as there are words in the sentence.
Format the output in json with the following keys:
- SENTENCE for the sentence I state below,
- ANNOTATION for the generated annotation,
where entity labels should look like "B-location" and the annotation consists simply of a single string 
where all entity labels are joined, separated by a single whitespace.
 Sentence below: <START SENTENCE>{sentence}<END SENTENCE>
 """

In [95]:
print(ask_openai(ex_prompt))

{
    "SENTENCE": "Jakarta ( ANTARA ) - as many as 419 confirmed patients positive COVID-19 in COVID-19 Wisma athlete Kemayoran , Central Jakarta to date , still undergoing maintenance or hospitalization .",
    "ANNOTATION": "B-location O O O O O O B-disease I-disease B-location I-location O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O 

In [96]:
sentence

'Jakarta ( ANTARA ) - as many as 419 confirmed patients positive COVID-19 in COVID-19 Wisma athlete Kemayoran , Central Jakarta to date , still undergoing maintenance or hospitalization .'