# Fine-Tuning NER Model for Extracting Medical Conditions and Medicines
#### This code fine-tunes a model to extract medical conditions and medicines from patient reports.
#### It utilizes OpenAI's GPT-3.5-turbo model and the Chat API for interaction.

# Setup

In [None]:
# Installing or upgrading the OpenAI Python package
!pip install --upgrade openai

Collecting openai
  Downloading openai-1.28.0-py3-none-any.whl (320 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.1/320.1 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.5-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: h11, httpcore, httpx, openai
Successfully installed h11-0.14.0 httpcore-1.0.5 ht

In [None]:
#Importing the required libraries
import csv
import json
import openai
import os
import pandas as pd
from pprint import pprint

#Setting up the OpenAI API key for authentication. Enter the key in the" " space.
client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY", " "))

# Data Loading and Preprocessing

In [None]:
# Load JSON data from medical data file
with open('medical_data.json') as json_file:
    data = json.load(json_file)

# Initialize CSV file with appropriate headers
csv_file = open('medical_data_updated.csv', 'w', newline='')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['ID', 'Content', 'Medical Conditions', 'Medicines'])

# Initialize dictionaries to store medical conditions and medicines from the JSON data
medical_conditions_dict = {}
medicines_dict = {}

# Iterate through the JSON file to extract the id, content, medical conditions and medicines into columns
for example in data['examples']:
    example_id = example['id']
    content = example['content']
    medical_conditions = set()
    medicines = set()
    for annotation in example['annotations']:
        tag_name = annotation['tag_name']
        value = annotation['value']
        if tag_name == 'MedicalCondition':
            medical_conditions.add(value)
        elif tag_name == 'Medicine':
            medicines.add(value)
    # Add medical conditions and medicines to respective dictionaries
    medical_conditions_dict[example_id] = ', '.join(medical_conditions)
    medicines_dict[example_id] = ', '.join(medicines)

# Write extracted data to CSV file
for example_id in medical_conditions_dict.keys():
    content = next((example['content'] for example in data['examples'] if example['id'] == example_id), '')
    medical_conditions = medical_conditions_dict.get(example_id, '')
    medicines = medicines_dict.get(example_id, '')
    csv_writer.writerow([example_id, content, medical_conditions, medicines])

# Close CSV file
csv_file.close()

#Load data from the CSV into a Pandas DataFrame for further processing
medical_df = pd.read_csv('medical_data_updated.csv')
medical_df.head()

Unnamed: 0,ID,Content,Medical Conditions,Medicines
0,18c2f619-f102-452f-ab81-d26f7e283ffe,While bismuth compounds (Pepto-Bismol) decreas...,", diarrhea, flatulence, constipation, diarrhea","Racecadotril, aluminomagnesium silicate, kaope..."
1,487c93e3-0d45-4088-a378-cf3a01c8953d,"Diarrhea, also spelled diarrhoea, is the condi...","Diarrhea, dehydration, decrease in responsiven...",
2,d5056874-895a-4a7f-9e0f-828d414d65d9,Antiretroviral therapy (ART) is recommended fo...,,"Antiretroviral therapy, ART"
3,20c792c7-0c4b-42d0-8127-0e04113db384,The following drugs are considered as DMARDs: ...,"rheumatoid arthritis, abdominal pain, DMARDs.,...","sulfasalazine, leflunomide, tocilizumab, Leflu..."
4,f5359e0d-4d4a-4707-95a3-4c627fc4a83b,"The goals of treatment are to reduce pain, dec...",,"steroids, Biological DMARDs, hydroxychloroquin..."


# Defining Conversation Structure
#### First, the conversation structure is defined, including system messages, user messages, and assistant responses.


In [None]:
#Define the system message for the conversation
system_message = "You are a helpful medical assistant. You are to extract the medical conditions and medicines from each patient report or medical content."

#Define the function to create the user message for each conversation example
def create_user_message(row):
    return f"""ID: {row['ID']}\n\nContent: {row['Content']}"""

#Define the function to prepare the conversation example
def prepare_example_conversation(row):
    messages = []
    messages.append({"role": "system", "content": system_message})

    user_message = create_user_message(row)
    messages.append({"role": "user", "content": user_message})

    messages.append({"role": "assistant", "content": f"Medical conditions: {row['Medical Conditions']}\n\nMedicines: {row['Medicines']}"})

    return {"messages": messages}

#Print an example conversation to check the input and output of the model
pprint(prepare_example_conversation(medical_df.iloc[0]))

{'messages': [{'content': 'You are a helpful medical assistant. You are to '
                          'extract the medical conditions and medicines from '
                          'each patient report or medical content.',
               'role': 'system'},
              {'content': 'ID: 18c2f619-f102-452f-ab81-d26f7e283ffe\n'
                          '\n'
                          'Content: While bismuth compounds (Pepto-Bismol) '
                          'decreased the number of bowel movements in those '
                          "with travelers' diarrhea, they do not decrease the "
                          'length of illness.[91] Anti-motility agents like '
                          'loperamide are also effective at reducing the '
                          'number of stools but not the duration of '
                          'disease.[8] These agents should be used only if '
                          'bloody diarrhea is not present.[92]\n'
                          '\n'
       

# Preparing Training Data

In [None]:
#Initialize an empty list to store the training data
training_data = []

# Use the first few rows of the dataset for training
training_df = medical_df.loc[0:20]

# Apply the prepare_example_conversation function to each row of the training_df
training_data = training_df.apply(prepare_example_conversation, axis=1).tolist()

# Preparing Validation Data to Prevent Overfitting

In [None]:
#Use the next few rows of the dataset for validation
validation_df = medical_df.loc[20:31]
validation_data = validation_df.apply(prepare_example_conversation, axis=1).tolist()

# Writing Data to JSONL Files


In [None]:
#Define a function to write data into JSONL file
def write_jsonl(data_list: list, filename: str) -> None:
    with open(filename, "w") as out:
        for ddict in data_list:
            jout = json.dumps(ddict) + "\n"
            out.write(jout)

In [None]:
#Save our data as `.jsonl` files, with each line being one training example conversation
training_file_name = "tmp_medical_finetune_training.jsonl"
write_jsonl(training_data, training_file_name)

#Same is done for validation data
validation_file_name = "tmp_medical_finetune_validation.jsonl"
write_jsonl(validation_data, validation_file_name)

# Uploading JSONL Files to Endpoint to be Used by the Fine-Tuned Model

In [None]:
#Upload the training and validation files to our `Files` OpenAI endpoint to be used by the fine-tuned model.

with open(training_file_name, "rb") as training_fd:
    training_response = client.files.create(
        file=training_fd, purpose="fine-tune"
    )

training_file_id = training_response.id

with open(validation_file_name, "rb") as validation_fd:
    validation_response = client.files.create(
        file=validation_fd, purpose="fine-tune"
    )
validation_file_id = validation_response.id

print("Training file ID:", training_file_id)
print("Validation file ID:", validation_file_id)

Training file ID: file-gR1Qs8ztiBf0I4CAoNlYbl5M
Validation file ID: file-eR0GJtLvj4QqntL4WpGUGfle


#Fine-Tuning the Model
### Creating Fine-Tuning Job using Training and Validation Files

In [None]:
response = client.fine_tuning.jobs.create(
    training_file=training_file_id,
    validation_file=validation_file_id,
    model="gpt-3.5-turbo",
    suffix="medical-ner",
)

job_id = response.id

print("Job ID:", response.id)
print("Status:", response.status)

Job ID: ftjob-nwjZL96lPsrcA3F50tR3Fyi5
Status: validating_files


### Checking Job Status

In [None]:
response = client.fine_tuning.jobs.retrieve(job_id)

print("Job ID:", response.id)
print("Status:", response.status)
print("Trained Tokens:", response.trained_tokens)

Job ID: ftjob-nwjZL96lPsrcA3F50tR3Fyi5
Status: running
Trained Tokens: None


### Tracking Fine-Tuning Progress

In [None]:
response = client.fine_tuning.jobs.list_events(job_id)

events = response.data
events.reverse()

#List the events for the job
for event in events:
    print(event.message)

Step 69/84: training loss=0.94, validation loss=0.90
Step 70/84: training loss=0.49, validation loss=0.80
Step 71/84: training loss=0.01, validation loss=0.56
Step 72/84: training loss=0.00, validation loss=0.43
Step 73/84: training loss=0.25, validation loss=1.13
Step 74/84: training loss=0.49, validation loss=1.10
Step 75/84: training loss=0.89, validation loss=0.42
Step 76/84: training loss=0.77, validation loss=0.94
Step 77/84: training loss=1.10, validation loss=1.29
Step 78/84: training loss=0.00, validation loss=0.24
Step 79/84: training loss=0.10, validation loss=0.91
Step 80/84: training loss=0.00, validation loss=0.89
Step 81/84: training loss=0.16, validation loss=0.84
Step 82/84: training loss=0.56, validation loss=0.56
Step 83/84: training loss=0.00, validation loss=0.43
Step 84/84: training loss=0.36, validation loss=1.10, full validation loss=0.80
Checkpoint created at step 42 with Snapshot ID: ft:gpt-3.5-turbo-0125:personal:medical-ner:9NTMGfbR:ckpt-step-42
Checkpoint c

### Getting Fine-Tuned Model ID from the Job


In [None]:
response = client.fine_tuning.jobs.retrieve(job_id)
fine_tuned_model_id = response.fine_tuned_model

if fine_tuned_model_id is None:
    raise RuntimeError("Fine-tuned model ID not found. Your job has likely not been completed yet.")

print("Fine-tuned model ID:", fine_tuned_model_id)

Fine-tuned model ID: ft:gpt-3.5-turbo-0125:personal:medical-ner:9NTMHSlu


#Defining a Chat Function to Interact with the Fine-Tuned Model

In [None]:
def chat_with_model(content):
    # Define the user message containing the content of the medical report
    user_message = f"Content: {content}"

    # Send the user message to the fine-tuned model using the Chat API
    response = client.chat.completions.create(
        model=fine_tuned_model_id,
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": user_message}
        ],
        temperature=0,
        max_tokens=500
    )

    # Extract and return the assistant's response
    assistant_response = response.choices[0].message.content
    return assistant_response

# Prompting the AI with a Medical Report to Extract the Medical Conditions and Medicines Mentioned:

In [None]:
content1 = "The patient was prescribed Aspirin for their heart condition. The doctor recommended Ibuprofen to alleviate the patient's headache. \
The patient is suffering from diabetes, and they need to take Metformin regularly. After the surgery, the patient experienced some post-operative complications, including infection. \
The patient is currently on a regimen of Lisinopril to manage their high blood pressure. The antibiotic course for treating the bacterial infection should be completed as prescribed. \
The patient's insulin dosage needs to be adjusted to better control their blood sugar levels. The physician suspects that the patient may have pneumonia and has ordered a chest X-ray. \
The patient's cholesterol levels are high, and they have been advised to take Atorvastatin. The allergy to penicillin was noted in the patient's medical history."
response1 = chat_with_model(content1)
print("Extracted Medical Conditions and Medicines:\n")
print(response1)

Extracted Medical Conditions and Medicines:

Medical conditions: heart condition, headache, diabetes, post-operative complications, infection, high blood pressure, bacterial infection, blood sugar levels, pneumonia, cholesterol levels, allergy

Medicines: Aspirin, Ibuprofen, Metformin, Lisinopril, Atorvastatin, penicillin


# Prompting the AI with an Example that Mentions One Major Medical Condition and Different Medicines Used to Treat It and It's Symptoms:

In [None]:
content2 = "The patient has been diagnosed with arthritis, a chronic autoimmune disorder that primarily affects the joints. \
Arthritis causes inflammation, pain, and swelling in the affected joints, leading to stiffness and decreased mobility. \
To manage the symptoms and slow the progression of the disease, the patient's treatment plan includes a combination of medications. \
They are prescribed methotrexate, a disease-modifying antirheumatic drug (DMARD), which helps to suppress the immune system's abnormal response and reduce inflammation in the joints. \
In addition to methotrexate, the patient also takes nonsteroidal anti-inflammatory drugs (NSAIDs) such as ibuprofen or naproxen to alleviate pain and inflammation. \
Corticosteroids like prednisone may be used in the short term to provide quick relief during flare-ups. \
Furthermore, the patient is advised to take biologic DMARDs like adalimumab or etanercept, which target specific components of the immune system involved in the inflammatory process. \
Along with medication, the patient is encouraged to engage in regular exercise, physical therapy, and lifestyle modifications to improve joint function and overall quality of life."
response2 = chat_with_model(content2)
print("Extracted Medical Conditions and Medicines:\n")
print(response2)

Extracted Medical Conditions and Medicines:

Medical conditions: arthritis, inflammation, pain, swelling, stiffness, decreased mobility, flare-ups

Medicines: methotrexate, ibuprofen, naproxen, prednisone, adalimumab, etanercept


# Prompting the AI with an Example that Mentions One Medicine and Different Medicinal Conditions it is Used to Treat:

In [None]:
# Prompting the AI with an Example that Mentions One Medicine and Different Medicinal Conditions it is Used to Treat:

In [None]:
content3 = "Ibuprofen, a nonsteroidal anti-inflammatory drug (NSAID), is commonly prescribed to alleviate pain, reduce inflammation, and lower fever. \
It is a versatile medication used to treat various conditions across different age groups. \
In adults, ibuprofen is often recommended for managing mild to moderate pain associated with headaches, toothaches, menstrual cramps, and musculoskeletal injuries such as sprains or strains. \
Additionally, it is effective in relieving pain and swelling caused by conditions like arthritis, including osteoarthritis and rheumatoid arthritis. \
In children, ibuprofen is frequently used to reduce fever and relieve discomfort due to common illnesses such as colds, flu, or sore throat. \
It can also help alleviate pain from teething, ear infections, or minor injuries. \
Furthermore, ibuprofen is sometimes prescribed to alleviate symptoms of inflammatory conditions like tendonitis or bursitis. \
Overall, ibuprofen is a widely available and well-tolerated medication that provides relief for a range of medical issues."
response3 = chat_with_model(content3)
print("Extracted Medical Conditions and Medicines:\n")
print(response3)

Extracted Medical Conditions and Medicines:

Medical conditions: tendonitis, bursitis, colds, flu, teething, ear infections, menstrual cramps, musculoskeletal injuries, toothaches, osteoarthritis, rheumatoid arthritis

Medicines: Ibuprofen


#Prompting the AI with Formatted Medical Reports to Extract the Medical Conditions and Medicines Mentioned:

In [None]:
content4 = "Patient Name: Sarah Johnson\
Age: 45\
Gender: Female\
Medical Conditions: Hypertension, Type 2 Diabetes\
Prescribed Medicines:\
- Lisinopril (10 mg daily) for hypertension management\
- Metformin (500 mg twice daily) for type 2 diabetes control\
- Aspirin (81 mg daily) for cardiovascular health"
response4 = chat_with_model(content4)
print("Extracted Medical Conditions and Medicines:\n")
print(response4)

Extracted Medical Conditions and Medicines:

Medical conditions: Hypertension, Type 2 Diabetes

Medicines: Lisinopril, Metformin, Aspirin


In [None]:
content5 = "Patient Name: John Smith\
Age: 60\
Gender: Male\
Medical Conditions: Osteoarthritis, High Cholesterol\
Prescribed Medicines:\
- Ibuprofen (200 mg three times daily) for pain relief due to osteoarthritis\
- Atorvastatin (20 mg daily) for lowering cholesterol levels\
- Glucosamine-Chondroitin supplement (1500 mg daily) for joint health"
response5 = chat_with_model(content5)
print("Extracted Medical Conditions and Medicines:\n")
print(response5)

Extracted Medical Conditions and Medicines:

Medical conditions: Osteoarthritis, High Cholesterol

Medicines: Ibuprofen, Atorvastatin, Glucosamine-Chondroitin supplement
