### Environment Setup

In [71]:
# Install required libraries
# !pip install openai pandas scikit-learn

# Import necessary libraries
import os
import csv
import pandas as pd
from openai import OpenAI
from IPython.display import display, HTML
from sklearn.metrics import precision_score, recall_score, f1_score
from concurrent.futures import ThreadPoolExecutor, as_completed

# Initialize OpenAI client
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    raise ValueError("API key not found. Please set OPENAI_API_KEY as an environment variable.")

client = OpenAI(api_key=api_key)
MODEL = 'o1-preview'

### Synthetic Data Generation

#### Function: `generate_data`

The `generate_data` function is responsible for creating 100 rows of detailed patient data in CSV format. The data includes realistic clinical scenarios with intentional errors for validation purposes. Each row contains:


In [72]:
def generate_data():
    messages = [
        {
           "role": "user",
           "content": """
                You are a helpful assistant designed to generate data. You will be given a format for the data to generate and some examples of the data.
                
                When generating Patient IDs, use the format 'P' followed by a three-digit number (e.g., P006, P941, P319).
                
                Intentionally make some mistakes in the data generation and document them in the appropriate columns ('Is Valid' and 'Issue') if the row of data is invalid.
                
                The types of mistakes to include are:
                - **Allergy Contradictions**: Prescribing a medication that the patient is allergic to (e.g., prescribing Penicillin to a patient allergic to Penicillin)
                - **Medical History and Medication Mismatch**: A patient with a medical condition not receiving appropriate medication (e.g., a diabetic patient not prescribed any diabetes medication)
                - **Lab Results and Diagnosis Mismatch**: Lab results that do not support the diagnosis (e.g., normal glucose levels but diagnosed with Diabetes Type 2)
                - **Clinical Judgment Errors**: Treatment plans that don't align with standard of care or miss critical interventions
                - **Documentation Gaps**: Missing essential elements in SOAP notes or incomplete clinical reasoning
                - **Risk Assessment Failures**: Failure to address high-risk clinical scenarios appropriately
                - **Care Coordination Issues**: Missing necessary referrals or follow-up plans
                - **Other Plausible Mistakes**: Any realistic errors in clinical documentation or decision-making

                Return 100 rows of data strictly in the format of a valid CSV, with comprehensive clinical documentation that allows for assessment of medical decision-making and treatment appropriateness.
                
                Generate Synthetic Medical Records Dataset with the following columns:
                   - Patient ID: A randomly generated patient id
                   - Date of Birth: Date of birth of the patient
                   - Gender: M/F
                   - Medical History: Past diagnoses
                   - Current Medications: Medication the patient is taking
                   - Allergies: Identified allergies
                   - Lab Results (Glucose mg/dL)
                   - Diagnoses: Current diagnosis
                   - Treatment Plan: Current treatment plan
                   - Encounter Notes: Comprehensive SOAP format clinical documentation
                   - Is Valid: Whether or not the current row of data is valid (True/False)
                   - Issue: If the row of data is not valid, what the issue is
                
                Patient ID,Date of Birth,Gender,Medical History,Current Medications,Allergies,Lab Results (Glucose mg/dL),Diagnoses,Treatment Plan,Encounter Notes,Is Valid,Issue
                P001,1980-05-14,F,"Hypertension; Type 2 Diabetes; Chronic Depression; Obesity (BMI 33)","Lisinopril 20mg daily; Metformin 1000mg BID; Sertraline 100mg daily",None,210,"Hypertension (Uncontrolled); Type 2 Diabetes (Uncontrolled); Depression (Stable)","Continue current medications; Diabetes education; Follow up 3 months",{"subjective":{"chief_complaint":"Routine follow-up for blood pressure and diabetes","history_present_illness":"Reports increasing thirst and fatigue, medication compliance affected by cost","review_of_systems":{"constitutional":"Fatigue present","cardiovascular":"No chest pain","endocrine":"Polyuria, polydipsia"},"medication_adherence":"Missing diabetes medications due to cost","lifestyle":{"diet":"Poor control","exercise":"Limited by fatigue","sleep":"Adequate"}},"objective":{"vital_signs":{"bp":"148/92","hr":"88","rr":"16","temp":"37.0","weight":"92 kg"},"physical_exam":{"general":"Well-appearing but fatigued","cardiovascular":"Regular rhythm","respiratory":"Clear","neurological":"Decreased foot sensation"},"labs":{"glucose":"210","hba1c":"8.8","basic_metabolic":"Within normal limits","lipids":"LDL 145"}},"assessment":{"diabetes":"Poor control, affected by medication adherence","hypertension":"Above goal, requiring adjustment","depression":"Currently stable on Sertraline"},"plan":{"medications":"No changes made","follow_up":"3 months","education":"Basic diabetes education provided"}},False,"Critical care gaps: 1) No medication adjustment despite uncontrolled HTN and DM; 2) No addressing of medication cost barriers; 3) No referral to diabetes education program; 4) Inadequate follow-up interval for poor control"
                P002,1970-03-22,M,"COPD (FEV1 45%); Coronary Artery Disease; Osteoarthritis","Tiotropium inhaler daily; Albuterol inhaler PRN; Aspirin 81mg daily; Atorvastatin 40mg daily","Penicillin; NSAIDs",122,"COPD Exacerbation; Acute Bronchitis","1. Prednisone 40mg daily x5 days; 2. Amoxicillin 875mg BID x7 days; 3. Follow up 1 week",{"subjective":{"chief_complaint":"Worsening shortness of breath x3 days","history_present_illness":"Increased cough, green sputum, using rescue inhaler more frequently","respiratory_symptoms":{"dyspnea":"Worsened","cough":"Productive, green","wheeze":"Present"},"functional_status":"Unable to perform ADLs due to SOB"},"objective":{"vital_signs":{"bp":"135/82","hr":"102","rr":"24","temp":"38.1","o2_sat":"92%"},"physical_exam":{"respiratory":"Diffuse wheezing, prolonged expiration","cardiac":"Tachycardic, regular","extremities":"No edema"},"peak_flow":"150 L/min (50% of baseline)"},"assessment":{"primary":"Moderate COPD exacerbation with likely bacterial infection","risk_factors":"Recent similar episodes, poor baseline function"},"plan":{"medications":"Prescribed antibiotics despite penicillin allergy","follow_up":"1 week","instructions":"Return if worsening"}},False,"Medication safety error: Prescribed Amoxicillin despite documented Penicillin allergy; requires alternative antibiotic choice"
                P003,2020-08-15,F,"Premature birth (32 weeks); Global Developmental Delay","Iron drops; Multivitamin","Milk protein",92,"Global Developmental Delay; Failure to Thrive","Continue current monitoring",{"subjective":{"chief_complaint":"18-month well-child check","interval_history":"Parents concerned about delayed milestones","developmental_concerns":{"motor":"Not walking independently","language":"No words","social":"Limited engagement"},"feeding":"Difficulties with solid foods"},"objective":{"vital_signs":{"weight":"8.2 kg (<3rd percentile)","height":"74 cm (5th percentile)","hc":"44 cm (10th percentile)"},"developmental_assessment":{"gross_motor":"12-month level","fine_motor":"12-month level","language":"9-month level","social":"10-month level"},"physical_exam":{"general":"Alert but not interactive","neurological":"Normal tone","growth":"Significant delay"}},"assessment":{"developmental":"Significant global delays","growth":"Failure to thrive","feeding":"Ongoing difficulties"},"plan":{"referrals":"None ordered","follow_up":"Routine well-child"}},False,"Critical care gaps: 1) No early intervention referral despite significant delays; 2) No specialty evaluation for FTT; 3) No therapy services initiated; 4) Follow-up inadequate for severity"
                P004,1965-09-30,M,"Bipolar I Disorder; Hypertension; Type 2 Diabetes","Lithium 900mg daily; Metformin 1000mg BID; Lisinopril 20mg daily",None,355,"Bipolar Disorder (Mixed Episode); Diabetic Ketoacidosis","Continue current medications; Outpatient follow-up",{"subjective":{"chief_complaint":"Increasing agitation and elevated mood","psychiatric_symptoms":{"mood":"Elevated with irritability","sleep":"2 hours nightly","behavior":"Aggressive","thought_content":"Grandiose delusions"},"medical_symptoms":{"polyuria":"Severe","polydipsia":"Present","vision":"Blurred"}},"objective":{"vital_signs":{"bp":"158/95","hr":"122","rr":"24","temp":"37.8","glucose":"355"},"mental_status":{"affect":"Labile","speech":"Pressured","thought_process":"Flight of ideas","insight":"Poor"},"labs":{"sodium":"148","bicarbonate":"18","anion_gap":"16","ketones":"Large"}},"assessment":{"psychiatric":"Acute mixed episode with agitation","medical":"Diabetic ketoacidosis requiring urgent intervention"},"plan":{"disposition":"Scheduled for outpatient follow-up","medications":"No changes","monitoring":"None arranged"}},False,"Severe patient safety issue: Diabetic ketoacidosis with psychiatric decompensation requires immediate hospitalization; current outpatient plan places patient at immediate risk"
            """
        }
    ]

    response = client.chat.completions.create(
        model=MODEL,
        messages=messages
    )

    return response.choices[0].message.content.replace('```csv', '').replace('```', '')

### Generating 300 Rows of Synthetic Data

To create a larger dataset, the `generate_data` function is executed three times, each time generating 100 rows of detailed patient data. The outputs are then combined into a single dataset containing 300 rows.
- The data is saved as `medicalData.csv`.

In [75]:
# Generate data three times using the existing dataGeneration function
generated_data = []
data = generate_data()
generated_data.extend(data.strip().split('\n'))

# Append the generated data to the medicalData.csv file
with open('./data/medicalData.csv', 'a', newline='') as csvfile:
    csvwriter = csv.writer(csvfile)
    for row in generated_data:
        csvwriter.writerow(row.split(','))

print("Synthetic data generation and appending completed.")

Synthetic data generation and appending completed.


### Data Validation
We are using GPT-4o for cost and effiency, as the task is narrow enough for this model

In [76]:
def validate_data(input_data):
    messages = [
        {
            "role": "user",
            "content": f"""
You are a helpful assistant designed to validate the quality of medical datasets. You will be given a single row of medical data, and your task is to determine whether the data is valid.

- Carefully analyze the data for any inconsistencies, contradictions, missing values, or implausible information.
- Consider the logical relationships between different fields (e.g., treatments should be appropriate for the diagnoses, medications should not conflict with allergies, lab results should be consistent with diagnoses, etc.).
- Use your general medical knowledge to assess the validity of the data.
- Focus solely on the information provided without making assumptions beyond the given data.

**Return only a JSON object** with the following two properties:

- `"is_valid"`: a boolean (`true` or `false`) indicating whether the data is valid.
- `"issue"`: if `"is_valid"` is `false`, provide a brief explanation of the issue; if `"is_valid"` is `true`, set `"issue"` to `null`.

Both JSON properties must always be present.

Do not include any additional text or explanations outside the JSON object.

MEDICAL DATA:
{input_data}
            """
        }
    ]

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=messages
    )

    response_content = response.choices[0].message.content.replace('```json', '').replace('```', '').strip()
    
    try:
        if isinstance(response_content, dict):
            response_dict = response_content
        else:
            response_dict = json.loads(response_content)
        return response_dict
    except json.JSONDecodeError as e:
        print(f"Failed to decode JSON response: {response_content}")
        raise e

In [81]:
# Read the CSV file and exclude the last two columns
input_data = []
with open('./data/medicalData.csv', 'r') as file:
    reader = csv.reader(file)
    headers = next(reader)
    for row in reader:
        input_data.append(row[:-2])  # Exclude "Is Valid" and "Issue" columns

# Initialize lists to store true labels
true_is_valid = []
true_issues = []

# Extract true labels from the CSV file
with open('./data/medicalData.csv', 'r') as file:
    reader = csv.reader(file)
    headers = next(reader)
    for row in reader:
        true_is_valid.append(row[-2] == 'True')
        true_issues.append(row[-1])

# Function to validate a single row of data
def validate_row(row):
    input_str = ','.join(row)
    result_json = validate_data(input_str)
    return result_json

# Validate data rows and collect results
pred_is_valid = [False] * len(input_data)
pred_issues = [''] * len(input_data)

with ThreadPoolExecutor() as executor:
    futures = {executor.submit(validate_row, row): i for i, row in enumerate(input_data)}
    
    for future in as_completed(futures):
        i = futures[future]  # Get the index of the current row
        result_json = future.result()
        pred_is_valid[i] = result_json['is_valid']
        pred_issues[i] = result_json['issue']

#### Compare the results against the source of truth and determine the system's accuracy

In [82]:
# Convert predicted and true 'is_valid' labels to boolean if they aren't already
pred_is_valid_bool = [bool(val) if isinstance(val, bool) else val == 'True' for val in pred_is_valid]
true_is_valid_bool = [bool(val) if isinstance(val, bool) else val == 'True' for val in true_is_valid]

# Calculate precision, recall, and f1 score for the 'is_valid' prediction
precision = precision_score(true_is_valid_bool, pred_is_valid_bool, pos_label=True)
recall = recall_score(true_is_valid_bool, pred_is_valid_bool, pos_label=True)
f1 = f1_score(true_is_valid_bool, pred_is_valid_bool, pos_label=True)

# Initialize issue_matches_full with False
issue_matches_full = [False] * len(true_is_valid)

In [83]:
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1: {f1:.2f}")

Precision: 0.84
Recall: 0.86
F1: 0.85


### Issue Identification
We will now determine the model's ability to accurately classify the issue in the data


In [84]:
def validate_issue(model_generated_answer, correct_answer):
    messages = [
        {
            "role": "user",
            "content": f"""
You are a medical expert assistant designed to validate the quality of an LLM-generated answer.

The model was asked to review a medical dataset row to determine if the data is valid. If the data is not valid, it should provide a justification explaining why.

Your task:

    •	Compare the model-generated justification with the correct reason provided.
    •	Determine if they address the same underlying medical issue or concern, even if phrased differently.
    •	Focus on the intent, medical concepts, and implications rather than exact wording.

Instructions:

    •	If the justifications have the same intent or address the same medical issue, return True.
    •	If they address different issues or concerns, return False.
    •	Only respond with a single word: True or False.

Examples:

    1.	Example 1:
    •	Model Generated Response: “The patient is allergic to penicillin”
    •	Correct Response: “The patient was prescribed penicillin despite being allergic”
    •	Answer: True
    2.	Example 2:
    •	Model Generated Response: “The date of birth of the patient is incorrect”
    •	Correct Response: “The patient was prescribed penicillin despite being allergic”
    •	Answer: False


Model Generated Response: {model_generated_answer}
Correct Response:  {correct_answer}
            """
        }
    ]

    response = client.chat.completions.create(
        model=MODEL,
        messages=messages
    )

    result = response.choices[0].message.content

    return result

In [85]:
# Validate issues for rows where both true and predicted 'is_valid' are False
validation_results = []

with ThreadPoolExecutor() as executor:
    futures = {
        executor.submit(validate_issue, pred_issues[i], true_issues[i]): i
        for i in range(len(pred_is_valid_bool))
        if not pred_is_valid_bool[i] and not true_is_valid_bool[i]
    }
    
    for future in as_completed(futures):
        i = futures[future]  # Get the original index
        issue_match = future.result()
        issue_matches_full[i] = (issue_match == 'True')
        validation_results.append({
            "index": i,
            "predicted_issue": pred_issues[i],
            "true_issue": true_issues[i],
            "issue_match": issue_matches_full[i]
        })
    
    # Calculate issue accuracy
    issue_accuracy = sum([i['issue_match'] for i in validation_results]) / len(validation_results)
    
    # Store the results in the dictionary
    model_results = {
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "issue_accuracy": issue_accuracy
    }

# Create a DataFrame to store the results
df_results = pd.DataFrame([model_results])

# Create a DataFrame to store the validation results for each row
df_validation_results = pd.DataFrame(validation_results)

#### Display the subset of rows that we correctly identified contained an issue. 
For each row, we'll show the predicted vs. true issue and whether or not there is a match

In [86]:
def display_formatted_dataframe(df):
    def format_text(text):
        return text.replace('\n', '<br>')

    df_formatted = df.copy()
    df_formatted['predicted_issue'] = df_formatted['predicted_issue'].apply(format_text)
    df_formatted['true_issue'] = df_formatted['true_issue'].apply(format_text)
    
    display(HTML(df_formatted.to_html(escape=False, justify='left')))
    
display_formatted_dataframe(pd.DataFrame(validation_results))

Unnamed: 0,index,predicted_issue,true_issue,issue_match
0,30,Patient is prescribed Metformin but has no diagnosis.,Medication prescribed without diagnosis,True
1,5,The age (55) does not match the date of birth (1978-12-05).,Low glucose level not properly addressed,False
2,28,"Patient is allergic to penicillin but is prescribed amoxicillin, which is contraindicated.",Prescribed Amoxicillin despite Penicillin allergy,True
3,38,Treatment field contains diagnosis instead of a valid treatment plan.,No medication prescribed for COPD,False
4,11,Patient is allergic to penicillin but is prescribed amoxicillin.,Prescribed Amoxicillin despite Penicillin allergy,True
5,59,"Patient is allergic to penicillin but was prescribed amoxicillin, which is contraindicated.",Prescribed Amoxicillin despite Penicillin allergy,True
6,3,Patient is allergic to Amoxicillin but is prescribed Amoxicillin.,Prescribed Amoxicillin despite Penicillin allergy,True
7,18,"Diagnosis of hyperlipidemia is inconsistent with normal lab result of 95 mg/dL, and no treatment plan is provided.",No medication prescribed for hyperlipidemia,True
8,14,No treatment plan is provided for a patient diagnosed with hypertension.,No medication prescribed for hypertension,True
9,49,Blood pressure of 90 mm Hg is inconsistent with diagnosis of hypertension and prescription of Lisinopril.,Diagnosis does not match prescribed medication,True


In [87]:
# Display the DataFrame
print(df_results)

   precision    recall        f1  issue_accuracy
0    0.84153  0.860335  0.850829            0.75



### Conclusion
We can see from the results here that we're able to generate a high precision/recall for issue identification as well as decent accuracy for pinpointing the exact issue in the data.
