In [14]:
from openai import OpenAI
import json
import pandas as pd
import os

function_schema = {
    "name": "extract_clinical_criteria",
    "description": "Extract clinical trial inclusion/exclusion criteria based on the given parameters.",
    "parameters": {
        "type": "object",
        "properties": {
            "demographics_and_general_characteristics": {
                "type": "object",
                "properties": {
                    "months_of_life_expectancy": {"type": "integer", "description": "Required months of life expectancy"},
                    "follow_up_agreement": {"type": "string", "enum": ["Required","Not required"], "description": "Whether the participant agrees to follow-up"},
                    "no_concurrent_participation_in_other_clinical_trials": {"type": "string", "enum": ["Required","Not required"], "description": "Whether concurrent participation in other clinical trials is allowed"}
                }
            },
            "disease_characteristics": {
                "type": "object",
                "properties": {
                    "confirmed_breast_cancer": {"type": "string", "enum": ["Required", "Not required"], "description": "Whether subjects are required to have breast cancer. Do not output allowed here"},
                    "confirmed_metastases_breast_cancer": {"type": "string", "enum": ["Required", "Allowed", "Not allowed", "Not required"], "description": "Whether subjects can have metastases breast cancer"},
                    "confirmed_locally_recurrent_breast_cancer": {"type": "string", "enum": ["Required", "Allowed", "Not allowed", "Not required"], "description": "Whether subjects can have locally recurrent breast cancer"},
                    "disease_progression_after_previous_treatment": {"type": "string", "enum": ["Required", "Allowed", "Not allowed", "Not required"], "description": "Whether cancer progressed after previous treatment is observed"},
                    "her2_status": {"type": "string", "enum": ["Positive only", "Negative only", "Both allowed"], "description": "which HER2 status is allowed for participant"},
                    "er_pr_status": {"type": "string", "enum": ["Positive only", "Negative only", "Both allowed"], "description": "which ER/PR status is allowed for participants"},
                    "measurable_lesion": {"type": "string", "enum": ["Required", "Allowed", "Not allowed", "Not required"], "description": "Whether there exists measurable disease/tumor tissue/lesion"}
                }
            },
            "health_and_organ_function": {
                "type": "object",
                "properties": {
                    "sufficient_renal_function": {"type": "string", "enum": ["Required", "Not required"], "description": "Whether subject is required to have sufficient renal/kidney function"},
                    "sufficient_bone_marrow": {"type": "string", "enum": ["Required", "Not required"], "description": "Whether subject is required to have sufficient bone marrow function"},
                    "sufficient_liver_function": {"type": "string", "enum": ["Required", "Not required"], "description": "Whether subject is required to have sufficient liver function"},
                    "sufficient_heart_cardiovascular_function": {"type": "string", "enum": ["Required", "Not required"], "description": "Whether subject is required to have sufficient heart/cardiovascular function"},
                    "sufficient_lung_function": {"type": "string", "enum": ["Required", "Not required"], "description": "Whether lung function is sufficient"},
                    "active_renal_disease": {"type": "string", "enum": ["Allowed", "Not allowed"], "description": "Whether subjects having renal/kidney disease is allowed"},
                    "active_liver_disease": {"type": "string", "enum": ["Allowed", "Not allowed"], "description": "Whether subjects having liver disease is allowed"},
                    "active_heart_disease": {"type": "string", "enum": ["Allowed", "Not allowed"], "description": "Whether subjects having heart disease is allowed"},
                    "active_lung_disease": {"type": "string", "enum": ["Allowed", "Not allowed"], "description": "Whether subjects having lung disease is allowed"},
                    "liver_function_ast_alt_upper_bound_uln": {"type": "number", "description": "subjects upper bound for aspartate amino transferase(AST)/alanine amino transferase(ALT) levels to be included (* ULN)"},
                    "liver_function_bilirubin_upper_bound_uln": {"type": "number", "description": "subjects upper bound for bilirubin levels to be included (* ULN)"},
                    "creatinine_upper_bound_uln": {"type": "number", "description": "subjects upper bound for creatinine levels (* ULN)"},
                    "creatinine_clearance_lower_bound": {"type": "number", "description": "subjects lower bound for creatinine clearance (ml/min)"},
                    "lvef_lower_bound_percent": {"type": "number", "description": "Lower bound for LVEF percentage"},
                    "anc_level_lower_bound_10_9_l": {"type": "number", "description": "Lower bound for ANC level (* 10^9/L)"},
                    "hgb_level_lower_bound_10_9_l": {"type": "number", "description": "Lower bound for hemoglobin level (* 10^9/L)"},
                    "plt_level_lower_bound_10_9_l": {"type": "number", "description": "Lower bound for platelet level (* 10^9/L)"},
                    "granulocytes": {"type": "number", "description": "Granulocyte levels (* 10^9/L)"},
                    "karnofsky_performance_lower_bound": {"type": "integer", "description": "Lower bound for Karnofsky performance scale"},
                    "ecog_performance_status_upper_bound": {"type": "integer", "description": "Upper bound for ECOG / WHO performance status"},
                    "untreated_brain_metastases": {"type": "string", "enum": ["Allowed", "Not allowed"], "description": "Whether untreated brain metastases are allowed"},
                    "untreated_central_nervous_system_metastases": {"type": "string", "enum": ["Allowed", "Not allowed"], "description": "Whether untreated central nervous system metastases are allowed"},
                    "pregnancy_nursing": {"type": "string", "enum": ["Allowed", "Not allowed"], "description": "Whether pregnancy or nursing is excluded"},
                    "peripheral_neuropathy_grade_upper_bound": {"type": "integer", "description": "Upper bound for peripheral neuropathy grade"},
                    "positive_hiv_status": {"type": "string", "enum": ["Allowed", "Not allowed"], "description": "Whether positive HIV status is excluded"}
                }
            },
            "text_relating_to_medical_history": {
                "type": "object",
                "properties": {
                    "inclusion_criteria": {"type": "string", "description": "Relevant text relating to medical history"}
                }
            },
            "unused_text": {
                "type": "object",
                "properties": {
                    "inclusion_criteria": {"type": "string", "description": "Unused text relating to inclusion criteria"}
                }
            }
        }
    }
}

def extract_criteria_with_chatgpt_structured(trial_text, prompt_template, model="gpt-4o"):
    client = OpenAI(
        api_key=os.environ.get("OPENAI_API_KEY")
    )
    

    prompt = prompt_template + f"\n\n{trial_text}"
    
    try:
       
        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": "You are a helpful assistant specialized in clinical trial criteria extraction."},
                {"role": "user", "content": prompt}
            ],
            functions=[function_schema],
            function_call={"name": "extract_clinical_criteria"},
            temperature=0
        )
        
        
        function_call = response.choices[0].message.function_call
        
        if function_call:
            return json.loads(function_call.arguments)  # Return as a Python dictionary
        else:
            print("No function call in response")
            return None
            
    except Exception as e:
        print(f"Error processing trial text: {e}")
        return None

def process_clinical_trials_from_csv(input_csv, output_json, prompt_template):
    """
    Process clinical trials from a CSV file and save extracted criteria to JSON.
    
    Args:
        input_csv (str): Path to input CSV file
        output_json (str): Path for output JSON file
        prompt_template (str): Template for GPT prompt
    """
    try:
        
        df = pd.read_csv(input_csv)
        results = []
        
        
        for index, row in df.iterrows():
            trial_text = row["criteria"]
            print(f"Processing row {index + 1}/{len(df)}...")
            
            structured_output = extract_criteria_with_chatgpt_structured(trial_text, prompt_template)
            
            if structured_output:
                results.append({
                    "Trial Text": trial_text,
                    "Extracted Criteria": structured_output
                })
        
        with open(output_json, "w") as f:
            json.dump(results, f, indent=4)
            
        print(f"Processing complete! Results saved to {output_json}")
        return results
        
    except Exception as e:
        print(f"Error processing CSV file: {e}")
        return None

if __name__ == "__main__":
    input_csv = "criteria_30.csv" 
    output_json = "updated_extracted_criteria_30.json" 
    
    prompt_template = """
        Extract the inclusion/exclusion criteria from the provided clinical trial text based on the provided parameters. 
        For each parameter, output only the key values explicitly stated in the bracket. 
        If a parameter is not mentioned, leave it blank. Pay close attention when distinguishing "Allowed" and "Required".
        Carefully handle statements involving negations (e.g., “not excluded,” “absence of,” “no contraindications”) to avoid misclassifying double negatives. 
        Cross-reference inclusion and exclusion criteria to ensure consistency. 
        Note that for "upper bound", you should list n-1 if "<n" is mentioned in the original text. For instance, if the original text says "ecog performance <2", then you should record 1 in "ecog_performance_status_upper_bound". 
        Avoid assuming a value based on a negation unless it is explicitly supported by the text. 
        Pay close attention to sentences that list multiple criteria in a single statement and ensure that all relevant parameters are extracted. 
        Do not include any descriptive words or explanations after each parameter value. 
        Any text relating to requirements on subjects’ medical history and cannot be classified in the structured parameters should be listed under “Text relating to Medical History”.
        Any unused or unmatched text should be listed separately under “Unused Text”, labeling as “Inclusion Criteria” or “Exclusion Criteria”. 
    """
    
    results = process_clinical_trials_from_csv(input_csv, output_json, prompt_template)

Processing row 1/24...
Processing row 2/24...
Processing row 3/24...
Processing row 4/24...
Processing row 5/24...
Processing row 6/24...
Processing row 7/24...
Processing row 8/24...
Processing row 9/24...
Processing row 10/24...
Processing row 11/24...
Processing row 12/24...
Processing row 13/24...
Processing row 14/24...
Processing row 15/24...
Processing row 16/24...
Processing row 17/24...
Processing row 18/24...
Processing row 19/24...
Processing row 20/24...
Processing row 21/24...
Processing row 22/24...
Processing row 23/24...
Processing row 24/24...
Processing complete! Results saved to updated_extracted_criteria_30.json


In [None]:
        Parameters:
        Demographics and General Characteristics
        Months of Life Expectancy: [1 – 99]
        Follow-up Agreement: [Required, Not required]
        Signed Consent: [Required, Not required]
        Protocol Compliance: [Required, Not required]
        No concurrent participation in other clinical trials: [Required, Not required]
 
        Disease Characteristics
        Confirmed breast cancer / mammary carcinoma: [Required, Not required]
        Confirmed metastases breast cancer: [Required, Allowed, Not allowed]
        Confirmed locally recurrent breast cancer: [Required, Allowed, Not allowed]
        Progress after previous treatment: [Required, Allowed, Not allowed]
        HER-2 Status: [Positive only, Negative only, Both allowed]
        ER/PR Status: [Positive only, Negative only, Both allowed]
        
        Health and Organ Function
        Sufficient renal function: [Required, Not required]
        Sufficient bone marrow: [Required, Not required]
        Sufficient liver function: [Required, Not required]
        Sufficient heart/cardiovascular function: [Required, Not required]
        Sufficient lung function: [Required, Not required]
        renal disease: [Allowed, Not allowed]
        liver disease: [Allowed, Not allowed]
        heart/cardiovascular disease: [Allowed, Not allowed]
        lung disease: [Allowed, Not allowed]
        AST/ALT upper bound (* ULN): [1 - 100]
        Bilirubin upper bound (* ULN): [0 - 100]
        LVEF lower bound (%): [0-100]
        creatinine clearance lower bound: [0 - 100]
        creatinine upper bound (*ULN): [0 - 100]
        ANC Level lower bound (*10^9/L): [0 - 10000]
        Hgb Level lower bound (*10^9/L): [0 – 100]
        plt Level lower bound (*10^9/L): [0 – 1000]
        granulocytes: [1-10]
        Karnofsky performance lower bound: [0 – 100]
        ECOG / WHO performance status upper bound: [0 – 5]
        Active brain Metastases: [Allowed, Not allowed]
        Active central nervous system metastases: [Allowed, Not allowed]
        Pregnancy/Nursing: [Allowed, Not allowed]
        Peripheral Neuropathy Grade upper bound: [1 – 4]
        Positive HIV status: [Allowed, Not allowed]
        Accessible Measurable Lesion: [Required, Not required]
        Swallow Ability: [Required, Not required]

    Text relating to Medical History
    [include but not limited to historical treatments, investigational drug use, historical allergies, current treatment, current disease]

        Unused Text

                    "protocol_compliance": {"type": "string", "enum": ["Required"], "description": "Whether subjects are expected to comply with the research protocol"},
