In [9]:
from openai import OpenAI
import json
import pandas as pd
import os

function_schema = {
    "name": "extract_clinical_criteria",
    "description": "Extract clinical trial inclusion/exclusion criteria based on the given parameters.",
    "parameters": {
        "type": "object",
        "properties": {
            "demographics_and_general_characteristics": {
                "type": "object",
                "properties": {
                    "months_of_life_expectancy": {"type": "integer", "description": "Required months of life expectancy"},
                    "no_participation_in_other_clinical_trials": {"type": "string", "enum": ["Required","Not required"], "description": "Whether concurrent participation in other clinical trials is allowed"}
                }
            },
            "disease_characteristics": {
                "type": "object",
                "properties": {
                    "confirmed_breast_cancer": {"type": "string", "enum": ["Required", "Not required"], "description": "Whether subjects are required to have breast cancer. Do not output allowed here"},
                    "confirmed_metastases_breast_cancer": {"type": "string", "enum": ["Required", "Allowed", "Not allowed", "Not required"], "description": "Whether subjects can have metastases breast cancer"},
                    "confirmed_locally_recurrent_breast_cancer": {"type": "string", "enum": ["Required", "Allowed", "Not allowed", "Not required"], "description": "Whether subjects can have locally recurrent breast cancer"},
                    "disease_progression_after_previous_treatment": {"type": "string", "enum": ["Required", "Allowed", "Not allowed", "Not required"], "description": "Whether cancer progressed after previous treatment is observed"},
                    "her2_status": {"type": "string", "enum": ["Positive only", "Negative only", "Both allowed"], "description": "which HER2 status is allowed for participant"},
                    "er_pr_status": {"type": "string", "enum": ["Positive only", "Negative only", "Both allowed"], "description": "which ER/PR status is allowed for participants"},
                    "measurable_lesion": {"type": "string", "enum": ["Required", "Allowed", "Not allowed", "Not required"], "description": "Whether there exists measurable disease/tumor tissue/lesion"}
                }
            },
            "health_and_organ_function": {
                "type": "object",
                "properties": {
                    "sufficient_renal_function": {"type": "string", "enum": ["Required", "Not required"], "description": "Whether subject is required to have sufficient renal/kidney function"},
                    "sufficient_bone_marrow": {"type": "string", "enum": ["Required", "Not required"], "description": "Whether subject is required to have sufficient bone marrow function"},
                    "sufficient_liver_function": {"type": "string", "enum": ["Required", "Not required"], "description": "Whether subject is required to have sufficient liver function"},
                    "sufficient_heart_cardiovascular_function": {"type": "string", "enum": ["Required", "Not required"], "description": "Whether subject is required to have sufficient heart/cardiovascular function"},
                    "sufficient_lung_function": {"type": "string", "enum": ["Required", "Not required"], "description": "Whether lung function is sufficient"},
                    "liver_function_ast_alt_upper_bound_uln": {"type": "number", "description": "subjects upper bound for aspartate amino transferase(AST)/alanine amino transferase(ALT) levels to be included (* ULN)"},
                    "liver_function_bilirubin_upper_bound_uln": {"type": "number", "description": "subjects upper bound for bilirubin levels to be included (* ULN)"},
                    "creatinine_upper_bound_uln": {"type": "number", "description": "subjects upper bound for creatinine levels (* ULN)"},
                    "creatinine_clearance_lower_bound": {"type": "number", "description": "subjects lower bound for creatinine clearance (ml/min)"},
                    "lvef_lower_bound_percent": {"type": "number", "description": "Lower bound for LVEF percentage"},
                    "anc_level_lower_bound_10_9_l": {"type": "number", "description": "Lower bound for ANC level (* 10^9/L)"},
                    "hgb_level_lower_bound_10_9_l": {"type": "number", "description": "Lower bound for hemoglobin level (* 10^9/L)"},
                    "plt_level_lower_bound_10_9_l": {"type": "number", "description": "Lower bound for platelet level (* 10^9/L)"},
                    "granulocytes": {"type": "number", "description": "Granulocyte levels (* 10^9/L)"},
                    "karnofsky_performance_lower_bound": {"type": "integer", "description": "Lower bound for Karnofsky performance scale"},
                    "ecog_performance_status_upper_bound": {"type": "integer", "description": "Upper bound for ECOG / WHO performance status"},
                    "untreated_central_nervous_system_metastases": {"type": "string", "enum": ["Allowed", "Not allowed"], "description": "Whether untreated central nervous system metastases are allowed"},
                    "pregnancy_nursing": {"type": "string", "enum": ["Allowed", "Not allowed"], "description": "Whether pregnancy or nursing is excluded"},
                    "peripheral_neuropathy_grade_upper_bound": {"type": "integer", "description": "Upper bound for peripheral neuropathy grade"},
                    "positive_hiv_status": {"type": "string", "enum": ["Allowed", "Not allowed"], "description": "Whether positive HIV status is excluded"}
                }
            },
            "text_relating_to_medical_history": {
                "type": "object",
                "properties": {
                    "inclusion_criteria": {"type": "string", "description": "Relevant text relating to medical history"}
                }
            },
            "unused_text": {
                "type": "object",
                "properties": {
                    "inclusion_criteria": {"type": "string", "description": "Unused text relating to inclusion criteria"}
                }
            }
        }
    }
}

def extract_criteria_with_chatgpt_structured(trial_text, prompt_template, model="gpt-4o"):
    client = OpenAI(
        api_key=os.environ.get("OPENAI_API_KEY")
    )
    

    prompt = prompt_template + f"\n\n{trial_text}"
    
    try:
       
        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": "You are a helpful assistant specialized in clinical trial criteria extraction."},
                {"role": "user", "content": prompt}
            ],
            functions=[function_schema],
            function_call={"name": "extract_clinical_criteria"},
            temperature=0
        )
        
        
        function_call = response.choices[0].message.function_call
        
        if function_call:
            return json.loads(function_call.arguments)  # Return as a Python dictionary
        else:
            print("No function call in response")
            return None
            
    except Exception as e:
        print(f"Error processing trial text: {e}")
        return None

def process_clinical_trials_from_csv(input_csv, output_json, prompt_template):
    """
    Process clinical trials from a CSV file and save extracted criteria to JSON.
    
    Args:
        input_csv (str): Path to input CSV file
        output_json (str): Path for output JSON file
        prompt_template (str): Template for GPT prompt
    """
    try:
        
        df = pd.read_csv(input_csv)
        results = []
        
        
        for index, row in df.iterrows():
            trial_text = row["criteria"]
            print(f"Processing row {index + 1}/{len(df)}...")
            
            structured_output = extract_criteria_with_chatgpt_structured(trial_text, prompt_template)
            
            if structured_output:
                results.append({
                    "Trial Text": trial_text,
                    "Extracted Criteria": structured_output
                })
        
        with open(output_json, "w") as f:
            json.dump(results, f, indent=4)
            
        print(f"Processing complete! Results saved to {output_json}")
        return results
        
    except Exception as e:
        print(f"Error processing CSV file: {e}")
        return None

if __name__ == "__main__":
    input_csv = "criteria_30_new.csv" 
    output_json = "extracted_criteria_30_new.json" 
    
    prompt_template = """
        Extract the inclusion/exclusion criteria from the provided clinical trial text based on the provided parameters. 
        For each parameter, output only the key values explicitly stated in the bracket. 
        If a parameter is not mentioned, leave it blank. Pay close attention when distinguishing "Allowed" and "Required".
        Carefully handle statements involving negations (e.g., “not excluded,” “absence of,” “no contraindications”) to avoid misclassifying double negatives. 
        Cross-reference inclusion and exclusion criteria to ensure consistency. 
        Note that for "upper bound", you should list n-1 if "<n" is mentioned in the original text. For instance, if the original text says "ecog performance <2", then you should record 1 in "ecog_performance_status_upper_bound". 
        Avoid assuming a value based on a negation unless it is explicitly supported by the text. 
        Pay close attention to sentences that list multiple criteria in a single statement and ensure that all relevant parameters are extracted. 
        Do not include any descriptive words or explanations after each parameter value. 
        Any text relating to requirements on subjects’ medical history and cannot be classified in the structured parameters should be listed under “Text relating to Medical History”.
        Any unused or unmatched text should be listed separately under “Unused Text”, labeling as “Inclusion Criteria” or “Exclusion Criteria”. 

        Following are some examples for accurately classifying parameters. Please understand them carefully and process parameters accordingly.
        For "sufficient organ function" criteria, adhere to the following interpretations:
        - Note that the lab requirements can be used both for "sufficient_X_function" decisions and specific lab parameters. Do not miss either of the extracted parameters when encountering the lab requirements. For example, for criteria stating that "ANC > 1.5", this could both label "anc_level_lower_bound_10_9_l" as 1.5 and "sufficient_bone_marrow_function" as "Required".
        - **Sufficient renal function**: If the text discusses **creatinine clearance (e.g., > 60 mL/min)** and **absence of severe kidney disease**, or the subject with renal diseases are excluded, then label as "Required".
        - **Sufficient bone marrow function**: If the text discusses **ANC (e.g., ≥ 1.5 × 10⁹/L), platelets (e.g., ≥ 100 × 10⁹/L), and hemoglobin (e.g., ≥ 10 g/dL)**, or the subject with bone marrow related diseases are excluded, then label as "Required".
        - **Sufficient liver function**: If the text discusses **ALT/AST (≤ 2.5× ULN, or ≤ 5× ULN if liver metastases), bilirubin (≤ 1.5× ULN)** or other liver indicators, or the subject with liver diseases are excluded, then label as "Required".
        - **Sufficient cardiovascular function**: If the text discusses **LVEF ≥ 50%** and **absence of NYHA Class III/IV heart failure**, or subject with related cardiovascular disease are excluded, label as "Required".
        - **Sufficient lung function**: If the text discusses **FEV1 ≥ 50% predicted** or the **absence of severe pulmonary disease** or other lung lab indicators, or subject with related lung disease are excluded, label as "Required".


        - **ecog_performance_status_upper_bound**: the upper bound looks for the highest performance status that is acceptable. If the performance status requires "<2", then the upper bound is 1. If requires "<=2", then the upper bound is 2.

        - For "confirmed_breast_cancer", if breast cancer is required and other types of cancer also mentioned, label as "Allowed"; if only requires breast cancer, label as "Required".
        - For "confirmed_metastases_breast_cancer", if no other stages mentioned, which means all patients should have metastatic breast cancer, label as "Required". If other types of breast cancer (stage III) is also allowed, label as "Allowed". Note that stage 4 is equivalent to MBC.
        - For "confirmed_locally_recurrent_breast_cancer", if mentioned and other types of breast cancer also mentioned, label as "Allowed".

        - **untreated_central_nervous_system_metastases**: if the criteria excludes subject with brain/central nervous system metastases, label as "Required". If the criteria allows subject with such metastases, leave it blank or label as "Allowed".

        - **No_Participation_in_other_clinical_trials**: if the text mentions that the subjects are included if not participated in other clinical trials/investigational drugs within X months / currently, or excluded subjects who are participating or participated in other clinical trials within X months, then it should be labeled as "Required".


    """
    
    results = process_clinical_trials_from_csv(input_csv, output_json, prompt_template)

Processing row 1/24...
Processing row 2/24...
Processing row 3/24...
Processing row 4/24...
Processing row 5/24...
Processing row 6/24...
Processing row 7/24...
Processing row 8/24...
Processing row 9/24...
Processing row 10/24...
Processing row 11/24...
Processing row 12/24...
Processing row 13/24...
Processing row 14/24...
Processing row 15/24...
Processing row 16/24...
Processing row 17/24...
Processing row 18/24...
Processing row 19/24...
Processing row 20/24...
Processing row 21/24...
Processing row 22/24...
Processing row 23/24...
Processing row 24/24...
Processing complete! Results saved to extracted_criteria_30_new.json


In [None]:
from openai import OpenAI
import json
import pandas as pd
import os

function_schema = {
    "name": "extract_clinical_criteria",
    "description": "Extract clinical trial inclusion/exclusion criteria based on the given parameters.",
    "parameters": {
        "type": "object",
        "properties": {
            "demographics_and_general_characteristics": {
                "type": "object",
                "properties": {
                    "months_of_life_expectancy": {"type": "integer", "description": "Required months of life expectancy"},
                    "no_participation_in_other_clinical_trials": {"type": "string", "enum": ["Required","Not required"], "description": "Whether concurrent participation in other clinical trials is allowed"}
                }
            },
            "disease_characteristics": {
                "type": "object",
                "properties": {
                    "confirmed_breast_cancer": {"type": "string", "enum": ["Required", "Not required"], "description": "Whether subjects are required to have breast cancer. Do not output allowed here"},
                    "confirmed_metastases_breast_cancer": {"type": "string", "enum": ["Required", "Allowed", "Not allowed", "Not required"], "description": "Whether subjects can have metastases breast cancer"},
                    "confirmed_locally_recurrent_breast_cancer": {"type": "string", "enum": ["Required", "Allowed", "Not allowed", "Not required"], "description": "Whether subjects can have locally recurrent breast cancer"},
                    "disease_progression_after_previous_treatment": {"type": "string", "enum": ["Required", "Allowed", "Not allowed", "Not required"], "description": "Whether cancer progressed after previous treatment is observed"},
                    "her2_status": {"type": "string", "enum": ["Positive only", "Negative only", "Both allowed"], "description": "which HER2 status is allowed for participant"},
                    "er_pr_status": {"type": "string", "enum": ["Positive only", "Negative only", "Both allowed"], "description": "which ER/PR status is allowed for participants"},
                    "measurable_lesion": {"type": "string", "enum": ["Required", "Allowed", "Not allowed", "Not required"], "description": "Whether there exists measurable disease/tumor tissue/lesion"}
                }
            },
            "health_and_organ_function": {
                "type": "object",
                "properties": {
                    "sufficient_renal_function": {"type": "string", "enum": ["Required", "Not required"], "description": "Whether subject is required to have sufficient renal/kidney function"},
                    "sufficient_bone_marrow": {"type": "string", "enum": ["Required", "Not required"], "description": "Whether subject is required to have sufficient bone marrow function"},
                    "sufficient_liver_function": {"type": "string", "enum": ["Required", "Not required"], "description": "Whether subject is required to have sufficient liver function"},
                    "sufficient_heart_cardiovascular_function": {"type": "string", "enum": ["Required", "Not required"], "description": "Whether subject is required to have sufficient heart/cardiovascular function"},
                    "sufficient_lung_function": {"type": "string", "enum": ["Required", "Not required"], "description": "Whether lung function is sufficient"},
                    "untreated_central_nervous_system_metastases": {"type": "string", "enum": ["Allowed", "Not allowed"], "description": "Whether untreated central nervous system metastases are allowed"},
                    "pregnancy_nursing": {"type": "string", "enum": ["Allowed", "Not allowed"], "description": "Whether pregnancy or nursing is excluded"},
                    "peripheral_neuropathy_grade_upper_bound": {"type": "integer", "description": "Upper bound for peripheral neuropathy grade"},
                    "positive_hiv_status": {"type": "string", "enum": ["Allowed", "Not allowed"], "description": "Whether positive HIV status is excluded"}
                }
            },
            "text_relating_to_medical_history": {
                "type": "object",
                "properties": {
                    "inclusion_criteria": {"type": "string", "description": "Relevant text relating to medical history"}
                }
            },
            "unused_text": {
                "type": "object",
                "properties": {
                    "inclusion_criteria": {"type": "string", "description": "Unused text relating to inclusion criteria"}
                }
            }
        }
    }
}

def extract_criteria_with_chatgpt_structured(trial_text, prompt_template, model="gpt-4o"):
    client = OpenAI(
        api_key=os.environ.get("OPENAI_API_KEY")
    )
    

    prompt = prompt_template + f"\n\n{trial_text}"
    
    try:
       
        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": "You are a helpful assistant specialized in clinical trial criteria extraction."},
                {"role": "user", "content": prompt}
            ],
            functions=[function_schema],
            function_call={"name": "extract_clinical_criteria"},
            temperature=0
        )
        
        
        function_call = response.choices[0].message.function_call
        
        if function_call:
            return json.loads(function_call.arguments)  # Return as a Python dictionary
        else:
            print("No function call in response")
            return None
            
    except Exception as e:
        print(f"Error processing trial text: {e}")
        return None

def process_clinical_trials_from_csv(input_csv, output_json, prompt_template):
    """
    Process clinical trials from a CSV file and save extracted criteria to JSON.
    
    Args:
        input_csv (str): Path to input CSV file
        output_json (str): Path for output JSON file
        prompt_template (str): Template for GPT prompt
    """
    try:
        
        df = pd.read_csv(input_csv)
        results = []
        
        
        for index, row in df.iterrows():
            trial_text = row["criteria"]
            print(f"Processing row {index + 1}/{len(df)}...")
            
            structured_output = extract_criteria_with_chatgpt_structured(trial_text, prompt_template)
            
            if structured_output:
                results.append({
                    "Trial Text": trial_text,
                    "Extracted Criteria": structured_output
                })
        
        with open(output_json, "w") as f:
            json.dump(results, f, indent=4)
            
        print(f"Processing complete! Results saved to {output_json}")
        return results
        
    except Exception as e:
        print(f"Error processing CSV file: {e}")
        return None

if __name__ == "__main__":
    input_csv = "criteria_30_new.csv" 
    output_json = "extracted_criteria_30_new.json" 
    
    prompt_template = """
        Extract the inclusion/exclusion criteria from the provided clinical trial text based on the provided parameters. 
        For each parameter, output only the key values explicitly stated in the bracket. 
        If a parameter is not mentioned, leave it blank. Pay close attention when distinguishing "Allowed" and "Required".
        Carefully handle statements involving negations (e.g., “not excluded,” “absence of,” “no contraindications”) to avoid misclassifying double negatives. 
        Cross-reference inclusion and exclusion criteria to ensure consistency. 
        Note that for "upper bound", you should list n-1 if "<n" is mentioned in the original text. For instance, if the original text says "ecog performance <2", then you should record 1 in "ecog_performance_status_upper_bound". 
        Avoid assuming a value based on a negation unless it is explicitly supported by the text. 
        Pay close attention to sentences that list multiple criteria in a single statement and ensure that all relevant parameters are extracted. 
        Do not include any descriptive words or explanations after each parameter value. 
        Any text relating to requirements on subjects’ medical history and cannot be classified in the structured parameters should be listed under “Text relating to Medical History”.
        Any unused or unmatched text should be listed separately under “Unused Text”, labeling as “Inclusion Criteria” or “Exclusion Criteria”. 

        Following are some examples for accurately classifying parameters. Please understand them carefully and process parameters accordingly.
        For "sufficient organ function" criteria, adhere to the following interpretations:
        - Note that the lab requirements can be used both for "sufficient_X_function" decisions and specific lab parameters. Do not miss either of the extracted parameters when encountering the lab requirements. For example, for criteria stating that "ANC > 1.5", this could both label "anc_level_lower_bound_10_9_l" as 1.5 and "sufficient_bone_marrow_function" as "Required".
        - **Sufficient renal function**: If the text discusses **creatinine clearance (e.g., > 60 mL/min)** and **absence of severe kidney disease**, or the subject with renal diseases are excluded, then label as "Required".
        - **Sufficient bone marrow function**: If the text discusses **ANC (e.g., ≥ 1.5 × 10⁹/L), platelets (e.g., ≥ 100 × 10⁹/L), and hemoglobin (e.g., ≥ 10 g/dL)**, or the subject with bone marrow related diseases are excluded, then label as "Required".
        - **Sufficient liver function**: If the text discusses **ALT/AST (≤ 2.5× ULN, or ≤ 5× ULN if liver metastases), bilirubin (≤ 1.5× ULN)** or other liver indicators, or the subject with liver diseases are excluded, then label as "Required".
        - **Sufficient cardiovascular function**: If the text discusses **LVEF ≥ 50%** and **absence of NYHA Class III/IV heart failure**, or subject with related cardiovascular disease are excluded, label as "Required".
        - **Sufficient lung function**: If the text discusses **FEV1 ≥ 50% predicted** or the **absence of severe pulmonary disease** or other lung lab indicators, or subject with related lung disease are excluded, label as "Required".
       
        - For "confirmed_breast_cancer", if breast cancer is required and other types of cancer also mentioned, label as "Allowed"; if only requires breast cancer, label as "Required".
        - For "confirmed_metastases_breast_cancer", if no other stages mentioned, which means all patients should have metastatic breast cancer, label as "Required". If other types of breast cancer (stage III) is also allowed, label as "Allowed". Note that stage 4 is equivalent to MBC.
        - For "confirmed_locally_recurrent_breast_cancer", if mentioned and other types of breast cancer also mentioned, label as "Allowed".

        - **untreated_central_nervous_system_metastases**: if the criteria excludes subject with brain/central nervous system metastases, label as "Required". If the criteria allows subject with such metastases, leave it blank or label as "Allowed".

        - **No_Participation_in_other_clinical_trials**: if the text mentions that the subjects are included if not participated in other clinical trials/investigational drugs within X months / currently, or excluded subjects who are participating or participated in other clinical trials within X months, then it should be labeled as "Required".


    """
    
    results = process_clinical_trials_from_csv(input_csv, output_json, prompt_template)

Processing row 1/24...
Processing row 2/24...
Processing row 3/24...
Processing row 4/24...
Processing row 5/24...
Processing row 6/24...
Processing row 7/24...
Processing row 8/24...
Processing row 9/24...
Processing row 10/24...
Processing row 11/24...
Processing row 12/24...
Processing row 13/24...
Processing row 14/24...
Processing row 15/24...
Processing row 16/24...
Processing row 17/24...
Processing row 18/24...
Processing row 19/24...
Processing row 20/24...
Processing row 21/24...
Processing row 22/24...
Processing row 23/24...
Processing row 24/24...
Processing complete! Results saved to extracted_criteria_30_new.json


In [13]:
from openai import OpenAI
import json
import pandas as pd
import os

function_schema = {
    "name": "extract_clinical_criteria",
    "description": "Extract clinical trial inclusion/exclusion criteria based on the given parameters.",
    "parameters": {
        "type": "object",
        "properties": {
            "health_and_organ_function": {
                "type": "object",
                "properties": {
                    "liver_function_ast_alt_upper_bound_uln": {"type": "number", "description": "subjects upper bound for aspartate amino transferase(AST)/alanine amino transferase(ALT) levels to be included (* ULN)"},
                    "liver_function_bilirubin_upper_bound_uln": {"type": "number", "description": "subjects upper bound for bilirubin levels to be included (* ULN)"},
                    "creatinine_upper_bound_uln": {"type": "number", "description": "subjects upper bound for creatinine levels (* ULN)"},
                    "creatinine_clearance_lower_bound": {"type": "number", "description": "subjects lower bound for creatinine clearance (ml/min)"},
                    "lvef_lower_bound_percent": {"type": "number", "description": "Lower bound for LVEF percentage"},
                    "anc_level_lower_bound_10_9_l": {"type": "number", "description": "Lower bound for ANC level (* 10^9/L)"},
                    "hgb_level_lower_bound_10_9_l": {"type": "number", "description": "Lower bound for hemoglobin level (* 10^9/L)"},
                    "plt_level_lower_bound_10_9_l": {"type": "number", "description": "Lower bound for platelet level (* 10^9/L)"},
                    "granulocytes": {"type": "number", "description": "Granulocyte levels (* 10^9/L)"},
                    "karnofsky_performance_lower_bound": {"type": "integer", "description": "Lower bound for Karnofsky performance scale"},
                    "ecog_performance_status_upper_bound": {"type": "integer", "description": "Upper bound for ECOG / WHO performance status"}
                }
            }
        }
    }
}

def extract_criteria_with_chatgpt_structured(trial_text, prompt_template, model="gpt-4o"):
    client = OpenAI(
        api_key=os.environ.get("OPENAI_API_KEY")
    )
    

    prompt = prompt_template + f"\n\n{trial_text}"
    
    try:
       
        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": "You are a helpful assistant specialized in clinical trial criteria extraction."},
                {"role": "user", "content": prompt}
            ],
            functions=[function_schema],
            function_call={"name": "extract_clinical_criteria"},
            temperature=0
        )
        
        
        function_call = response.choices[0].message.function_call
        
        if function_call:
            return json.loads(function_call.arguments)  # Return as a Python dictionary
        else:
            print("No function call in response")
            return None
            
    except Exception as e:
        print(f"Error processing trial text: {e}")
        return None

def process_clinical_trials_from_csv(input_csv, output_json, prompt_template):
    """
    Process clinical trials from a CSV file and save extracted criteria to JSON.
    
    Args:
        input_csv (str): Path to input CSV file
        output_json (str): Path for output JSON file
        prompt_template (str): Template for GPT prompt
    """
    try:
        
        df = pd.read_csv(input_csv)
        results = []
        
        
        for index, row in df.iterrows():
            trial_text = row["criteria"]
            print(f"Processing row {index + 1}/{len(df)}...")
            
            structured_output = extract_criteria_with_chatgpt_structured(trial_text, prompt_template)
            
            if structured_output:
                results.append({
                    "Trial Text": trial_text,
                    "Extracted Criteria": structured_output
                })
        
        with open(output_json, "w") as f:
            json.dump(results, f, indent=4)
            
        print(f"Processing complete! Results saved to {output_json}")
        return results
        
    except Exception as e:
        print(f"Error processing CSV file: {e}")
        return None

if __name__ == "__main__":
    input_csv = "criteria_30_new.csv" 
    output_json = "extracted_criteria_30_new_2.json" 
    
    prompt_template = """
        Extract the lab parameters discussed in each inclusion / exclusion criteria. Note that for upper bounds and lower bounds of lab parameters, there could have different requirements for different occasions. I want the highest / lowest as upper bound / lower bound.
        
        **ecog_performance_status_upper_bound**: the upper bound looks for the highest performance status that is acceptable. If the performance status requires "<2", then the upper bound is 1. If requires "<=2", then the upper bound is 2.

    """
    
    results = process_clinical_trials_from_csv(input_csv, output_json, prompt_template)

Processing row 1/24...
Processing row 2/24...
Processing row 3/24...
Processing row 4/24...
Processing row 5/24...
Processing row 6/24...
Processing row 7/24...
Processing row 8/24...
Processing row 9/24...
Processing row 10/24...
Processing row 11/24...
Processing row 12/24...
Processing row 13/24...
Processing row 14/24...
Processing row 15/24...
Processing row 16/24...
Processing row 17/24...
Processing row 18/24...
Processing row 19/24...
Processing row 20/24...
Processing row 21/24...
Processing row 22/24...
Processing row 23/24...
Processing row 24/24...
Processing complete! Results saved to extracted_criteria_30_new_2.json


In [18]:
# merge data
import json

def merge_json_files(file1, file2, output_file):
    try:
        with open(file1, 'r') as f1:
            data1 = json.load(f1)
        
        with open(file2, 'r') as f2:
            data2 = json.load(f2)
        
        merged_data_dict = {}

        for entry in data1:
            trial_text = entry.get("Trial Text")
            if trial_text:
                merged_data_dict[trial_text] = entry

        for entry in data2:
            trial_text = entry.get("Trial Text")
            if trial_text:
                if trial_text in merged_data_dict:
                    for key, value in entry.get("Extracted Criteria", {}).items():
                        if key in merged_data_dict[trial_text]["Extracted Criteria"]:
                            merged_data_dict[trial_text]["Extracted Criteria"][key].update(value)
                        else:
                            merged_data_dict[trial_text]["Extracted Criteria"][key] = value
                else:
                    merged_data_dict[trial_text] = entry

        merged_data = list(merged_data_dict.values())

        with open(output_file, 'w') as f_out:
            json.dump(merged_data, f_out, indent=4)
        
        print(f"Merging complete! Merged data saved to {output_file}")
    
    except Exception as e:
        print(f"Error merging JSON files: {e}")

if __name__ == "__main__":
    file1 = "extracted_criteria_30_new.json"
    file2 = "extracted_criteria_30_new_2.json"
    output_file = "extracted_criteria_30_new_merged.json"
    
    merge_json_files(file1, file2, output_file)

Merging complete! Merged data saved to extracted_criteria_30_new_merged.json
