In [7]:
from openai import OpenAI
import json
import pandas as pd
import os

function_schema = {
    "name": "extract_clinical_criteria",
    "description": "Extract clinical trial inclusion/exclusion criteria based on the given parameters.",
    "parameters": {
        "type": "object",
        "properties": {
            "demographics_and_general_characteristics": {
                "type": "object",
                "properties": {
                    "months_of_life_expectancy": {"type": "integer", "description": "Required months of life expectancy"},
                    "follow_up_agreement": {"type": "string", "enum": ["Yes", "No"], "description": "Whether the participant agrees to follow-up"},
                    "signed_consent": {"type": "string", "enum": ["Yes", "No"], "description": "Whether signed consent is required"},
                    "protocol_compliance": {"type": "string", "enum": ["Yes", "No"], "description": "Whether subjects are expected to comply with the research protocol"},
                    "no_concurrent_participation_in_other_clinical_trials": {"type": "string", "enum": ["Yes", "No"], "description": "Whether concurrent participation in other clinical trials is prohibited"}
                }
            },
            "disease_characteristics": {
                "type": "object",
                "properties": {
                    "confirmed_breast_cancer": {"type": "string", "enum": ["Yes", "No"], "description": "Whether breast cancer is confirmed"},
                    "confirmed_metastases_breast_cancer": {"type": "string", "enum": ["Yes", "No"], "description": "Whether metastases in breast cancer are confirmed"},
                    "confirmed_locally_recurrent_breast_cancer": {"type": "string", "enum": ["Yes", "No"], "description": "Whether locally recurrent breast cancer is confirmed"},
                    "progress_after_previous_treatment": {"type": "string", "enum": ["Yes", "No"], "description": "Whether progress after previous treatment is observed"},
                    "her2_status": {"type": "string", "enum": ["Positive", "Negative"], "description": "HER2 status of the participant"},
                    "er_pr_status": {"type": "string", "enum": ["Positive", "Negative"], "description": "ER/PR status of the participant"}
                }
            },
            "health_and_organ_function": {
                "type": "object",
                "properties": {
                    "sufficient_renal_function": {"type": "string", "enum": ["Yes", "No"], "description": "Whether renal function is sufficient"},
                    "sufficient_bone_marrow": {"type": "string", "enum": ["Yes", "No"], "description": "Whether bone marrow function is sufficient"},
                    "sufficient_liver_function": {"type": "string", "enum": ["Yes", "No"], "description": "Whether liver function is sufficient"},
                    "sufficient_heart_cardiovascular_function": {"type": "string", "enum": ["Yes", "No"], "description": "Whether heart/cardiovascular function is sufficient"},
                    "sufficient_lung_function": {"type": "string", "enum": ["Yes", "No"], "description": "Whether lung function is sufficient"},
                    "ast_alt_upper_bound_uln": {"type": "number", "description": "Upper bound for AST/ALT levels (* ULN)"},
                    "bilirubin_upper_bound_uln": {"type": "number", "description": "Upper bound for bilirubin levels (* ULN)"},
                    "creatinine_upper_bound_uln": {"type": "number", "description": "Upper bound for creatinine levels (* ULN)"},
                    "creatinine_clearance_lower_bound": {"type": "number", "description": "Lower bound for creatinine clearance (ml/min)"},
                    "lvef_lower_bound_percent": {"type": "number", "description": "Lower bound for LVEF percentage"},
                    "anc_level_lower_bound_10_9_l": {"type": "number", "description": "Lower bound for ANC level (* 10^9/L)"},
                    "hgb_level_lower_bound_10_9_l": {"type": "number", "description": "Lower bound for hemoglobin level (* 10^9/L)"},
                    "plt_level_lower_bound_10_9_l": {"type": "number", "description": "Lower bound for platelet level (* 10^9/L)"},
                    "granulocytes": {"type": "number", "description": "Granulocyte levels (* 10^9/L)"},
                    "karnofsky_performance_lower_bound": {"type": "integer", "description": "Lower bound for Karnofsky performance scale"},
                    "ecog_performance_status_upper_bound": {"type": "integer", "description": "Upper bound for ECOG / WHO performance status"},
                    "no_active_brain_metastases": {"type": "string", "enum": ["Yes", "No"], "description": "Whether brain metastases are absent"},
                    "no_active_central_nervous_system_metastases": {"type": "string", "enum": ["Yes", "No"], "description": "Whether central nervous system metastases are absent"},
                    "no_pregnancy_nursing": {"type": "string", "enum": ["Yes", "No"], "description": "Whether pregnancy or nursing is excluded"},
                    "peripheral_neuropathy_grade_upper_bound": {"type": "integer", "description": "Upper bound for peripheral neuropathy grade"},
                    "no_positive_hiv_status": {"type": "string", "enum": ["Yes", "No"], "description": "Whether positive HIV status is excluded"},
                    "measurable_lesion": {"type": "string", "enum": ["Yes", "No"], "description": "Whether there exists measurable disease/tumor tissue/lesion"},
                    "swallow_ability": {"type": "string", "enum": ["Yes", "No"], "description": "Whether the participant has the ability to swallow"}
                }
            },
            "text_relating_to_medical_history": {
                "type": "object",
                "properties": {
                    "inclusion_criteria": {"type": "string", "description": "Relevant text relating to medical history"},
                    "exclusion_criteria": {"type": "string", "description": "Relevant text relating to medical history"}
                }
            },
            "unused_text": {
                "type": "object",
                "properties": {
                    "inclusion_criteria": {"type": "string", "description": "Unused text relating to inclusion criteria"},
                    "exclusion_criteria": {"type": "string", "description": "Unused text relating to exclusion criteria"}
                }
            }
        }
    }
}

def extract_criteria_with_chatgpt_structured(trial_text, prompt_template, model="gpt-4o"):
    client = OpenAI(
        api_key=os.environ.get("OPENAI_API_KEY")
    )
    

    prompt = prompt_template + f"\n\n{trial_text}"
    
    try:
       
        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": "You are a helpful assistant specialized in clinical trial criteria extraction."},
                {"role": "user", "content": prompt}
            ],
            functions=[function_schema],
            function_call={"name": "extract_clinical_criteria"},
            temperature=0.2
        )
        
        
        function_call = response.choices[0].message.function_call
        
        if function_call:
            return json.loads(function_call.arguments)  # Return as a Python dictionary
        else:
            print("No function call in response")
            return None
            
    except Exception as e:
        print(f"Error processing trial text: {e}")
        return None

def process_clinical_trials_from_csv(input_csv, output_json, prompt_template):
    """
    Process clinical trials from a CSV file and save extracted criteria to JSON.
    
    Args:
        input_csv (str): Path to input CSV file
        output_json (str): Path for output JSON file
        prompt_template (str): Template for GPT prompt
    """
    try:
        
        df = pd.read_csv(input_csv)
        results = []
        
        
        for index, row in df.iterrows():
            trial_text = row["criteria"]
            print(f"Processing row {index + 1}/{len(df)}...")
            
            structured_output = extract_criteria_with_chatgpt_structured(trial_text, prompt_template)
            
            if structured_output:
                results.append({
                    "Trial Text": trial_text,
                    "Extracted Criteria": structured_output
                })
        
        with open(output_json, "w") as f:
            json.dump(results, f, indent=4)
            
        print(f"Processing complete! Results saved to {output_json}")
        return results
        
    except Exception as e:
        print(f"Error processing CSV file: {e}")
        return None

if __name__ == "__main__":
    input_csv = "criteria_30.csv" 
    output_json = "extracted_criteria_30.json" 
    
    prompt_template = """
        Extract the inclusion/exclusion criteria from the provided clinical trial text based on the following parameters. For each parameter, output only the key values explicitly stated in the bracket. If a parameter is implicitly mentioned, make predictions based on your best judgement while ensuring alignment with the context. If a parameter is not mentioned or cannot be reasonably inferred, leave it blank. Carefully handle statements involving negations (e.g., “not excluded,” “absence of,” “no contraindications”) to avoid misclassifying double negatives. Cross-reference inclusion and exclusion criteria to ensure consistency. For example:If “no pregnancy or nursing” appears in exclusion criteria, classify it as “no_pregnancy_nursing = Yes” under the appropriate parameter. Avoid assuming a value based on a negation unless it is explicitly supported by the text. Pay close attention to sentences that list multiple criteria in a single statement and ensure that all relevant parameters are extracted. Do not include any descriptive words or explanations after each parameter value. Any text relating to requirements on subjects’ medical history and cannot be classified in the structured parameters should be listed under “Text relating to Medical History”. Any unused or unmatched text should be listed separately under “Unused Text”, labeling as “Inclusion Criteria” or “Exclusion Criteria”. 
 
        Parameters:
        Demographics and General Characteristics
        Months of Life Expectancy: [1 – 99]
        Follow-up Agreement: [Yes, No]
        Signed Consent: [Yes, No]
        Protocol Compliance: [Yes, No]
        No concurrent participation in other clinical trials: [Yes, No]
 
        Disease Characteristics
        Confirmed breast cancer / mammary carcinoma: [Yes, No]
        Confirmed metastases breast cancer: [Yes, No]
        Confirmed locally recurrent breast cancer: [Yes, No]
        Progress after previous treatment: [Yes, No]
        HER-2 Status: [Positive, Negative]
        ER/PR Status: [Positive, Negative]
        
        Health and Organ Function
        Sufficient renal function: [Yes, No]
        Sufficient bone marrow: [Yes, No]
        Sufficient liver function: [Yes, No]
        Sufficient heart/cardiovascular function: [Yes, No]
        Sufficient lung function: [Yes, No]
        AST/ALT upper bound (* ULN): [1 - 100]
        Bilirubin upper bound (* ULN): [0 - 100]
        LVEF lower bound (%): [0-100]
        creatinine clearance lower bound: [0 - 100]
        creatinine upper bound (*ULN): [0 - 100]
        ANC Level lower bound (*10^9/L): [0 - 10000]
        Hgb Level lower bound (*10^9/L): [0 – 100]
        plt Level lower bound (*10^9/L): [0 – 1000]
        granulocytes: [1-10]
        Karnofsky performance lower bound: [0 – 100]
        ECOG / WHO performance status upper bound: [0 – 5]
        No active brain Metastases: [Yes, No]
        No active central nervous system metastases: [Yes, No]
        No Pregnancy/Nursing: [Yes, No]
        peripheral Neuropathy Grade upper bound: [1 – 4]
        No positive HIV status: [Yes, No]
        Accessible Measurable Lesion: [Yes, No]
        Swallow Ability: [Yes, No]

    Text relating to Medical History
    [include but not limited to historical treatments, investigational drug use, historical allergies, current treatment, current disease]

        Unused Text
        
    """
    
    results = process_clinical_trials_from_csv(input_csv, output_json, prompt_template)

Processing row 1/24...
Processing row 2/24...
Processing row 3/24...
Processing row 4/24...
Processing row 5/24...
Processing row 6/24...
Processing row 7/24...
Processing row 8/24...
Processing row 9/24...
Processing row 10/24...
Processing row 11/24...
Processing row 12/24...
Processing row 13/24...
Processing row 14/24...
Processing row 15/24...
Processing row 16/24...
Processing row 17/24...
Processing row 18/24...
Processing row 19/24...
Processing row 20/24...
Processing row 21/24...
Processing row 22/24...
Processing row 23/24...
Processing row 24/24...
Processing complete! Results saved to extracted_criteria_30.json


In [4]:
# Data Processing
import json
import pandas as pd


file_path = "extracted_criteria_30.json"
with open(file_path, "r") as file:
    data = json.load(file)

# Convert to a DataFrame for better analysis
df = pd.json_normalize(data, sep='_')

# Display the DataFrame structure
print(df.head())

                                          Trial Text  \
0  Inclusion Criteria:\n\nPatients with a history...   
1  SELECTION OF PATIENTS (MOST IMPORTANT CRITERIA...   
2  Inclusion Criteria:\n\nHistologically confirme...   
3  Inclusion Criteria:\n\nStep 1 subjects only: m...   
4  Inclusion Criteria:\n\nPatients diagnosed with...   

  Extracted Criteria_demographics_and_general_characteristics_minimum_age  \
0                                                 18                        
1                                                 18                        
2                                                NaN                        
3                                                 18                        
4                                                 18                        

  Extracted Criteria_demographics_and_general_characteristics_signed_consent  \
0                                                Yes                           
1                                       

In [5]:
print(df.columns)

Index(['Trial Text',
       'Extracted Criteria_demographics_and_general_characteristics_minimum_age',
       'Extracted Criteria_demographics_and_general_characteristics_signed_consent',
       'Extracted Criteria_demographics_and_general_characteristics_protocol_compliance',
       'Extracted Criteria_disease_characteristics_confirmed_breast_cancer',
       'Extracted Criteria_disease_characteristics_confirmed_metastases_breast_cancer',
       'Extracted Criteria_disease_characteristics_confirmed_locally_recurrent_breast_cancer',
       'Extracted Criteria_disease_characteristics_her2_status',
       'Extracted Criteria_health_and_organ_function_no_pregnancy_nursing',
       'Extracted Criteria_health_and_organ_function_ecog_performance_status_upper_bound',
       'Extracted Criteria_text_relating_to_medical_history_inclusion_criteria',
       'Extracted Criteria_text_relating_to_medical_history_exclusion_criteria',
       'Extracted Criteria_unused_text_exclusion_criteria',
       '

In [13]:
ages = df['Extracted Criteria_demographics_and_general_characteristics_minimum_age']
print(ages.describe())

count     26
unique     3
top       18
freq      23
Name: Extracted Criteria_demographics_and_general_characteristics_minimum_age, dtype: int64


In [12]:
her2_status = df['Extracted Criteria_disease_characteristics_her2_status'].value_counts()
print(her2_status)

Extracted Criteria_disease_characteristics_her2_status
Negative       14
Positive        4
Inferred        1
Unspecified     1
Name: count, dtype: int64


In [3]:
## extract "inclusion criteria"

from openai import OpenAI
import json
import pandas as pd
import os

function_schema = {
    "name": "extract_clinical_criteria",
    "description": "Extract clinical trial inclusion criteria based on the given parameters.",
    "parameters": {
        "type": "object",
        "properties": {
            "demographics_and_general_characteristics": {
                "type": "object",
                "properties": {
                    "months_of_life_expectancy": {"type": "integer", "description": "Required months of life expectancy"},
                    "follow_up_agreement": {"type": "string", "enum": ["Required", "Not required"], "description": "Whether the participant agrees to follow-up"},
                    "signed_consent": {"type": "string", "enum": ["Required", "Not required"], "description": "Whether signed consent is required"},
                    "protocol_compliance": {"type": "string", "enum": ["Required", "Not required"], "description": "Whether subjects are expected to comply with the research protocol"},
                    "no_concurrent_participation_in_other_clinical_trials": {"type": "string", "enum": ["Required", "Not required"], "description": "Whether concurrent participation in other clinical trials is prohibited"}
                }
            },
            "disease_characteristics": {
                "type": "object",
                "properties": {
                    "confirmed_breast_cancer": {"type": "string", "enum": ["Yes", "No"], "description": "Whether breast cancer is confirmed"},
                    "confirmed_metastases_breast_cancer": {"type": "string", "enum": ["Yes", "No"], "description": "Whether metastases in breast cancer are confirmed"},
                    "confirmed_locally_recurrent_breast_cancer": {"type": "string", "enum": ["Yes", "No"], "description": "Whether locally recurrent breast cancer is confirmed"},
                    "progress_after_previous_treatment": {"type": "string", "enum": ["Allowed", "Not allowed"], "description": "Whether progress after previous treatment is observed"},
                    "her2_status": {"type": "string", "enum": ["Positive", "Negative"], "description": "HER2 status of the participant"},
                    "er_pr_status": {"type": "string", "enum": ["Positive", "Negative"], "description": "ER/PR status of the participant"}
                }
            },
            "health_and_organ_function": {
                "type": "object",
                "properties": {
                    "sufficient_renal_function": {"type": "string", "enum": ["Required", "Not required"], "description": "Whether renal function is sufficient"},
                    "sufficient_bone_marrow": {"type": "string", "enum": ["Required", "Not required"], "description": "Whether bone marrow function is sufficient"},
                    "sufficient_liver_function": {"type": "string", "enum": ["Required", "Not required"], "description": "Whether liver function is sufficient"},
                    "sufficient_heart_cardiovascular_function": {"type": "string", "enum": ["Required", "Not required"], "description": "Whether heart/cardiovascular function is sufficient"},
                    "sufficient_lung_function": {"type": "string", "enum": ["Required", "Not required"], "description": "Whether lung function is sufficient"},
                    "ast_alt_upper_bound_uln": {"type": "number", "description": "Upper bound for AST/ALT levels (* ULN)"},
                    "bilirubin_upper_bound_uln": {"type": "number", "description": "Upper bound for bilirubin levels (* ULN)"},
                    "creatinine_upper_bound_uln": {"type": "number", "description": "Upper bound for creatinine levels (* ULN)"},
                    "creatinine_clearance_lower_bound": {"type": "number", "description": "Lower bound for creatinine clearance (ml/min)"},
                    "lvef_lower_bound_percent": {"type": "number", "description": "Lower bound for LVEF percentage"},
                    "anc_level_lower_bound_10_9_l": {"type": "number", "description": "Lower bound for ANC level (* 10^9/L)"},
                    "hgb_level_lower_bound_10_9_l": {"type": "number", "description": "Lower bound for hemoglobin level (* 10^9/L)"},
                    "plt_level_lower_bound_10_9_l": {"type": "number", "description": "Lower bound for platelet level (* 10^9/L)"},
                    "granulocytes": {"type": "number", "description": "Granulocyte levels (* 10^9/L)"},
                    "karnofsky_performance_lower_bound": {"type": "integer", "description": "Lower bound for Karnofsky performance scale"},
                    "ecog_performance_status_upper_bound": {"type": "integer", "description": "Upper bound for ECOG / WHO performance status"},
                    "active_brain_metastases": {"type": "string", "enum": ["Allowed", "Not allowed"], "description": "Whether brain metastases are absent"},
                    "active_central_nervous_system_metastases": {"type": "string", "enum": ["Allowed", "Not allowed"], "description": "Whether central nervous system metastases are absent"},
                    "pregnancy_nursing": {"type": "string", "enum": ["Allowed", "Not allowed"], "description": "Whether pregnancy or nursing is excluded"},
                    "peripheral_neuropathy_grade_upper_bound": {"type": "integer", "description": "Upper bound for peripheral neuropathy grade"},
                    "positive_hiv_status": {"type": "string", "enum": ["Allowed", "not allowed"], "description": "Whether positive HIV status is excluded"},
                    "measurable_lesion": {"type": "string", "enum": ["Required", "not required"], "description": "Whether there exists measurable disease/tumor tissue/lesion"},
                    "swallow_ability": {"type": "string", "enum": ["Required", "not required"], "description": "Whether the participant has the ability to swallow"}
                }
            },
            "text_relating_to_medical_history": {
                "type": "object",
                "properties": {
                    "inclusion_criteria": {"type": "string", "description": "Relevant text relating to medical history"}
                }
            },
            "unused_text": {
                "type": "object",
                "properties": {
                    "inclusion_criteria": {"type": "string", "description": "Unused text relating to inclusion criteria"}
                }
            }
        }
    }
}

def extract_criteria_with_chatgpt_structured(trial_text, prompt_template, model="gpt-4o"):
    client = OpenAI(
        api_key=os.environ.get("OPENAI_API_KEY")
    )
    

    prompt = prompt_template + f"\n\n{trial_text}"
    
    try:
       
        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": "You are a helpful assistant specialized in clinical trial criteria extraction."},
                {"role": "user", "content": prompt}
            ],
            functions=[function_schema],
            function_call={"name": "extract_clinical_criteria"},
            temperature=0.2
        )
        
        
        function_call = response.choices[0].message.function_call
        
        if function_call:
            return json.loads(function_call.arguments)  # Return as a Python dictionary
        else:
            print("No function call in response")
            return None
            
    except Exception as e:
        print(f"Error processing trial text: {e}")
        return None

def process_clinical_trials_from_csv(input_csv, output_json, prompt_template):
    """
    Process clinical trials from a CSV file and save extracted criteria to JSON.
    
    Args:
        input_csv (str): Path to input CSV file
        output_json (str): Path for output JSON file
        prompt_template (str): Template for GPT prompt
    """
    try:
        
        df = pd.read_csv(input_csv)
        results = []
        
        
        for index, row in df.iterrows():
            trial_text = row["inclusion_criteria"]
            print(f"Processing row {index + 1}/{len(df)} of inclusion criteria...")
            
            structured_output = extract_criteria_with_chatgpt_structured(trial_text, prompt_template)
            
            if structured_output:
                results.append({
                    "Trial Text": trial_text,
                    "Extracted Criteria": structured_output
                })
        
        with open(output_json, "w") as f:
            json.dump(results, f, indent=4)
            
        print(f"Processing complete! Results saved to {output_json}")
        return results
        
    except Exception as e:
        print(f"Error processing CSV file: {e}")
        return None

if __name__ == "__main__":
    input_csv = "criteria_30.csv" 
    output_json = "extracted_inclusion_criteria_30.json" 
    
    prompt_template = """
        Extract the inclusion criteria from the provided clinical trial text based on the following parameters. For each parameter, output only the key values explicitly stated in the bracket. If a parameter is implicitly mentioned, make predictions based on your best judgement while ensuring alignment with the context. If a parameter is not mentioned or cannot be reasonably inferred, leave it blank. Carefully handle statements involving negations (e.g., “not excluded,” “absence of,” “no contraindications”) to avoid misclassifying double negatives. Avoid assuming a value based on a negation unless it is explicitly supported by the text. Pay close attention to sentences that list multiple criteria in a single statement and ensure that all relevant parameters are extracted. Do not include any descriptive words or explanations after each parameter value. Any text relating to requirements on subjects’ medical history and cannot be classified in the structured parameters should be listed under “Text relating to Medical History”, labeling as “Inclusion Criteria”. Any unused or unmatched text should be listed separately under “Unused Text”, labeling as “Inclusion Criteria”. 
 
        Parameters:
        Demographics and General Characteristics
        Months of Life Expectancy: [1 – 99]
        Follow-up Agreement: [Yes, No]
        Signed Consent: [Yes, No]
        Protocol Compliance: [Yes, No]
        No concurrent participation in other clinical trials: [Yes, No]
 
        Disease Characteristics
        Confirmed breast cancer / mammary carcinoma: [Yes, No]
        Confirmed metastases breast cancer: [Yes, No]
        Confirmed locally recurrent breast cancer: [Yes, No]
        Progress after previous treatment: [Yes, No]
        HER-2 Status: [Positive, Negative]
        ER/PR Status: [Positive, Negative]
        
        Health and Organ Function
        Sufficient renal function: [Yes, No]
        Sufficient bone marrow: [Yes, No]
        Sufficient liver function: [Yes, No]
        Sufficient heart/cardiovascular function: [Yes, No]
        Sufficient lung function: [Yes, No]
        AST/ALT upper bound (* ULN): [1 - 100]
        Bilirubin upper bound (* ULN): [0 - 100]
        LVEF lower bound (%): [0-100]
        creatinine clearance lower bound: [0 - 100]
        creatinine upper bound (*ULN): [0 - 100]
        ANC Level lower bound (*10^9/L): [0 - 10000]
        Hgb Level lower bound (*10^9/L): [0 – 100]
        plt Level lower bound (*10^9/L): [0 – 1000]
        granulocytes: [1-10]
        Karnofsky performance lower bound: [0 – 100]
        ECOG / WHO performance status upper bound: [0 – 5]
        No active brain Metastases: [Yes, No]
        No active central nervous system metastases: [Yes, No]
        Pregnancy/Nursing: [Allowed, Not allowed]
        peripheral Neuropathy Grade upper bound: [1 – 4]
        No positive HIV status: [Yes, No]
        Accessible Measurable Lesion: [Yes, No]
        Swallow Ability: [Yes, No]

        Text relating to Medical History
        [include but not limited to historical treatments, investigational drug use, historical allergies, current treatment, current disease]

        Unused Text
        
    """
    
    results = process_clinical_trials_from_csv(input_csv, output_json, prompt_template)

Processing row 1/24 of inclusion criteria...
Processing row 2/24 of inclusion criteria...
Processing row 3/24 of inclusion criteria...
Processing row 4/24 of inclusion criteria...
Processing row 5/24 of inclusion criteria...
Processing row 6/24 of inclusion criteria...
Processing row 7/24 of inclusion criteria...
Processing row 8/24 of inclusion criteria...
Processing row 9/24 of inclusion criteria...
Processing row 10/24 of inclusion criteria...
Processing row 11/24 of inclusion criteria...
Processing row 12/24 of inclusion criteria...
Processing row 13/24 of inclusion criteria...
Processing row 14/24 of inclusion criteria...
Processing row 15/24 of inclusion criteria...
Processing row 16/24 of inclusion criteria...
Processing row 17/24 of inclusion criteria...
Processing row 18/24 of inclusion criteria...
Processing row 19/24 of inclusion criteria...
Processing row 20/24 of inclusion criteria...
Processing row 21/24 of inclusion criteria...
Processing row 22/24 of inclusion criteria.

In [4]:
## extract "exclusion criteria"

from openai import OpenAI
import json
import pandas as pd
import os

function_schema = {
    "name": "extract_clinical_criteria",
    "description": "Extract clinical trial exclusion criteria based on the given parameters.",
    "parameters": {
        "type": "object",
        "properties": {
            "demographics_and_general_characteristics": {
                "type": "object",
                "properties": {
                    "months_of_life_expectancy": {"type": "integer", "description": "Required months of life expectancy"},
                    "follow_up_agreement": {"type": "string", "enum": ["Yes", "No"], "description": "Whether the participant agrees to follow-up"},
                    "signed_consent": {"type": "string", "enum": ["Yes", "No"], "description": "Whether signed consent is required"},
                    "protocol_compliance": {"type": "string", "enum": ["Yes", "No"], "description": "Whether subjects are expected to comply with the research protocol"},
                    "no_concurrent_participation_in_other_clinical_trials": {"type": "string", "enum": ["Yes", "No"], "description": "Whether concurrent participation in other clinical trials is prohibited"}
                }
            },
            "disease_characteristics": {
                "type": "object",
                "properties": {
                    "confirmed_breast_cancer": {"type": "string", "enum": ["Yes", "No"], "description": "Whether breast cancer is confirmed"},
                    "confirmed_metastases_breast_cancer": {"type": "string", "enum": ["Yes", "No"], "description": "Whether metastases in breast cancer are confirmed"},
                    "confirmed_locally_recurrent_breast_cancer": {"type": "string", "enum": ["Yes", "No"], "description": "Whether locally recurrent breast cancer is confirmed"},
                    "progress_after_previous_treatment": {"type": "string", "enum": ["Yes", "No"], "description": "Whether progress after previous treatment is observed"},
                    "her2_status": {"type": "string", "enum": ["Positive", "Negative"], "description": "HER2 status of the participant"},
                    "er_pr_status": {"type": "string", "enum": ["Positive", "Negative"], "description": "ER/PR status of the participant"}
                }
            },
            "health_and_organ_function": {
                "type": "object",
                "properties": {
                    "sufficient_renal_function": {"type": "string", "enum": ["Yes", "No"], "description": "Whether renal function is sufficient"},
                    "sufficient_bone_marrow": {"type": "string", "enum": ["Yes", "No"], "description": "Whether bone marrow function is sufficient"},
                    "sufficient_liver_function": {"type": "string", "enum": ["Yes", "No"], "description": "Whether liver function is sufficient"},
                    "sufficient_heart_cardiovascular_function": {"type": "string", "enum": ["Yes", "No"], "description": "Whether heart/cardiovascular function is sufficient"},
                    "sufficient_lung_function": {"type": "string", "enum": ["Yes", "No"], "description": "Whether lung function is sufficient"},
                    "ast_alt_upper_bound_uln": {"type": "number", "description": "Upper bound for AST/ALT levels (* ULN)"},
                    "bilirubin_upper_bound_uln": {"type": "number", "description": "Upper bound for bilirubin levels (* ULN)"},
                    "creatinine_upper_bound_uln": {"type": "number", "description": "Upper bound for creatinine levels (* ULN)"},
                    "creatinine_clearance_lower_bound": {"type": "number", "description": "Lower bound for creatinine clearance (ml/min)"},
                    "lvef_lower_bound_percent": {"type": "number", "description": "Lower bound for LVEF percentage"},
                    "anc_level_lower_bound_10_9_l": {"type": "number", "description": "Lower bound for ANC level (* 10^9/L)"},
                    "hgb_level_lower_bound_10_9_l": {"type": "number", "description": "Lower bound for hemoglobin level (* 10^9/L)"},
                    "plt_level_lower_bound_10_9_l": {"type": "number", "description": "Lower bound for platelet level (* 10^9/L)"},
                    "granulocytes": {"type": "number", "description": "Granulocyte levels (* 10^9/L)"},
                    "karnofsky_performance_lower_bound": {"type": "integer", "description": "Lower bound for Karnofsky performance scale"},
                    "ecog_performance_status_upper_bound": {"type": "integer", "description": "Upper bound for ECOG / WHO performance status"},
                    "no_active_brain_metastases": {"type": "string", "enum": ["Yes", "No"], "description": "Whether brain metastases are absent"},
                    "no_active_central_nervous_system_metastases": {"type": "string", "enum": ["Yes", "No"], "description": "Whether central nervous system metastases are absent"},
                    "pregnancy_nursing": {"type": "string", "enum": ["Allowed", "Not allowed"], "description": "Whether pregnancy or nursing is excluded"},
                    "peripheral_neuropathy_grade_upper_bound": {"type": "integer", "description": "Upper bound for peripheral neuropathy grade"},
                    "no_positive_hiv_status": {"type": "string", "enum": ["Yes", "No"], "description": "Whether positive HIV status is excluded"},
                    "measurable_lesion": {"type": "string", "enum": ["Yes", "No"], "description": "Whether there exists measurable disease/tumor tissue/lesion"},
                    "swallow_ability": {"type": "string", "enum": ["Yes", "No"], "description": "Whether the participant has the ability to swallow"}
                }
            },
            "text_relating_to_medical_history": {
                "type": "object",
                "properties": {
                    "inclusion_criteria": {"type": "string", "description": "Relevant text relating to medical history"}
                }
            },
            "unused_text": {
                "type": "object",
                "properties": {
                    "inclusion_criteria": {"type": "string", "description": "Unused text relating to inclusion criteria"}
                }
            }
        }
    }
}

def extract_criteria_with_chatgpt_structured(trial_text, prompt_template, model="gpt-4o"):
    client = OpenAI(
        api_key=os.environ.get("OPENAI_API_KEY")
    )
    

    prompt = prompt_template + f"\n\n{trial_text}"
    
    try:
       
        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": "You are a helpful assistant specialized in clinical trial criteria extraction."},
                {"role": "user", "content": prompt}
            ],
            functions=[function_schema],
            function_call={"name": "extract_clinical_criteria"},
            temperature=0.2
        )
        
        
        function_call = response.choices[0].message.function_call
        
        if function_call:
            return json.loads(function_call.arguments)  # Return as a Python dictionary
        else:
            print("No function call in response")
            return None
            
    except Exception as e:
        print(f"Error processing trial text: {e}")
        return None

def process_clinical_trials_from_csv(input_csv, output_json, prompt_template):
    """
    Process clinical trials from a CSV file and save extracted criteria to JSON.
    
    Args:
        input_csv (str): Path to input CSV file
        output_json (str): Path for output JSON file
        prompt_template (str): Template for GPT prompt
    """
    try:
        
        df = pd.read_csv(input_csv)
        results = []
        
        
        for index, row in df.iterrows():
            trial_text = row["exclusion_criteria"]
            print(f"Processing row {index + 1}/{len(df)} of exclusion criteria...")
            
            structured_output = extract_criteria_with_chatgpt_structured(trial_text, prompt_template)
            
            if structured_output:
                results.append({
                    "Trial Text": trial_text,
                    "Extracted Criteria": structured_output
                })
        
        with open(output_json, "w") as f:
            json.dump(results, f, indent=4)
            
        print(f"Processing complete! Results saved to {output_json}")
        return results
        
    except Exception as e:
        print(f"Error processing CSV file: {e}")
        return None

if __name__ == "__main__":
    input_csv = "criteria_30.csv" 
    output_json = "extracted_exclusion_criteria_30.json" 
    
    prompt_template = """
        Extract the exclusion criteria from the provided clinical trial text based on the following parameters. For each parameter, output only the key values explicitly stated in the bracket. If a parameter is implicitly mentioned, make predictions based on your best judgement while ensuring alignment with the context. If a parameter is not mentioned or cannot be reasonably inferred, leave it blank. Since you are processing exclusion criteria, carefully handle statements involving negations (e.g., "insufficient", “not excluded,” “absence of,” “no contraindications”) to avoid misclassifying double negatives. For instance, "insufficient renal functions" in exclusion criteria will lead to "Yes" in "Sufficient renal functions" parameter. Avoid assuming a value based on a negation unless it is explicitly supported by the text. Pay close attention to sentences that list multiple criteria in a single statement and ensure that all relevant parameters are extracted. Do not include any descriptive words or explanations after each parameter value. Any text relating to requirements on subjects’ medical history and cannot be classified in the structured parameters should be listed under “Text relating to Medical History”, labeling as “Exclusion Criteria”. Any unused or unmatched text should be listed separately under “Unused Text”, labeling as “Exclusion Criteria”. 
 
        Parameters:
        Demographics and General Characteristics
        Months of Life Expectancy: [1 – 99]
        Follow-up Agreement: [Yes, No]
        Signed Consent: [Yes, No]
        Protocol Compliance: [Yes, No]
        No concurrent participation in other clinical trials: [Yes, No]
 
        Disease Characteristics
        Confirmed breast cancer / mammary carcinoma: [Yes, No]
        Confirmed metastases breast cancer: [Yes, No]
        Confirmed locally recurrent breast cancer: [Yes, No]
        Progress after previous treatment: [Yes, No]
        HER-2 Status: [Positive, Negative]
        ER/PR Status: [Positive, Negative]
        
        Health and Organ Function
        Sufficient renal function: [Yes, No]
        Sufficient bone marrow: [Yes, No]
        Sufficient liver function: [Yes, No]
        Sufficient heart/cardiovascular function: [Yes, No]
        Sufficient lung function: [Yes, No]
        AST/ALT upper bound (* ULN): [1 - 100]
        Bilirubin upper bound (* ULN): [0 - 100]
        LVEF lower bound (%): [0-100]
        creatinine clearance lower bound: [0 - 100]
        creatinine upper bound (*ULN): [0 - 100]
        ANC Level lower bound (*10^9/L): [0 - 10000]
        Hgb Level lower bound (*10^9/L): [0 – 100]
        plt Level lower bound (*10^9/L): [0 – 1000]
        granulocytes: [1-10]
        Karnofsky performance lower bound: [0 – 100]
        ECOG / WHO performance status upper bound: [0 – 5]
        No active brain Metastases: [Yes, No]
        No active central nervous system metastases: [Yes, No]
        Pregnancy/Nursing: [Allowed, Not allowed]
        peripheral Neuropathy Grade upper bound: [1 – 4]
        No positive HIV status: [Yes, No]
        Accessible Measurable Lesion: [Yes, No]
        Swallow Ability: [Yes, No]

        Text relating to Medical History
        [include but not limited to historical treatments, investigational drug use, historical allergies, current treatment, current disease]

        Unused Text
        
    """
    
    results = process_clinical_trials_from_csv(input_csv, output_json, prompt_template)

Processing row 1/24 of exclusion criteria...
Processing row 2/24 of exclusion criteria...
Processing row 3/24 of exclusion criteria...
Processing row 4/24 of exclusion criteria...
Processing row 5/24 of exclusion criteria...
Processing row 6/24 of exclusion criteria...
Processing row 7/24 of exclusion criteria...
Processing row 8/24 of exclusion criteria...
Processing row 9/24 of exclusion criteria...
Processing row 10/24 of exclusion criteria...
Processing row 11/24 of exclusion criteria...
Processing row 12/24 of exclusion criteria...
Processing row 13/24 of exclusion criteria...
Processing row 14/24 of exclusion criteria...
Processing row 15/24 of exclusion criteria...
Processing row 16/24 of exclusion criteria...
Processing row 17/24 of exclusion criteria...
Processing row 18/24 of exclusion criteria...
Processing row 19/24 of exclusion criteria...
Processing row 20/24 of exclusion criteria...
Processing row 21/24 of exclusion criteria...
Processing row 22/24 of exclusion criteria.

In [6]:
import json

# File paths (update as needed)
inclusion_json_path = "extracted_inclusion_criteria_30.json"
exclusion_json_path = "extracted_exclusion_criteria_30.json"
merged_json_path = "merged_criteria.json"

# Load the two JSON files
with open(inclusion_json_path, "r") as f:
    inclusion_data = json.load(f)

with open(exclusion_json_path, "r") as f:
    exclusion_data = json.load(f)

# Ensure both lists have the same length
if len(inclusion_data) != len(exclusion_data):
    raise ValueError("The two JSON files must have the same number of entries.")

# Function to merge inclusion and exclusion results into a single structure
def merge_criteria(inclusion, exclusion):
    merged = {}

    # Iterate over all possible keys in inclusion and exclusion criteria
    for category in set(inclusion.keys()).union(set(exclusion.keys())):
        merged[category] = {}

        # Merge parameters from inclusion and exclusion under the same category
        if category in inclusion:
            merged[category].update(inclusion[category])  # Add inclusion results
        if category in exclusion:
            merged[category].update(exclusion[category])  # Add exclusion results

    # Special handling for `text_relating_to_medical_history` and `unused_text`
    if "text_relating_to_medical_history" in merged:
        merged["text_relating_to_medical_history"] = {
            "inclusion_criteria": inclusion.get("text_relating_to_medical_history", {}).get("inclusion_criteria", ""),
            "exclusion_criteria": exclusion.get("text_relating_to_medical_history", {}).get("inclusion_criteria", "")
        }

    if "unused_text" in merged:
        merged["unused_text"] = {
            "inclusion_criteria": inclusion.get("unused_text", {}).get("inclusion_criteria", ""),
            "exclusion_criteria": exclusion.get("unused_text", {}).get("inclusion_criteria", "")
        }

    return merged

# Merge corresponding entries
merged_data = []
for i in range(len(inclusion_data)):
    trial_text = f"Inclusion Criteria:\n\n{inclusion_data[i].get('Trial Text', '')}\n\nExclusion Criteria:\n\n{exclusion_data[i].get('Trial Text', '')}"
    
    merged_entry = {
        "Trial Text": trial_text.strip(),  # Combine both inclusion and exclusion texts
        "Extracted Criteria": merge_criteria(
            inclusion_data[i].get("Extracted Criteria", {}),
            exclusion_data[i].get("Extracted Criteria", {})
        )
    }
    merged_data.append(merged_entry)

# Save the merged data to a new JSON file
with open(merged_json_path, "w") as f:
    json.dump(merged_data, f, indent=4)

print(f"Merged JSON file saved to {merged_json_path}")

Merged JSON file saved to merged_criteria.json
