In [None]:
import os
from time import sleep

In [None]:
from ollama_interface import chat_with_model # local ollama wrapper
from openai import OpenAI
from anthropic import Anthropic

def interact_with_agent(model_name, system_message, user_prompt, image_paths=None, stream=False, options = None):
    """Interacts with a model agent using system and user prompts."""
    
    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_prompt}
    ]

    if model_name == "gpt-4o":
        client = OpenAI(api_key="")

        response = client.chat.completions.create(
            model="gpt-4o",
            messages=messages
        )

        return response.choices[0].message.content.strip()

    elif model_name == "claude-sonnet-4-20250514":
        client = Anthropic(api_key="")
        response = client.messages.create(
            model=model_name,
            system=system_message,
            messages=[
                {"role": "user", "content": user_prompt}
            ],
            max_tokens=8192
        )

        return response.content[0].text.strip()
        
    else:
        response = chat_with_model(
            model_name=model_name,
            messages=messages,
            # image_paths=image_paths or [],
            stream=stream,
            options=options,
            timeout=30*60*60
        )
        return response["message"]["content"]
    return None

In [None]:
def load_file_to_string(file_path):
  """Loads the content of a file into a string.

  Args:
    file_path: The path to the file.

  Returns:
    The content of the file as a string.
  """
  with open(file_path, 'r') as file:
    file_contents = file.read()
  return file_contents

In [None]:
def fill_prompt_template(template_text, values_dict):
    for key, value in values_dict.items():
        template_text = template_text.replace(f"{{{key}}}", value)
    return template_text

In [None]:
def agent_call(agent):
    response = interact_with_agent(
        model_name=agent["model"],
        system_message=agent["system_message"],
        user_prompt=agent["prompt"]
    )
    return response

In [None]:
pub_path = "publication.md"
pub_string = load_file_to_string(pub_path)
pub = pub_string

In [3]:
def master_func(model_name, schema_complete, schema_paring, lit_paring, example_provision):

    if schema_complete:

        update_schema = False
        update_lit = False
        
        if schema_paring and not lit_paring:

            update_schema = True
            
            agent1_sys = """
            You are an expert knowledge extraction analyst.
            
            You are given:
             - A set of schema triples in the format (subject, predicate, object), which define a knowledge schema
             - A scientific publication or report in markdown format
             
            Your task is to review the schema and determine which triples are realistically populatable based on the kind of information found in the publication — not to extract the actual data values.
            
            Only include triples for which:
             - The publication clearly discusses or covers the subject matter described in the triple.
             - There is a strong indication that data could be extracted to populate the triple, even if not extracted now
            
            Do not include:
             - Triples that refer to topics or entities not mentioned at all in the publication
             - Triples for which the necessary information is missing or unlikely to be derivable from the content
            
            Output only the subset of relevant schema triples — the ones that appear potentially populatable — in CSV format
            """
            
            agent1_usr = """
            Task: Read the publication and filter the schema triples. Output only those triples for which there is evidence that the information is likely present or derivable from the publication.
            
            Do not extract or fill in the data. Only identify which schema triples are realistically populatable based on content coverage.
            
            Output format:
            A CSV list of supported triples in this format:
            "subject,predicate,object"
            
            Schema: \"\"\"
            {schema}
            \"\"\"
            
            Publication: \"\"\"
            {publication}
            \"\"\"
            """
    
        elif not schema_paring and lit_paring:

            update_lit = True
            
            agent1_sys = """
            You are an expert knowledge extraction analyst.
            
            You are given:
             - A set of schema triples in the format (subject, predicate, object), which define a knowledge schema
             - A scientific publication or report in markdown format
             
            Your task is to review the schema and extract pieces of the publication that can realistically populatable the given schema triples.
            
            Only include the publication data extract for which:
             - The publication clearly discusses or covers the subject matter described in the triple.
             - There is a strong indication that data could be extracted to populate the triple, even if not extracted now
            
            Do not:
             - Change or rephrase anything in the publication but keep the wording as text as it is.
             - include data extracts for which the necessary information is missing or unlikely to be derivable from the content
            
            Output only the subset publication extract — in markdown format
            """
    
            agent1_usr = """
            Task: Read the publication and the schema triples. Output only those publication extracts for which there is evidence that the information is likely present or derivable and can populate the triples.
            
            Do not change or rephrase the publication wording but give exact same wording. Only identify the sub-extracts of the publication that can realistically populate triples based on content coverage.
            
            Output format:
            A markdown of the exact extracts of the publication:
            
            Schema: \"\"\"
            {schema}
            \"\"\"
            
            Publication: \"\"\"
            {publication}
            \"\"\"
            """

        schema_path = "triples.csv"
        schema_string = load_file_to_string(schema_path)
        schema = schema_string

        input_data = {
            "publication": pub,
            "schema": schema
        }
        agent1_prompt = fill_prompt_template(agent1_usr, input_data)

        agent1 = {
            "name": "schema_extraction_agent",
            "model": model_name,
            "system_message": agent1_sys,
            "prompt": agent1_prompt
        }

        response = agent_call(agent1)
        file_path_write = f"{model_name}-{str(schema_complete)}-{str(schema_paring)}-{str(lit_paring)}-{str(example_provision)}-schema-lit-extraction.txt"
        with open(file_path_write, 'w') as f:
            f.write(response)

        ## Agent 1 calls complete

        
        # Prep for Agent 2
        if not example_provision:
            
            agent2_sys = """
            You are an expert in precise information extraction.
            
            You are given:
             - A list of schema triples in the form: (subject, predicate, object) — each defines a relation pattern that is expected to be supported by the publication
             - A publication in markdown format
            
            Your task is to:
             - Carefully read the publication
             - Extract all factual data triples from the publication that match the patterns defined in the schema
             - Only extract triples that are explicitly stated or clearly inferable from the publication
             - Return only the final data triples extracted
            
            Guidelines:
             - Only include triples that adhere to the schema exactly
             - Do not include unsupported, hypothetical, or guessed information
             - If multiple valid triples fit the same schema (e.g., multiple values), include them all
             - Do not include input schema triples but only include the extracted data triples
             - Output must be a CSV-style list of complete data triples: "subject,predicate,object"
            """
    
            agent2_usr = """
            Task: From the publication, extract all factual data triples that match the patterns defined in the schema.
            
            Only extract triples if they are explicitly stated or clearly inferable. Return one CSV-formatted triple per line.
            
            Format:
            "subject,predicate,object"
            
            Schema: \"\"\"
            {schema}
            \"\"\"
            
            Publication: \"\"\"
            {publication}
            \"\"\"
            """

        else:
            agent2_sys = """
            You are an expert in precise information extraction.
            
            You are given:
             - A list of schema triples in the form: (subject, predicate, object) — each defines a relation pattern that is expected to be supported by the publication
             - A publication in markdown format
            
            Your task is to:
             - Carefully read the publication
             - Analyze the provided examples for populating triples with data
             - Extract all factual data triples from the publication that match the patterns defined in the schema using the examples as a guide
             - Only extract triples that are explicitly stated or clearly inferable from the publication
             - Return only the final data triples extracted
            
            Guidelines:
             - Only include triples that adhere to the schema exactly
             - Do not include unsupported, hypothetical, or guessed information
             - If multiple valid triples fit the same schema (e.g., multiple values), include them all
             - Do not include input schema triples but only include the extracted data triples
             - Output must be a CSV-style list of complete data triples: "subject,predicate,object"
            """
    
            agent2_usr = """
            Task: From the publication, extract all factual data triples that match the patterns defined in the schema.
            
            Only extract triples if they are explicitly stated or clearly inferable. Return one CSV-formatted triple per line.

            Below are examples of what a triple in the schema could become when populated with data.

            Examples:
            1. Material,hasProperty,Property -> Polyethylene,hasProperty,Density: 0.95 g/cm³
            2. Experiment,hasLocation,Location -> Experiment001,hasLocation,MIT Laboratory 3B
            3. DifferentialScanningCalorimetry,hasCalorimetryCharacteristic,CoolingRate -> DSC_Exp12,hasCalorimetryCharacteristic,CoolingRate: 10 °C/min
            4. Rheometry,hasRheometryCharacteristic,CapillarySize -> RheoTest22,hasRheometryCharacteristic,CapillarySize: 0.5 mm
            5. TransmissionElectronMicroscopy,hasMicrscopyCharacteristic,Magnification -> TEM_SampleA,hasMicrscopyCharacteristic,Magnification: 50000x
            6. Collection,hasDOI,xsd:anyURI -> NanocompositeStudy2023,hasDOI,https://doi.org/10.1016/j.polymer.2023.125
            7. Reference,hasAuthor,Author -> Ref1234,hasAuthor,Jane Smith
            8. Material,hasComponent,Material -> EpoxyResinBlend,hasComponent,CarbonNanotubes
            9. Computation,hasSoftwareConfiguration,SoftwareConfiguration -> MDRun45,hasSoftwareConfiguration,LAMMPS v3.2
            10. Data,hasSamplePreparation,PhysicalProcess -> DataSet_AlFoam,hasSamplePreparation,HeatTreatment950C
            11. CharacterizationMethod,hasResultData,ResultData -> SEM_Char2022,hasResultData,SEM_Results_Char22.csv
            12. FiberTensileStrength,isMeasuredUnderCondition,Conditions -> FiberX_2021,isMeasuredUnderCondition,Temperature: 25 °C
            13. TensileModulus,isMeasuredUnderCondition,Conditions -> PLA_Sample7,isMeasuredUnderCondition,StrainRate: 0.01 /s
            14. Material,hasProperty,SpecificSurfaceArea -> SiO2_Powder_A,hasProperty,SSA_SiO2_A
            15. SpecificSurfaceArea,hasQuantity,Value -> SSA_SiO2_A,hasQuantity,200 m²/g
            16. Additive,hasQuantity,Quantity -> TiO2_Nanoparticles,hasQuantity,3 wt%

            Format:
            "subject,predicate,object"
            
            Schema: \"\"\"
            {schema}
            \"\"\"
            
            Publication: \"\"\"
            {publication}
            \"\"\"
            """

        if update_schema and not update_lit:
            input_data2 = {
                "publication": pub,
                "schema": response
            }

        elif not update_schema and update_lit:
            input_data2 = {
                "publication": response,
                "schema": schema
            }
            
        agent2_prompt = fill_prompt_template(agent2_usr, input_data2)

        agent2 = {
            "name": "knowledge_extraction_agent",
            "model": model_name,
            "system_message": agent2_sys,
            "prompt": agent2_prompt
        }

        response_2 = agent_call(agent2)
        file_path_write = f"{model_name}-{str(schema_complete)}-{str(schema_paring)}-{str(lit_paring)}-{str(example_provision)}-knowledge-extraction.csv"
        with open(file_path_write, 'w') as f:
            f.write(response_2)
            
    elif not schema_complete:

        update_schema = False
        update_lit = False
        
        if schema_paring and not lit_paring:

            update_schema = True
            
            agent1_sys = """
            You are an expert knowledge extraction analyst.
            
            You are given:
             - A set of schema triples in the format (subject, predicate, object), which define a knowledge schema
             - A scientific publication or report in markdown format
             
            Your task is to review the schema and determine which triples are realistically populatable based on the kind of information found in the publication — not to extract the actual data values.
            
            Only include triples for which:
             - The publication clearly discusses or covers the subject matter described in the triple.
             - There is a strong indication that data could be extracted to populate the triple, even if not extracted now
            
            Do not include:
             - Triples that refer to topics or entities not mentioned at all in the publication
             - Triples for which the necessary information is missing or unlikely to be derivable from the content
            
            Output only the subset of relevant schema triples — the ones that appear potentially populatable — in CSV format
            """
            
            agent1_usr = """
            Task: Read the publication and filter the schema triples. Output only those triples for which there is evidence that the information is likely present or derivable from the publication.
            
            Do not extract or fill in the data. Only identify which schema triples are realistically populatable based on content coverage.
            
            Output format:
            A CSV list of supported triples in this format:
            "subject,predicate,object"
            
            Schema: \"\"\"
            {schema}
            \"\"\"
            
            Publication: \"\"\"
            {publication}
            \"\"\"
            """
    
        elif not schema_paring and lit_paring:

            update_lit = True
            
            agent1_sys = """
            You are an expert knowledge extraction analyst.
            
            You are given:
             - A set of schema triples in the format (subject, predicate, object), which define a knowledge schema
             - A scientific publication or report in markdown format
             
            Your task is to review the schema and extract pieces of the publication that can realistically populatable the given schema triples.
            
            Only include the publication data extract for which:
             - The publication clearly discusses or covers the subject matter described in the triple.
             - There is a strong indication that data could be extracted to populate the triple, even if not extracted now
            
            Do not:
             - Change or rephrase anything in the publication but keep the wording as text as it is.
             - include data extracts for which the necessary information is missing or unlikely to be derivable from the content
            
            Output only the subset publication extract — in markdown format
            """
    
            agent1_usr = """
            Task: Read the publication and the schema triples. Output only those publication extracts for which there is evidence that the information is likely present or derivable and can populate the triples.
            
            Do not change or rephrase the publication wording but give exact same wording. Only identify the sub-extracts of the publication that can realistically populate triples based on content coverage.
            
            Output format:
            A markdown of the exact extracts of the publication:
            
            Schema: \"\"\"
            {schema}
            \"\"\"
            
            Publication: \"\"\"
            {publication}
            \"\"\"
            """

        for filename in sorted(os.listdir("csv")):
            if filename.endswith(".csv"):
                schema_path = os.path.join("csv", filename)
                schema = load_file_to_string(schema_path)
                
                input_data = {
                    "publication": pub,
                    "schema": schema
                }
                agent1_prompt = fill_prompt_template(agent1_usr, input_data)
                
                agent1 = {
                    "name": "schema_extraction_agent",
                    "model": model_name,
                    "system_message": agent1_sys,
                    "prompt": agent1_prompt
                }
        
                response = agent_call(agent1)

                base, _ = os.path.splitext(filename)
                new_filename = base + ".txt"
                dir_name = f"{model_name}-{str(schema_complete)}-{str(schema_paring)}-{str(lit_paring)}-{str(example_provision)}-schema-lit-extraction"
                os.makedirs(dir_name, exist_ok=True)
                file_path_write = os.path.join(dir_name, new_filename)
                with open(file_path_write, 'w') as f:
                    f.write(response)

        if not example_provision:
            
            agent2_sys = """
            You are an expert in precise information extraction.
            
            You are given:
             - A list of schema triples in the form: (subject, predicate, object) — each defines a relation pattern that is expected to be supported by the publication
             - A publication in markdown format
            
            Your task is to:
             - Carefully read the publication
             - Extract all factual data triples from the publication that match the patterns defined in the schema
             - Only extract triples that are explicitly stated or clearly inferable from the publication
             - Return only the final data triples extracted
            
            Guidelines:
             - Only include triples that adhere to the schema exactly
             - Do not include unsupported, hypothetical, or guessed information
             - If multiple valid triples fit the same schema (e.g., multiple values), include them all
             - Do not include input schema triples but only include the extracted data triples
             - Output must be a CSV-style list of complete data triples: "subject,predicate,object"
            """
    
            agent2_usr = """
            Task: From the publication, extract all factual data triples that match the patterns defined in the schema.
            
            Only extract triples if they are explicitly stated or clearly inferable. Return one CSV-formatted triple per line.
            
            Format:
            "subject,predicate,object"
            
            Schema: \"\"\"
            {schema}
            \"\"\"
            
            Publication: \"\"\"
            {publication}
            \"\"\"
            """

        else:
            agent2_sys = """
            You are an expert in precise information extraction.
            
            You are given:
             - A list of schema triples in the form: (subject, predicate, object) — each defines a relation pattern that is expected to be supported by the publication
             - A publication in markdown format
            
            Your task is to:
             - Carefully read the publication
             - Analyze the provided examples for populating triples with data
             - Extract all factual data triples from the publication that match the patterns defined in the schema using the examples as a guide
             - Only extract triples that are explicitly stated or clearly inferable from the publication
             - Return only the final data triples extracted
            
            Guidelines:
             - Only include triples that adhere to the schema exactly
             - Do not include unsupported, hypothetical, or guessed information
             - If multiple valid triples fit the same schema (e.g., multiple values), include them all
             - Do not include input schema triples but only include the extracted data triples
             - Output must be a CSV-style list of complete data triples: "subject,predicate,object"
            """
    
            agent2_usr = """
            Task: From the publication, extract all factual data triples that match the patterns defined in the schema.
            
            Only extract triples if they are explicitly stated or clearly inferable. Return one CSV-formatted triple per line.

            Below are examples of what a triple in the schema could become when populated with data.

            Examples:
            1. Material,hasProperty,Property -> Polyethylene,hasProperty,Density: 0.95 g/cm³
            2. Experiment,hasLocation,Location -> Experiment001,hasLocation,MIT Laboratory 3B
            3. DifferentialScanningCalorimetry,hasCalorimetryCharacteristic,CoolingRate -> DSC_Exp12,hasCalorimetryCharacteristic,CoolingRate: 10 °C/min
            4. Rheometry,hasRheometryCharacteristic,CapillarySize -> RheoTest22,hasRheometryCharacteristic,CapillarySize: 0.5 mm
            5. TransmissionElectronMicroscopy,hasMicrscopyCharacteristic,Magnification -> TEM_SampleA,hasMicrscopyCharacteristic,Magnification: 50000x
            6. Collection,hasDOI,xsd:anyURI -> NanocompositeStudy2023,hasDOI,https://doi.org/10.1016/j.polymer.2023.125
            7. Reference,hasAuthor,Author -> Ref1234,hasAuthor,Jane Smith
            8. Material,hasComponent,Material -> EpoxyResinBlend,hasComponent,CarbonNanotubes
            9. Computation,hasSoftwareConfiguration,SoftwareConfiguration -> MDRun45,hasSoftwareConfiguration,LAMMPS v3.2
            10. Data,hasSamplePreparation,PhysicalProcess -> DataSet_AlFoam,hasSamplePreparation,HeatTreatment950C
            11. CharacterizationMethod,hasResultData,ResultData -> SEM_Char2022,hasResultData,SEM_Results_Char22.csv
            12. FiberTensileStrength,isMeasuredUnderCondition,Conditions -> FiberX_2021,isMeasuredUnderCondition,Temperature: 25 °C
            13. TensileModulus,isMeasuredUnderCondition,Conditions -> PLA_Sample7,isMeasuredUnderCondition,StrainRate: 0.01 /s
            14. Material,hasProperty,SpecificSurfaceArea -> SiO2_Powder_A,hasProperty,SSA_SiO2_A
            15. SpecificSurfaceArea,hasQuantity,Value -> SSA_SiO2_A,hasQuantity,200 m²/g
            16. Additive,hasQuantity,Quantity -> TiO2_Nanoparticles,hasQuantity,3 wt%

            Format:
            "subject,predicate,object"
            
            Schema: \"\"\"
            {schema}
            \"\"\"
            
            Publication: \"\"\"
            {publication}
            \"\"\"
            """

        # if update_schema, we need to iterate through f"{model_name}-{str(schema_complete)}-{str(schema_paring)}-{str(lit_paring)}-{str(example_provision)}-schema-lit-extraction"
        # and put it as the new schema. the lit will stay the same
        # if update_lit, we need to iterate through f"{model_name}-{str(schema_complete)}-{str(schema_paring)}-{str(lit_paring)}-{str(example_provision)}-schema-lit-extraction"
        # and put it as the new lit. the schema will stay what is in "csv"
        if update_schema:
            schema_dir = f"{model_name}-{str(schema_complete)}-{str(schema_paring)}-{str(lit_paring)}-{str(example_provision)}-schema-lit-extraction"
        elif update_lit:
            pass
        for filename in os.listdir("csv"):
            if filename.endswith(".csv"):
                schema_path = os.path.join("pared-csv2-promptex-claudesonnet4", filename)
                schema = load_file_to_string(schema_path)
        
                input_data = {
                    "publication": pub,
                    "schema": schema
                }
                agent2_prompt = fill_prompt_template(agent2_usr, input_data)
        
                agent2 = {
                    "name": "knowledge_extraction_agent",
                    "model": "claude-sonnet-4-20250514",
                    "system_message": agent2_sys,
                    "prompt": agent2_prompt
                    #"image_paths": [f"images/{image}" for image in os.listdir("images") if image.endswith(".png")]
                }
        
                response_2 = agent_call(agent2)
        
                # print(response_2["message"]["content"]) # local models
                # print(response_2.choices[0].message.content) # gpt
                print(response_2.content[0].text) # claude
        
                file_path_write = os.path.join("pared-extracted-csv2-promptex-claudesonnet4", f"claude-sonnet4-knowledge-extraction-{filename.split('-')[-1]}")
                with open(file_path_write, 'w') as f:
                    # f.write(response_2["message"]["content"]) # local models
                    # f.write(response_2.choices[0].message.content) # gpt
                    f.write(response_2.content[0].text) # claude

In [None]:
models = ["phi4:latest-64k",
          "llama3.2:latest-64k",
          "phi4-mini:latest-64k",
          "granite3.3:latest-64k",
          "gemma3:27b-64k",
          "mistral-small3.2:latest-64k",
          "deepseek-r1:latest-64k"]

In [None]:
# Combinations (Images = No)

# 1. Schema = Complete; Schema Paring = Yes; Literature Paring = No; Examples = No
# 2. Schema = Complete; Schema Paring = No; Literature Paring = Yes; Examples = No
# 3. Schema = Complete; Schema Paring = Yes; Literature Paring = No; Examples = Yes
# 4. Schema = Complete = Schema Paring = No; Literature Paring = Yes; Examples = Yes

# 5. Schema = Modules; Schema Paring = Yes; Literature Paring = No; Examples = No
# 6. Schema = Modules; Schema Paring = No; Literature Paring = Yes; Examples = No
# 7. Schema = Modules; Schema Paring = Yes; Literature Paring = No; Examples = Yes
# 8. Schema = Modules; Schema Paring = No; Literature Paring = Yes; Examples = Yes

for model in models:
    master_func(model_name=model, schema_complete=True, schema_paring=True, lit_paring=False, Examples=False)
    master_func(model_name=model, schema_complete=True, schema_paring=False, lit_paring=True, Examples=False)
    master_func(model_name=model, schema_complete=True, schema_paring=True, lit_paring=False, Examples=True)
    master_func(model_name=model, schema_complete=True, schema_paring=False, lit_paring=True, Examples=True)

    master_func(model_name=model, schema_complete=False, schema_paring=True, lit_paring=False, Examples=False)
    master_func(model_name=model, schema_complete=False, schema_paring=False, lit_paring=True, Examples=False)
    master_func(model_name=model, schema_complete=False, schema_paring=True, lit_paring=False, Examples=True)
    master_func(model_name=model, schema_complete=False, schema_paring=False, lit_paring=True, Examples=True)