In [2]:
# WARNING: This API is under development and may undergo changes in future releases.
# Backwards compatibility is not guaranteed at this time.

from datashaper import NoopVerbCallbacks
from pydantic import PositiveInt, validate_call

from graphrag.config.models.graph_rag_config import GraphRagConfig
from graphrag.index.llm import load_llm
from graphrag.index.progress import PrintProgressReporter

from graphrag.prompt_tune.generator import (
    MAX_TOKEN_COUNT,
    create_community_summarization_prompt,
    create_entity_extraction_prompt,
    create_entity_summarization_prompt,
    detect_language,
    generate_community_report_rating,
    generate_community_reporter_role,
    generate_domain,
    generate_entity_relationship_examples,
    generate_entity_types,
    generate_persona,
)
from graphrag.prompt_tune.loader import (
    MIN_CHUNK_SIZE,
    load_docs_in_chunks,
    read_config_parameters
)
from graphrag.prompt_tune.types import DocSelectionType
from graphrag.prompt_tune.generator.entity_extraction_prompt import ENTITY_EXTRACTION_FILENAME
from graphrag.prompt_tune.generator.community_report_summarization import COMMUNITY_SUMMARIZATION_FILENAME
from graphrag.prompt_tune.generator.entity_summarization_prompt import ENTITY_SUMMARIZATION_FILENAME

In [3]:
config_file = './settings.yaml'
root = './'
output = './prompts'
reporter = PrintProgressReporter("")
config = read_config_parameters(root, reporter, config_file)

chunk_size = 256
limit: PositiveInt = 15
selection_method = DocSelectionType.AUTO
# domain = "computational ploymer science"
domain = "Materials Science and Computational Chemistry"
language = "English"
max_tokens = 2048
skip_entity_types = False
min_examples_required = 10
n_subset_max = 1000
k: PositiveInt = 20




INFO: Reading settings from settings.yaml


In [4]:
# Retrieve documents
doc_list = await load_docs_in_chunks(
    root=root,
    config=config,
    limit=limit,
    select_method=selection_method,
    reporter=reporter,
    chunk_size=chunk_size,
    n_subset_max=n_subset_max,
    k=k,
)
print(f"Loaded {len(doc_list)} documents")

# Create LLM from config
llm = load_llm(
    "prompt_tuning",
    config.llm.type,
    NoopVerbCallbacks(),
    None,
    config.llm.model_dump(),
)

if not domain:
    reporter.info("Generating domain...")
    domain = await generate_domain(llm, doc_list)
    reporter.info(f"Generated domain: {domain}")

print(f"Domain: {domain}")


Loading Input (InputFileType.csv).Loaded 20 documents
Domain: Materials Science and Computational Chemistry


In [5]:
doc_list

['## b) Multi-fidelity co-Kriging\n\n\n\n',
 '## A. Monomer/Oligomer scale: The scales of chemical specifity\n\n\n\nThe basic building blocks are the monomers. They can have a simple chemical structure, as in the case of many commodity polymers such as polystyrene, or a rather complicated structure, as in the case of biopolymers such as RNA, DNA, or proteins. The structure of the monomers on the monomer scale determines local properties such as the charges and the polarization, the solubility in a solvent [101], the existence and structure of a hydra- tion shell [102], the local affinity to surfaces [103], or - in studies of polymer reactions, the monomer re- activity [104]. In general, these properties are also influenced by the larger scale structure of polymer systems. For example, the effective monomer re- activity depends on the accessibility of the reactive sites, which is determined not only by the local elec- tronic and steric monomer structure, but also by the polymer conforma

In [6]:
if not language:
    reporter.info("Detecting language...")
    language = await detect_language(llm, doc_list)

reporter.info("Generating persona...")
persona = await generate_persona(llm, domain)
persona


INFO: Generating persona...


'You are an expert in Materials Science and Computational Chemistry. You are skilled at analyzing complex data sets, mapping out relationships within scientific communities, and understanding the intricate structures of research networks. You are adept at helping people identify key influencers, collaboration patterns, and the overall structure of the community of interest in these specialized fields.'

In [7]:
community_report_ranking = await generate_community_report_rating(
        llm, domain=domain, persona=persona, docs=doc_list
    )

community_report_ranking

'A float score between 0-10 that represents the relevance of the text to materials science and computational chemistry, focusing on the significance of the information for understanding chemical structures, properties, and predictive modeling, with 1 being trivial or irrelevant and 10 being highly significant, insightful, and impactful for advancing knowledge and research in the field.'

In [8]:
entity_types = None

if not skip_entity_types:
    reporter.info("Generating entity types...")
    entity_types = await generate_entity_types(
        llm,
        domain=domain,
        persona=persona,
        docs=doc_list,
        json_mode=config.llm.model_supports_json or False,
    )

    print(f"Entity types: {len(entity_types)}")
    print(','.join(entity_types))


INFO: Generating entity types...
Entity types: 51
chemical_structure,polymer,monomer,property,method,model,simulation,descriptor,interaction,dataset,algorithm,technique,tool,theory,representation,prediction,trend,confidence_interval,residual_plot,conductivity,interaction_parameter,potential,field_model,graph,hypergraph,node,edge,hyperedge,particle,force_field,energy_landscape,correlation_function,architecture,neural_network,fine_tuning,modality,Monte_Carlo,lattice,scission_energy,stereoregularity,tacticity,transition,functional_group,production_rule,generative_model,constraint,support_vector_machine,recurrent_neural_network,generative_adversarial_network,autoencoder,reinforcement_learning


In [9]:
print(','.join(entity_types))

chemical_structure,polymer,monomer,property,method,model,simulation,descriptor,interaction,dataset,algorithm,technique,tool,theory,representation,prediction,trend,confidence_interval,residual_plot,conductivity,interaction_parameter,potential,field_model,graph,hypergraph,node,edge,hyperedge,particle,force_field,energy_landscape,correlation_function,architecture,neural_network,fine_tuning,modality,Monte_Carlo,lattice,scission_energy,stereoregularity,tacticity,transition,functional_group,production_rule,generative_model,constraint,support_vector_machine,recurrent_neural_network,generative_adversarial_network,autoencoder,reinforcement_learning


In [10]:
reporter.info("Generating entity relationship examples...")
examples = await generate_entity_relationship_examples(
    llm,
    persona=persona,
    entity_types=entity_types,
    docs=doc_list,
    language=language,
    json_mode=False,  # config.llm.model_supports_json should be used, but this prompts are used in non-json by the index engine
)

print(f"Examples: {len(examples)}")
examples[0]


INFO: Generating entity relationship examples...


Examples: 5


'("entity"{tuple_delimiter}MULTI-FIDELITY CO-KRIGING{tuple_delimiter}METHOD{tuple_delimiter}Multi-fidelity co-Kriging is a statistical method used to combine data from multiple sources of varying fidelity to improve prediction accuracy)\n{record_delimiter}\n{completion_delimiter}'

In [11]:
reporter.info("Generating entity extraction prompt...")
entity_extraction_prompt = create_entity_extraction_prompt(
    entity_types=entity_types,
    docs=doc_list,
    examples=examples,
    language=language,
    json_mode=False,  # config.llm.model_supports_json should be used, but these prompts are used in non-json by the index engine
    encoding_model=config.encoding_model,
    max_token_count=max_tokens,
    min_examples_required=min_examples_required,
)

entity_extraction_prompt


INFO: Generating entity extraction prompt...


'\n-Goal-\nGiven a text document that is potentially relevant to this activity and a list of entity types, identify all entities of those types from the text and all relationships among the identified entities.\n\n-Steps-\n1. Identify all entities. For each identified entity, extract the following information:\n- entity_name: Name of the entity, capitalized\n- entity_type: One of the following types: [chemical_structure, polymer, monomer, property, method, model, simulation, descriptor, interaction, dataset, algorithm, technique, tool, theory, representation, prediction, trend, confidence_interval, residual_plot, conductivity, interaction_parameter, potential, field_model, graph, hypergraph, node, edge, hyperedge, particle, force_field, energy_landscape, correlation_function, architecture, neural_network, fine_tuning, modality, Monte_Carlo, lattice, scission_energy, stereoregularity, tacticity, transition, functional_group, production_rule, generative_model, constraint, support_vector_

In [12]:
reporter.info("Generating entity summarization prompt...")
entity_summarization_prompt = create_entity_summarization_prompt(
    persona=persona,
    language=language,
)

entity_summarization_prompt


INFO: Generating entity summarization prompt...


"\nYou are an expert in Materials Science and Computational Chemistry. You are skilled at analyzing complex data sets, mapping out relationships within scientific communities, and understanding the intricate structures of research networks. You are adept at helping people identify key influencers, collaboration patterns, and the overall structure of the community of interest in these specialized fields.\nUsing your expertise, you're asked to generate a comprehensive summary of the data provided below.\nGiven one or two entities, and a list of descriptions, all related to the same entity or group of entities.\nPlease concatenate all of these into a single, concise description in English. Make sure to include information collected from all the descriptions.\nIf the provided descriptions are contradictory, please resolve the contradictions and provide a single, coherent summary.\nMake sure it is written in third person, and include the entity names so we have the full context.\n\nEnrich i

In [13]:
reporter.info("Generating community reporter role...")
community_reporter_role = await generate_community_reporter_role(
    llm, domain=domain, persona=persona, docs=doc_list
)

community_reporter_role


INFO: Generating community reporter role...


'A community analyst that is examining the field of Materials Science and Computational Chemistry, given a list of entities that belong to the community as well as their relationships and optional associated claims. The analysis will be used to inform decision-makers about key influencers, collaboration patterns, and the overall structure of the community, as well as significant developments and their potential impact.'

In [14]:
reporter.info("Generating community summarization prompt...")
community_summarization_prompt = create_community_summarization_prompt(
    persona=persona,
    role=community_reporter_role,
    report_rating_description=community_report_ranking,
    language=language,
)

community_summarization_prompt


INFO: Generating community summarization prompt...


'\nYou are an expert in Materials Science and Computational Chemistry. You are skilled at analyzing complex data sets, mapping out relationships within scientific communities, and understanding the intricate structures of research networks. You are adept at helping people identify key influencers, collaboration patterns, and the overall structure of the community of interest in these specialized fields.\n\n# Goal\nWrite a comprehensive assessment report of a community taking on the role of a A community analyst that is examining the field of Materials Science and Computational Chemistry, given a list of entities that belong to the community as well as their relationships and optional associated claims. The analysis will be used to inform decision-makers about key influencers, collaboration patterns, and the overall structure of the community, as well as significant developments and their potential impact.. The content of this report includes an overview of the community\'s key entities

In [15]:
from pathlib import Path
output_path = Path(output)
if output_path:
    reporter.info(f"Writing prompts to {output_path}")
    output_path.mkdir(parents=True, exist_ok=True)
    entity_extraction_prompt_path = output_path / ENTITY_EXTRACTION_FILENAME
    entity_summarization_prompt_path = output_path / ENTITY_SUMMARIZATION_FILENAME
    community_summarization_prompt_path = (
        output_path / COMMUNITY_SUMMARIZATION_FILENAME
    )
    # Write files to output path
    with entity_extraction_prompt_path.open("wb") as file:
        file.write(entity_extraction_prompt.encode(encoding="utf-8", errors="strict"))
    with entity_summarization_prompt_path.open("wb") as file:
        file.write(entity_summarization_prompt.encode(encoding="utf-8", errors="strict"))
    with community_summarization_prompt_path.open("wb") as file:
        file.write(community_summarization_prompt.encode(encoding="utf-8", errors="strict"))


INFO: Writing prompts to prompts
