In [1]:
import sys
import os

# Add the parent directory of 'wiki' to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

from wiki.api.lib.pipelines.hybrid_pipeline import hybrid_pipeline   # same instance of the pipeline will be loaded in all imports
from wiki.api.lib.pipelines.graph_pipeline_v2 import graph_pipeline
from wiki.api.lib.models.p1_qa import P1QA
from wiki.api.lib.models.p2_qa import P2QA
from wiki.api.lib.models.hierarchy_path import HierarchyPathData

  from .autonotebook import tqdm as notebook_tqdm


### Question-Answer: Invoke pipeline, capture response, return answer with references

In [2]:
import json


def question_answer(question: str) -> dict:
    p1_qa_schema = P1QA.model_json_schema()

    input_data = {
        "elasticsearch_retriever": {"query": question},
        "text_embedder": {"text": question},
        "p1_qa_prompt_builder": {"p1_qa_schema": p1_qa_schema, "query": question},
    }

    hybrid_result_dict = hybrid_pipeline.run(
        data=input_data,
        include_outputs_from={
            "text_embedder",
            "weaviate_retriever",
            "elasticsearch_retriever",
            "reciprocal_rank_fusion_joiner",
            "p1_qa_prompt_builder",
            "p1_qa_generator"
        },
    )

    hybrid_replies_json = hybrid_result_dict["p1_qa_generator"]["replies"][0]
    hybrid_replies_dict = json.loads(hybrid_replies_json)
    hybrid_replies = P1QA(**hybrid_replies_dict)

    if not hybrid_replies.need_more_context:
        # If the answer is complete, return
        answer = {"text": hybrid_replies_json, "phase": 1}
        context_docs = hybrid_result_dict["reciprocal_rank_fusion_joiner"]["documents"]
        metadata = {"phase-1": hybrid_result_dict}
        return {"answer": answer, "context_docs": context_docs, "metadata": metadata}
    else:
        # If the answer is incomplete, run graph pipeline to fetch more context
        grounding_docs = hybrid_result_dict["reciprocal_rank_fusion_joiner"][
            "documents"
        ]

        path_schema = HierarchyPathData.model_json_schema()
        p2_qa_schema = P2QA.model_json_schema()

        input_data = {
            "hierarchy_prompt_builder": {
                "query": question,
                "hierarchy_path_schema": path_schema,
            },
            "wiki_hierarchy_builder": {"documents": grounding_docs},
            "p2_qa_prompt_builder": {"p2_qa_schema": p2_qa_schema, "query": question},
        }

        result = graph_pipeline.run(
            data=input_data,
            include_outputs_from={
                "wiki_hierarchy_builder",
                "hierarchy_prompt_builder",
                "hierarchy_generator",
                "wiki_context_creator",
                "p2_qa_prompt_builder",
                "p2_qa_generator",
            },
        )

        answer = {"text": result["p2_qa_generator"]["replies"][0], "phase": 2}
        context_docs = result["wiki_context_creator"]["documents"]
        metadata = {"phase-1": hybrid_result_dict, "phase-2": result}
        return {"answer": answer, "context_docs": context_docs, "metadata": metadata}

In [3]:
def build_metadata_for_doc(doc):
    metadata = {}
    for key in ['title', 'h2', 'h3', 'h4']:
        if key in doc.meta and doc.meta[key] is not None:
            metadata[key] = doc.meta[key]
    return metadata

In [4]:
def build_answer_with_reference(answer: dict, context_docs: dict) -> dict:
    phase = answer["phase"]
    answer_json = json.loads(answer["text"])
    
    if phase == 1:
        answer_obj = P1QA(**answer_json)
    else:
        answer_obj = P2QA(**answer_json)
            
    used_doc_ids = answer_obj.document_ids
    references = []
    for doc_id in used_doc_ids:
        ref_doc = context_docs[doc_id-1]   # ref-1 because the document ids are 1-indexed
        references.append(build_metadata_for_doc(ref_doc))
    return { "answer": answer_obj.answer, "references": references }

In [9]:
response_dinosaur_extinction = question_answer("Write a short essay on the extinction of dinosaurs. Include all probable causes, the main hyppthesis as well as alternative theories if any.")

  timestamp = datetime.utcnow().replace(tzinfo=tzutc())
  body["sentAt"] = datetime.utcnow().replace(tzinfo=tzutc()).isoformat()


In [10]:
response_dinosaur_extinction["answer"]["phase"]

2

In [11]:
api_response_dinosaur_extinction = build_answer_with_reference(response_dinosaur_extinction["answer"], response_dinosaur_extinction["context_docs"])

api_response_dinosaur_extinction

{'answer': "The extinction of dinosaurs, occurring approximately 66 million years ago at the Cretaceous–Paleogene (K-Pg) boundary, has been attributed primarily to the Chicxulub impact event. This hypothesis, famously posited by Luis Alvarez and his team in 1980, suggests that a bolide, estimated to be 5 to 15 kilometers wide, struck the Yucatán Peninsula, leading to catastrophic environmental changes that would have extinguished most dinosaur species. Evidence supporting this theory includes a global layer of sediment enriched with iridium, shocked quartz, and other impact-related materials found in rock strata globally, alongside the discovery of the Chicxulub crater itself, which matches the calculated size and age of the impact. The aftermath of this impact likely involved immediate catastrophic events such as wildfires and tsunamis, followed by longer-term climatic disturbances characterized by an 'impact winter' that blocked sunlight and halted photosynthesis, resulting in collap

***SUCCESS!***

***Write down the complete response from the pipeline for reference***

In [21]:
import json
from haystack import Document  # Import the Document class from Haystack

def custom_serializer(obj):
    if isinstance(obj, Document):
        return obj.to_dict()
    # Let the default serializer handle other types
    return json.JSONEncoder().default(obj)

with open('phase_2_response_dinosaur_extinction_5_multipage_rag_v2.json', 'w') as file:
    file.write(json.dumps(response_dinosaur_extinction, default=custom_serializer))
    

In [16]:
response_ornithosceldia = question_answer("What is Ornithoscelida?")

api_response_ornithosceldia = build_answer_with_reference(response_ornithosceldia["answer"], response_ornithosceldia["context_docs"])

api_response_ornithosceldia

  timestamp = datetime.utcnow().replace(tzinfo=tzutc())
  body["sentAt"] = datetime.utcnow().replace(tzinfo=tzutc()).isoformat()


{'answer': "Ornithoscelida is a proposed clade that includes various major groupings of dinosaurs, originally proposed by Thomas Henry Huxley and later revived in 2017 after a new cladistic analysis. It refers to a clade characterized by 'bird hipped' features, including specific pelvic bone configurations.",
 'references': [{'title': 'Ornithoscelida'},
  {'title': 'Ornithoscelida', 'h2': 'Details'}]}

In [22]:
import json
from haystack import Document  # Import the Document class from Haystack

def custom_serializer(obj):
    if isinstance(obj, Document):
        return obj.to_dict()
    # Let the default serializer handle other types
    return json.JSONEncoder().default(obj)

with open('phase_1_response_ornithosceldia_5_multipage_rag_v2.json', 'w') as file:
    file.write(json.dumps(response_ornithosceldia, default=custom_serializer))

In [19]:
response_chixulub = question_answer("Write a short essay on the following topic: 'Chixulub crater and the end of the dinosaurs'")

api_response_chixulub = build_answer_with_reference(response_chixulub["answer"], response_chixulub["context_docs"])

api_response_chixulub

  timestamp = datetime.utcnow().replace(tzinfo=tzutc())
  body["sentAt"] = datetime.utcnow().replace(tzinfo=tzutc()).isoformat()


{'answer': "The Chicxulub crater, formed over 66 million years ago by a ten-kilometer-diameter asteroid impact, is widely believed to have played a pivotal role in the mass extinction event that led to the demise of the dinosaurs at the K–Pg boundary. This crater, located beneath the Yucatán Peninsula in Mexico, is the second largest impact structure on Earth and is significant for scientific research due to its intact peak ring. The event triggered a series of catastrophic changes to the climate and environment, ultimately contributing to the extinction of approximately 75% of Earth's species, including dinosaurs. While some dissenting experts highlight the role of volcanism from the Deccan Traps, the consensus remains that the Chicxulub impact was a critical factor in this extinction event, marking the end of the age of dinosaurs and the beginning of a new era in Earth's biological history.",
 'references': [{'title': 'Chicxulub crater', 'h2': 'Discovery'},
  {'title': 'Chicxulub cra

In [23]:
import json
from haystack import Document  # Import the Document class from Haystack

def custom_serializer(obj):
    if isinstance(obj, Document):
        return obj.to_dict()
    # Let the default serializer handle other types
    return json.JSONEncoder().default(obj)

with open('phase_1_response_chixulub_5_multipage_rag_v2.json', 'w') as file:
    file.write(json.dumps(response_chixulub, default=custom_serializer))

In [20]:
response_paleobiology = question_answer("Write a short essay on the paleobiology of dinosaurs.")

api_response_paleobiology = build_answer_with_reference(response_paleobiology["answer"], response_paleobiology["context_docs"])

api_response_paleobiology

  timestamp = datetime.utcnow().replace(tzinfo=tzutc())
  body["sentAt"] = datetime.utcnow().replace(tzinfo=tzutc()).isoformat()


{'answer': 'Paleobiology encompasses the study of the biology of ancient organisms through fossil and non-fossil records. In the case of dinosaurs, evidence is provided by fossilized remains, trackways, and even soft tissues. Various fields contribute to this study, including biology, chemistry, and paleontology. Dinosaur sizes varied significantly across different periods, with predatory theropods generally weighing between 100 to 1,000 kg. Some of the largest dinosaurs, such as the sauropods, were substantially larger than any modern terrestrial animals, with proposed benefits of their size including dietary efficiency and protection from predation. Behavior may have included gregariousness, as suggested by herd tracks and mass death sites. Evidence indicates some dinosaurs hunted collaboratively, while others displayed complex social behaviors. Communication likely involved visual signals and possibly non-vocal sounds, as their closest living relatives, birds and crocodilians, utili

In [24]:
import json
from haystack import Document  # Import the Document class from Haystack

def custom_serializer(obj):
    if isinstance(obj, Document):
        return obj.to_dict()
    # Let the default serializer handle other types
    return json.JSONEncoder().default(obj)

with open('phase_2_response_paleobiology_5_multipage_rag_v2.json', 'w') as file:
    file.write(json.dumps(response_paleobiology, default=custom_serializer))