In [113]:
from pathlib import Path
from typing import List
from langchain.chains.openai_functions import create_structured_output_chain

# from langchain.chat_models import ChatOpenAI
from langchain_community.chat_models import ChatOpenAI
# !pip install -U langchain-openai
from langchain_openai import ChatOpenAI

# from langchain.document_loaders import WikipediaLoader, PyPDFLoader, TextLoader
from langchain_community.document_loaders import WikipediaLoader, PyPDFLoader, TextLoader
from langchain.docstore.document import Document
# from langchain.embeddings.openai import OpenAIEmbeddings
# from langchain.embeddings.azure_openai import OpenAIEmbeddings
from langchain_community.embeddings import OpenAIEmbeddings

# from langchain.graphs import Neo4jGraph
from langchain_community.graphs import Neo4jGraph
from langchain.prompts import ChatPromptTemplate, PromptTemplate, HumanMessagePromptTemplate

from langchain.pydantic_v1 import BaseModel, Field
from langchain.text_splitter import TokenTextSplitter, CharacterTextSplitter

from langchain.callbacks import get_openai_callback
from langchain.schema.output_parser import StrOutputParser


from neo4j.exceptions import ClientError
import os
from langchain.schema import HumanMessage, ChatMessage, AIMessage

import dotenv
dotenv.load_dotenv()

graph = Neo4jGraph()


In [17]:

all_data = WikipediaLoader(query="Removal_of_Sam_Altman_from_OpenAI").load()

# Embeddings & LLM models
embeddings = OpenAIEmbeddings()
embedding_dimension = 1536
llm = ChatOpenAI(
    temperature=0,
    openai_api_key=os.getenv("OPENAI_API_KEY"),
    openai_api_version=os.getenv("OPENAI_API_VERSION"),
    openai_api_base=os.getenv("OPENAI_API_BASE"),
    deployment_name="gpt-4",
)

# Process All Data
parent_splitter = TokenTextSplitter(chunk_size=512, chunk_overlap=24)
child_splitter = TokenTextSplitter(chunk_size=100, chunk_overlap=24)

# Ingest Parent-Child node pairs
for document in all_data:
    parent_documents = parent_splitter.split_documents([document])
    for i, parent in enumerate(parent_documents):
        child_documents = child_splitter.split_documents([parent])
        params = {
            "parent_text": parent.page_content,
            "parent_id": i,
            "parent_embedding": embeddings.embed_query(parent.page_content),
            "children": [
                {
                    "text": c.page_content,
                    "id": f"{i}-{ic}",
                    "embedding": embeddings.embed_query(c.page_content),
                }
                for ic, c in enumerate(child_documents)
            ],
        }
        print(f"Processing {len(params['children'])} children")
        # Ingest data
        graph.query(
            """
        MERGE (p:Parent {id: $parent_id})
        SET p.text = $parent_text
        WITH p
        CALL db.create.setVectorProperty(p, 'embedding', $parent_embedding)
        YIELD node
        WITH p 
        UNWIND $children AS child
        MERGE (c:Child {id: child.id})
        SET c.text = child.text
        MERGE (c)<-[:HAS_CHILD]-(p)
        WITH c, child
        CALL db.create.setVectorProperty(c, 'embedding', child.embedding)
        YIELD node
        RETURN count(*)
        """,
            params,
        )
        # Create vector index for child
        try:
            graph.query(
                "CALL db.index.vector.createNodeIndex('parent_document', "
                "'Child', 'embedding', $dimension, 'cosine')",
                {"dimension": embedding_dimension},
            )
        except ClientError:  # already exists
            pass
        # Create vector index for parents
        try:
            graph.query(
                "CALL db.index.vector.createNodeIndex('typical_rag', "
                "'Parent', 'embedding', $dimension, 'cosine')",
                {"dimension": embedding_dimension},
            )
        except ClientError:  # already exists
            pass
    # Ingest hypothethical questions
print('Done;')

Processing 7 children
Processing 4 children
Processing 7 children
Processing 6 children
Processing 7 children
Processing 5 children
Processing 7 children
Processing 2 children
Processing 7 children
Processing 5 children
Processing 7 children
Processing 5 children
Processing 3 children
Processing 7 children
Processing 4 children
Processing 7 children
Processing 3 children
Processing 7 children
Processing 5 children
Processing 7 children
Processing 4 children
Processing 7 children
Processing 5 children
Processing 7 children
Processing 5 children
Processing 7 children
Processing 5 children
Processing 7 children
Processing 4 children
Processing 7 children
Processing 4 children
Processing 7 children
Processing 4 children
Processing 7 children
Processing 7 children
Processing 1 children
Processing 7 children
Processing 5 children
Done;


In [73]:
from langchain.output_parsers import ResponseSchema, StructuredOutputParser, PydanticOutputParser, NumberedListOutputParser
from langchain.prompts import PromptTemplate

# Define your desired data structure.
# class Questions(BaseModel):
#     """Generating hypothetical questions about text."""

#     questions: List[str] = Field(
#         ...,
#         description=(
#             "Generated hypothetical questions based on " "the information from the text"
#         ),
#     )



# Set up a parser + inject instructions into the prompt template.
# pydantic_parser = PydanticOutputParser(pydantic_object=Questions)


In [114]:
## This is a barebones manner, without 'chat' contexts
# prompt = """You are generating hypothetical questions based on the information 
#             found in the text. Make sure to provide full context in the generated 
#             questions. 

#             Use the given format to generate hypothetical questions from the "
#                 "following input: {input}"""
# questions_prompt = ChatPromptTemplate.from_template(prompt) 

## This is preferred for chat contexts
questions_prompt = ChatPromptTemplate.from_messages(
    [
        AIMessage(content="You are generating hypothetical questions based on the information "
                "found in the text. Make sure to provide full context in the generated "
                "questions." 
        ),
        HumanMessagePromptTemplate.from_template( "Use the given format to generate hypothetical questions from the "
                "following input: {input}"
        )

    ]
)

print(questions_prompt.invoke({'input':'test data'}))

question_chain = questions_prompt | llm | NumberedListOutputParser()



messages=[AIMessage(content='You are generating hypothetical questions based on the information found in the text. Make sure to provide full context in the generated questions.'), HumanMessage(content='Use the given format to generate hypothetical questions from the following input: test data')]


In [110]:
## This is just an example of how to use  the invoke of the chains
# from langchain.prompts import HumanMessagePromptTemplate
# from langchain_core.messages import SystemMessage
# from langchain_openai import ChatOpenAI

# chat_template = ChatPromptTemplate.from_messages(
#     [
#         SystemMessage(
#             content=(
#                 "You are a helpful assistant that re-writes the user's text to "
#                 "sound more upbeat."
#             )
#         ),
#         HumanMessagePromptTemplate.from_template("{text}"),
#     ]
# )
# # messages = chat_template.format_messages(text="I don't like eating tasty things")
# messages = chat_template.invoke({"text":"I don't like eating tasty things"})
# print(messages)

messages=[SystemMessage(content="You are a helpful assistant that re-writes the user's text to sound more upbeat."), HumanMessage(content="I don't like eating tasty things")]


In [104]:
# parent.page_content

' its hyperbolic trajectory and estimated initial high velocity, to be from beyond the Solar System. The 2014 meteorite was detected three years earlier than the more recent and widely known interstellar objects, ʻOumuamua in 2017 and  2I/Borisov in 2019. Further related studies were reported on 1 September 2023.\nThe first known dinosaur fossil linked to the very day of the Chicxulub impact is reported by paleontologists at the Tanis site in North Dakota.\nOne science journalist reflects on the global management of the COVID-19 pandemic in relation to science, investigating the question "Why the WHO took two years to say COVID is airborne" – a finding hundreds of scientists reaffirmed in an open letter in July 2020 – with one indication that this may be one valid major concern to many expert scientists being several writings published by news outlets.\nA study decodes electrical communication between fungi into word-like components via spiking characteristics.\nResearchers demonstrate

In [115]:

for i, parent in enumerate(parent_documents):
    if i > 0:
        break
    print(f"Processing parent {i}")# with page content\n {parent.page_content}")
    print(f"prompt:\n {str(questions_prompt.invoke({'input': parent.page_content}))}")
    questions = question_chain.invoke({'input': parent.page_content})
    print(f"There were {len(questions)} questions generated")
    params = {
        "parent_id": i,
        "questions": [
            {"text": q, "id": f"{i}-{iq}", "embedding": embeddings.embed_query(q)}
            for iq, q in enumerate(questions)
            if q
        ],
    }
    graph.query(
        """
    MERGE (p:Parent {id: $parent_id})
    WITH p
    UNWIND $questions AS question
    CREATE (q:Question {id: question.id})
    SET q.text = question.text
    MERGE (q)<-[:HAS_QUESTION]-(p)
    WITH q, question
    CALL db.create.setVectorProperty(q, 'embedding', question.embedding)
    YIELD node
    RETURN count(*)
    """,
        params,
    )
    # Create vector index
    try:
        graph.query(
            "CALL db.index.vector.createNodeIndex('hypothetical_questions', "
            "'Question', 'embedding', $dimension, 'cosine')",
            {"dimension": embedding_dimension},
        )
    except ClientError:  # already exists
        pass



Processing parent 0
prompt:


AttributeError: 'list' object has no attribute 'questions'

In [84]:
graph.query(
    """
MERGE (p:Parent {id: $parent_id})
WITH p
UNWIND $questions AS question
CREATE (q:Question {id: question.id})
SET q.text = question.text
MERGE (q)<-[:HAS_QUESTION]-(p)
WITH q, question
CALL db.create.setVectorProperty(q, 'embedding', question.embedding)
YIELD node
RETURN count(*)
""",
    params,
)
# Create vector index
try:
    graph.query(
        "CALL db.index.vector.createNodeIndex('hypothetical_questions', "
        "'Question', 'embedding', $dimension, 'cosine')",
        {"dimension": embedding_dimension},
    )
except ClientError:  # already exists
    pass

In [93]:

# Ingest summaries

summary_prompt = ChatPromptTemplate.from_messages(
    [
        AIMessage(
            content="You are generating concise and accurate summaries based on the "\
                "information found in the text."
        ),
        HumanMessagePromptTemplate.from_template( 
         "Generate a summary of the following input: \n {input}\n" "Summary:\n"
        ),
    ]
)

summary_chain = summary_prompt | llm | StrOutputParser()

for i, parent in enumerate(parent_documents):
    if i > 0:
        break
    summary = summary_chain.invoke({"input": parent.page_content})
    params = {
        "parent_id": i,
        "summary": summary,
        "embedding": embeddings.embed_query(summary),
    }
    graph.query(
        """
    MERGE (p:Parent {id: $parent_id})
    MERGE (p)-[:HAS_SUMMARY]->(s:Summary)
    SET s.text = $summary
    WITH s
    CALL db.create.setVectorProperty(s, 'embedding', $embedding)
    YIELD node
    RETURN count(*)
    """,
        params,
    )
    # Create vector index
    try:
        graph.query(
            "CALL db.index.vector.createNodeIndex('summary', "
            "'Summary', 'embedding', $dimension, 'cosine')",
            {"dimension": embedding_dimension},
        )
    except ClientError:  # already exists
        pass

In [89]:
parent.page_content

' its hyperbolic trajectory and estimated initial high velocity, to be from beyond the Solar System. The 2014 meteorite was detected three years earlier than the more recent and widely known interstellar objects, ʻOumuamua in 2017 and  2I/Borisov in 2019. Further related studies were reported on 1 September 2023.\nThe first known dinosaur fossil linked to the very day of the Chicxulub impact is reported by paleontologists at the Tanis site in North Dakota.\nOne science journalist reflects on the global management of the COVID-19 pandemic in relation to science, investigating the question "Why the WHO took two years to say COVID is airborne" – a finding hundreds of scientists reaffirmed in an open letter in July 2020 – with one indication that this may be one valid major concern to many expert scientists being several writings published by news outlets.\nA study decodes electrical communication between fungi into word-like components via spiking characteristics.\nResearchers demonstrate

In [94]:
summary = summary_chain.invoke({"input": parent.page_content})

In [95]:
print(summary)

The text doesn't provide any information to summarize. Please provide a text.


In [101]:
summary_prompt.format_messages(input=parent.page_content)

[AIMessage(content='You are generating concise and accurate summaries based on the information found in the text.'),
 HumanMessage(content='Generate a summary of the following input: \n {input}\nSummary:\n')]