In [1]:
import chainlit as cl

from dotenv import load_dotenv # type: ignore
import os
import openai

# Common data processing
import json
import textwrap

# Langchain
from langchain_community.graphs import Neo4jGraph
from langchain_community.vectorstores import Neo4jVector
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQAWithSourcesChain
from langchain_openai import ChatOpenAI
from langchain_openai import AzureChatOpenAI
from langchain_openai import AzureOpenAIEmbeddings
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts.prompt import PromptTemplate
from langchain_community.vectorstores.neo4j_vector import remove_lucene_chars
from langchain.schema.runnable.config import RunnableConfig
from langchain.tools.retriever import create_retriever_tool

from yfiles_jupyter_graphs import GraphWidget
from neo4j import GraphDatabase

from PyPDF2 import PdfReader
from langchain.schema import Document

from typing import Tuple, List, Optional

from langchain_core.messages import AIMessage, HumanMessage
from langchain_core.output_parsers import StrOutputParser
from langchain_experimental.graph_transformers import LLMGraphTransformer

#getting chat history
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_core.messages import BaseMessage, AIMessage
from langchain_core.tools import tool
from langchain.agents import AgentExecutor, create_react_agent
from langchain.chains import GraphCypherQAChain

import logging

# Warning control
import warnings
warnings.filterwarnings("ignore")

from io import BytesIO


from langchain_core.runnables import (
    RunnableBranch,
    RunnableLambda,
    RunnableParallel,
    RunnablePassthrough,
)


from langchain_core.pydantic_v1 import BaseModel, Field
from langchain.output_parsers import PydanticToolsParser
from langchain_core.prompts import ChatPromptTemplate

from langchain.chains import RetrievalQA, GraphCypherQAChain
from typing import Literal


2024-08-29 09:23:03 - Loaded .env file


In [None]:
load_dotenv('.env', override=True)
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
AZURE_OPENAI_ENDPOINT = os.getenv('AZURE_OPENAI_ENDPOINT')
AZURE_OPENAI_API_VERSION = os.getenv('AZURE_OPENAI_API_VERSION')
AZURE_OPENAI_CHAT_DEPLOYMENT_NAME = os.getenv('AZURE_OPENAI_CHAT_DEPLOYMENT_NAME')
AZURE_OPENAI_CHAT_DEPLOYMENT_NAME_MODEL = os.getenv('AZURE_OPENAI_CHAT_DEPLOYMENT_NAME_MODEL')
NEO4J_URI=os.getenv("NEO4J_URI")
NEO4J_USERNAME=os.getenv("NEO4J_USERNAME")
NEO4J_PASSWORD=os.getenv("NEO4J_PASSWORD")
NEO4J_DATABASE=os.getenv("NEO4J_DATABASE")

In [None]:
graph = Neo4jGraph(
    url=NEO4J_URI, username=NEO4J_USERNAME, password=NEO4J_PASSWORD, database=NEO4J_DATABASE
)

In [None]:
llm = AzureChatOpenAI(
    azure_endpoint=AZURE_OPENAI_ENDPOINT,
    azure_deployment=AZURE_OPENAI_CHAT_DEPLOYMENT_NAME_MODEL,
    api_version=AZURE_OPENAI_API_VERSION,
    api_key=OPENAI_API_KEY,
    temperature=0.3
)

In [None]:
embeddings = AzureOpenAIEmbeddings(model="text-embedding-ada-002",
                                    azure_deployment=AZURE_OPENAI_CHAT_DEPLOYMENT_NAME,
                                    azure_endpoint=AZURE_OPENAI_ENDPOINT,
                                    api_key=OPENAI_API_KEY
                                    )

In [None]:
vector_index = Neo4jVector.from_existing_graph(
        embedding=embeddings,
        search_type="hybrid",
        node_label="Document",
        text_node_properties=["text"],
        embedding_node_property="embedding",
        index_name='neo4j'
    )

In [None]:
class SubQuery(BaseModel):
    """Decompose a given question/query into sub-queries"""

    sub_query: str = Field(
        ...,
        description="A unique paraphrasing of the original questions.",
    )
    
system = """You are an expert at converting user questions into Neo4j Cypher queries. \

Perform query decomposition. Given a user question, break it down into two distinct subqueries that \
you need to answer in order to answer the original question.

For the given input question, create a query for similarity search and create a query to perform neo4j graph query.
Here is example:
Question: Find the articles about the photosynthesis and return their titles.
Answers:
sub_query1 : Find articles related to photosynthesis.
sub_query2 : Return titles of the articles
"""
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "{question}"),
    ]
)

llm_with_tools = llm.bind_tools([SubQuery])
parser = PydanticToolsParser(tools=[SubQuery])
query_analyzer = prompt | llm_with_tools | parser

In [None]:
RETRIEVE = "retrieve"
GRADE_DOCUMENTS = "grade_documents"
GENERATE = "generate"
VECTOR_SEARCH = "vector_search"
GRAPH_QA = "graph_qa"
GRAPH_QA_WITH_CONTEXT = "graph_qa_with_context"
PROMPT_TEMPLATE = "prompt_template"
PROMPT_TEMPLATE_WITH_CONTEXT = "prompt_template_with_context"
CREATE_CONTEXT = "create_context"
CREATE_PREFIX = "create_prefix"
DECOMPOSER = "decomposer"

In [None]:
from typing import List, TypedDict


class GraphState(TypedDict):
    """
    Represents the state of our graph.

    Attributes:
        question: question
        documents: result of chain
        article_ids: list of article id from vector search
        prompt: prompt template object
        prompt_with_context: prompt template with context from vector search
        subqueries: decomposed queries
    """

    question: str
    documents: dict
    article_ids: List[str]
    prompt: object
    prompt_with_context: object
    subqueries: object

In [None]:
def get_graph_qa_chain(state: GraphState):
    
    """Create a Neo4j Graph Cypher QA Chain"""
    
    prompt = state["prompt"]
    
    graph_qa_chain = GraphCypherQAChain.from_llm(
            cypher_llm = llm, #should use gpt-4 for production
            qa_llm = llm,
            validate_cypher= True,
            graph=graph,
            verbose=True,
            cypher_prompt = prompt,
            # return_intermediate_steps = True,
            return_direct = True,
        )
    return graph_qa_chain

def get_graph_qa_chain_with_context(state: GraphState):
    
    """Create a Neo4j Graph Cypher QA Chain. Using this as GraphState so it can access state['prompt']"""
    
    prompt_with_context = state["prompt_with_context"] 
    
    graph_qa_chain = GraphCypherQAChain.from_llm(
            cypher_llm = llm, #should use gpt-4 for production
            qa_llm = llm,
            validate_cypher= True,
            graph=graph,
            verbose=False,
            cypher_prompt = prompt_with_context,
            # return_intermediate_steps = True,
            return_direct = True,
        )
    return graph_qa_chain

In [None]:
class RouteQuery(BaseModel):
    """Route a user query to the most relevant datasource."""

    datasource: Literal["vector search", "graph query"] = Field(
        ...,
        description="Given a user question choose to route it to vectorstore or graphdb.",
    )
    
llm = ChatOpenAI(temperature=0)
structured_llm_router = llm.with_structured_output(RouteQuery)

system = """You are an expert at routing a user question to perform vector search or graph query. 
The vector store contains documents related article title, abstracts and topics. Here are three routing situations:
If the user question is about similarity search, perform vector search. The user query may include term like similar, related, relvant, identitical, closest etc to suggest vector search. For all else, use graph query.

Example questions of Vector Search Case: 
    Find articles about photosynthesis
    Find similar articles that is about oxidative stress
    
Example questions of Graph DB Query: 
    MATCH (n:Article) RETURN COUNT(n)
    MATCH (n:Article) RETURN n.title

Example questions of Graph QA Chain: 
    Find articles published in a specific year and return it's title, authors
    Find authors from the institutions who are located in a specific country, e.g Japan
"""

route_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "{question}")
    ]
)

question_router = route_prompt | structured_llm_router

In [None]:
def get_neo4j_vector_index():   

    ''' Create vector for article title and abstract and Instantiate Neo4j vector from graph'''
    
    neo4j_vector_index = Neo4jVector.from_existing_graph(
        embedding = EMBEDDING_MODEL,
        url = AURA_CONNECTION_URI,
        username = AURA_USERNAME,
        password = AURA_PASSWORD,
        index_name = 'title_abstract_vector',
        node_label = 'Article',
        text_node_properties = ['title', 'abstract'],
        embedding_node_property = 'embedding_vectors',
    )
    return neo4j_vector_index

def get_neo4j_title_vector_index(): 
    
    '''Create a title vector and Instantiate Neo4j vector from graph'''
    
    neo4j_title_vector_index = Neo4jVector.from_existing_graph(
        embedding = EMBEDDING_MODEL,
        url = AURA_CONNECTION_URI,
        username = AURA_USERNAME,
        password = AURA_PASSWORD,
        index_name = 'title_vector',
        node_label = 'Title',
        text_node_properties = ['text'],
        embedding_node_property = 'embedding_vectors',
    )
    return neo4j_title_vector_index

def get_neo4j_abstract_vector_index(): 
    
    ''' Create an abstract vector and Instantiate Neo4j vector from graph'''
    
    neo4j_abstract_vector_index = Neo4jVector.from_existing_graph(
        embedding = EMBEDDING_MODEL,
        url = AURA_CONNECTION_URI,
        username = AURA_USERNAME,
        password = AURA_PASSWORD,
        index_name = 'abstract_vector',
        node_label = 'Abstract',
        text_node_properties = ['text'],
        embedding_node_property = 'embedding_vectors',
    )
    return neo4j_abstract_vector_index

def get_neo4j_topic_vector_index(): 
    
    '''Create a topic vector and Instantiate Neo4j vector from graph'''
    
    neo4j_topic_vector_index = Neo4jVector.from_existing_graph(
        embedding = EMBEDDING_MODEL,
        url = AURA_CONNECTION_URI,
        username = AURA_USERNAME,
        password = AURA_PASSWORD,
        index_name = 'topic_vector',
        node_label = 'Topic',
        text_node_properties = ['text'],
        embedding_node_property = 'embedding_vectors',
    )
    return neo4j_topic_vector_index

In [None]:
vector_index = get_neo4j_vector_index()

def get_vector_graph_chain():
    '''Create a Neo4j Retrieval QA Chain. Returns top K most relevant articles'''
    vector_graph_chain = RetrievalQA.from_chain_type(
        llm, 
        chain_type="stuff", 
        retriever = vector_index.as_retriever(search_kwargs={'k':3}), 
        verbose=True,
        return_source_documents=True,
    )
    return vector_graph_chain