In [None]:
#import
import os
import pandas as pd
from langchain_community.graphs import Neo4jGraph
from langchain.document_loaders import TextLoader
from langchain.text_splitter import TokenTextSplitter
from langchain_openai import ChatOpenAI
from langchain.schema import Document 
from langchain_experimental.graph_transformers import LLMGraphTransformer
from neo4j import GraphDatabase
from yfiles_jupyter_graphs import GraphWidget

os.environ["OPENAI_API_KEY"] = ''
os.environ["NEO4J_USERNAME"] = ''
os.environ["NEO4J_URI"] = ''
os.environ["NEO4J_PASSWORD"] = ''

In [None]:
#preprocessing
csv_file = 'PATH'  
df = pd.read_csv(csv_file)

translated_texts = df['Text'].dropna().tolist()  
raw_documents = [Document(page_content=text) for text in translated_texts]


text_splitter = TokenTextSplitter(chunk_size=512, chunk_overlap=24)
documents = text_splitter.split_documents(raw_documents)

for doc in documents[:5]:  
    print(doc.page_content)

In [None]:
#embedding
from tqdm import tqdm

llm=ChatOpenAI(temperature=0, model_name="MODELNAME") #ex) gpt-4o
llm_transformer = LLMGraphTransformer(llm=llm)

graph_documents = []
for doc in tqdm(documents, desc="Transforming documents to graph format"):
    graph_doc = llm_transformer.convert_to_graph_documents([doc])
    graph_documents.extend(graph_doc)

In [None]:
# import into Neo4j
graph = Neo4jGraph(
    url=os.environ["NEO4J_URI"],
    username=os.environ["NEO4J_USERNAME"],
    password=os.environ["NEO4J_PASSWORD"],
    database="DATABASE"  # Enter the name of the database you wish to specify here
)
for graph_doc in tqdm(graph_documents, desc="Adding graph documents to Neo4j"):
    graph.add_graph_documents(
        [graph_doc],
        baseEntityLabel=True,
        include_source=True
    )

In [None]:
#GraphRAG Settings
from typing import List
from pydantic import BaseModel, Field
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnableParallel, RunnablePassthrough
from langchain.vectorstores.neo4j_vector import Neo4jVector
from langchain.graphs import Neo4jGraph
from langchain.embeddings import OpenAIEmbeddings
from langchain.output_parsers import PydanticOutputParser
from langchain.schema.output_parser import StrOutputParser

graph = Neo4jGraph()
graph.query(
    "CREATE FULLTEXT INDEX entity IF NOT EXISTS FOR (e:__Entity__) ON EACH [e.id]"
)

class Entities(BaseModel):
    names: List[str] = Field(
        ...,
        description="All the person, organization, or business entities that appear in the text",
    )

parser = PydanticOutputParser(pydantic_object=Entities)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", "You are extracting organization and person entities from the text."),
        ("human", "Extract information from the following input in JSON format:\n{question}\n\n{format_instructions}"),
    ]
).partial(format_instructions=parser.get_format_instructions())

llm = ChatOpenAI(temperature=0, model_name="gpt-4o-mini")

entity_chain = prompt | llm | parser

def structured_retriever(question: str) -> str:
    result = ""
    entities = entity_chain.invoke({"question": question})
    for entity in entities.names:
        response = graph.query(
            """CALL db.index.fulltext.queryNodes('entity', $query, {limit:2})
            YIELD node, score
            CALL {
              WITH node
              MATCH (node)-[r:!MENTIONS]->(neighbor)
              RETURN node.id + ' - ' + type(r) + ' -> ' + neighbor.id AS output
              UNION ALL
              WITH node
              MATCH (node)<-[r:!MENTIONS]-(neighbor)
              RETURN neighbor.id + ' - ' + type(r) + ' -> ' + node.id AS output
            }
            RETURN output LIMIT 50
            """,
            {"query": entity},
        )
        result += "\n".join([el['output'] for el in response])
    return result

def retriever(question: str, mode: str = "hybrid"):
    print(f"Search query: {question} | Mode: {mode}")

    if mode == "gpt_only":
        return ""  

    structured_data = ""
    unstructured_data = []

    if mode in ["local", "hybrid"]:
        structured_data = structured_retriever(question)

    if mode in ["global", "hybrid"]:
        unstructured_data = [
            el.page_content for el in vector_index.similarity_search(question)
        ]

    final_data = f"""Structured data:
{structured_data}
Unstructured data:
{"#Document ".join(unstructured_data)}
"""
    print(final_data)
    return final_data

template = """Answer the question based only on the following context:
{context}

Question: {question}
Use natural language and be concise.
Answer:"""
prompt = ChatPromptTemplate.from_template(template)

def qa_chain(question: str, mode: str = "hybrid"):
    if mode == "gpt_only":
        return llm.invoke(question)

    chain = (
        RunnableParallel(
            {
                "context": lambda q: retriever(q, mode=mode),
                "question": RunnablePassthrough(),
            }
        )
        | prompt
        | llm
        | StrOutputParser()
    )
    return chain.invoke(question)

In [None]:
#response generation
if __name__ == "__main__":
    question = "PROMPT" #prompt

    print("=== GPT ONLY ===")
    print(qa_chain(question, mode="gpt_only"))

    print("\n=== LOCAL SEARCH ===")
    print(qa_chain(question, mode="local"))

    print("\n=== GLOBAL SEARCH ===")
    print(qa_chain(question, mode="global"))

    print("\n=== HYBRID SEARCH ===")
    print(qa_chain(question, mode="hybrid"))