In [None]:


import fitz  # PyMuPDF
import asyncio
import nest_asyncio
import os
from langchain_openai import ChatOpenAI
from langchain_experimental.graph_transformers.llm import LLMGraphTransformer
from langchain.schema import Document
from neo4j import GraphDatabase
from langchain.graphs import Neo4jGraph
from langchain_core.prompts import ChatPromptTemplate

# functions
def split_text(text, chunk_size=4000):
    words = text.split()
    return [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]

async def process_text_in_batches(text, chapter_name, batch_size=10):
    chunks = split_text(text)
    documents = [Document(page_content=chunk) for chunk in chunks]
    batched_documents = [documents[i:i + batch_size] for i in range(0, len(documents), batch_size)]
    all_graph_documents = []

    for batch in batched_documents:
            ## here must be prompt
        batch_graph_documents = await llm_transformer.aconvert_to_graph_documents(batch)
        print(batch_graph_documents[0].nodes)
        
        for doc in batch_graph_documents:
            for node in doc.nodes:
                node.properties['chapter'] = chapter_name
        
        all_graph_documents.extend(batch_graph_documents)
    
    return all_graph_documents


def read_text_from_pdf(file_path, pages):
    pdf_document = fitz.open(file_path)
    all_text = ""
    for start, end in pages:
        print(f"Extracting text from pages {start} to {end}")
        for page_num in range(start - 1, end):
            try:
                page = pdf_document.load_page(page_num)
                text = page.get_text("text")
                all_text += text + "\n"
            except ValueError as e:
                print(f"Error loading page {page_num + 1}: {e}")
    pdf_document.close()
    return all_text

async def main():
    pdf_path = r"/Users/titanyanlev/Desktop/future intelligence .pdf"

    all_graph_documents_nodes = []
    all_graph_documents_relationships = []

    for chapter_name, pages in zip(chapter_names, pages_to_extract):
        print(f"Processing chapter: {chapter_name} with pages: {pages}")
        text = read_text_from_pdf(pdf_path, [pages])

        graph_documents = await process_text_in_batches(text, chapter_name)

        graph_documents_nodes = graph_documents[0].nodes 
        graph_documents_relationships = graph_documents[0].relationships


        all_graph_documents_nodes.extend(graph_documents_nodes)
        all_graph_documents_relationships.extend(graph_documents_relationships)
        
    return all_graph_documents_nodes,all_graph_documents_relationships

def create_node(tx, node):
    query = f"""
    MERGE (n:{node['type']} {{id: $id, name: $id, chapter: $chapter}})
    RETURN n
    """
    tx.run(query, id=node["id"], chapter=node['chapter'])

def create_relationship(tx, edge):
    # Ensure the relationship type is properly formatted as a string
    relationship_type = edge['type']
    query = f"""
    MATCH (a {{id: $source_id}})
    MATCH (b {{id: $target_id}})
    MERGE (a)-[r:`{relationship_type}` {{chapter: $chapter}}]->(b)
    RETURN r
    """
    tx.run(query, source_id=edge["source"], target_id=edge["target"], chapter=edge.get('chapter', 'Unknown'))

# Prompt

template = ChatPromptTemplate.from_messages([
    ("system", "You're working on an advanced project to transform PDF text into unique nodes and relationships for a Neo4j database."),
    ("system", "This project requires creating over 1000 unique nodes and 1000 unique relationships, ensuring high accuracy and efficiency."),
    ("system","make such nodes and relationships so that when using the graph information, one can get all the information the was provided in the source"),
    ("system","include more properties, not only id and name, e.g. year etc."),
    ("human", "{input}"),
])
node_properties = ["type", "name", "chapter"]

In [None]:

os.environ['OPENAI_API_KEY'] = "your-api-key"
your_llm_instance = ChatOpenAI(temperature=0.7,api_key=os.environ['OPENAI_API_KEY'], model_name="gpt-4-turbo")
llm_transformer = LLMGraphTransformer(llm=your_llm_instance,prompt=template,node_properties=node_properties)

## custom names and pages
chapter_names = ["Future of Healthcare", "Future of Consumption", "Future of Food", "Future of Entertainment"]
pages_to_extract = [(60, 72), (139,147), (148, 160), (236, 242)]  # Adjust page ranges as needed

# custom account
url = "neo4j+s://a1e9ad6b.databases.neo4j.io"
username = "neo4j"
password = "your_password"
enviromental_variable = "NEO4J_URL"

driver = GraphDatabase.driver(url, auth=(username, password))
nest_asyncio.apply()