In [3]:
from dotenv import load_dotenv # type: ignore
import os
import openai

# Common data processing
import json
import textwrap

# Langchain
from langchain_community.graphs import Neo4jGraph
from langchain_community.vectorstores import Neo4jVector
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQAWithSourcesChain
from langchain_openai import ChatOpenAI
from langchain_openai import AzureChatOpenAI

# Warning control
import warnings
warnings.filterwarnings("ignore")

In [2]:
from langchain.document_loaders import WikipediaLoader
raw_document = WikipediaLoader(query = "Elizabeth 1").load()


In [3]:
# Extract text from each document
text_data = [doc.page_content for doc in raw_document]

# Join the text content into a single string
raw_document = ''.join(text_data)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 2000,
    chunk_overlap  = 200,
    length_function = len,
    is_separator_regex = False,
)

item1_text_chunks = text_splitter.split_text(raw_document)

In [4]:
len(item1_text_chunks)

68

In [4]:
chunks_with_metadata = []
chunk_seq_id = 0
for chunk in item1_text_chunks[:20]:
    chunk_dict = {'text': chunk, 'chunkId': f'chunk{chunk_seq_id:04d}'}
    chunks_with_metadata.append(chunk_dict)
    chunk_seq_id += 1
    print(f'\tSplit into {chunk_seq_id} chunks')
                                

	Split into 1 chunks
	Split into 2 chunks
	Split into 3 chunks
	Split into 4 chunks
	Split into 5 chunks
	Split into 6 chunks
	Split into 7 chunks
	Split into 8 chunks
	Split into 9 chunks
	Split into 10 chunks
	Split into 11 chunks
	Split into 12 chunks
	Split into 13 chunks
	Split into 14 chunks
	Split into 15 chunks
	Split into 16 chunks
	Split into 17 chunks
	Split into 18 chunks
	Split into 19 chunks
	Split into 20 chunks


In [6]:
print(chunks_with_metadata)

[{'text': 'Elizabeth I (7 September 1533 – 24 March 1603) was Queen of England and Ireland from 17 November 1558 until her death in 1603. She was the last monarch of the House of Tudor.\nElizabeth was the only surviving child of Henry VIII and his second wife, Anne Boleyn. When Elizabeth was two years old, her parents\' marriage was annulled, her mother was executed, and Elizabeth was declared illegitimate. Henry restored her to the line of succession when she was 10, via the Third Succession Act 1543. After Henry\'s death in 1547, Elizabeth\'s younger half-brother Edward VI ruled until his own death in 1553, bequeathing the crown to a Protestant cousin, Lady Jane Grey, and ignoring the claims of his two half-sisters, the Catholic Mary and the younger Elizabeth, in spite of statutes to the contrary. Edward\'s will was set aside within weeks of his death and Mary became queen, deposing and executing Jane. During Mary\'s reign, Elizabeth was imprisoned for nearly a year on suspicion of s

In [9]:
merge_chunk_node_query = """
MERGE(mergedChunk:Chunk {chunkId: $chunkParam.chunkId})
    ON CREATE SET  
        mergedChunk.text = $chunkParam.text
RETURN mergedChunk
"""

In [4]:

from langchain.text_splitter import TokenTextSplitter
text_splitter = TokenTextSplitter(chunk_size=512, chunk_overlap=24)
documents = text_splitter.split_documents(raw_document[:3])

In [5]:
type(documents)

list

In [9]:
item1_text_chunks[1]

'In government, Elizabeth was more moderate than her father and siblings had been. One of her mottoes was video et taceo ("I see and keep silent"). In religion, she was relatively tolerant and avoided systematic persecution. After the pope declared her illegitimate in 1570, which in theory released English Catholics from allegiance to her, several conspiracies threatened her life, all of which were defeated with the help of her ministers\' secret service, run by Sir Francis Walsingham. Elizabeth was cautious in foreign affairs, manoeuvring between the major powers of France and Spain. She half-heartedly supported a number of ineffective, poorly resourced military campaigns in the Netherlands, France, and Ireland. By the mid-1580s, England could no longer avoid war with Spain.\nAs she grew older, Elizabeth became celebrated for her virginity. A cult of personality grew around her which was celebrated in the portraits, pageants, and literature of the day. Elizabeth\'s reign became known 

In [4]:
# Load from environment
load_dotenv('.env', override=True)
NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')
NEO4J_DATABASE = os.getenv('NEO4J_DATABASE') or 'neo4j'
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
AZURE_OPENAI_ENDPOINT = os.getenv('AZURE_OPENAI_ENDPOINT')
AZURE_OPENAI_API_VERSION = os.getenv('AZURE_OPENAI_API_VERSION')
AZURE_OPENAI_CHAT_DEPLOYMENT_NAME = os.getenv('AZURE_OPENAI_CHAT_DEPLOYMENT_NAME')

In [6]:
kg = Neo4jGraph(
    url=NEO4J_URI, username=NEO4J_USERNAME, password=NEO4J_PASSWORD, database=NEO4J_DATABASE
)

In [13]:
# Process each chunk individually
for chunk_data in chunks_with_metadata:
    kg.query(merge_chunk_node_query, params={'chunkParam': chunk_data})

In [14]:
kg.query("""
CREATE CONSTRAINT unique_chunk IF NOT EXISTS 
    FOR (c:Chunk) REQUIRE c.chunkId IS UNIQUE
""")


[]

In [15]:
kg.query("SHOW INDEXES")

[{'id': 1,
  'name': 'index_343aff4e',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'LOOKUP',
  'entityType': 'NODE',
  'labelsOrTypes': None,
  'properties': None,
  'indexProvider': 'token-lookup-1.0',
  'owningConstraint': None,
  'lastRead': neo4j.time.DateTime(2024, 8, 16, 4, 46, 37, 244000000, tzinfo=<UTC>),
  'readCount': 20},
 {'id': 2,
  'name': 'index_f7700477',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'LOOKUP',
  'entityType': 'RELATIONSHIP',
  'labelsOrTypes': None,
  'properties': None,
  'indexProvider': 'token-lookup-1.0',
  'owningConstraint': None,
  'lastRead': None,
  'readCount': 0},
 {'id': 3,
  'name': 'unique_chunk',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'RANGE',
  'entityType': 'NODE',
  'labelsOrTypes': ['Chunk'],
  'properties': ['chunkId'],
  'indexProvider': 'range-1.0',
  'owningConstraint': 'unique_chunk',
  'lastRead': None,
  'readCount': 0}]

In [16]:
node_count = 0
for chunk in chunks_with_metadata:
    print(f"Creating `:Chunk` node for chunk ID {chunk['chunkId']}")
    kg.query(merge_chunk_node_query, 
            params={
                'chunkParam': chunk
            })
    node_count += 1
print(f"Created {node_count} nodes")

Creating `:Chunk` node for chunk ID chunk0000
Creating `:Chunk` node for chunk ID chunk0001
Creating `:Chunk` node for chunk ID chunk0002
Creating `:Chunk` node for chunk ID chunk0003
Creating `:Chunk` node for chunk ID chunk0004
Creating `:Chunk` node for chunk ID chunk0005
Creating `:Chunk` node for chunk ID chunk0006
Creating `:Chunk` node for chunk ID chunk0007
Creating `:Chunk` node for chunk ID chunk0008
Creating `:Chunk` node for chunk ID chunk0009
Creating `:Chunk` node for chunk ID chunk0010
Creating `:Chunk` node for chunk ID chunk0011
Creating `:Chunk` node for chunk ID chunk0012
Creating `:Chunk` node for chunk ID chunk0013
Creating `:Chunk` node for chunk ID chunk0014
Creating `:Chunk` node for chunk ID chunk0015
Creating `:Chunk` node for chunk ID chunk0016
Creating `:Chunk` node for chunk ID chunk0017
Creating `:Chunk` node for chunk ID chunk0018
Creating `:Chunk` node for chunk ID chunk0019
Created 20 nodes


In [17]:
kg.query("""
         MATCH (n)
         RETURN count(n) as nodeCount
         """)

[{'nodeCount': 20}]

In [7]:
kg.query("""
         CREATE VECTOR INDEX `form_10k_chunks` IF NOT EXISTS
          FOR (c:Chunk) ON (c.textEmbedding) 
          OPTIONS { indexConfig: {
            `vector.dimensions`: 1536,
            `vector.similarity_function`: 'cosine'    
         }}
""")

[]

In [8]:
kg.query("SHOW INDEXES")

[{'id': 5,
  'name': 'form_10k_chunks',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'VECTOR',
  'entityType': 'NODE',
  'labelsOrTypes': ['Chunk'],
  'properties': ['textEmbedding'],
  'indexProvider': 'vector-2.0',
  'owningConstraint': None,
  'lastRead': None,
  'readCount': 0},
 {'id': 1,
  'name': 'index_343aff4e',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'LOOKUP',
  'entityType': 'NODE',
  'labelsOrTypes': None,
  'properties': None,
  'indexProvider': 'token-lookup-1.0',
  'owningConstraint': None,
  'lastRead': neo4j.time.DateTime(2024, 8, 16, 4, 59, 56, 940000000, tzinfo=<UTC>),
  'readCount': 23},
 {'id': 2,
  'name': 'index_f7700477',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'LOOKUP',
  'entityType': 'RELATIONSHIP',
  'labelsOrTypes': None,
  'properties': None,
  'indexProvider': 'token-lookup-1.0',
  'owningConstraint': None,
  'lastRead': None,
  'readCount': 0},
 {'id': 3,
  'name': 'unique_chunk',
  'state': 'ONLI

In [7]:
# from langchain_openai import AzureChatOpenAI
# from langchain_openai import AzureOpenAI
# from langchain_openai import AzureOpenAIEmbeddings
import openai
from openai import AzureOpenAI


client = AzureOpenAI(
    azure_endpoint=AZURE_OPENAI_ENDPOINT,
    azure_deployment=AZURE_OPENAI_CHAT_DEPLOYMENT_NAME,
    api_version=AZURE_OPENAI_API_VERSION,
    api_key=OPENAI_API_KEY
)


In [10]:
def generate_embedding(text):
  response = client.embeddings.create(
    model="text-embedding-ada-002",  
    input=text,
  )
  return response.model_dump_json(indent=2)

In [18]:
from langchain.embeddings import OpenAIEmbeddings

for chunk in chunks_with_metadata:
  # embedding = OpenAIEmbeddings(model="text-embedding-ada-002", chunk_size=1)
  # e = embedding.embed_query(chunk['text'])
  embedding = generate_embedding(chunk['text'])
  # Update the query to set the embedding property
  kg.query("""
    MATCH (chunk:Chunk {chunkId: $chunkId})
    SET chunk.textEmbedding = $embedding
  """, params={"chunkId": chunk['chunkId'], "embedding": embedding})

In [11]:
kg.refresh_schema()
print(kg.schema)

Node properties:
Chunk {chunkId: STRING, text: STRING, textEmbedding: STRING}
Relationship properties:

The relationships:



In [8]:
def neo4j_vector_search(question):
  """Search for similar nodes using the Neo4j vector index"""
  vector_search_query = """
    WITH genai.vector.encode(
      $question, 
      "OpenAI", 
      {
        token: $openAiApiKey,
        endpoint: $openAiEndpoint
      }) AS question_embedding
    CALL db.index.vector.queryNodes($index_name, $top_k, question_embedding) yield node, score
    RETURN score, node.text AS text
  """
  similar = kg.query(vector_search_query, 
                     params={
                      'question': question, 
                      'openAiApiKey':OPENAI_API_KEY,
                      'openAiEndpoint': 'dialog-crm-gpt4o-westus', 
                      'top_k': 10})
  return similar

In [11]:
def neo4j_vector_search(question):
  """Search for similar nodes using pre-computed embeddings"""
  question_embedding = generate_embedding(question)

  vector_search_query = """
  MATCH (n)
  WHERE apoc.vector.cosineDistance(n.embedding, $question_embedding) < 0.5
  RETURN n, apoc.vector.cosineDistance(n.embedding, $question_embedding) AS distance
  ORDER BY distance ASC
  LIMIT 5
  """

  similar = kg.query(vector_search_query, params={'question_embedding': question_embedding})
  return similar

In [12]:
search_results = neo4j_vector_search(
    'In a single sentence, tell me about Elizabeth 1.'
)

ValueError: Generated Cypher Statement is not valid
{code: Neo.ClientError.Statement.SyntaxError} {message: Unknown function 'apoc.vector.cosineDistance' (line 3, column 11 (offset: 21))
"  WHERE apoc.vector.cosineDistance(n.embedding, $question_embedding) < 0.5"
         ^}

In [11]:
kg.query("""
    MATCH (chunk:Chunk) WHERE chunk.textEmbedding IS NULL
    WITH chunk, genai.vector.encode(
      chunk.text, 
      "OpenAI", 
      {
        token: $openAiApiKey, 
        endpoint: $openAiEndpoint
      }) AS vector
    CALL db.create.setNodeVectorProperty(chunk, "textEmbedding", vector)
    """, 
    params={"openAiApiKey":OPENAI_API_KEY, "openAiEndpoint": AZURE_OPENAI_ENDPOINT} )

ValueError: Generated Cypher Statement is not valid
{code: Neo.ClientError.Statement.SyntaxError} {message: Unknown function 'genai.vector.encode' (line 3, column 21 (offset: 75))
"    WITH chunk, genai.vector.encode("
                 ^}

In [5]:
import os
from openai import AzureOpenAI

client = AzureOpenAI(
  api_key = OPENAI_API_KEY, 
  azure_deployment=AZURE_OPENAI_CHAT_DEPLOYMENT_NAME,
  api_version = AZURE_OPENAI_API_VERSION,
  azure_endpoint = AZURE_OPENAI_ENDPOINT
)

response = client.embeddings.create(
    input = "Your text string goes here",
    model= "text-embedding-ada-002"
)

print(response.model_dump_json(indent=2))

{
  "data": [
    {
      "embedding": [
        -0.007578954566270113,
        -0.0055061643943190575,
        0.011402026750147343,
        -0.0247525442391634,
        -0.024873483926057816,
        0.039802949875593185,
        -0.010347154922783375,
        -0.009480412118136883,
        -0.013384111225605011,
        -0.00991042423993349,
        -0.011717816814780235,
        0.008217253722250462,
        -0.014338199980556965,
        0.0077603659592568874,
        0.010044802911579609,
        -0.0049820877611637115,
        0.022777177393436432,
        -0.0017082883277907968,
        0.015319163911044598,
        -0.010279965586960316,
        0.0049182577058672905,
        0.012208298780024052,
        0.004834271036088467,
        0.010568879544734955,
        -0.006574474740773439,
        -0.00044596908264793456,
        0.005707732401788235,
        -0.012853316031396389,
        0.01634044200181961,
        0.004454651847481728,
        0.0063762660138309,
        -0.0

In [1]:
!pip install PyPDF2




[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [10]:
from PyPDF2 import PdfReader
from langchain.schema import Document
def read_pdf(pdf_path):
    pdf_reader = PdfReader(pdf_path)
    documents = []
    
    for page_num, page in enumerate(pdf_reader.pages):
        text = page.extract_text()
        metadata = {
            "source": pdf_path,
            "page": page_num + 1  # Page numbers are typically 1-based
        }
        documents.append(Document(page_content=text, metadata=metadata))
    
    return documents

pdf_path = 'Tesla.pdf'
  
raw_document = read_pdf(pdf_path)

# print(type(raw_document))

from langchain.text_splitter import TokenTextSplitter
text_splitter = TokenTextSplitter(chunk_size=512, chunk_overlap=24)
documents = text_splitter.split_documents(raw_document[:3])

documents[0]




Document(metadata={'source': 'Tesla.pdf', 'page': 1}, page_content="Tesla, Inc.   is an American multinational  automotive  and  clean energy  company. \nHeadquartered in  Austin, Texas , it designs, manufactures and sells  battery electric \nvehicles  (BEVs), stationary battery  energy storage  devices from home to  grid-scale, solar \npanels  and  solar shingles , and related products and services.  \nTesla was incorporated in July 2003 by  Martin Eberhard  and  Marc Tarpenning  as Tesla Motors . \nIts name is a tribute to inventor and electrical engineer  Nikola Tesla . In February 2004,  Elon \nMusk  joined as Tesla's largest shareholder; in 2008, he was named  chief executive officer . In \n2008, the company began production of its first car model, the  Roadster  sports car, followed by \nthe Model S  sedan in 2012, the  Model X  SUV in 2015, the  Model 3  sedan in 2017, the  Model \nY crossover in 2020, the  Tesla Semi  truck in 2022 and the  Cybertruck  pickup truck in 2023. The