In [7]:
import os
from string import Template
import json
from neo4j import GraphDatabase
import glob
from timeit import default_timer as timer
from dotenv import load_dotenv
from time import sleep
import os

import streamlit as st
from langchain.chains import create_retrieval_chain
from langchain.callbacks.base import BaseCallbackHandler
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Neo4jVector
from streamlit.logger import get_logger
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.chat_models import ChatOllama
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from typing import Dict, List
from langchain_community.llms import Ollama
from pprint import pprint

In [8]:
config={"ollama_base_url": "http://localhost:11434",
        "llm_name": "llama3",
        "neo4j_url": "neo4j://localhost:7687",
        "neo4j_username": "neo4j",
        "neo4j_password": "password",
        "file_path": "data/people_profiles/people-profiles1.md",		
        }

In [9]:
llm = Ollama(model="llama3", temperature=0.2, format="json", base_url="http://localhost:11434")
llm_no_json = Ollama(model="llama3", temperature=0.3, base_url="http://localhost:11434")

In [10]:
# Neo4j configuration & constraints
url=config["neo4j_url"] #database url
username=config["neo4j_username"] #neo4j username
password=config["neo4j_password"] #neo4j password
gds = GraphDatabase.driver(url, auth=(username, password))

In [11]:
file_loc = config["file_path"]
with open(file_loc, 'r') as file:
    text = file.read().rstrip()

In [12]:
prompt = "<|begin_of_text|><|start_header_id|>system<|end_header_id|>Extract all entities from this text: <|start_header_id|>user<|end_header_id|> Text: " + text + "<|eot_id|><|start_header_id|>assistant<|end_header_id|>"
entities = llm.invoke(prompt)
print(entities)

{"entities": [
  {"type": "person", "value": "Sarah Johnson"},
  {"type": "skill", "value": "Machine Learning"},
  {"type": "skill", "value": "Data Analytics"},
  {"type": "skill", "value": "Azure"},
  {"type": "skill", "value": "Python"},
  {"type": "project", "value": "BetaHealth Secure Healthcare Data Analytics Platform on Azure"},
  {"type": "person", "value": "David Patel"},
  {"type": "skill", "value": "AWS"},
  {"type": "skill", "value": "Cloud Computing"},
  {"type": "skill", "value": "DevOps"},
  {"type": "skill", "value": "Data Warehousing"},
  {"type": "person", "value": "Amanda Rodriguez"},
  {"type": "skill", "value": "Data Security"},
  {"type": "skill", "value": "Compliance"},
  {"type": "skill", "value": "Healthcare Regulations"},
  {"type": "project", "value": "BetaHealth Secure Healthcare Data Analytics Platform on Azure"}
]}


In [13]:
prompt = "<|begin_of_text|><|start_header_id|>system<|end_header_id|>Extract all relationships from the text and entities given: <|start_header_id|>user<|end_header_id|> Text: " + text + "Entities:" + entities + "<|eot_id|><|start_header_id|>assistant<|end_header_id|>"
rs = llm.invoke(prompt)
print(rs)

{ "relationships": [
  {"type": "works_on", "person": "Sarah Johnson", "project": "BetaHealth Secure Healthcare Data Analytics Platform on Azure"},
  {"type": "has_skill", "person": "Sarah Johnson", "skill": "Machine Learning"},
  {"type": "has_skill", "person": "Sarah Johnson", "skill": "Data Analytics"},
  {"type": "has_skill", "person": "Sarah Johnson", "skill": "Azure"},
  {"type": "has_skill", "person": "Sarah Johnson", "skill": "Python"},
  {"type": "works_on", "person": "David Patel", "project": ""},
  {"type": "has_skill", "person": "David Patel", "skill": "AWS"},
  {"type": "has_skill", "person": "David Patel", "skill": "Cloud Computing"},
  {"type": "has_skill", "person": "David Patel", "skill": "DevOps"},
  {"type": "has_skill", "person": "David Patel", "skill": "Data Warehousing"},
  {"type": "works_on", "person": "Amanda Rodriguez", "project": "BetaHealth Secure Healthcare Data Analytics Platform on Azure"},
  {"type": "has_skill", "person": "Amanda Rodriguez", "skill": "D

In [14]:
# Prompt for generating Cypher queries from text
cypher_generation_template_0 = """
<|begin_of_text|><|start_header_id|>system<|end_header_id|>
Based on the entities and relationships given, format them into the following output. You can include additional context from the text as well.

0. ALWAYS FINISH THE OUTPUT. Never send partial responses.

1. The output should look like:
{
    "entities": [{"label":"EntityType","id":string,"name":string,"summary":string, "additional_properties": string}],
    "relationships": ["entityid1|RELATIONSHIP_VERB|entityid2"]
}

2. Some other formatting:
    - RELATIONSHIP_VERB should be in uppercase.
    - ids should not have spaces.

<|start_header_id|>user<|end_header_id|>
Text to Process:
$text

Entities:
$entities

Relationships:
$relationships
<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

prompt = Template(cypher_generation_template_0).substitute(text=text, entities=entities, relationships=rs)
result = llm.invoke(prompt)
print(result)


{
    "entities": [
        {
            "label": "Person",
            "id": "sarahjohnson",
            "name": "Sarah Johnson",
            "summary": "",
            "additional_properties": ""
        },
        {
            "label": "Skill",
            "id": "machinelearning",
            "name": "Machine Learning",
            "summary": "",
            "additional_properties": ""
        },
        {
            "label": "Skill",
            "id": "dataanalytics",
            "name": "Data Analytics",
            "summary": "",
            "additional_properties": ""
        },
        {
            "label": "Skill",
            "id": "azure",
            "name": "Azure",
            "summary": "",
            "additional_properties": ""
        },
        {
            "label": "Skill",
            "id": "python",
            "name": "Python",
            "summary": "",
            "additional_properties": ""
        },
        {
            "label": "Project",
            

In [15]:
json_obj = json.loads(result)

In [16]:
json_objS = []
json_objS.append(json_obj)

In [17]:
for i, obj in enumerate(json_objS):
    print(obj)
    print(i)
    print(obj['entities'])
    print(obj['relationships'])

{'entities': [{'label': 'Person', 'id': 'sarahjohnson', 'name': 'Sarah Johnson', 'summary': '', 'additional_properties': ''}, {'label': 'Skill', 'id': 'machinelearning', 'name': 'Machine Learning', 'summary': '', 'additional_properties': ''}, {'label': 'Skill', 'id': 'dataanalytics', 'name': 'Data Analytics', 'summary': '', 'additional_properties': ''}, {'label': 'Skill', 'id': 'azure', 'name': 'Azure', 'summary': '', 'additional_properties': ''}, {'label': 'Skill', 'id': 'python', 'name': 'Python', 'summary': '', 'additional_properties': ''}, {'label': 'Project', 'id': 'betahhealthsecurehealthcaredataanalyticsplatformonazure', 'name': 'BetaHealth Secure Healthcare Data Analytics Platform on Azure', 'summary': '', 'additional_properties': ''}, {'label': 'Person', 'id': 'davidpatel', 'name': 'David Patel', 'summary': '', 'additional_properties': ''}, {'label': 'Skill', 'id': 'aws', 'name': 'AWS', 'summary': '', 'additional_properties': ''}, {'label': 'Skill', 'id': 'cloudcomputing', 'na

In [18]:
e_statements = []
r_statements = []
e_label_map = {}

# loop through our json object
for i, obj in enumerate(json_objS):
    # for each dictionary item, do this...
    # entities is a dictionary of dictionaries
    print(f"Generating cypher for file {i+1} of {len(json_obj)}")
    print()
    for entity in obj["entities"]:
        print(i, type(entity))
        label = entity["label"]
        id = entity["id"]
        id = id.replace("-", "").replace("_", "")
        # classifying all the information of a node into just labels, id, and properties (whcih stores the rest of the information)
        properties = {k: v for k, v in entity.items() if k not in ["label", "id"]}

        cypher = f'MERGE (n:{label} {{id: "{id}"}})'
        if properties:
            props_str = ", ".join(
                [f'n.{key} = "{val}"' for key, val in properties.items()]
            )
            cypher += f" ON CREATE SET {props_str}"
        e_statements.append(cypher)
        e_label_map[id] = label

    for rs in obj["relationships"]:
        src_id, rs_type, tgt_id = rs.split("|")
        src_id = src_id.replace("-", "").replace("_", "")
        print("src_id", src_id) 
        tgt_id = tgt_id.replace("-", "").replace("_", "")
        print("tgt_id", tgt_id)
        pprint(e_label_map)
        print("rs_type", rs_type)

        src_label = e_label_map[src_id]
        if tgt_id == "":
            continue
        else:
            tgt_label = e_label_map[tgt_id]

        cypher = f'MERGE (a:{src_label} {{id: "{src_id}"}}) MERGE (b:{tgt_label} {{id: "{tgt_id}"}}) MERGE (a)-[:{rs_type}]->(b)'
        r_statements.append(cypher)

Generating cypher for file 1 of 2

0 <class 'dict'>
0 <class 'dict'>
0 <class 'dict'>
0 <class 'dict'>
0 <class 'dict'>
0 <class 'dict'>
0 <class 'dict'>
0 <class 'dict'>
0 <class 'dict'>
0 <class 'dict'>
0 <class 'dict'>
0 <class 'dict'>
0 <class 'dict'>
0 <class 'dict'>
0 <class 'dict'>
src_id sarahjohnson
tgt_id betahhealthsecurehealthcaredataanalyticsplatformonazure
{'amandarodriguez': 'Person',
 'aws': 'Skill',
 'azure': 'Skill',
 'betahhealthsecurehealthcaredataanalyticsplatformonazure': 'Project',
 'cloudcomputing': 'Skill',
 'compliance': 'Skill',
 'dataanalytics': 'Skill',
 'datasecurity': 'Skill',
 'datawarehousing': 'Skill',
 'davidpatel': 'Person',
 'devops': 'Skill',
 'healthcareregulations': 'Skill',
 'machinelearning': 'Skill',
 'python': 'Skill',
 'sarahjohnson': 'Person'}
rs_type WORKS_ON
src_id sarahjohnson
tgt_id machinelearning
{'amandarodriguez': 'Person',
 'aws': 'Skill',
 'azure': 'Skill',
 'betahhealthsecurehealthcaredataanalyticsplatformonazure': 'Project',
 'c

In [19]:
e_statements

['MERGE (n:Person {id: "sarahjohnson"}) ON CREATE SET n.name = "Sarah Johnson", n.summary = "", n.additional_properties = ""',
 'MERGE (n:Skill {id: "machinelearning"}) ON CREATE SET n.name = "Machine Learning", n.summary = "", n.additional_properties = ""',
 'MERGE (n:Skill {id: "dataanalytics"}) ON CREATE SET n.name = "Data Analytics", n.summary = "", n.additional_properties = ""',
 'MERGE (n:Skill {id: "azure"}) ON CREATE SET n.name = "Azure", n.summary = "", n.additional_properties = ""',
 'MERGE (n:Skill {id: "python"}) ON CREATE SET n.name = "Python", n.summary = "", n.additional_properties = ""',
 'MERGE (n:Project {id: "betahhealthsecurehealthcaredataanalyticsplatformonazure"}) ON CREATE SET n.name = "BetaHealth Secure Healthcare Data Analytics Platform on Azure", n.summary = "", n.additional_properties = ""',
 'MERGE (n:Person {id: "davidpatel"}) ON CREATE SET n.name = "David Patel", n.summary = "", n.additional_properties = ""',
 'MERGE (n:Skill {id: "aws"}) ON CREATE SET n.n

In [20]:
r_statements

['MERGE (a:Person {id: "sarahjohnson"}) MERGE (b:Project {id: "betahhealthsecurehealthcaredataanalyticsplatformonazure"}) MERGE (a)-[:WORKS_ON]->(b)',
 'MERGE (a:Person {id: "sarahjohnson"}) MERGE (b:Skill {id: "machinelearning"}) MERGE (a)-[:HAS_SKILL]->(b)',
 'MERGE (a:Person {id: "sarahjohnson"}) MERGE (b:Skill {id: "dataanalytics"}) MERGE (a)-[:HAS_SKILL]->(b)',
 'MERGE (a:Person {id: "sarahjohnson"}) MERGE (b:Skill {id: "azure"}) MERGE (a)-[:HAS_SKILL]->(b)',
 'MERGE (a:Person {id: "sarahjohnson"}) MERGE (b:Skill {id: "python"}) MERGE (a)-[:HAS_SKILL]->(b)',
 'MERGE (a:Person {id: "davidpatel"}) MERGE (b:Skill {id: "aws"}) MERGE (a)-[:HAS_SKILL]->(b)',
 'MERGE (a:Person {id: "davidpatel"}) MERGE (b:Skill {id: "cloudcomputing"}) MERGE (a)-[:HAS_SKILL]->(b)',
 'MERGE (a:Person {id: "davidpatel"}) MERGE (b:Skill {id: "devops"}) MERGE (a)-[:HAS_SKILL]->(b)',
 'MERGE (a:Person {id: "davidpatel"}) MERGE (b:Skill {id: "datawarehousing"}) MERGE (a)-[:HAS_SKILL]->(b)',
 'MERGE (a:Person {i

In [21]:
with open("cyphers.txt", "w") as outfile:
    outfile.write("\n".join(e_statements + r_statements))

In [22]:
# Generate and execute cypher statements
cypher_statements = e_statements + r_statements
for i, stmt in enumerate(cypher_statements):
    print(f"Executing cypher statement {i+1} of {len(cypher_statements)}")
    try:
        gds.execute_query(stmt)
    except Exception as e:
        with open("failed_statements.txt", "w") as f:
            f.write(f"{stmt} - Exception: {e}\n")

Executing cypher statement 1 of 29
Executing cypher statement 2 of 29
Executing cypher statement 3 of 29
Executing cypher statement 4 of 29
Executing cypher statement 5 of 29
Executing cypher statement 6 of 29
Executing cypher statement 7 of 29
Executing cypher statement 8 of 29
Executing cypher statement 9 of 29
Executing cypher statement 10 of 29
Executing cypher statement 11 of 29
Executing cypher statement 12 of 29
Executing cypher statement 13 of 29
Executing cypher statement 14 of 29
Executing cypher statement 15 of 29
Executing cypher statement 16 of 29
Executing cypher statement 17 of 29
Executing cypher statement 18 of 29
Executing cypher statement 19 of 29
Executing cypher statement 20 of 29
Executing cypher statement 21 of 29
Executing cypher statement 22 of 29
Executing cypher statement 23 of 29
Executing cypher statement 24 of 29
Executing cypher statement 25 of 29
Executing cypher statement 26 of 29
Executing cypher statement 27 of 29
Executing cypher statement 28 of 29
E