In [None]:
import os, json
import numpy as np
from tqdm import tqdm
from dotenv import load_dotenv
from neo4j import GraphDatabase

In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

In [None]:
load_dotenv()
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
NEO4J_URI = os.getenv("NEO4J_URI")
NEO4J_USERNAME = os.getenv("NEO4J_USER")
NEO4J_PASSWORD = os.getenv("NEO4J_PW")

In [None]:
assert GOOGLE_API_KEY, "Set GOOGLE_API_KEY"
assert NEO4J_URI and NEO4J_USERNAME and NEO4J_PASSWORD, "Set Neo4j credentials"

In [None]:
llm = ChatGoogleGenerativeAI(model="models/gemini-2.5-flash", temperature=0)

In [None]:
prompt = PromptTemplate(
    input_variables=["text"],
    template="""
Extract all (subject, relation, object) triples that represent knowledge facts.
Return strictly JSON array of triples like:
[
  {{"subject": "A", "relation": "is a", "object": "B"}},
  {{"subject": "X", "relation": "treats", "object": "Y"}}
]

Text:
{text}
"""
)

chain = LLMChain(llm=llm, prompt=prompt)

In [None]:
def generate_triplets(text):
    response = chain.invoke({"text": text})
    raw = response["text"] if isinstance(response, dict) else response
    clean_raw = raw.strip().strip("```").replace("json", "").strip()
    try:
        triples = json.loads(clean_raw)
        print("\nParsed Triples:\n", triples)
        return triples
    except json.JSONDecodeError as e:
        print("JSON parsing failed:", e)

In [None]:
text = '''
The AstraZeneca COVID-19 Vaccine was developed to prevent COVID-19 caused by the SARS-CoV-2 virus. It was previously known as ChAdOx1 nCoV-19 or AZD1222. The ChAdOx1 viral vector was created at the University of Oxford and was also studied for use in vaccines against another coronavirus, MERS-CoV. The vaccine was produced through a partnership between the University of Oxford Jenner Institute and the Italian pharmaceutical company Advent Srl.
This vaccine contains a replication-deficient adenovirus that can produce the spike (S) protein of SARS-CoV-2. This allows the body to make antibodies and an immune response against the virus. In May 2020, the University of Oxford made a development and distribution deal with AstraZeneca. The vaccine was later approved and used in many countries, including Canada, Mexico, the UK, the EU, and Australia.
In March 2021, some EU countries temporarily stopped using the vaccine due to reports of blood clots with low platelet counts. The European Medicines Agency (EMA) reviewed these cases and found no increase in the overall risk of blood clots from the vaccine. However, it noted that very rare cases of blood clots with thrombocytopenia might be linked to the vaccine. The EMA stated that a causal link was possible but unproven, and it concluded that the benefits of the vaccine outweighed the risks.
A phase I/II clinical trial began in April 2020 in the UK to study the vaccine safety, efficacy, and immune response. It was a single-blinded, randomized, placebo-controlled trial involving healthy adults aged 18 to 55 years. The vaccine is given as an intramuscular injection and is approved for use in adults aged 18 years and older.
The AstraZeneca vaccine is given in two doses, spaced 4 to 12 weeks apart. Its overall efficacy in preventing COVID-19 is about 67%, and it is 100 percent effective in preventing severe illness, hospitalizations, and deaths. Studies suggested that longer intervals between doses—12 weeks or more—might improve efficacy to around 80%, although this finding is still under review.
In November 2021, Health Canada issued a warning about a rare risk of immune thrombocytopenia after vaccination. People were advised to seek medical help if they experienced unusual bleeding, bruising, shortness of breath, chest pain, leg pain or swelling, or persistent abdominal pain after receiving the vaccine.
The AstraZeneca vaccine uses a chimpanzee adenovirus vector called ChAdOx1. This vector carries the genetic instructions for the spike protein of SARS-CoV-2 in its pre-fusion form. After injection, the spike protein is produced in the body, triggering the immune system to create antibodies and immune cells that recognize and fight the virus. This immune response provides protection against future infection.
'''

In [None]:
triplets_graph = generate_triplets(text)

In [None]:
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))

In [None]:
def create_constraints():
    with driver.session() as session:
        session.run("CREATE CONSTRAINT IF NOT EXISTS FOR (e:Entity) REQUIRE e.name IS UNIQUE")
        print("Constraints created")

create_constraints()

In [None]:
def insert_triple(tx, subject, relation, obj):
    relation_clean = relation.strip().upper().replace(" ", "_").replace("-", "_")
    if not relation_clean.isidentifier():
        relation_clean = "RELATED_TO"
    query = f"""
        MERGE (s:Entity {{name: $subject}})
        MERGE (o:Entity {{name: $object}})
        MERGE (s)-[r:{relation_clean}]->(o)
    """
    tx.run(query, subject=subject, object=obj)


def add_triples_to_neo4j(triples):
    with driver.session() as session:
        for t in triples:
            subject = t["subject"]
            relation = t["relation"]
            obj = t["object"]
            session.execute_write(insert_triple, subject, relation, obj)
    print("All triples added to Neo4j successfully")

In [None]:
add_triples_to_neo4j(triplets_graph)

In [None]:
# Have to upload this and then run all cells below this
new_text = '''
The Janssen COVID-19 Vaccine, also known as Ad26.COV2.S, is a recombinant vaccine that uses an adenovirus serotype 26 (Ad26) vector to deliver a stabilized SARS-CoV-2 spike protein. The vaccine was developed through a collaboration between Johnson & Johnson (J&J), Janssen Pharmaceuticals, and the Beth Israel Deaconess Medical Center.
Preclinical studies in animals showed promising results. In hamsters infected with SARS-CoV-2, a single dose of the vaccine triggered neutralizing antibodies and protected against pneumonia and death caused by the virus. Further studies in rhesus monkeys showed that the Ad26 vaccine produced a strong immune response and provided near-complete protection, as seen in nasal and lung samples taken after exposure to the virus.
In June 2020, a Phase 1/2 clinical trial was launched to test the vaccine safety, immune response, and effectiveness in humans. The trial included 1,045 healthy adults between the ages of 18 and 55. The vaccine later received Emergency Use Authorization (EUA) in the United States.
The Janssen COVID-19 Vaccine is indicated for active immunization to prevent COVID-19 caused by the SARS-CoV-2 virus in adults aged 18 years and older. In the U.S., it is authorized under an EUA for individuals 18 years or older who either do not have access to, or are not medically suited for, other FDA-approved COVID-19 vaccines. It is also available to those who choose to receive it because they would otherwise not get vaccinated.
The Janssen vaccine uses a recombinant, replication-incompetent human adenovirus type 26 vector. This vector carries the genetic code for the SARS-CoV-2 spike protein in a stabilized form. After vaccination, the body cells produce the spike protein, which triggers the immune system to create antibodies and activate immune cells against it. This immune response provides protection from future infection with the SARS-CoV-2 virus.
'''

In [None]:
new_triplets = generate_triplets(new_text)

In [None]:
add_triples_to_neo4j(new_triplets)

In [None]:
for item in new_triplets:
    triplets_graph.append(item)
# Stop run here after uploading

In [None]:
# Answer Retrieval and Generated System

In [None]:
def get_all_triples():
    """Fetch all triples (subject, relation, object) from Neo4j."""
    with driver.session() as session:
        query = """
        MATCH (s)-[r]->(o)
        RETURN s.name AS subject, type(r) AS relation, o.name AS object
        """
        records = session.run(query)
        triples = [{"subject": r["subject"], "relation": r["relation"], "object": r["object"]} for r in records]
    return triples

In [None]:
triples = get_all_triples()
print(f"Retrieved {len(triples)} triples from Neo4j")
if not triples:
    raise ValueError("No triples found! Please ensure Neo4j contains nodes and relationships.")

triple_texts = [f"{t['subject']} {t['relation'].replace('_', ' ').lower()} {t['object']}" for t in triples]

In [None]:
import spacy
from sentence_transformers import SentenceTransformer
import faiss 

nlp = spacy.load("en_core_web_sm")
EMBEDDING_MODEL = "multi-qa-MiniLM-L6-cos-v1"
embedder = SentenceTransformer(EMBEDDING_MODEL)

In [None]:
from sklearn.preprocessing import normalize

embeddings = embedder.encode(triple_texts, normalize_embeddings=True)
embeddings = normalize(embeddings)

dim = embeddings.shape[1]
index = faiss.IndexFlatIP(dim)
index.add(embeddings)
print("FAISS index rebuilt successfully")


In [None]:
llm_new = ChatGoogleGenerativeAI(model="models/gemini-2.5-flash", temperature=0.2)

In [None]:
def retrieve_relevant_triples(question, top_k=10, threshold=0.12):
    """Retrieve the most relevant triples for a given question using FAISS."""
    q_emb = embedder.encode([question])
    q_emb = normalize(q_emb)

    D, I = index.search(q_emb.astype(np.float32), top_k)
    retrieved = [
        triple_texts[idx]
        for idx, score in zip(I[0], D[0])
        if 0 <= idx < len(triple_texts) and score > threshold
    ]

    '''
    print("\nRetrieval Results:")
    for idx, score in zip(I[0], D[0]):
        if 0 <= idx < len(triple_texts):
            print(f"{score:.3f} → {triple_texts[idx]}")
    '''
    return retrieved


def answer_from_graph(question):
    """Answer user question using graph knowledge (RAG)."""
    retrieved = retrieve_relevant_triples(question)
    if not retrieved:
        return "No relevant information found."

    context = "\n".join(retrieved)
    prompt = f"""
You are a factual and expert biomedical assistant that answers based ONLY on the provided triples.
If the question cannot be answered, say exactly:
"No relevant information found."

Question: {question}

Relevant triples:
{context}

Give a clear answer using only the triples.
"""
    response = llm.invoke(prompt)
    return response.content.strip()

In [None]:
question = "What does AZD1222 protect against?"
print(answer_from_graph(question))

In [None]:
# Perform this if above does not work

In [None]:
def get_answer(question, triples=triplets_graph):
    """
    Ask a question over structured triples using LLM reasoning.
    If no relevant info → "No relevant information found."
    """
    context = json.dumps(triples, indent=2)

    prompt = f"""
You are a factual assistant answering questions based ONLY on the provided triples.
If you cannot find relevant information, say exactly:
"No relevant information found."

Question: {question}

Triples:
{context}

Answer concisely based strictly on these triples.
"""

    response = llm.invoke(prompt)
    return response.content.strip()

In [None]:
question = "What do you know about Janssen Covid19 vaccine?"
answer = get_answer(question)
print(answer)