In [None]:
#from langchain.chat_models import ChatOpenAI
from openai import OpenAI
import requests
from bs4 import BeautifulSoup

from langchain.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma

import json
import os
os.environ["USER_AGENT"] = "agent1"

USER_AGENT environment variable not set, consider setting it to identify your requests.


### AgenticRAG

In [3]:
def fetch_ncbi_bookshelf_articles(gene, disease):
    """Fetches review articles for a gene-disease pair from NCBI Bookshelf."""
    
    search_query = f"{gene} {disease} review"
    url = f"https://www.ncbi.nlm.nih.gov/books/?term={search_query.replace(' ', '+')}"
    
    response = requests.get(url)
    if response.status_code != 200:
        return {"error": "NCBI Bookshelf request failed"}

    soup = BeautifulSoup(response.text, "html.parser")

    review_articles = []
    for rev in soup.find_all("p", class_="title")[:5]:  # ✅ Fetch top 5 reviews
        review_title = rev.text.strip()
        
        # ✅ Fetch NCBI Bookshelf URL (Relative -> Absolute)
        article_link = rev.find("a")["href"]
        full_url = f"https://www.ncbi.nlm.nih.gov{article_link}"
        
        # ✅ Store metadata (Title + URL)
        review_articles.append({
            "title": review_title,
            "url": full_url
        })

    return {"review_articles": review_articles}

'''
# ✅ Test Function
gene = "BRCA1"
disease = "Breast Cancer"
articles = fetch_ncbi_bookshelf_articles(gene, disease)

# ✅ Print Retrieved Articles
for idx, article in enumerate(articles["review_articles"], 1):
    print(f"{idx}. {article['title']}")
    print(f"   URL: {article['url']}")'
'''


'\n# ✅ Test Function\ngene = "BRCA1"\ndisease = "Breast Cancer"\narticles = fetch_ncbi_bookshelf_articles(gene, disease)\n\n# ✅ Print Retrieved Articles\nfor idx, article in enumerate(articles["review_articles"], 1):\n    print(f"{idx}. {article[\'title\']}")\n    print(f"   URL: {article[\'url\']}")\'\n'

In [None]:
# from scholarly import scholarly
# import requests
# from bs4 import BeautifulSoup

# def get_citations_from_scholar(article_title):
#     """Fetch citation count for a review article using Google Scholar."""
#     try:
#         search_query = scholarly.search_pubs(article_title)
#         result = next(search_query, None)  # Get the first result
        
        
#         if result:
#             return result["num_citations"]  # Extract the citation count
#         else:
#             return 0  # No citations found
#     except Exception as e:
#         print(f"❌ Error fetching citations for {article_title}: {e}")
#         return 0


In [4]:
def get_citations(article_title):
    """Fetch citation count for a review article using multiple sources (CrossRef, OpenAlex, Semantic Scholar)."""

    # ✅ Try CrossRef first
    citations = get_citations_from_crossref(article_title)
    if citations > 0:
        return citations


# ✅ Fetch Citations via CrossRef API
def get_citations_from_crossref(article_title):
    """Fetches citation count for an article from CrossRef API."""
    url = f"https://api.crossref.org/works?query.title={article_title}"

    try:
        response = requests.get(url, timeout=10)
        data = response.json()

        for item in data["message"]["items"]:
            if "is-referenced-by-count" in item:
                return item["is-referenced-by-count"]

        return 0  # No citations found

    except Exception as e:
        print(f"❌ Error fetching citations from CrossRef for {article_title}: {e}")
        return 0

In [5]:
def fetch_ncbi_bookshelf_articles(gene, disease):
    """Fetches review articles for a gene-disease pair from NCBI Bookshelf and ranks them by citations."""
    
    search_query = f"{gene} {disease} review"
    url = f"https://www.ncbi.nlm.nih.gov/books/?term={search_query.replace(' ', '+')}"
    
    response = requests.get(url)
    if response.status_code != 200:
        return {"error": "NCBI Bookshelf request failed"}

    soup = BeautifulSoup(response.text, "html.parser")

    review_articles = []
    for rev in soup.find_all("p", class_="title")[:5]:  # ✅ Fetch top 5 reviews
        review_title = rev.text.strip()
        
        # ✅ Fetch NCBI Bookshelf URL (Relative -> Absolute)
        article_link = rev.find("a")["href"]
        full_url = f"https://www.ncbi.nlm.nih.gov{article_link}"
        
        # ✅ Fetch citation count from Google Scholar
        citations = get_citations_from_crossref(review_title)
        
        # ✅ Store metadata (Title + URL + Citations)
        review_articles.append({
            "title": review_title,
            "url": full_url,
            "citations": citations
        })

    # ✅ Sort articles by number of citations (Descending)
    review_articles.sort(key=lambda x: x["citations"], reverse=True)

    return {"review_articles": review_articles}


### RAG for top 5 cited articles

In [7]:
def load_articles_with_webloader(review_articles):
    """Loads full text from review article URLs using LangChain WebLoader."""
    docs = []
    
    for article in review_articles:
        url = article["url"]
        print(f"🌐 Scraping: {url}")
        
        try:
            loader = WebBaseLoader(url)
            docs.extend(loader.load())
        except Exception as e:
            print(f"❌ Error loading {url}: {e}")
    
    return docs  # ✅ Returns extracted documents


In [8]:
def split_articles(docs):
    """Splits loaded documents into smaller chunks for efficient storage."""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,  # ✅ Each chunk = 1000 characters
        chunk_overlap=200  # ✅ Allow 200 character overlap
    )
    
    return text_splitter.split_documents(docs)

In [9]:
def store_in_chromadb(chunks):
    """Stores document chunks in ChromaDB using OpenAI embeddings."""
    embedding_model = OpenAIEmbeddings(model="text-embedding-ada-002")

    vectorstore = Chroma.from_documents(
        documents=chunks,
        embedding=embedding_model
    )

    return vectorstore  # ✅ Returns stored vector index

In [10]:
def retrieve_relevant_docs(vectorstore, query, top_k=3):
    """Retrieves most relevant documents from ChromaDB for a given query."""
    return vectorstore.similarity_search(query, k=top_k)

In [None]:
# import openai
# import json

# def predict_mechanism_with_rag(gene, disease, retrieved_docs):
#     """Predicts disease mechanism using retrieved literature and GPT-4."""
    
#     evidence_text = ""
#     for doc in retrieved_docs:
#         evidence_text += f"\n\n{doc.page_content[:2000]}"  # ✅ Limit to 2000 chars

#     # ✅ Prompt GPT-4 with retrieved evidence
#     prompt = f"""
#     Based on the following literature, determine the mechanism of disease for {gene} in {disease}:

#     {evidence_text}

#     Classify as:
#     1. Loss of Function (LoF) - Mutation leads to reduced/absent protein function.
#     2. Gain of Function (GoF) - Mutation leads to enhanced/new protein function.
#     3. Dominant Negative (DN) - Mutant protein interferes with wild-type protein.

#     Response format (JSON):
#     {
#         "mechanism": "...",
#         "justification": "..."
#     }
#     """
    

#     response = client.chat.completions.create(
#         model="gpt-4o",
#         messages=[{"role": "user", "content": prompt}]
#     )

#     return response.choices[0].message.content


In [11]:
#import openai

# ✅ Initialize OpenAI client
#client = openai.OpenAI()

def predict_mechanism_with_rag(gene, disease, retrieved_docs):
    """Predicts disease mechanism using retrieved literature and GPT-4 with enforced JSON output."""

    # ✅ Combine evidence from retrieved documents
    evidence_text = "\n\n".join([doc.page_content[:2000] for doc in retrieved_docs])  # Limit text size

    # ✅ Construct structured prompt
    prompt = f"""
    Based on the following scientific literature, determine the mechanism of disease for {gene} in {disease}:

    {evidence_text}

    Choose one of the following classifications:
    - Loss of Function (LoF): Mutation leads to reduced/absent protein function.
    - Gain of Function (GoF): Mutation leads to enhanced/new protein function.
    - Dominant Negative (DN): Mutant protein interferes with wild-type protein.

    Return a **valid JSON response** in this exact format:
    {{
        "mechanism": "Loss of Function (LoF) | Gain of Function (GoF) | Dominant Negative (DN)",
        "justification": "A concise explanation based on extracted evidence."
    }}
    """

    try:
        # ✅ Enforce JSON output
        client=OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[{"role": "user", "content": prompt}],
            response_format={"type": "json_object"}  # ✅ Forces GPT-4 to return JSON
        )

        # ✅ Parse and return JSON response
        return json.loads(response.choices[0].message.content)  # Ensure valid JSON parsing

    except Exception as e:
        print(f"❌ Error predicting mechanism: {e}")
        return {"error": "Failed to predict mechanism"}


In [12]:
def score_confidence_with_rag(gene, disease, review_articles):
    """Scores confidence of the predicted mechanism based on citation count."""
    
    # ✅ Use total citations from top 3 articles
    total_citations = sum([article["citations"] for article in review_articles[:3]])
    confidence_score = min(10, total_citations // 100)  # Normalize to 1-10 scale
    
    return {"mechanism": "Predicted Mechanism", "confidence_score": confidence_score}


In [13]:
def autonomous_mechanism_discovery(gene, disease):
    """Runs the full mechanism discovery pipeline with LangChain WebLoader + ChromaDB."""
    
    print("\n🔍 Step 1: Fetching Highly Cited Review Articles...")
    search_results = fetch_ncbi_bookshelf_articles(gene, disease)
    review_articles = search_results["review_articles"]
    
    print("\n🌐 Step 2: Scraping Articles Using LangChain WebLoader...")
    docs = load_articles_with_webloader(review_articles)

    print("\n📄 Step 3: Splitting Articles for Storage...")
    chunks = split_articles(docs)

    print("\n🗄️ Step 4: Storing in ChromaDB...")
    vectorstore = store_in_chromadb(chunks)

    print("\n🔎 Step 5: Retrieving Top Relevant Evidence...")
    retrieved_docs = retrieve_relevant_docs(vectorstore, f"{gene} {disease} mechanism", top_k=3)

    print("\n🧠 Step 6: Predicting Mechanism Using RAG & GPT-4...")
    mechanism_result = predict_mechanism_with_rag(gene, disease, retrieved_docs)
    
    print("\n📊 Step 7: Scoring Confidence Based on Citations...")
    confidence_score = score_confidence_with_rag(gene, disease, review_articles)

    print("\n✅ Final Mechanism Report:")
    final_report = {
        "gene": gene,
        "disease": disease,
        "mechanism": mechanism_result["mechanism"],
        "confidence_score": confidence_score["confidence_score"],
        "justification": mechanism_result["justification"]
    }

    print(final_report)
    return final_report

In [None]:
if __name__ == "__main__":
    # ✅ Run the pipeline
    report = autonomous_mechanism_discovery("BRCA1", "Breast Cancer")


🔍 Step 1: Fetching Highly Cited Review Articles...

🌐 Step 2: Scraping Articles Using LangChain WebLoader...
🌐 Scraping: https://www.ncbi.nlm.nih.gov/books/NBK545867/?term=BRCA1%20Breast%20Cancer%20review
🌐 Scraping: https://www.ncbi.nlm.nih.gov/books/NBK12354/?term=BRCA1%20Breast%20Cancer%20review
🌐 Scraping: https://www.ncbi.nlm.nih.gov/books/NBK82221/?term=BRCA1%20Breast%20Cancer%20review
🌐 Scraping: https://www.ncbi.nlm.nih.gov/books/NBK430685/?term=BRCA1%20Breast%20Cancer%20review
🌐 Scraping: https://www.ncbi.nlm.nih.gov/books/NBK179201/?term=BRCA1%20Breast%20Cancer%20review

📄 Step 3: Splitting Articles for Storage...

🗄️ Step 4: Storing in ChromaDB...


  embedding_model = OpenAIEmbeddings(model="text-embedding-ada-002")



🔎 Step 5: Retrieving Top Relevant Evidence...

🧠 Step 6: Predicting Mechanism Using RAG & GPT-4...

📊 Step 7: Scoring Confidence Based on Citations...

✅ Final Mechanism Report:
{'gene': 'BRCA1', 'disease': 'Breast Cancer', 'mechanism': 'Loss of Function (LoF)', 'confidence_score': 2, 'justification': 'BRCA1 mutations in breast cancer typically result in a loss of function, as these mutations often lead to reduced or absent protein function which disrupts the DNA repair mechanism, thereby increasing the risk for cancer development.'}
