<a href="https://www.kaggle.com/code/jpthirumalai/pharmacovigilance-ver1-hcls?scriptVersionId=235117177" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

"""
Capstone Project: Multi-Agent System for Real-Time Pharmacovigilance Signal Detection<br>
Date: 2025-04-08
"""

In [2]:


# %% [markdown]
# # Capstone: Real-Time Pharmacovigilance Signal Detection Agent System
#
# **Goal:** Monitor diverse data sources (literature, news, simulated regulatory/social feeds) to identify potential drug safety signals using a multi-agent system powered by Gemini and Vector Search.
#
# **Core Components:**
# 1.  **Data Ingestion:** Fetch/Simulate data from PubMed, News, Social Media, FAERS.
# 2.  **Embedding & Vector Store:** Embed relevant text data and store in ChromaDB.
# 3.  **Knowledge Store:** Basic info on drugs, known ADRs (simulated).
# 4.  **Generative AI Agents:** Specialized agents for scanning, context analysis, and synthesis.
# 5.  **Agent Communication:** Simple function calls or message passing.
# 6.  **LLM:** Google Gemini (`gemini-1.5-flash-latest` or `gemini-1.5-pro-latest`).


In [3]:
!pip uninstall -qqy jupyterlab kfp  # Remove unused conflicting packages
!pip install -qU "google-genai==1.7.0" "chromadb==0.6.3"

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m144.7/144.7 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m611.1/611.1 kB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m55.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.9/100.9 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m284.2/284.2 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.2/95.2 kB[0m [31m4.4 MB/s[0

In [4]:
!pip install feedparser
!pip install biopython

Collecting feedparser
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting sgmllib3k (from feedparser)
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading feedparser-6.0.11-py3-none-any.whl (81 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.3/81.3 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: sgmllib3k
  Building wheel for sgmllib3k (setup.py) ... [?25l[?25hdone
  Created wheel for sgmllib3k: filename=sgmllib3k-1.0.0-py3-none-any.whl size=6047 sha256=a8b0150514d6b45296e4f9e0c50d806963aaccff3093ccc259ee9fcb02e218d2
  Stored in directory: /root/.cache/pip/wheels/f0/69/93/a47e9d621be168e9e33c7ce60524393c0b92ae83cf6c6e89c5
Successfully built sgmllib3k
Installing collected packages: sgmllib3k, feedparser
Successfully installed feedparser-6.0.11 sgmllib3k-1.0.0
Collecting biopython
  Downloading biopython-1.85-cp310-cp310-man

In [5]:
from google import genai
from google.genai import types
import google.generativeai as ogenai

from IPython.display import Markdown

genai.__version__

'1.7.0'

In [6]:
!pip install phidata
!pip install groq

Collecting phidata
  Downloading phidata-2.7.10-py3-none-any.whl.metadata (38 kB)
Collecting pydantic-settings (from phidata)
  Downloading pydantic_settings-2.9.1-py3-none-any.whl.metadata (3.8 kB)
Collecting typing-inspection>=0.4.0 (from pydantic-settings->phidata)
  Downloading typing_inspection-0.4.0-py3-none-any.whl.metadata (2.6 kB)
Downloading phidata-2.7.10-py3-none-any.whl (716 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m716.9/716.9 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydantic_settings-2.9.1-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading typing_inspection-0.4.0-py3-none-any.whl (14 kB)
Installing collected packages: typing-inspection, pydantic-settings, phidata
Successfully installed phidata-2.7.10 pydantic-settings-2.9.1 typing-inspection-0.4.0
Collecting groq
  Downloading groq-0.22.0-py3-none-any.

In [7]:

import os
import json
import re
import datetime
import time
import hashlib # For generating consistent IDs
from typing import List, Dict, Any  # Import List, Dict and Any
from phi.agent import Agent
# from phi.model.groq import Groq
from groq import Groq
from phi.tools.tool import Tool
import requests
import hashlib
import traceback
from typing import List, Dict, Any  # Import List, Dict and Any
from kaggle_secrets import UserSecretsClient

# Core AI/VectorStore Libs
# import google.generativeai as genai
import chromadb
from chromadb.utils import embedding_functions

# Data Source Libs (Install as needed)
import requests # For NewsAPI, other web APIs
# from Bio import Entrez # For PubMed - Requires setup
import feedparser # For RSS Feeds (News, some journals)
# import praw # For Reddit - Requires API setup

print(f"Notebook started on: {datetime.datetime.now()}")
print(f"Current date context: Tuesday, April 8, 2025") # As per user context


Notebook started on: 2025-04-20 22:58:29.027635
Current date context: Tuesday, April 8, 2025


In [8]:
# --- Securely load API keys ---
# Recommended: Use environment variables or Colab secrets
# Example:
# GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
# NEWS_API_KEY = os.environ.get("NEWS_API_KEY")
# REDDIT_CLIENT_ID = os.environ.get("REDDIT_CLIENT_ID")
# etc.

In [9]:
from kaggle_secrets import UserSecretsClient

google_client = UserSecretsClient().get_secret("GOOGLE_API_KEY")
news_client = UserSecretsClient().get_secret("NEWS_API_KEY")

2. Initialize LLM & Embedding Model (Using Google AI)

In [10]:
generation_config =[ 
    types.GenerateContentConfig(
      temperature = 0.7, # Adjust for creativity vs consistency
      top_p= 1.0,
      top_k= 32,
      max_output_tokens= 8192,
    )
]


In [11]:
# Safety settings for Gemini
# safety_settings = [
#     {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
#     {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
#     {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
#     {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
# ]

safety_settings=[
        types.SafetySetting(
            category=types.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
            threshold=types.HarmBlockThreshold.BLOCK_LOW_AND_ABOVE,
        ),]

In [12]:
# import google.generativeai as genai2
client = None
try:
    client = genai.Client(api_key=google_client)
    # Using Flash for speed, consider Pro for more complex reasoning
    model_name="gemini-2.0-flash-001",
    response = client.models.generate_content(
        model="gemini-2.0-flash-001",
        contents="Will this pfizer or abbvie solve any of their use cases with Pharmacovigilance",
        # config = generation_config,
        config = types.GenerateContentConfig(
            temperature = 0.7, # Adjust for creativity vs consistency
            top_p= 1.0,
            top_k= 32,
            max_output_tokens= 8192,
            safety_settings=[
            types.SafetySetting(
                category=types.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
                threshold=types.HarmBlockThreshold.BLOCK_LOW_AND_ABOVE,
            ),]
        )
        # safetySettings = safety_settings
    )
    print(response.text)
    print(f"Gemini model '{model_name}' initialized.")

    # Using Google's text embedding model via the Generative AI SDK
    embedding_model_name = "models/text-embedding-004"
    # Note: Direct embedding function via genai SDK might be simpler for some use cases
    # Or use ChromaDB's helper with GoogleGenerativeAiEmbeddingFunction if needed
    google_ef = embedding_functions.GoogleGenerativeAiEmbeddingFunction(api_key=google_client, model_name=embedding_model_name)
    print(f"Google Embedding model '{embedding_model_name}' ready via ChromaDB helper.")

except Exception as e:
    print(f"Error initializing Google AI services: {e}")
    # Handle error appropriately (e.g., exit or fallback)


To answer whether Pfizer or AbbVie can solve any of their use cases with Pharmacovigilance (PV), we need to understand:

1. **What Pharmacovigilance is:**  It's the science and activities relating to the detection, assessment, understanding, and prevention of adverse effects or any other drug-related problem.  It's a crucial part of drug development and post-market surveillance.

2. **Common Use Cases in Pharmacovigilance:**  These are the tasks and challenges PV departments face. Here are some examples:

    *   **Adverse Event (AE) Case Management:**  Collecting, processing, and reporting individual case safety reports (ICSRs) from various sources (clinical trials, post-market surveillance, literature, etc.).
    *   **Signal Detection:**  Identifying potential new safety concerns by analyzing large datasets of AE reports.  This involves looking for patterns and trends that might not be obvious from individual cases.
    *   **Risk Management:**  Developing and implementing strategie

### Helper function for LLM calls

In [13]:
def call_gemini(prompt, llm_model=client, is_json_output=False):
    """Sends a prompt to the Gemini model and returns the text response."""
    try:
        client = genai.Client(api_key=google_client)
        # model_name="gemini-2.0-flash-001",
        # Add instruction for JSON output if requested
        if is_json_output:
             prompt += "\n\nPlease format your response strictly as a JSON object."

        response = client.models.generate_content(
            model = "gemini-2.0-flash",
            contents = prompt,
            # config=generation_config,
            # safetySettings=safety_settings
            config = types.GenerateContentConfig(
                temperature = 0.7, # Adjust for creativity vs consistency
                top_p= 1.0,
                top_k= 32,
                max_output_tokens= 8192,
                safety_settings=[
                types.SafetySetting(
                    category=types.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
                    threshold=types.HarmBlockThreshold.BLOCK_LOW_AND_ABOVE,
                )]
            )
        )
        # Basic check for blocked content
        if not response.candidates:
             print("Warning: Response was blocked or empty.")
             return None
        # Attempt to extract text, handle potential errors
        try:
            result_text = response.text
            if is_json_output:
                # Clean potential markdown ```json ... ```
                result_text = re.sub(r'^```json\s*|\s*```$', '', result_text, flags=re.MULTILINE)
                return json.loads(result_text) # Parse JSON
            return result_text
        except (ValueError, json.JSONDecodeError) as json_err:
             print(f"Warning: Could not parse expected JSON output. Error: {json_err}")
             print(f"Raw response: {response.text}")
             return None # Or return raw text if preferred fallback
        except Exception as resp_err:
            print(f"Warning: Error extracting text from response. Error: {resp_err}")
            return None

    except Exception as e:
        print(f"Error calling Gemini API: {e}")
        print(traceback.format_exc())
               
        return None

In [14]:
gorq_client = Groq(
            api_key=UserSecretsClient().get_secret("GROQ_API_KEY"),  
        )

In [15]:
def call_groq(prompt, llm_model=gorq_client, is_json_output=False):
    """Sends a prompt to the Groq llama model and returns the text response."""
    try:
        
        # model_name="gemini-2.0-flash-001",
        # Add instruction for JSON output if requested
        if is_json_output:
             prompt += "\n\nPlease format your response strictly as a JSON object."

        
        
        chat_completion = gorq_client.chat.completions.create(
            messages=[
                {
                    "role": "user",
                    "content": prompt,
                }
            ],
            model="llama3-8b-8192",
            #
            # Optional parameters
            #
        
            # Controls randomness: lowering results in less random completions.
            # As the temperature approaches zero, the model will become deterministic
            # and repetitive.
            temperature=0.5,
        
            # The maximum number of tokens to generate. Requests can use up to
            # 32,768 tokens shared between prompt and completion.
            max_completion_tokens=1024,
        
            # Controls diversity via nucleus sampling: 0.5 means half of all
            # likelihood-weighted options are considered.
            top_p=1,
        
            # A stop sequence is a predefined or user-specified text string that
            # signals an AI to stop generating content, ensuring its responses
            # remain focused and concise. Examples include punctuation marks and
            # markers like "[end]".
            stop=None,
        )
        response = chat_completion.choices[0].message.content

        # Basic check for blocked content
        if not response:
             print("Warning: Response was blocked or empty.")
             return None
        # Attempt to extract text, handle potential errors
        # print(response)
        try:
            result_text = response
            if is_json_output:
                # Clean potential markdown ```json ... ```
                result_text = re.sub(r'^```json\s*|\s*```$', '', result_text, flags=re.MULTILINE)
                return json.loads(result_text) # Parse JSON
            return result_text
        except (ValueError, json.JSONDecodeError) as json_err:
             print(f"Warning: Could not parse expected JSON output. Error: {json_err}")
             print(f"Raw response: {response.text}")
             return None # Or return raw text if preferred fallback
        except Exception as resp_err:
            print(f"Warning: Error extracting text from response. Error: {resp_err}")
            return None

    except Exception as e:
        print(f"Error calling Gemini API: {e}")
        print(traceback.format_exc())
               
        return None

### Helper function for Embeddings

In [16]:
def get_embedding(text, model_name=embedding_model_name):
    """Generates embeddings for a given text using Google's model."""
    try:
        result = ogenai.embed_content(model=f"{model_name}", content=text, task_type="RETRIEVAL_DOCUMENT")
        return result['embedding']
    except Exception as e:
        print(f"Error generating embedding: {e}")
        return None


## 3. Initialize Vector Store (ChromaDB - Local)

In [17]:

client = chromadb.PersistentClient(path="./chroma_pv_db") # Creates directory if not exists

# Using Google Generative AI embeddings with ChromaDB
# Note: If using google_ef helper, pass it here. Otherwise, generate embeddings separately.
try:
    # Collection for Literature Abstracts/Snippets
    literature_collection = client.get_or_create_collection(
        name="literature_pv",
        embedding_function=google_ef # Use the helper function
        # metadata={"hnsw:space": "cosine"} # Optional: Specify distance metric
    )
    print(f"ChromaDB collection 'literature_pv' ready. Item count: {literature_collection.count()}")

    # Collection for News/Social Media Posts (potentially shorter text)
    feeds_collection = client.get_or_create_collection(
        name="feeds_pv",
        embedding_function=google_ef
    )
    print(f"ChromaDB collection 'feeds_pv' ready. Item count: {feeds_collection.count()}")

except Exception as e:
    print(f"Error initializing ChromaDB: {e}")

ChromaDB collection 'literature_pv' ready. Item count: 0
ChromaDB collection 'feeds_pv' ready. Item count: 0


### Helper function to add to ChromaDB

In [18]:
import traceback

In [19]:
def add_to_vector_store(collection, documents, metadatas, ids):
    """Adds documents and metadata to the specified ChromaDB collection."""
    if not documents:
        print("No documents to add.")
        return
    try:
        collection.add(
            # embeddings=embeddings, # Not needed if embedding_function is set
            documents=documents,
            metadatas=metadatas,
            ids=ids
        )
        print(f"Added {len(documents)} items to collection '{collection.name}'.")
    except Exception as e:
        print(f"Error adding to ChromaDB collection '{collection.name}': {e}")
        print(traceback.format_exc())



### Helper function to search ChromaDB

In [20]:
def search_vector_store(collection, query_text, n_results=5):
    """Searches the collection for text similar to the query text."""
    try:
        results = collection.query(
            query_texts=[query_text],
            n_results=n_results,
            include=['documents', 'metadatas', 'distances']
        )
        return results
    except Exception as e:
        print(f"Error searching ChromaDB collection '{collection.name}': {e}")
        return None

## 4. Define Knowledge Store (using agents to get literature from pubmed & news from newsapi)
### In a real system, this could be a database or more structured files.

In [21]:
knowledge_store = {
    "drugs": {
        "Levothyroxine": {
            "class": "Thyroid hormone",
            "indication": "Hypothyroidism",
            "known_adrs": ["Headache", "Nausea", "Palpitations"],
        },
        "Lisinopril": {
            "class": "ACE inhibitor",
            "indication": "Hypertension",
            "known_adrs": ["Cough", "Dizziness", "Hypotension"],
        },
        "Metformin": {
            "class": "Biguanide",
            "indication": "Type 2 Diabetes",
            "known_adrs": ["Diarrhea", "Nausea", "Abdominal Pain"],
        },
        "Simvastatin": {
            "class": "Statin",
            "indication": "Hypercholesterolemia",
            "known_adrs": ["Muscle Pain", "Elevated Liver Enzymes"],
        },
        "Sertraline": {
            "class": "SSRI",
            "indication": "Depression",
            "known_adrs": ["Nausea", "Insomnia", "Anxiety"],
        },
        "Amoxicillin": {
            "class": "Penicillin",
            "indication": "Bacterial Infection",
            "known_adrs": ["Diarrhea", "Rash"],
        },
        "Warfarin": {
            "class": "Anticoagulant",
            "indication": "Blood Clot Prevention",
            "known_adrs": ["Bleeding"],
        },
        "Prednisone": {
            "class": "Corticosteroid",
            "indication": "Inflammation",
            "known_adrs": ["Weight Gain", "Mood Changes", "Increased risk of infection"],
        },
        "Insulin": {
            "class": "Hormone",
            "indication": "Diabetes",
            "known_adrs": ["Hypoglycemia"],
        },
        "Oxycodone": {
            "class": "Opioid",
            "indication": "Pain",
            "known_adrs": ["Constipation", "Nausea", "Drowsiness"],
        },
    },
    "meddra_mapping": {  # Example mapping, add more!
        "cardiovascular complications": "Cardiovascular Disorder",
        "osteoporosis": "Bone Density Decreased",
        "testicular dysfunction": "Testicular Disorder",
        "hypotension":"Low blood pressure",
        "gastrointestinal discomfort": "Gastrointestinal Disorder",
        "hepatic impairments":"Liver Disorder"
    },
    "seriousness_keywords": [
        "severe",
        "life-threatening",
        "hospitalization",
        "disability",
        "death",
        "permanent damage",
        "ICU",
        "intensive care",
        "suicide",
        "hemorrhage",
        "liver failure"
    ],
}

In [22]:
def get_drug_info(drug_name):
    return knowledge_store["drugs"].get(drug_name)

In [23]:
def map_to_meddra(term):
    # Simple keyword matching - real system needs fuzzy matching / NLP model
    term_lower = term.lower()
    for key, value in knowledge_store["meddra_mapping"].items():
        if key in term_lower:
            return value
    return term # Return original if no simple map found

In [24]:
def check_seriousness(text):
    text_lower = text.lower()
    for keyword in knowledge_store["seriousness_keywords"]:
        if keyword in text_lower:
            return True
    return False

In [25]:
import datetime
import hashlib
from Bio import Entrez  # Import Entrez
from Bio import Medline # Import Medline

def literature_scanner_agent(drugs_of_interest, search_terms, max_results=10):
    """
    Monitors PubMed for new relevant literature using Entrez.
    Extracts potential ADRs and drug mentions.
    Adds findings to vector store.
    Returns list of findings (e.g., AnalyzedItem objects or dicts).
    """

    print(f"\n--- Running Literature Scanner Agent ({datetime.datetime.now()}) ---")
    findings = []

    Entrez.email = UserSecretsClient().get_secret("THIRU_EMAIL")#"your.email@example.com"  # Always tell NCBI who you are! Replace with your email.

    for drug in drugs_of_interest:
        for keyword in search_terms:
            search_term = f"{drug} AND {keyword}"
            try:
                handle = Entrez.esearch(db="pubmed", term=search_term, retmax=max_results)
                record = Entrez.read(handle)
                handle.close()
                
                if record["IdList"]:
                    print(f"  Found {len(record['IdList'])} abstracts for: {search_term}")
                    
                    # Fetch the actual abstracts
                    handle = Entrez.efetch(db="pubmed", id=",".join(record["IdList"]), retmode="text", rettype="medline")
                    records = Medline.parse(handle)
                    
                    for article in records:
                        try:
                            if 'AB' in article and 'TI' in article:
                                abstract_text = article['TI'] + ". " + article['AB']
                                pmid = article.get('PMID', 'N/A')
                                # Basic ADR extraction (Improve with LLM!)
                                potential_adr = None
                                adr_keywords = ["side effect", "adverse reaction", "toxicity", "complication"]
                                for adr in adr_keywords:
                                    if adr in abstract_text.lower():
                                        #  Crude extraction, get creative/LLM here
                                        sentences = abstract_text.split('.')
                                        for sentence in sentences:
                                            if adr in sentence.lower():
                                                potential_adr = sentence.strip()
                                                break
                                        if potential_adr:
                                            break
                                
                                if potential_adr:
                                    finding = {
                                        "source": "pubmed",
                                        "source_id": pmid,
                                        "text": abstract_text,
                                        "potential_adr": potential_adr,
                                        "drug_mentioned": drug,
                                        "timestamp": datetime.datetime.now()
                                    }
                                    findings.append(finding)
                                    # print(f"    Relevant abstract: {pmid}")
                                    
                                    docs_to_embed = []
                                    metadatas = []
                                    ids = []
                                    
                                    # Prepare for vector store
                                    docs_to_embed.append(abstract_text)
                                    metadatas.append({
                                        "source": "pubmed",
                                        "source_id": pmid,
                                        "drug": drug,
                                        "adr_mention": potential_adr,
                                        "timestamp": finding["timestamp"].isoformat()
                                    })
                                    # Generate a unique, deterministic ID based on content or source ID
                                    hash_id = hashlib.sha256(pmid.encode()).hexdigest()
                                    ids.append(f"pubmed_{hash_id}")
                                    
                                    # Add findings to Vector Store
                                    if docs_to_embed:
                                        add_to_vector_store(literature_collection, documents=docs_to_embed, metadatas=metadatas, ids=ids)
                        except Exception as e:
                            print(f"  Error processing article: {e}")
            except Exception as e:
                print(f"  Error searching PubMed for {search_term}: {e}")

    return findings

### --- Agent Function Definitions ---

In [26]:
# print(drugs_of_interest)

In [27]:

# def literature_scanner_agent(drugs_of_interest, search_terms, max_results=10):
#     """
#     Monitors PubMed (simulated here) for new relevant literature.
#     Extracts potential ADRs and drug mentions.
#     Adds findings to vector store.
#     Returns list of findings (e.g., AnalyzedItem objects or dicts).
#     """
#     print(f"\n--- Running Literature Scanner Agent ({datetime.datetime.now()}) ---")
#     findings = []
#     # TODO: Implement actual PubMed API call using Entrez
#     # Entrez.email = "Your.Name.Here@example.org" # Always tell NCBI who you are
#     # handle = Entrez.esearch(db="pubmed", term="YourComplexSearchQuery", retmax=max_results)
#     # record = Entrez.read(handle)
#     # handle.close()
#     # etc... fetch abstracts

#     # --- Simulation ---
#     simulated_abstracts = [
#         {"id": "pmid1", "text": "A study on DrugA found increased reports of severe skin reactions, previously unknown.", "drug": "DrugA", "adr": "severe skin reactions"},
#         {"id": "pmid2", "text": "DrugB effectiveness was confirmed, common side effects like Fatigue were noted.", "drug": "DrugB", "adr": "Fatigue"},
#         {"id": "pmid3", "text": "Interesting case report linking DrugA to sudden onset Vertigo.", "drug": "DrugA", "adr": "Vertigo"},
#     ]
#     print(f"Simulating PubMed search, found {len(simulated_abstracts)} abstracts.")

#     docs_to_embed = []
#     metadatas = []
#     ids = []

#     for abstract in simulated_abstracts:
#          # Basic check if drug is relevant
#         if abstract["drug"] in drugs_of_interest:
#             finding = {
#                 "source": "pubmed",
#                 "source_id": abstract["id"],
#                 "text": abstract["text"],
#                 "potential_adr": abstract["adr"],
#                 "drug_mentioned": abstract["drug"],
#                 "timestamp": datetime.datetime.now()
#             }
#             findings.append(finding)
#             print(f"  Found relevant abstract: {abstract['id']}")

#             # Prepare for vector store
#             docs_to_embed.append(abstract["text"])
#             metadatas.append({
#                 "source": "pubmed",
#                 "source_id": abstract["id"],
#                 "drug": abstract["drug"],
#                 "adr_mention": abstract["adr"],
#                 "timestamp": finding["timestamp"].isoformat()
#             })
#             # Generate a unique, deterministic ID based on content or source ID
#             hash_id = hashlib.sha256(abstract["id"].encode()).hexdigest()
#             ids.append(f"pubmed_{hash_id}")

#     embedded_docs=get_embedding(docs_to_embed)
#     # Add findings to Vector Store
#     if docs_to_embed:
#         add_to_vector_store(literature_collection, documents=docs_to_embed, metadatas=metadatas, ids=ids)

#     return findings


In [28]:
# def news_listener_agent(drugs_of_interest, keywords, max_results=20):
#     """
#     Monitors NewsAPI (or RSS) for relevant articles.
#     Uses LLM to extract potential ADRs and drug mentions.
#     Adds findings to vector store.
#     Returns list of findings.
#     """
#     try:
#         print(f"\n--- Running News Listener Agent ({datetime.datetime.now()}) ---")
#         findings = []
#         # --- Actual NewsAPI Call ---
#         # url = f"https://newsapi.org/v2/everything?q={'+OR+'.join(keywords)}&apiKey={NEWS_API_KEY}&pageSize={max_results}&sortBy=publishedAt"
#         # try:
#         #     response = requests.get(url)
#         #     response.raise_for_status() # Raise HTTPError for bad responses (4XX, 5XX)
#         #     articles = response.json().get('articles', [])
#         # except requests.exceptions.RequestException as e:
#         #     print(f"Error fetching news: {e}")
#         #     articles = []
    
#         # --- Simulation ---
#         articles = [
#             {"source": {"name": "HealthNews"}, "title": "Concerns grow over DrugA side effects", "description": "Patients report unexpected issues like Vertigo after taking DrugA.", "url": "http://example.com/news1", "publishedAt": datetime.datetime.now().isoformat()},
#             {"source": {"name": "Tech Chronicle"}, "title": "New AI for Drug Discovery", "description": "Mentions DrugC development.", "url": "http://example.com/news2", "publishedAt": datetime.datetime.now().isoformat()},
#             {"source": {"name": "Med Journal"}, "title": "DrugB trial results positive", "description": "Standard side effects noted, effectiveness confirmed.", "url": "http://example.com/news3", "publishedAt": datetime.datetime.now().isoformat()},
#             {"source": {"name": "Med Journal"}, "title": "DrugB trial serious outcome", "description": "Patient reported fatigue, experienced spinning room patient hospitalized", "url": "http://simulated.thiru.com/news4", "publishedAt": datetime.datetime.now().isoformat()},
#             {"source": {"name": "Med Journal"}, "title": "DrugB trial serious outcome", "description": "Patient reported spinning room, experienced spinning room patient hospitalized", "url": "http://simulated1.thiru.com/news1", "publishedAt": datetime.datetime.now().isoformat()},
#             {"source": {"name": "Med Journal"}, "title": "DrugB trial serious outcome", "description": "Patient reported spinning room, experienced  patient hospitalized", "url": "http://simulated2.thiru.com/news4", "publishedAt": datetime.datetime.now().isoformat()},
#             {"source": {"name": "Med Journal"}, "title": "DrugB trial serious outcome", "description": "Patient reported spinning room, experienced spinning room patient hospitalized", "url": "http://simulated3.thiru.com/news5", "publishedAt": datetime.datetime.now().isoformat()},
#             {"source": {"name": "Med Journal"}, "title": "DrugB trial serious outcome", "description": "Patient reported spinning room, experienced spinning room patient hospitalized", "url": "http://simulated4.thiru.com/news6", "publishedAt": datetime.datetime.now().isoformat()},
        
#         ]
#         print(f"Simulating News search, found {len(articles)} articles.")
    
#         docs_to_embed = []
#         metadatas = []
#         ids = []
    
#         for article in articles:
#             content_to_analyze = f"Title: {article.get('title', '')}\nDescription: {article.get('description', '')}"
    
#             # Use LLM to check relevance and extract info
#             prompt = f"""Analyze the following news snippet. Does it mention any specific drugs from the list [{', '.join(drugs_of_interest)}]? Does it mention any potential adverse drug reactions or side effects?
    
#             News Snippet:
#             "{content_to_analyze}"
    
#             If it mentions both a relevant drug AND a potential side effect, respond in JSON format with keys "relevant": true, "drug_mentioned": ["list of drugs"], "potential_adr": ["list of adrs"].
#             Otherwise, respond with "relevant": false.
#             """
#             llm_response = call_gemini(prompt, is_json_output=True)
    
#             if llm_response and llm_response.get("relevant"):
#                 drug = llm_response.get("drug_mentioned", [None])[0] # Take first mentioned relevant drug
#                 adr = llm_response.get("potential_adr", [None])[0] # Take first mentioned relevant adr
    
#                 if drug in drugs_of_interest and adr:
#                     finding = {
#                         "source": "news",
#                         "source_id": article.get('url', f"news_{hashlib.sha256(content_to_analyze.encode()).hexdigest()}"),
#                         "text": content_to_analyze,
#                         "potential_adr": adr,
#                         "drug_mentioned": drug,
#                         "timestamp": datetime.datetime.fromisoformat(article.get('publishedAt').replace("Z", "+00:00")) if article.get('publishedAt') else datetime.datetime.now()
#                     }
#                     findings.append(finding)
#                     print(f"  Relevant news item found: {article.get('url')}")
    
#                     # Prepare for vector store
#                     docs_to_embed.append(content_to_analyze)
#                     metadatas.append({
#                         "source": "news",
#                         "source_id": finding["source_id"],
#                         "drug": drug,
#                         "adr_mention": adr,
#                         "timestamp": finding["timestamp"].isoformat()
#                     })
#                     ids.append(f"news_{hashlib.sha256(finding['source_id'].encode()).hexdigest()}")
    
    
#         # Add findings to Vector Store
#         if docs_to_embed:
#             add_to_vector_store(feeds_collection, documents=docs_to_embed, metadatas=metadatas, ids=ids)
    
#         return findings
#     except Exception as e:
#         print(traceback.format_exc())
#         return None


In [29]:
# Not using this as it result in quota limit if you are using free-tier Eg. : google.genai.errors.ClientError: 429 RESOURCE_EXHAUSTED. {'error'
# def news_listener_agent(drugs_of_interest, keywords, max_results=2):
#     """
#     Monitors NewsAPI for relevant articles.
#     Uses LLM to extract potential ADRs and drug mentions.
#     Adds findings to vector store.
#     Returns list of findings.
#     """
#     try:
#         print(f"\n--- Running News Listener Agent ({datetime.datetime.now()}) ---")
#         findings = []

#         # Securely load NewsAPI key
#         news_api_key = UserSecretsClient().get_secret("NEWS_API_KEY")  # Ensure you have added the key to secrets.

#         # Construct the query. We'll search for articles containing the drug name and ADR-related keywords
#         # The '+OR+' is crucial for combining the search terms correctly in the NewsAPI query.
#         base_url = "https://newsapi.org/v2/everything"
#         all_findings = []
#         for drug in drugs_of_interest:
#           query_terms = [drug] + keywords
#           query = " OR ".join([f'"{term}"' for term in query_terms])
#           url = f"{base_url}?q={query}&apiKey={news_api_key}&pageSize={max_results}&sortBy=relevancy"

#           try:
#               response = requests.get(url)
#               response.raise_for_status()  # Raise HTTPError for bad responses
#               articles = response.json().get('articles', [])
#           except requests.exceptions.RequestException as e:
#               print(f"Error fetching news for {drug}: {e}")
#               print(f"URL was: {url}")  # Print the URL to help with debugging
#               continue  # Skip to the next drug on error

#           print(f"Fetched {len(articles)} articles from NewsAPI for {drug}.")

#           for article in articles:
#               content_to_analyze = f"Title: {article.get('title', '')}\nDescription: {article.get('description', '')}"

#               # Use LLM to check relevance and extract info
#               prompt = f"""
#                   Analyze the following news snippet.
#                   Does it mention the drug {drug}?
#                   Does it mention any potential adverse drug reactions or side effects?
#                   News Snippet: "{content_to_analyze}"

#                   If it mentions both the drug and a potential side effect, respond in JSON format with keys:
#                   "relevant": true,
#                   "drug_mentioned": "{drug}",
#                   "potential_adr": ["list of adrs"],
#                   "text": "Full text of the snippet".
#                   Otherwise, respond with "relevant": false.
#                   Ensure the response is valid JSON.
#                   """
#               llm_response = call_gemini(prompt, is_json_output=True) # this may result in quota limit if you are using free-tier Eg. : google.genai.errors.ClientError: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.', 'status': 'RESOURCE_EXHAUSTED', 'details': [{'@type': 'type.googleapis.com/google.rpc.QuotaFailure', 'violations': [{'quotaMetric': 'generativelanguage.googleapis.com/generate_content_free_tier_requests', 'quotaId': 'GenerateRequestsPerMinutePerProjectPerModel-FreeTier', 'quotaDimensions': {'location': 'global', 'model': 'gemini-2.0-flash'}, 'quotaValue': '15'}]}, {'@type': 'type.googleapis.com/google.rpc.Help', 'links': [{'description': 'Learn more about Gemini API quotas', 'url': 'https://ai.google.dev/gemini-api/docs/rate-limits'}]}, {'@type': 'type.googleapis.com/google.rpc.RetryInfo', 'retryDelay': '50s'}]}}


#               if llm_response and llm_response.get("relevant"):
#                   adrs = llm_response.get("potential_adr", [])
#                   text = llm_response.get("text", "")

#                   if adrs:
#                       for adr in adrs:
#                           finding = {
#                               "source": "news",
#                               "source_id": article.get('url', f"news_{hashlib.sha256(content_to_analyze.encode()).hexdigest()}"),
#                               "text": text,
#                               "potential_adr": adr,
#                               "drug_mentioned": drug,
#                               "timestamp": datetime.datetime.fromisoformat(article.get('publishedAt').replace("Z", "+00:00"))
#                               if article.get('publishedAt') else datetime.datetime.now(),
#                           }
#                           findings.append(finding)
#                           print(f"  Relevant news item found for {drug}: {article.get('url')}")
#         all_findings.extend(findings)
#         return all_findings

#     except Exception as e:
#         print(traceback.format_exc())
#         return []

In [30]:
# using groq cloud and llama3-8b

import os
import asyncio
from groq import AsyncGroq

def news_listener_groq_agent(drugs_of_interest, keywords, max_results=2):
    """
    Monitors NewsAPI for relevant articles.
    Uses LLM to extract potential ADRs and drug mentions.
    Adds findings to vector store.
    Returns list of findings.
    """
    try:
        print(f"\n--- Running News Listener Agent ({datetime.datetime.now()}) ---")
        findings = []

        # Securely load NewsAPI key
        news_api_key = UserSecretsClient().get_secret("NEWS_API_yKEY")  # Ensure you have added the key to secrets. used various news apikeys to overcome 429 error

        # Construct the query. We'll search for articles containing the drug name and ADR-related keywords
        # The '+OR+' is crucial for combining the search terms correctly in the NewsAPI query.
        base_url = "https://newsapi.org/v2/everything"
        all_findings = []
        for drug in drugs_of_interest:
          query_terms = [drug] + keywords
          query = " OR ".join([f'"{term}"' for term in query_terms])
          url = f"{base_url}?q={query}&apiKey={news_api_key}&pageSize={max_results}&sortBy=relevancy"

          try:
              response = requests.get(url)
              response.raise_for_status()  # Raise HTTPError for bad responses
              articles = response.json().get('articles', [])
          except requests.exceptions.RequestException as e:
              print(f"Error fetching news for {drug}: {e}")
              print(f"URL was: {url}")  # Print the URL to help with debugging
              continue  # Skip to the next drug on error

          print(f"Fetched {len(articles)} articles from NewsAPI for {drug}.")

          for article in articles:
              content_to_analyze = f"Title: {article.get('title', '')}\nDescription: {article.get('description', '')}"

              # Use LLM to check relevance and extract info
              prompt = f"""
                  Analyze the following news snippet.
                  Does it mention the drug {drug}?
                  Does it mention any potential adverse drug reactions or side effects?
                  News Snippet: "{content_to_analyze}"

                  If it mentions both the drug and a potential side effect, respond in JSON format with keys:
                  "relevant": true,
                  "drug_mentioned": "{drug}",
                  "potential_adr": ["list of adrs"],
                  "text": "Full text of the snippet".
                  Otherwise, respond with "relevant": false.
                  Ensure the response is valid JSON.
                  """
              llm_response = call_groq(prompt, is_json_output=True) # this may result in quota limit if you are using free-tier Eg. : google.genai.errors.ClientError: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.', 'status': 'RESOURCE_EXHAUSTED', 'details': [{'@type': 'type.googleapis.com/google.rpc.QuotaFailure', 'violations': [{'quotaMetric': 'generativelanguage.googleapis.com/generate_content_free_tier_requests', 'quotaId': 'GenerateRequestsPerMinutePerProjectPerModel-FreeTier', 'quotaDimensions': {'location': 'global', 'model': 'gemini-2.0-flash'}, 'quotaValue': '15'}]}, {'@type': 'type.googleapis.com/google.rpc.Help', 'links': [{'description': 'Learn more about Gemini API quotas', 'url': 'https://ai.google.dev/gemini-api/docs/rate-limits'}]}, {'@type': 'type.googleapis.com/google.rpc.RetryInfo', 'retryDelay': '50s'}]}}
                

              if llm_response and llm_response.get("relevant"):
                  adrs = llm_response.get("potential_adr", [])
                  text = llm_response.get("text", "")

                  if adrs:
                      for adr in adrs:
                          finding = {
                              "source": "news",
                              "source_id": article.get('url', f"news_{hashlib.sha256(content_to_analyze.encode()).hexdigest()}"),
                              "text": text,
                              "potential_adr": adr,
                              "drug_mentioned": drug,
                              "timestamp": datetime.datetime.fromisoformat(article.get('publishedAt').replace("Z", "+00:00"))
                              if article.get('publishedAt') else datetime.datetime.now(),
                          }
                          findings.append(finding)
                          print(f"  Relevant news item found for {drug}: {article.get('url')}")
        all_findings.extend(findings)
        return all_findings

    except Exception as e:
        print(traceback.format_exc())
        return []

### Using GroqCloud and Phidata (an alternative to LangChain) as I am exceeding gemini-flash (Gooogle) quota in free-tier

### Helper function which will be used by tool from phidata

In [31]:
# # using phidata framework - agantic AI framework
# def get_drug_news(drugs_of_interest: List[str], keywords: List[str], max_results: int = 3) -> str:
#     """
#     Use this function to fetch news articles from the NewsAPI.

#     Args:
#     drugs_of_interest (List[str]): List of drugs for which to get the results from the News API.
#     keywords (List[str]): Keywords for which relevant search is to be made.
#     max_results (int): Number of news results to return. Defaults to 3.

#     Returns:
#         str: JSON string of top stories.
#     """
#     try:
#         print(f"\n--- Running NewsAPITool ({datetime.datetime.now()}) ---")
#         findings = []
#         base_url = "https://newsapi.org/v2/everything"
#         try:
#             # news_api_key = UserSecretsClient().get_secret("NEWS_API_KEY")
#             news_api_key = UserSecretsClient().get_secret("NEWS_API_hotKEY")
#             if not news_api_key:
#                 raise ValueError("NEWS_API_KEY environment variable not set.")
#         except KeyError:
#             raise ValueError("NEWS_API_KEY environment variable not set.")

#         for drug in drugs_of_interest:
#             query_terms = [drug] + keywords
#             query = " OR ".join([f'"{term}"' for term in query_terms])
#             url = f"{base_url}?q={query}&apiKey={news_api_key}&pageSize={max_results}&sortBy=relevancy"

#             try:
#                 response = requests.get(url)
#                 response.raise_for_status()
#                 articles = response.json().get('articles', [])
#             except requests.exceptions.RequestException as e:
#                 print(f"Error fetching news for {drug}: {e}")
#                 print(f"URL was: {url}")
#                 continue

#             print(f"Fetched {len(articles)} articles from NewsAPI for {drug}.")

#             for article in articles:
#                 # print(article) # Removed the print here to avoid cluttering output
#                 content_to_analyze = f"Title: {article.get('title', '')}\nDescription: {article.get('description', '')}"

#                 published_at = article.get('publishedAt')
#                 if published_at:
#                     if isinstance(published_at, str):
#                         try:
#                             timestamp = datetime.datetime.fromisoformat(published_at.replace("Z", "+00:00")).isoformat()
#                         except ValueError:
#                             print(f"Error parsing date string: {published_at} for drug {drug}.  Setting to now().")
#                             timestamp = datetime.datetime.now().isoformat()

#                     elif isinstance(published_at, datetime.datetime):
#                         timestamp = published_at.isoformat()
#                     else:
#                         print(f"Published at was not string or datetime for drug {drug}.  It was {type(published_at)}. Setting to now().")
#                         timestamp = datetime.datetime.now().isoformat()
#                 else:
#                     timestamp = datetime.datetime.now().isoformat()

#                 finding: Dict[str, Any] = {
#                     "source": "newsapi",
#                     "source_id": article.get('url', f"news_{hashlib.sha256(content_to_analyze.encode()).hexdigest()}"),
#                     "text": content_to_analyze,
#                     "timestamp": timestamp,
#                     "drug_mentioned": drug,
#                 }
#                 # print(f"*************\n{finding}############")  # Removed the print here to avoid cluttering output
#                 findings.append(finding)
#                 print(f"  News item found for {drug}: {article.get('url')}")  # Added a space for better formatting
#         # print(findings)
#         return json.dumps(findings)  # Convert the list of dictionaries to a JSON string
#         # return findings
#     except Exception as e:
#         print(traceback.format_exc())
#         return json.dumps([])  # Return an empty JSON string in case of an error

### TODO: Add similar agents for Social Media (Reddit/PRAW), simulated FAERS data

In [32]:
def clinical_context_agent(items):
    """
    Analyzes findings from other agents.
    Standardizes ADR terms using Knowledge Store (MedDRA map).
    Checks if ADR is known for the drug.
    Assesses potential seriousness.
    Returns list of contextualized findings.
    """
    print(f"\n--- Running Clinical Context Agent ({datetime.datetime.now()}) ---")
    contextualized_findings = []
    for item in items:
        drug = item['drug_mentioned']
        adr_raw = item['potential_adr']

        # Standardize ADR term
        adr_standardized = map_to_meddra(adr_raw)

        # Check if known ADR for this drug
        drug_info = get_drug_info(drug)
        is_known_adr = False
        if drug_info:
            # Basic check - real system might need fuzzy matching
            is_known_adr = any(known.lower() in adr_standardized.lower() for known in drug_info['known_adrs'])

        # Check seriousness
        is_serious = check_seriousness(item['text']) or check_seriousness(adr_raw)

        # Add context to the finding
        item['adr_standardized'] = adr_standardized
        item['is_known_adr'] = is_known_adr
        item['is_serious'] = is_serious
        contextualized_findings.append(item)
        print(f"  Contextualized: {item['source_id']} - ADR: {adr_standardized} (Known: {is_known_adr}, Serious: {is_serious})")

    return contextualized_findings


In [33]:
def signal_synthesizer_agent(contextualized_items, time_window_days=7, min_reports_for_signal=3):
    """
    Looks for patterns and correlations in contextualized findings.
    Flags potential signals based on criteria (e.g., multiple reports of unexpected ADR).
    Uses Vector Store to find similar past reports.
    Returns list of potential signals (e.g., Signal objects or dicts).
    """
    print(f"\n--- Running Signal Synthesizer Agent ({datetime.datetime.now()}) ---")
    potential_signals = []
    cutoff_date = datetime.datetime.now() - datetime.timedelta(days=time_window_days)

    # Group findings by Drug and Standardized ADR within the time window
    adr_groups = {}
    for item in contextualized_items:
        # Ensure timestamp is timezone-aware or convert naive to aware for comparison
        item_ts = item['timestamp']
        if item_ts.tzinfo is None:
             # Assuming UTC if naive, adjust as needed based on source data timezone
             item_ts = item_ts.replace(tzinfo=datetime.timezone.utc)

        if item_ts < cutoff_date.replace(tzinfo=datetime.timezone.utc): # Make cutoff aware too
            continue

        key = (item['drug_mentioned'], item['adr_standardized'])
        if key not in adr_groups:
            adr_groups[key] = []
        adr_groups[key].append(item)

    # Analyze groups
    for (drug, adr), items in adr_groups.items():
        # Basic Signal Criteria: Multiple reports of an *unexpected* and potentially *serious* ADR
        num_reports = len(items)
        is_unexpected = not items[0]['is_known_adr'] # Assumes consistency within group
        has_serious_report = any(item['is_serious'] for item in items)

        # Example Rule: Signal if >= min_reports of an unexpected ADR, OR if >= N reports of a serious ADR (even if known)
        if (is_unexpected and num_reports >= min_reports_for_signal) or \
           (has_serious_report and num_reports >= min_reports_for_signal + 2): # Stricter threshold for serious

            # Use Vector Store to find related historical items (optional enhancement)
            # query = f"Reports related to {drug} and {adr}"
            # similar_historical_reports = search_vector_store(literature_collection, query, n_results=5)
            # print(f"Found similar historical reports: {similar_historical_reports}")

            # Generate Signal
            evidence_summary = f"Found {num_reports} reports of '{adr}' for {drug} within the last {time_window_days} days. "
            evidence_summary += f"This ADR is considered {'unexpected' if is_unexpected else 'known'}. "
            if has_serious_report:
                evidence_summary += "At least one report mentioned serious outcomes. "
            source_ids = [item['source_id'] for item in items]

            signal = {
                "drugs": [drug],
                "adr_term": adr,
                "evidence": evidence_summary,
                "sources": source_ids,
                "confidence_score": 0.6 + min(0.4, (num_reports / (min_reports_for_signal * 2))), # Simple confidence heuristic
                "timestamp": datetime.datetime.now()
            }
            potential_signals.append(signal)
            print(f"  Potential Signal Identified: {drug} - {adr}")
            print(f"    Evidence: {evidence_summary}")

    return potential_signals



### 6. Main Workflow Orchestration


Test data : considered 10 commonly used medicines

In [34]:
DRUGS_TO_MONITOR = ["Levothyroxine", "Lisinopril", "Metformin", "Simvastatin",
                        "Sertraline", "Amoxicillin", "Warfarin", "Prednisone",
                        "Insulin", "Oxycodone"]
#KEYWORDS_FOR_LITERATURE = ["adverse effects", "side effects", "drug interactions"] # You can expand these
KEYWORDS_FOR_LITERATURE = [
    "adverse effects",
    "side effects",
    "drug interactions",
    "toxicity",
    "complications",
    "safety",
    "risk assessment",
    "case report",  # To catch individual unusual events
    "clinical trial", # broaden the search
    "long-term effects"
]

# literature_findings = literature_scanner_agent(DRUGS_TO_MONITOR, KEYWORDS_FOR_LITERATURE, max_results=5) # Adjust max_results as needed

# if literature_findings:
#     print("\n--- Literature Findings ---")
#     for finding in literature_findings:
#         print(finding)

### --- Run Agents Sequentially (Simple Orchestration) ---
### In a real system, this could run on a schedule (e.g., daily, hourly)

#### 1. Data Ingestion / Scanning

In [35]:
# lit_findings = literature_scanner_agent(DRUGS_TO_MONITOR, KEYWORDS_FOR_LITERATURE)
# literature_findings = literature_scanner_agent(DRUGS_TO_MONITOR, KEYWORDS_FOR_LITERATURE, max_results=5)
# news_findings = news_listener_agent(DRUGS_TO_MONITOR, KEYWORDS_FOR_NEWS)
# social_findings = social_listener_agent(...) # TODO
# faers_findings = faers_processor_agent(...) # TODO
# all_raw_findings = lit_findings #+ news_findings # + social_findings + faers_findings

In [36]:
UNIT_KEYWORDS_FOR_NEWS = [
    "side effect"
]
UNIT_DRUGS_TO_MONITOR = ["Levothyroxine"]

In [37]:
KEYWORDS_FOR_NEWS = [
    "side effect",
    "adverse reaction",
    "drug interaction",
    "safety concern",
    "recall",
    "warning",
    "patient experience",
    "complication",
    "lawsuit", # To catch legal news related to drugs
    "FDA",     # To capture regulatory news
    "risk",
    # Terms specific to some of the drugs (add more as needed)
    "thyroid problem",  # Levothyroxine
    "blood pressure issue", # Lisinopril
    "diabetes side effect", # Metformin, Insulin
    "muscle pain",      # Simvastatin
    "mental health",    # Sertraline
    "allergic reaction", # Amoxicillin
    "bleeding risk",    # Warfarin
    "steroid side effect", # Prednisone
    "opioid crisis",    # Oxycodone
    "addiction"         #Oxycodone
]

In [38]:
news_findings = news_listener_groq_agent(DRUGS_TO_MONITOR, KEYWORDS_FOR_NEWS)


--- Running News Listener Agent (2025-04-20 22:58:37.673691) ---
Fetched 1 articles from NewsAPI for Levothyroxine.
Fetched 1 articles from NewsAPI for Lisinopril.
Fetched 1 articles from NewsAPI for Metformin.
Fetched 1 articles from NewsAPI for Simvastatin.
Fetched 1 articles from NewsAPI for Sertraline.
Fetched 1 articles from NewsAPI for Amoxicillin.
Fetched 1 articles from NewsAPI for Warfarin.
Fetched 1 articles from NewsAPI for Prednisone.
Fetched 1 articles from NewsAPI for Insulin.
Fetched 1 articles from NewsAPI for Oxycodone.


In [39]:
print(news_findings)

[]


In [40]:
# from pydantic import BaseModel, Field
# from typing import List, Dict, Any # Import List, Dict and Any

# class NewsFormat(BaseModel): 
#     source: str = Field(..., description="website location"),
#     source_id: str = Field(..., description="ID received from news api."),
#     text: str = Field(..., description="3 sentence from for the description which includes keywords."),
#     timestamp: str = Field(..., description="date and time of news report, if not found use time now in GMT "),
#     drug_mentioned: str = Field(..., description="name of the drug"),

# class NewsFormatList(BaseModel):
#     finding : List[NewsFormat]

In [41]:
# from phi.run.response import RunResponse

# try:
#     GROQ_API_KEY = UserSecretsClient().get_secret("GROQ_API_KEY")
#     instructions = "look for the keywords in new {0}".join(KEYWORDS_FOR_NEWS)
#     news_listener_agent_groc = Agent(

#         # model=Groq(id="llama-3.3-70b-versatile", api_key=GROQ_API_KEY),
#          model=Groq(id="llama-3.1-8b-instant", api_key=GROQ_API_KEY), #used this mmodel to avoid TMP limitation in llama-3.3...versatile
#         tools=[get_drug_news],
#         show_tool_calls=True,
#         markdown=True,
#         instructions=[
#             # "compose result in json format",
#         ],
#         # response_model=NewsFormat,
#         # structured_outputs=True, #only for OpenAI models
#         stream = True
#     )
    
#     response = news_listener_agent_groc.run("Get the latest top 2 news about the drugs ".join(DRUGS_TO_MONITOR)+"and also look for the kewords ".join(KEYWORDS_FOR_NEWS))
#     # run_response = news_listener_agent_groc.run("Get the latest top 2 news about the drugs ".join(UNIT_DRUGS_TO_MONITOR)+", for each drug look description any one of the word in the kewords ".join(UNIT_KEYWORDS_FOR_NEWS))
#     # run_history = run_response.run_history

#     # # Now we can iterate through the run_history to access the individual runs
#     # for run in run_history:
#     #     # Access properties of each run, such as the step_response
#     #     step_response = run.step_response

#     # # Alternatively, if we want the entire list of step_responses in one shot:
#     # news_findings = [run.step_response for run in run_history]
#     # news_findings= ""
#     # print(f"response.messages = {response.messages}")
#     # for message in response.messages:
#     #     if message.role == "assistant":
#     #         print(f"Tool: {message.name}")
#     #         news_findings = message.content
#     #         # print(f"Response: {message.content}")
#     # print(f"news_findings = {news_findings}")

    
#     # news_findings = run_response.content
#     print(run_response.content)
#     # news_listener_agent_groc.print_response()


# except ValueError as e:
#     print(f"Error: {e}")
#     print(traceback.format_exc())
# except Exception as e:
#     print(f"An unexpected error occurred: {e}")
#     print(traceback.format_exc())

#### <font color="maroon">Note to my self address this token issue later<font>
Received the error while using  model=Groq(id="llama-3.3-70b-versatile", api_key=GROQ_API_KEY),<br>An unexpected error occurred: Error code: 413 - {'error': {'message': 'Request too large for model `llama-3.3-70b-versatile` in organization `org_01jfyjcz68ebd8rgty1gn60par` service tier `on_demand` on tokens per minute (TPM): Limit 12000, Requested 12658, please reduce your message size and try again. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
Traceback (most recent call last):
  File "<ipython-input-35-b991e83cf5f5>", line 15, in <cell line: 1>
    news_findings = news_listener_agent_groc.run("Get the latest top 5 news about the drugs ".join(DRUGS_TO_MONITOR)+"and also look for the kewords ".join(KEYWORDS_FOR_NEWS))

In [42]:
# news_findings = news_listener_agent(DRUGS_TO_MONITOR, KEYWORDS_FOR_NEWS)

In [43]:
literature_findings = literature_scanner_agent(DRUGS_TO_MONITOR, KEYWORDS_FOR_LITERATURE, max_results=5)


--- Running Literature Scanner Agent (2025-04-20 22:58:41.861210) ---
  Found 5 abstracts for: Levothyroxine AND adverse effects
Added 1 items to collection 'literature_pv'.
  Found 5 abstracts for: Levothyroxine AND side effects
Added 1 items to collection 'literature_pv'.
Added 1 items to collection 'literature_pv'.
  Found 5 abstracts for: Levothyroxine AND drug interactions
  Found 5 abstracts for: Levothyroxine AND toxicity
Added 1 items to collection 'literature_pv'.
  Found 5 abstracts for: Levothyroxine AND complications
Added 1 items to collection 'literature_pv'.
Added 1 items to collection 'literature_pv'.
  Found 5 abstracts for: Levothyroxine AND safety
Added 1 items to collection 'literature_pv'.
  Found 5 abstracts for: Levothyroxine AND risk assessment
Added 1 items to collection 'literature_pv'.
  Found 5 abstracts for: Levothyroxine AND case report
  Found 5 abstracts for: Levothyroxine AND clinical trial
Added 1 items to collection 'literature_pv'.
  Found 5 abstrac

In [44]:
print(type(literature_findings))
print(type(news_findings))

<class 'list'>
<class 'list'>


In [45]:
all_raw_findings = literature_findings + news_findings

In [46]:
# print(all_raw_findings)

### TODO: Add similar agents for Social Media (Reddit/PRAW), simulated FAERS data


In [47]:
def clinical_context_agent(items):
    """
    Analyzes findings from other agents.
    Standardizes ADR terms using Knowledge Store (MedDRA map).
    Checks if ADR is known for the drug.
    Assesses potential seriousness.
    Returns list of contextualized findings.
    """
    print(f"\n--- Running Clinical Context Agent ({datetime.datetime.now()}) ---")
    contextualized_findings = []
    for item in items:
        drug = item['drug_mentioned']
        adr_raw = item['potential_adr']

        # Standardize ADR term
        adr_standardized = map_to_meddra(adr_raw)

        # Check if known ADR for this drug
        drug_info = get_drug_info(drug)
        is_known_adr = False
        if drug_info:
            # Basic check - real system might need fuzzy matching
            # print(f"Checking {drug} and {adr_raw} in {adr_standardized} and {drug_info['known_adrs']}")
            is_known_adr = any(known.lower() in adr_standardized.lower() for known in drug_info['known_adrs'])

        # Check seriousness
        is_serious = check_seriousness(item['text']) or check_seriousness(adr_raw)

        # Add context to the finding
        item['adr_standardized'] = adr_standardized
        item['is_known_adr'] = is_known_adr
        item['is_serious'] = is_serious
        contextualized_findings.append(item)
        print(f"  Contextualized: {item['source_id']} - ADR: {adr_standardized} (Known: {is_known_adr}, Serious: {is_serious})")

    return contextualized_findings

In [48]:
def signal_synthesizer_agent(contextualized_items, time_window_days=7, min_reports_for_signal=3):
    """
    Looks for patterns and correlations in contextualized findings.
    Flags potential signals based on criteria (e.g., multiple reports of unexpected ADR).
    Uses Vector Store to find similar past reports.
    Returns list of potential signals (e.g., Signal objects or dicts).
    """
    print(f"\n--- Running Signal Synthesizer Agent ({datetime.datetime.now()}) ---")
    potential_signals = []
    cutoff_date = datetime.datetime.now() - datetime.timedelta(days=time_window_days)

    # Group findings by Drug and Standardized ADR within the time window
    adr_groups = {}
    for item in contextualized_items:
        # Ensure timestamp is timezone-aware or convert naive to aware for comparison
        item_ts = item['timestamp']
        if item_ts.tzinfo is None:
             # Assuming UTC if naive, adjust as needed based on source data timezone
             item_ts = item_ts.replace(tzinfo=datetime.timezone.utc)

        if item_ts < cutoff_date.replace(tzinfo=datetime.timezone.utc): # Make cutoff aware too
            continue

        key = (item['drug_mentioned'], item['adr_standardized'])
        if key not in adr_groups:
            adr_groups[key] = []
        adr_groups[key].append(item)

    # Analyze groups
    for (drug, adr), items in adr_groups.items():
        # Basic Signal Criteria: Multiple reports of an *unexpected* and potentially *serious* ADR
        num_reports = len(items)
        is_unexpected = not items[0]['is_known_adr'] # Assumes consistency within group
        has_serious_report = any(item['is_serious'] for item in items)

        # Example Rule: Signal if >= min_reports of an unexpected ADR, OR if >= N reports of a serious ADR (even if known)
        if (is_unexpected and num_reports >= min_reports_for_signal) or \
           (has_serious_report and num_reports >= min_reports_for_signal + 2): # Stricter threshold for serious

            # Use Vector Store to find related historical items (optional enhancement)
            # query = f"Reports related to {drug} and {adr}"
            # similar_historical_reports = search_vector_store(literature_collection, query, n_results=5)
            # print(f"Found similar historical reports: {similar_historical_reports}")

            # Generate Signal
            evidence_summary = f"Found {num_reports} reports of '{adr}' for {drug} within the last {time_window_days} days. "
            evidence_summary += f"This ADR is considered {'unexpected' if is_unexpected else 'known'}. "
            if has_serious_report:
                evidence_summary += "At least one report mentioned serious outcomes. "
            source_ids = [item['source_id'] for item in items]

            signal = {
                "drugs": [drug],
                "adr_term": adr,
                "evidence": evidence_summary,
                "sources": source_ids,
                "confidence_score": 0.6 + min(0.4, (num_reports / (min_reports_for_signal * 2))), # Simple confidence heuristic
                "timestamp": datetime.datetime.now()
            }
            potential_signals.append(signal)
            print(f"  Potential Signal Identified: {drug} - {adr}")
            print(f"    Evidence: {evidence_summary}")

    return potential_signals

### 6. Main Workflow Orchestration

### --- Run Agents Sequentially (Simple Orchestration) ---
### In a real system, this could run on a schedule (e.g., daily, hourly)

### 1. Data Ingestion / Scanning

In [49]:
# lit_findings = literature_scanner_agent(DRUGS_TO_MONITOR, KEYWORDS_FOR_LITERATURE)
# news_findings = news_listener_agent(DRUGS_TO_MONITOR, KEYWORDS_FOR_NEWS)
# # social_findings = social_listener_agent(...) # TODO
# # faers_findings = faers_processor_agent(...) # TODO

# all_raw_findings = lit_findings + news_findings # + social_findings + faers_findings

In [50]:
all_raw_findings = literature_findings

In [51]:
# 2. Contextualization
contextualized_findings = clinical_context_agent(all_raw_findings)


--- Running Clinical Context Agent (2025-04-20 23:00:19.664110) ---
  Contextualized: 40199841 - ADR: Cardiovascular Disorder (Known: False, Serious: False)
  Contextualized: 40201048 - ADR: The patient was prescribed anti-thyroid drug therapy through methimazole but elected not to take it due to concerns about side effects (Known: False, Serious: False)
  Contextualized: 40199841 - ADR: Cardiovascular Disorder (Known: False, Serious: False)
  Contextualized: 40165683 - ADR: The foremost concerns are HDRI side effects, radioiodine refractoriness, macronodular lung, and bone metastasis (Known: False, Serious: False)
  Contextualized: 40213017 - ADR: However, despite its efficacy, it has inherent risks and complications (Known: False, Serious: False)
  Contextualized: 40206496 - ADR: The case highlights the importance of early recognition and prompt discontinuation of the drug, which are crucial to preventing severe complications (Known: False, Serious: True)
  Contextualized: 40188564 

In [52]:
# 3. Synthesis / Signal Detection
detected_signals = signal_synthesizer_agent(contextualized_findings)


--- Running Signal Synthesizer Agent (2025-04-20 23:00:19.769197) ---
  Potential Signal Identified: Metformin - RESULTS: Our study reveals significant side effects associated with combining ICIs with metformin or statins, including gastrointestinal, respiratory, hepatobiliary, and renal disorders
    Evidence: Found 4 reports of 'RESULTS: Our study reveals significant side effects associated with combining ICIs with metformin or statins, including gastrointestinal, respiratory, hepatobiliary, and renal disorders' for Metformin within the last 7 days. This ADR is considered unexpected. 
  Potential Signal Identified: Metformin - Drug-drug interactions (DDIs), which can result in side effects and reduced treatment efficacy, have increased
    Evidence: Found 3 reports of 'Drug-drug interactions (DDIs), which can result in side effects and reduced treatment efficacy, have increased' for Metformin within the last 7 days. This ADR is considered unexpected. 
  Potential Signal Identified: 

In [53]:
# # 7. Output / Demonstration

print("\n\n--- Final Detected Signals ---")
if detected_signals:
    for i, signal in enumerate(detected_signals):
        print(f"\nSignal {i+1}:")
        print(f"  Drug(s): {', '.join(signal['drugs'])}")
        print(f"  ADR Term: {signal['adr_term']}")
        print(f"  Evidence: {signal['evidence']}")
        print(f"  Confidence: {signal['confidence_score']:.2f}")
        print(f"  Supporting Sources: {', '.join(signal['sources'])}")
        print(f"  Timestamp: {signal['timestamp']}")
else:
    print("No significant signals detected in this run.")



--- Final Detected Signals ---

Signal 1:
  Drug(s): Metformin
  ADR Term: RESULTS: Our study reveals significant side effects associated with combining ICIs with metformin or statins, including gastrointestinal, respiratory, hepatobiliary, and renal disorders
  Evidence: Found 4 reports of 'RESULTS: Our study reveals significant side effects associated with combining ICIs with metformin or statins, including gastrointestinal, respiratory, hepatobiliary, and renal disorders' for Metformin within the last 7 days. This ADR is considered unexpected. 
  Confidence: 1.00
  Supporting Sources: 40232293, 40232293, 40232293, 40232293
  Timestamp: 2025-04-20 23:00:19.770117

Signal 2:
  Drug(s): Metformin
  ADR Term: Drug-drug interactions (DDIs), which can result in side effects and reduced treatment efficacy, have increased
  Evidence: Found 3 reports of 'Drug-drug interactions (DDIs), which can result in side effects and reduced treatment efficacy, have increased' for Metformin within the 