# Biotech News and Trends Concierge Agent

## Objective

## Import Libraries

In [None]:
import feedparser
import datetime
import json
from pathlib import Path
import pandas as pd
from bs4 import BeautifulSoup
from google import genai
import time

## Fetch RSS Articles

In [1]:
import feedparser
import datetime
import json
from pathlib import Path

class RSSFetcher:
    def __init__(self, config_path="config/rss_feeds.json", storage_path="../data/rss_raw.json"):
        self.config_path = Path(config_path)
        self.storage_path = Path(storage_path)
        self.storage_path.parent.mkdir(parents=True, exist_ok=True)

        # Load feeds from config file
        with open(self.config_path, "r") as f:
            self.rss_urls = json.load(f)["feeds"]

    def _infer_source(self, url: str) -> str:
        """Infer source name from URL."""
        if "pubmed" in url:
            return "PubMed"
        elif "biorxiv" in url:
            return "bioRxiv"
        elif "arxiv" in url:
            return "arXiv"
        elif "nature.com" in url:
            return "Nature"
        elif "sciencedaily" in url:
            return "ScienceDaily"
        else:
            return "Unknown"

    def fetch(self):
        """Fetch articles from all RSS URLs."""
        all_articles = []

        for url in self.rss_urls:
            feed = feedparser.parse(url)
            source = self._infer_source(url)

            print(f"Fetching from {source}: {url}")

            for entry in feed.entries:
                article = {
                    "title": entry.get("title"),
                    "summary": entry.get("summary", ""),
                    "link": entry.get("link"),
                    "published": entry.get("published") or entry.get("updated") or None,
                    "source": source,
                    "fetched_at": datetime.datetime.utcnow().isoformat()
                }
                all_articles.append(article)

        self._save(all_articles)
        return all_articles

    def _save(self, articles):
        """Save raw fetched articles."""
        with open(self.storage_path, "w", encoding="utf-8") as f:
            json.dump(articles, f, indent=2)

        print(f"Saved {len(articles)} articles to {self.storage_path}")

In [7]:
# Run Fetcher for generic RSS (including PubMed)
#from src.fetcher import RSSFetcher

# This will load feeds from config/rss_feeds.json by default
fetcher = RSSFetcher()

articles = fetcher.fetch()

# Convert to df
articles = pd.DataFrame(articles)

print(f"Fetched {len(articles)} articles.")
print(articles[:3])  # show first 3

Fetching from PubMed: https://pubmed.ncbi.nlm.nih.gov/rss/search/1A2B3C
Fetching from Unknown: https://another-feed.com/rss
Fetching from bioRxiv: https://biorxiv.org/covid19.xml
Fetching from PubMed: https://pubmed.ncbi.nlm.nih.gov/rss/feed/atom?limit=20
Fetching from Unknown: https://pmc.ncbi.nlm.nih.gov/about/new-in-pmc/?format=rss
Saved 83 articles to ..\data\rss_raw.json
Fetched 83 articles.
                                               title  \
0             Updated Full-Text Search Now Available   
1                            PMC OAI-PMH API Updated   
2  Updates in Support of the 2024 NIH Public Acce...   

                                             summary  \
0  <p>As <a href="https://ncbiinsights.ncbi.nlm.n...   
1  <p>The PMC OAI-PMH API, a tool that allows use...   
2  <p>The National Center for Biotechnology Infor...   

                                                link  \
0  https://pmc.ncbi.nlm.nih.gov/about/new-in-pmc/...   
1  https://pmc.ncbi.nlm.nih.gov/about/

In [None]:
# Clean up articles and extract titles and summaries
from bs4 import BeautifulSoup

# Convert summaries to plain text
articles['summary_text'] = articles['summary'].apply(lambda x: BeautifulSoup(x, "html.parser").get_text())

# Print just the top 5 titles and converted summaries
for title in articles['title'].head(5):
    print("Title:", title)
for summary_text in articles['summary_text'].head(5):
    print("Summary text:", summary_text)

Title: Updated Full-Text Search Now Available
Title: PMC OAI-PMH API Updated
Title: Updates in Support of the 2024 NIH Public Access Policy
Title: PubMed Central's Updated Full-Text Search Preview Now Available
Title: Preview of Upcoming Changes to PMC's eFetch Output


## Summarize Articles (Agent)

In [None]:
from google import genai
import time

GOOGLE_API_KEY = "AIzaSyCot4EY1kxTXTaHiTS6mZY01rFu93ReB0s"
client = genai.Client(api_key=GOOGLE_API_KEY)

MODEL_NAME = "gemini-2.5-flash"

THROTTLE = 1

def summarize_article(title: str, summary: str) -> dict:
    prompt = f"""
    You are an AI biotech assistant. Summarize this article in 3 bullet points.
    Extract: 
    1. Main finding
    2. Key biological targets (genes, proteins, pathways)
    3. Application area (diagnostics, therapeutics, biotech tools, etc.)

    Title: {title}
    Summary: {summary}
    """

    response = client.models.generate_content(
        model=MODEL_NAME,
        contents=prompt
    )

    time.sleep(THROTTLE)         # API Rate limiting

    return {
        "title": title,
        "raw_summary": summary,
        "ai_summary": response.text
    }

# Example usage
ai_summary_sample = summarize_article(
    "Updated Full-Text Search Now Available",
    "As previously announced, NCBI has updated the PubMed Central (PMC) full-text search functionality and user experience..."
)

print(ai_summary_sample)

# Generate AI summaries for the first 5 articles only
articles.loc[:4, "ai_summary"] = articles.loc[:4].apply(
    lambda row: summarize_article(row["title"], row["summary_text"])["ai_summary"],
    axis=1
)

# Check results
print(articles.loc[:4, ["title", "ai_summary"]])

{'title': 'Updated Full-Text Search Now Available', 'raw_summary': 'As previously announced, NCBI has updated the PubMed Central (PMC) full-text search functionality and user experience...', 'ai_summary': 'Here is the summary based on your request:\n\n*   **Main Finding:** NCBI has updated the full-text search functionality and user experience for PubMed Central (PMC).\n*   **Key biological targets:** Not applicable, as the article discusses an information search tool, not biological entities.\n*   **Application area:** Biotech resource / Informatics tool (specifically, enhancing access to scientific literature for research and development).'}
                                               title  \
0             Updated Full-Text Search Now Available   
1                            PMC OAI-PMH API Updated   
2  Updates in Support of the 2024 NIH Public Acce...   
3  PubMed Central's Updated Full-Text Search Prev...   
4  Preview of Upcoming Changes to PMC's eFetch Ou...   

           

In [None]:
# JSON

from pathlib import Path
import json

# Convert back to list of dicts
# Convert entire DataFrame to list of dicts
articles_list = articles.to_dict(orient="records")

output_path = Path("data/rss_summarized.json")
output_path.parent.mkdir(parents=True, exist_ok=True)

with open(output_path, "w", encoding="utf-8") as f:
    json.dump(articles_list, f, indent=2, ensure_ascii=False)

print(f"Saved {len(articles_list)} articles to {output_path}")

# Add section to append new articles without duplicates - they'll be stored for access by trend agent later


Saved 83 articles to data\rss_summarized.json


## Trend Analysis (Agent)