In [1]:
import feedparser
import datetime
import json
from pathlib import Path

class RSSFetcher:
    def __init__(self, config_path="config/rss_feeds.json", storage_path="../data/rss_raw.json"):
        self.config_path = Path(config_path)
        self.storage_path = Path(storage_path)
        self.storage_path.parent.mkdir(parents=True, exist_ok=True)

        # Load feeds from config file
        with open(self.config_path, "r") as f:
            self.rss_urls = json.load(f)["feeds"]

    def _infer_source(self, url: str) -> str:
        """Infer source name from URL."""
        if "pubmed" in url:
            return "PubMed"
        elif "biorxiv" in url:
            return "bioRxiv"
        elif "arxiv" in url:
            return "arXiv"
        elif "nature.com" in url:
            return "Nature"
        elif "sciencedaily" in url:
            return "ScienceDaily"
        else:
            return "Unknown"

    def fetch(self):
        """Fetch articles from all RSS URLs."""
        all_articles = []

        for url in self.rss_urls:
            feed = feedparser.parse(url)
            source = self._infer_source(url)

            print(f"Fetching from {source}: {url}")

            for entry in feed.entries:
                article = {
                    "title": entry.get("title"),
                    "summary": entry.get("summary", ""),
                    "link": entry.get("link"),
                    "published": entry.get("published") or entry.get("updated") or None,
                    "source": source,
                    "fetched_at": datetime.datetime.utcnow().isoformat()
                }
                all_articles.append(article)

        self._save(all_articles)
        return all_articles

    def _save(self, articles):
        """Save raw fetched articles."""
        with open(self.storage_path, "w", encoding="utf-8") as f:
            json.dump(articles, f, indent=2)

        print(f"Saved {len(articles)} articles to {self.storage_path}")

In [7]:
# Run Fetcher for generic RSS (including PubMed)
#from src.fetcher import RSSFetcher

# This will load feeds from config/rss_feeds.json by default
fetcher = RSSFetcher()

articles = fetcher.fetch()

print(f"Fetched {len(articles)} articles.")
print(articles[:3])  # show first 3

Fetching from PubMed: https://pubmed.ncbi.nlm.nih.gov/rss/search/1A2B3C
Fetching from Unknown: https://another-feed.com/rss
Fetching from bioRxiv: https://biorxiv.org/covid19.xml
Fetching from PubMed: https://pubmed.ncbi.nlm.nih.gov/rss/feed/atom?limit=20
Fetching from Unknown: https://pmc.ncbi.nlm.nih.gov/about/new-in-pmc/?format=rss
Saved 83 articles to ..\data\rss_raw.json
Fetched 83 articles.
[{'title': 'Updated Full-Text Search Now Available', 'summary': '<p>As <a href="https://ncbiinsights.ncbi.nlm.nih.gov/2025/08/19/new-pmc-search/">previously announced</a>, NCBI has updated the PubMed Central (PMC) full-text search functionality and user experience. To prepare for this transition, <a href="https://ncbiinsights.ncbi.nlm.nih.gov/2025/04/08/pmc-full-text-search-available">a Beta version was released in April</a> for users to preview and test, and a number of improvements were made based on user feedback.</p>\n<p>NCBI will continue to add to and improve the new search, with priorit

In [42]:
# Clean up articles and extract titles and summaries
from bs4 import BeautifulSoup

for article in articles:
    title = article['title']
    summary_html = article['summary']
    summary_text = BeautifulSoup(summary_html, "html.parser").get_text()
    article['summary_text'] = summary_text

# Print just the top 5 titles
for article in articles[:5]:
    print("Title:", article['title'])

Title: Updated Full-Text Search Now Available
Title: PMC OAI-PMH API Updated
Title: Updates in Support of the 2024 NIH Public Access Policy
Title: PubMed Central's Updated Full-Text Search Preview Now Available
Title: Preview of Upcoming Changes to PMC's eFetch Output


In [36]:
from google import genai
from google.genai import types

GOOGLE_API_KEY = "AIzaSyCot4EY1kxTXTaHiTS6mZY01rFu93ReB0s"
client = genai.Client(api_key=GOOGLE_API_KEY)

MODEL_NAME = "gemini-2.5-flash"

def summarize_article(title: str, summary: str) -> dict:
    prompt = f"""
    You are an AI biotech assistant. Summarize this article in 3 bullet points.
    Extract: 
    1. Main finding
    2. Key biological targets (genes, proteins, pathways)
    3. Application area (diagnostics, therapeutics, biotech tools, etc.)

    Title: {title}
    Summary: {summary}
    """

    response = client.models.generate_content(
        model=MODEL_NAME,
        contents=prompt
    )

    return {
        "title": title,
        "raw_summary": summary,
        "ai_summary": response.text
    }

# Example usage
ai_summary_sample = summarize_article(
    "Updated Full-Text Search Now Available",
    "As previously announced, NCBI has updated the PubMed Central (PMC) full-text search functionality and user experience..."
)

print(ai_summary_sample)

# Example usage from actual article list
for article in articles[:5]:
    article["ai_summary"] = summarize_article(article["title"], article["summary_text"])

print(article['ai_summary'])

{'title': 'Updated Full-Text Search Now Available', 'raw_summary': 'As previously announced, NCBI has updated the PubMed Central (PMC) full-text search functionality and user experience...', 'ai_summary': "Here's a summary of the article:\n\n*   **Main finding**: NCBI has updated and improved the full-text search functionality and user experience for PubMed Central (PMC).\n*   **Key biological targets**: None specified; the article focuses on a digital resource rather than specific biological entities.\n*   **Application area**: Bioinformatics tool improvement, enhancing literature search and access for biomedical research."}
{'title': "Preview of Upcoming Changes to PMC's eFetch Output", 'raw_summary': 'In April 2025, the data formats for PMC articles available through eFetch will be updated as part of NLM\'s ongoing efforts to modernize its products and services. While most of the data will remain unchanged, there are several key differences in the XML and PubMed (formerly MEDLINE) d