In [1]:
import feedparser
import datetime
import json
from pathlib import Path

# Define PubMed RSS fetcher
class PubMedFetcher:
    def __init__(self, rss_urls, storage_path="../data/pubmed_raw.json"):
        self.rss_urls = rss_urls
        self.storage_path = Path(storage_path)
        self.storage_path.parent.mkdir(parents=True, exist_ok=True)

    def fetch(self):
        """Fetch articles from all RSS URLs."""
        all_articles = []

        for url in self.rss_urls:
            feed = feedparser.parse(url)

            for entry in feed.entries:
                article = {
                    "title": entry.get("title"),
                    "summary": entry.get("summary", ""),
                    "link": entry.get("link"),
                    "published": entry.get("published"),
                    "source": "PubMed",
                    "fetched_at": datetime.datetime.utcnow().isoformat()
                }
                all_articles.append(article)

        self._save(all_articles)
        return all_articles

    def _save(self, articles):
        """Save raw fetched articles."""
        with open(self.storage_path, "w", encoding="utf-8") as f:
            json.dump(articles, f, indent=2)

In [2]:
# Run Fetcher for PubMed RSS
from src.fetcher import PubMedFetcher

# Example RSS feeds (you will replace these with real PubMed RSS URLs)
rss_feeds = [
    "https://www.nlm.nih.gov/rss/nlmnews.rss",  # NLM News
    "https://pubmed.ncbi.nlm.nih.gov/rss/search/3k2398dkf.../",         # Gene therapy
]

fetcher = PubMedFetcher(rss_urls=rss_feeds)
articles = fetcher.fetch()

print(f"Fetched {len(articles)} articles.")
print(articles)

Fetched 5 articles.
[{'title': 'NLM Announces 2025 Michael E. DeBakey Fellows in the History of Medicine', 'summary': 'Following its May 6, 2024, call for applications to the National Library of Medicine (NLM) Michael E. DeBakey Fellowship in the History of Medicine, NLM is pleased to announce its 2025 DeBakey Fellows.', 'link': 'https://www.nlm.nih.gov/news/2025_DeBakey_Fellows_History_of_Medicine.html', 'published': 'Tue, 17 Dec 2024 00:00:00 EST', 'source': 'PubMed', 'fetched_at': '2025-11-20T15:54:30.270350'}, {'title': 'Michael Huerta, PhD, Acting Deputy Director for Operations and Innovation at the National Library of Medicine (NLM) Retires', 'summary': 'Michael Huerta, PhD, NLM Deputy Director for Operations and Innovation (Acting), Associate Director for Strategy at the National Institutes of Health (NIH)â€™s National Library of Medicine (NLM) will retire effective January 11.', 'link': 'https://www.nlm.nih.gov/news/Michael_Huerta_retires.html', 'published': 'Mon, 25 Nov 2024 0

In [None]:
import google.generativeai as genai
import os

GOOGLE_API_KEY1 = "AIzaSyD9pSyHHcZxeS8rRZUgP3Xpn7ely_G63G4"
GOOGLE_API_KEY2 = ""

# Load API key
genai.configure(api_key=GOOGLE_API_KEY1)

MODEL_NAME = "gemini-1.5-flash"

def summarize_article(title: str, abstract: str) -> dict:
    prompt = f"""
    You are an AI biotech assistant. Summarize this article in 3 bullet points.
    Extract: 
    1. Main finding
    2. Key biological targets (genes, proteins, pathways)
    3. Application area (diagnostics, therapeutics, biotech tools, etc.)
    
    Title: {title}
    Abstract: {abstract}
    """

    response = genai.GenerativeModel(MODEL_NAME).generate_content(prompt)
    text = response.text

    return {
        "title": title,
        "raw_summary": abstract,
        "ai_summary": text
    }

ai_summary = summarize_article("title", "summary")

print(response)

  from .autonotebook import tqdm as notebook_tqdm


NotFound: 404 models/gemini-1.5-flash is not found for API version v1beta, or is not supported for generateContent. Call ListModels to see the list of available models and their supported methods.