# PUBMED data


In [38]:
from Bio import Entrez
import time
from pathlib import Path

# Set your email (required by NCBI)
Entrez.email = "divyanshr141@email.com"

# Output folder to save abstracts
output_dir = Path("pubmed_abstracts")
output_dir.mkdir(exist_ok=True)

def fetch_pubmed_abstracts(query, max_results=10):
    try:
        search_handle = Entrez.esearch(db="pubmed", term=query, retmax=max_results)
        search_results = Entrez.read(search_handle)
        id_list = search_results["IdList"]
        if not id_list:
            return f"No results found for query: {query}"

        fetch_handle = Entrez.efetch(db="pubmed", id=",".join(id_list), rettype="abstract", retmode="text")
        data = fetch_handle.read()
        return data
    except Exception as e:
        return f"Error fetching data for query '{query}': {e}"

# Conditions and allergies
conditions = [
    "high blood pressure", "diabetes", "heart disease", "celiac disease",
    "obesity", "IBS", "lactose intolerance", "arthritis", "asthma", "eczema"
]
allergies = ["dairy", "gluten", "nuts", "shellfish", "soy", "eggs", "wheat", "corn"]

# Loop through all combinations
for condition in conditions:
    for allergy in allergies:
        query = f"{condition} {allergy} allergy"
        print(f"\n--- Fetching for: {query} ---")
        abstracts = fetch_pubmed_abstracts(query, max_results=10)

        # Save abstracts to a .txt file
        filename = f"{condition}_{allergy}_allergy.txt".replace(" ", "_").lower()
        filepath = output_dir / filename
        with open(filepath, "w", encoding="utf-8") as f:
            f.write(abstracts)

        print(f"✅ Saved to: {filepath}")
        time.sleep(1)  # Respectful delay



--- Fetching for: high blood pressure dairy allergy ---
✅ Saved to: pubmed_abstracts\high_blood_pressure_dairy_allergy.txt

--- Fetching for: high blood pressure gluten allergy ---
✅ Saved to: pubmed_abstracts\high_blood_pressure_gluten_allergy.txt

--- Fetching for: high blood pressure nuts allergy ---
✅ Saved to: pubmed_abstracts\high_blood_pressure_nuts_allergy.txt

--- Fetching for: high blood pressure shellfish allergy ---
✅ Saved to: pubmed_abstracts\high_blood_pressure_shellfish_allergy.txt

--- Fetching for: high blood pressure soy allergy ---
✅ Saved to: pubmed_abstracts\high_blood_pressure_soy_allergy.txt

--- Fetching for: high blood pressure eggs allergy ---
✅ Saved to: pubmed_abstracts\high_blood_pressure_eggs_allergy.txt

--- Fetching for: high blood pressure wheat allergy ---
✅ Saved to: pubmed_abstracts\high_blood_pressure_wheat_allergy.txt

--- Fetching for: high blood pressure corn allergy ---
✅ Saved to: pubmed_abstracts\high_blood_pressure_corn_allergy.txt

--- Fet

# FDC data

In [29]:
apikeyfdc= 'ApdGkZpttgPvMkfMVMLkyTmClmEhy7n77axbi3YQ'


In [39]:
import requests
import json
from pathlib import Path

def fetch_fdc_foods(query, api_key, max_results=5):
    url = "https://api.nal.usda.gov/fdc/v1/foods/search"
    params = {
        "api_key": api_key,
        "query": query,
        "pageSize": max_results,
        "dataType": ["Foundation", "SR Legacy", "Survey (FNDDS)"]
    }

    response = requests.get(url, params=params)
    try:
        response.raise_for_status()
        return response.json()
    except Exception as e:
        return {"error": str(e)}

# Replace with your actual API key
api_key = apikeyfdc

# Define diseases and allergies
conditions = [
    "high blood pressure", "diabetes", "heart disease", "celiac disease",
    "obesity", "IBS", "lactose intolerance", "arthritis", "asthma", "eczema"
]

allergies = [
    "dairy", "gluten", "nuts", "shellfish", "soy", "eggs", "wheat", "corn"
]

# Create output folder
output_dir = Path("fdc_data")
output_dir.mkdir(exist_ok=True)

# Loop through all combinations of condition and allergy
for condition in conditions:
    for allergy in allergies:
        query = f"{condition} {allergy}"
        print(f"\n🔍 Querying FDC: {query}")
        result = fetch_fdc_foods(query, api_key)

        # Save the result
        filename = f"{condition}_{allergy}".replace(" ", "_").lower() + ".txt"
        filepath = output_dir / filename
        with open(filepath, "w", encoding="utf-8") as f:
            json.dump(result, f, indent=2)

        print(f"✅ Saved to: {filepath}")



🔍 Querying FDC: high blood pressure dairy
✅ Saved to: fdc_data\high_blood_pressure_dairy.txt

🔍 Querying FDC: high blood pressure gluten
✅ Saved to: fdc_data\high_blood_pressure_gluten.txt

🔍 Querying FDC: high blood pressure nuts
✅ Saved to: fdc_data\high_blood_pressure_nuts.txt

🔍 Querying FDC: high blood pressure shellfish
✅ Saved to: fdc_data\high_blood_pressure_shellfish.txt

🔍 Querying FDC: high blood pressure soy
✅ Saved to: fdc_data\high_blood_pressure_soy.txt

🔍 Querying FDC: high blood pressure eggs
✅ Saved to: fdc_data\high_blood_pressure_eggs.txt

🔍 Querying FDC: high blood pressure wheat
✅ Saved to: fdc_data\high_blood_pressure_wheat.txt

🔍 Querying FDC: high blood pressure corn
✅ Saved to: fdc_data\high_blood_pressure_corn.txt

🔍 Querying FDC: diabetes dairy
✅ Saved to: fdc_data\diabetes_dairy.txt

🔍 Querying FDC: diabetes gluten
✅ Saved to: fdc_data\diabetes_gluten.txt

🔍 Querying FDC: diabetes nuts
✅ Saved to: fdc_data\diabetes_nuts.txt

🔍 Querying FDC: diabetes shellf

# EATRIGHT data

In [40]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from pathlib import Path
import time
import json

def scrape_eatright_selenium(condition, allergy, max_results=5):
    query = f"{condition} {allergy}".replace(" ", "%20")
    search_url = f"https://www.eatright.org/search-results?search={query}"

    options = Options()
    options.headless = True
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")

    results = []

    try:
        driver = webdriver.Chrome(options=options)
        driver.get(search_url)

        time.sleep(3)  # wait for JS to load content

        articles = driver.find_elements(By.CSS_SELECTOR, ".search-results__item")

        for article in articles[:max_results]:
            try:
                link_elem = article.find_element(By.TAG_NAME, "a")
                title = link_elem.text.strip()
                link = link_elem.get_attribute("href")
                results.append({"title": title, "link": link})
            except Exception as e:
                results.append({"error": f"Error extracting article: {e}"})

        driver.quit()

    except Exception as e:
        results.append({"error": f"Selenium crash or session error: {e}"})

    return results if results else [{"note": "No articles found."}]

# === Main ===
if __name__ == "__main__":
    conditions = [
        "high blood pressure", "diabetes", "heart disease", "celiac disease",
        "obesity", "IBS", "lactose intolerance", "arthritis", "asthma", "eczema"
    ]
    allergies = ["dairy", "gluten", "nuts", "shellfish", "soy", "eggs", "wheat", "corn"]

    output_dir = Path("eatright_articles")
    output_dir.mkdir(exist_ok=True)

    for condition in conditions:
        for allergy in allergies:
            print(f"\n🔍 Scraping: {condition} + {allergy}")
            articles = scrape_eatright_selenium(condition, allergy)

            filename = f"{condition}_{allergy}".replace(" ", "_").lower() + ".txt"
            filepath = output_dir / filename
            with open(filepath, "w", encoding="utf-8") as f:
                json.dump(articles, f, indent=2)

            print(f"✅ Saved: {filepath}")



🔍 Scraping: high blood pressure + dairy
✅ Saved: eatright_articles\high_blood_pressure_dairy.txt

🔍 Scraping: high blood pressure + gluten
✅ Saved: eatright_articles\high_blood_pressure_gluten.txt

🔍 Scraping: high blood pressure + nuts
✅ Saved: eatright_articles\high_blood_pressure_nuts.txt

🔍 Scraping: high blood pressure + shellfish
✅ Saved: eatright_articles\high_blood_pressure_shellfish.txt

🔍 Scraping: high blood pressure + soy
✅ Saved: eatright_articles\high_blood_pressure_soy.txt

🔍 Scraping: high blood pressure + eggs
✅ Saved: eatright_articles\high_blood_pressure_eggs.txt

🔍 Scraping: high blood pressure + wheat
✅ Saved: eatright_articles\high_blood_pressure_wheat.txt

🔍 Scraping: high blood pressure + corn
✅ Saved: eatright_articles\high_blood_pressure_corn.txt

🔍 Scraping: diabetes + dairy
✅ Saved: eatright_articles\diabetes_dairy.txt

🔍 Scraping: diabetes + gluten
✅ Saved: eatright_articles\diabetes_gluten.txt

🔍 Scraping: diabetes + nuts
✅ Saved: eatright_articles\diabet

# Harvard data

In [41]:
import requests
from bs4 import BeautifulSoup
from pathlib import Path
import time

def scrape_harvard_nutrition(query):
    base_url = "https://nutritionsource.hsph.harvard.edu"
    search_url = f"{base_url}/?s={query.replace(' ', '+')}"
    headers = {'User-Agent': 'Mozilla/5.0'}

    try:
        res = requests.get(search_url, headers=headers)
        res.raise_for_status()
        soup = BeautifulSoup(res.text, "html.parser")

        results = []
        for article in soup.select("article"):
            title_tag = article.find("h2")
            if title_tag:
                link_tag = title_tag.find("a")
                if link_tag and link_tag.get("href"):
                    link = link_tag["href"]
                    if link.startswith("/"):
                        link = base_url + link
                    title = title_tag.get_text(strip=True)
                    results.append({"title": title, "link": link})
        return results

    except requests.RequestException as e:
        return [{"error": f"Request failed: {e}"}]
    except Exception as e:
        return [{"error": f"Parsing failed: {e}"}]

def extract_article_content(url):
    headers = {'User-Agent': 'Mozilla/5.0'}
    try:
        res = requests.get(url, headers=headers)
        res.raise_for_status()
        soup = BeautifulSoup(res.text, "html.parser")

        content_div = soup.find("div", class_="entry-content") or soup.find("main")
        if not content_div:
            return "⚠️ Could not locate main content."

        paragraphs = content_div.find_all("p")
        article_text = "\n".join(p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True))
        return article_text if article_text else "⚠️ No readable paragraphs found."

    except requests.RequestException as e:
        return f"❌ Request failed: {e}"
    except Exception as e:
        return f"❌ Error parsing article: {e}"

# === Main Script ===
if __name__ == "__main__":
    conditions = [
        "high blood pressure", "diabetes", "heart disease", "celiac disease",
        "obesity", "IBS", "lactose intolerance", "arthritis", "asthma", "eczema"
    ]
    allergies = ["dairy", "gluten", "nuts", "shellfish", "soy", "eggs", "wheat", "corn"]

    output_dir = Path("harvard_articles")
    output_dir.mkdir(exist_ok=True)

    for condition in conditions:
        for allergy in allergies:
            query = f"{condition} {allergy}"
            print(f"\n🔍 Searching for: {query}")
            results = scrape_harvard_nutrition(query)

            filename = f"{condition}_{allergy}".replace(" ", "_").lower() + ".txt"
            filepath = output_dir / filename

            with open(filepath, "w", encoding="utf-8") as f:
                for item in results:
                    if "error" in item:
                        f.write(f"❌ Error: {item['error']}\n")
                        print("❌", item["error"])
                        continue

                    f.write(f"🔗 {item['title']}\n{item['link']}\n\n")
                    content = extract_article_content(item["link"])
                    f.write(content + "\n\n" + "="*80 + "\n\n")
                    print(f"✅ {item['title']}")
                    time.sleep(1)  # Respectful delay

            print(f"💾 Saved to: {filepath}")



🔍 Searching for: high blood pressure dairy
✅ Dairy
✅ Avocados
✅ Iodine
✅ Aquatic Foods
✅ Cheese
✅ Milk
✅ Calcium
✅ Choline
✅ Magnesium
✅ Processed Foods and Health
💾 Saved to: harvard_articles\high_blood_pressure_dairy.txt

🔍 Searching for: high blood pressure gluten
✅ Precision Nutrition
✅ Rice
✅ Diet Review: Paleo Diet for Weight Loss
✅ Oats
✅ Diet Reviews
✅ Whole Grains
✅ Protein
💾 Saved to: harvard_articles\high_blood_pressure_gluten.txt

🔍 Searching for: high blood pressure nuts
✅ Electrolyte Drinks
✅ WHO releases updated guidelines on defining healthy diets
✅ Chloride
✅ Copper
✅ Diet Review: MIND Diet
✅ Avocados
✅ Dairy
✅ Cheese
✅ Milk
✅ Calcium
💾 Saved to: harvard_articles\high_blood_pressure_nuts.txt

🔍 Searching for: high blood pressure shellfish
✅ Copper
✅ Iodine
✅ Aquatic Foods
✅ Diet Review: Paleo Diet for Weight Loss
✅ Fish: Friend or Foe?
✅ Ask the Expert: Healthy Fats
💾 Saved to: harvard_articles\high_blood_pressure_shellfish.txt

🔍 Searching for: high blood pressure so