In [3]:
import requests
from bs4 import BeautifulSoup
from newspaper import Article
import pandas as pd
import time
from urllib.parse import urljoin
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
 # List of Indian health news URLs
news_sources = {
    "The Hindu": "https://www.thehindu.com/sci-tech/health/",
    "Indian Express": "https://indianexpress.com/section/lifestyle/health/",
    "Times of India": "https://timesofindia.indiatimes.com/life-style/health-fitness/health-news",
    "The Wire": "https://thewire.in/health",
    "Scroll": "https://scroll.in/topic/451/health",
    "Swarajya": "https://swarajyamag.com/health",
    "OpIndia": "https://www.opindia.com/category/health/",
    "The Quint": "https://www.thequint.com/news/health",
    "NDTV Health": "https://www.ndtv.com/health",
    "News18": "https://www.news18.com/health/",
    "ABP Live": "https://news.abplive.com/health",
    "Deccan Herald": "https://www.deccanherald.com/specials/health/",
    "Business Standard": "https://www.business-standard.com/category/health",
    "Hindustan Times": "https://www.hindustantimes.com/health",
    "Livemint": "https://www.livemint.com/health",
    "Economic Times Health": "https://economictimes.indiatimes.com/industry/healthcare/biotech",
    "India Today Health": "https://www.indiatoday.in/health",
    "DNA India": "https://www.dnaindia.com/health",
    "Firstpost": "https://www.firstpost.com/health",
    "Asian Age": "https://www.asianage.com/health",
    "Outlook India": "https://www.outlookindia.com/topic/health",
    "Zee News Health": "https://zeenews.india.com/health",
    "One India Health": "https://www.oneindia.com/health",
    "Rediff Health": "https://www.rediff.com/getahead/health.html",
    "Mid-Day": "https://www.mid-day.com/lifestyle/health",
    "The Pioneer": "https://www.dailypioneer.com/health",
    "Greater Kashmir": "https://www.greaterkashmir.com/health",
    "Tribune India": "https://www.tribuneindia.com/section/health",
    "ET Health World": "https://health.economictimes.indiatimes.com",
    "Medical Dialogues": "https://medicaldialogues.in",
    "Pharmabiz": "http://www.pharmabiz.com/",
    "The Statesman Health": "https://www.thestatesman.com/health",
    "The Print": "https://theprint.in/health",
    "The Sentinel Assam": "https://www.sentinelassam.com/health",
    "The Shillong Times": "https://theshillongtimes.com/health",
    "North East Today": "https://www.northeasttoday.in/health",
    "Morung Express": "https://morungexpress.com/health",
    "Assam Tribune": "https://www.assamtribune.com/health",
    "Nagaland Post": "https://nagalandpost.com/category/health",
    "The Citizen": "https://www.thecitizen.in/topic/health",
    "IndiaSpend": "https://www.indiaspend.com/health",
    "Healthworld": "https://health.economictimes.indiatimes.com",
    "People Matters": "https://www.peoplematters.in/topic/health",
    "Business Today Health": "https://www.businesstoday.in/latest/trends/health",
    "Indian Health Journal": "https://indianhealthjournal.com/",
    "Express Healthcare": "https://www.expresshealthcare.in/",
    "India Science Wire": "https://indiasciencewire.org/health",
    "Research Matters": "https://researchmatters.in/section/health",
    "Down to Earth": "https://www.downtoearth.org.in/category/health",
    "The Better India Health": "https://www.thebetterindia.com/topics/health/",
    "Youth Ki Awaaz": "https://www.youthkiawaaz.com/category/health/",
    "The Logical Indian": "https://thelogicalindian.com/health"
}
 # Fast Chrome user-agent
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
}
 # Thread-safe storage
lock = threading.Lock()
all_articles = []
 def fetch_article(url, source):
   """Fetch article using either Newspaper or BS4."""
    try:
        article = Article(url)
        article.download()
        article.parse()
        if len(article.text.split()) > 200:
            return {
                "source": source,
                "title": article.title,
                "text": article.text,
                "url": url
            }
    except Exception:
        try:
            res = requests.get(url, headers=headers, timeout=10)
            soup = BeautifulSoup(res.content, "html.parser")
            title_tag = soup.find("title")
            title = title_tag.text.strip() if title_tag else ""
            paragraphs = soup.find_all('p')

            text = " ".join(p.get_text(strip=True) for p in paragraphs)

            if len(text.split()) > 200:

                return {

                    "source": source,

                    "title": title,

                    "text": text,

                    "url": url

                }

        except:

            return None
 
def scrape_source(source, base_url):

    """Scrape all articles from a single source."""
    urls_collected = set()
   max_pages = 30  # Adjust as needed
     # Parallelize pagination
    def collect_links(page):
        page_url = f"{base_url}?page={page}"
        try:
            response = requests.get(page_url, headers=headers, timeout=10)
            if response.status_code != 200:
                return []
            soup = BeautifulSoup(response.content, "html.parser")
            links = soup.find_all('a', href=True)
            return [
                urljoin(base_url, link['href'])
                for link in links
                if "health" in link['href'] and not link['href'].endswith(('.jpg', '.png', '.pdf'))
            ]
        except:
           return []
     with ThreadPoolExecutor(max_workers=10) as executor:
        futures = [executor.submit(collect_links, page) for page in range(1, max_pages + 1)]
        for future in as_completed(futures):
            urls_collected.update(future.result())
 
    # Parallelize article extraction
    with ThreadPoolExecutor(max_workers=20) as executor:
        futures = [executor.submit(fetch_article, url, source) for url in urls_collected]
        for future in as_completed(futures):
            article = future.result()
            if article:
                with lock:
                    all_articles.append(article)
                if len(all_articles) >= 10000:  # Stop if target is reached
                    return
 
# Scrape all sources
for source, base_url in tqdm(news_sources.items(), desc="News Sources"):
    scrape_source(source, base_url)
 
# Save to CSV
df = pd.DataFrame(all_articles)
df.drop_duplicates(subset="url", inplace=True)
df.to_csv("indian_health_bias_new2.csv", index=False)
print(f"\n✅ Saved {len(df)} articles to 'indian_health_bias_news_optimized.csv'")

 

IndentationError: unindent does not match any outer indentation level (<string>, line 117)