In [1]:
from bs4 import BeautifulSoup

In [2]:
pip install newspaper3k


Collecting newspaper3k
  Downloading newspaper3k-0.2.8-py3-none-any.whl.metadata (11 kB)
Collecting cssselect>=0.9.2 (from newspaper3k)
  Downloading cssselect-1.3.0-py3-none-any.whl.metadata (2.6 kB)
Collecting feedparser>=5.2.1 (from newspaper3k)
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting tldextract>=2.0.1 (from newspaper3k)
  Downloading tldextract-5.3.0-py3-none-any.whl.metadata (11 kB)
Collecting feedfinder2>=0.0.4 (from newspaper3k)
  Downloading feedfinder2-0.0.4.tar.gz (3.3 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting jieba3k>=0.35.1 (from newspaper3k)
  Downloading jieba3k-0.35.1.zip (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m53.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tinysegmenter==0.3 (from newspaper3k)
  Downloading tinysegmenter-0.3.tar.gz (16 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collec

In [3]:
!pip install lxml_html_clean

Collecting lxml_html_clean
  Downloading lxml_html_clean-0.4.2-py3-none-any.whl.metadata (2.4 kB)
Downloading lxml_html_clean-0.4.2-py3-none-any.whl (14 kB)
Installing collected packages: lxml_html_clean
Successfully installed lxml_html_clean-0.4.2


In [4]:
import requests
from bs4 import BeautifulSoup

def extract_news_links(url):

    try:
        response = requests.get(url)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')


        links = []
        for a_tag in soup.find_all('a', href=True):
            href = a_tag['href']
            links.append(href)
        return links
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the URL: {e}")
        return []
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return []

if __name__ == "__main__":
    toi_url = "https://timesofindia.indiatimes.com/"
    print(f"Extracting links from: {toi_url}\n")
    all_links = extract_news_links(toi_url)

    if all_links:
        print(f"Found {len(all_links)} links. Here are a few examples:")
        for i, link in enumerate(all_links[:20]):
            print(f"{i+1}. {link}")
        if len(all_links) > 20:
            print("\n(Showing first 20 links. Many more links were found.)")
    else:
        print("No links were extracted or an error occurred.")



Extracting links from: https://timesofindia.indiatimes.com/

Found 818 links. Here are a few examples:
1. https://timesofindia.indiatimes.com/
2. https://timesofindia.indiatimes.com/us
3. https://timesofindia.indiatimes.com/
4. https://navbharattimes.indiatimes.com/
5. https://marathi.indiatimes.com/
6. https://vijaykarnataka.com/
7. https://tamil.samayam.com/
8. https://bangla.indiatimes.com/
9. https://malayalam.samayam.com/
10. https://telugu.samayam.com/
11. https://www.iamgujarat.com/
12. https://timesofindia.indiatimes.com/weather
13. https://timesofindia.indiatimes.com
14. https://timesofindia.indiatimes.com/toi-plus
15. https://timesofindia.indiatimes.com/games?src=top_nav&camp=games
16. https://timesofindia.indiatimes.com/videos
17. https://timesofindia.indiatimes.com/city
18. https://timesofindia.indiatimes.com/city/mumbai
19. https://timesofindia.indiatimes.com/city/delhi
20. https://timesofindia.indiatimes.com/city/bangalore

(Showing first 20 links. Many more links were fo

In [5]:
len(all_links)

818

In [8]:
def extract_article_links_from_section(all_links, base_url):

    print(f"  Visiting section: {all_links}")
    html_content = fetch_page_content(all_links)
    if not html_content:
        return []

    soup = BeautifulSoup(html_content, 'html.parser')
    article_links = []

    for a_tag in soup.find_all('a', href=True):
        href = a_tag['href']


        if href.startswith('/'):
            full_url = base_url.rstrip('/') + href
        elif href.startswith('./'):
            full_url = base_url.rstrip('/') + href[1:]
        else:
            full_url = href


        if (full_url.startswith(base_url) and
            ('/news/' in full_url or
             '/articles/' in full_url) and
            full_url.endswith('.cms')):
            article_links.append(full_url)
    return list(set(article_links))

In [9]:
def fetch_page_content(url):

    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return ""

In [10]:
if __name__ == "__main__":
    toi_base_url = "https://timesofindia.indiatimes.com/"
    print("Visiting each section to extract news article links (this may take a while)...")

    all_news_article_links = []

    if all_links:
        for section_link in all_links:

            if toi_base_url in section_link or "timesofindia.indiatimes.com" in section_link:
                article_links_in_section = extract_article_links_from_section(section_link, toi_base_url)
                all_news_article_links.extend(article_links_in_section)
            else:
                print(f"  Skipping external link: {section_link}")

        all_news_article_links = list(set(all_news_article_links))

        if all_news_article_links:
            print(f"\nSuccessfully extracted {len(all_news_article_links)} unique news/video article links:")

            for i, link in enumerate(all_news_article_links[:20]):
                print(f"{i + 1}. {link}")
            if len(all_news_article_links) > 20:
                print("\n(Showing first 20 links. Many more links were found.)")
        else:
            print("\nNo news/video article links were found after visiting the sections.")
    else:
        print("No main section links were extracted from the provided HTML snippet.")


Visiting each section to extract news article links (this may take a while)...
  Visiting section: https://timesofindia.indiatimes.com/
  Visiting section: https://timesofindia.indiatimes.com/us
  Visiting section: https://timesofindia.indiatimes.com/
  Skipping external link: https://navbharattimes.indiatimes.com/
  Skipping external link: https://marathi.indiatimes.com/
  Skipping external link: https://vijaykarnataka.com/
  Skipping external link: https://tamil.samayam.com/
  Skipping external link: https://bangla.indiatimes.com/
  Skipping external link: https://malayalam.samayam.com/
  Skipping external link: https://telugu.samayam.com/
  Skipping external link: https://www.iamgujarat.com/
  Visiting section: https://timesofindia.indiatimes.com/weather
  Visiting section: https://timesofindia.indiatimes.com
  Visiting section: https://timesofindia.indiatimes.com/toi-plus
  Visiting section: https://timesofindia.indiatimes.com/games?src=top_nav&camp=games
  Visiting section: https:

In [11]:
len(all_news_article_links)

2240

In [12]:
from newspaper import Article

def extract_news_body_only(url):
    try:
        article = Article(url)
        article.download()
        article.parse()

        full_text = article.text.strip()


        lines = full_text.split('\n')
        if len(lines) > 1:
            body = '\n'.join(lines[1:]).strip()
        else:
            body = full_text
        return body

    except Exception as e:
        return f"Error: {str(e)}"
s = input('enter link')
r = extract_news_body_only(s)
print(r)

enter linkhttps://www.ndtv.com/india-news/truck-loses-control-on-mumbai-pune-expressway-rams-20-cars-8955914?pfrom=home-ndtv_topscroll
The accident happened under a bridge on the Mumbai-bound lane of the expressway, just after the Khopoli exit near the toll booth.

Anita Ekhande, 35, a resident of Osmanabad, died on the spot.

The 21 injured people were initially given first-aid at Khopoli Municipal Hospital and later shifted to MGM Hospital in Kamothe for further treatment.

Among the injured is the wife of a Bombay High Court judge.

The accident happened when the brakes of a trailer failed while it was headed toward Mumbai, causing the driver to lose control.

As a result, the vehicles behind collided with each other over a stretch of nearly 3 km.

Teams from the India Reserve Battalion, Devdoot, highway police and volunteers from Help Foundation carried out the rescue operations. They cleared the expressway within 45 minutes by moving the damaged vehicles to the side.

"The driver 

In [13]:
from newspaper import Article
import time
import pandas as pd

def extract_news_body_only(url):
    try:
        article = Article(url)
        article.download()
        article.parse()


        lines = article.text.strip().split('\n')
        body = '\n'.join(lines[1:]).strip() if len(lines) > 1 else article.text.strip()
        return body

    except Exception as e:
        return f"Error: {str(e)}"


data = []

for idx, link in enumerate(all_news_article_links):
    print(f"[{idx + 1}/{len(all_news_article_links)}] Processing: {link}")
    body = extract_news_body_only(link)
    data.append({"url": link, "body": body})


    time.sleep(1)

df = pd.DataFrame(data)


df.to_csv("extracted_articles.csv", index=False, encoding='utf-8-sig')

print("\n✅ Done! Data saved to 'extracted_articles.csv'")


[1/2240] Processing: https://timesofindia.indiatimes.com/sports/nfl/news/las-vegas-raiders-christian-wilkins-blown-away-the-internet-days-after-his-abrupt-removal/articleshow/122909811.cms
[2/2240] Processing: https://timesofindia.indiatimes.com/sports/nhl/news/florida-panthers-sign-jeff-petry-for-775k-as-key-defensive-addition-veteran-calls-it-a-no-brainer-move-to-chase-stanley-cup/articleshow/122248665.cms
[3/2240] Processing: https://timesofindia.indiatimes.com/entertainment/hindi/bollywood/news/karan-johar-confirms-takht-is-on-hold-not-cancelled-its-my-best-screenplay-to-date/articleshow/121249091.cms
[4/2240] Processing: https://timesofindia.indiatimes.com/entertainment/hindi/bollywood/news/kartik-aaryan-says-hell-faint-if-he-experiences-anything-supernatural/articleshow/122919600.cms
[5/2240] Processing: https://timesofindia.indiatimes.com/sports/nhl/news/chicago-blackhawks-avoid-arbitration-sign-arvid-soderblom-to-two-year-5-5m-deal/articleshow/122927755.cms
[6/2240] Processing:

In [14]:
#bbc

In [16]:
import requests
from bs4 import BeautifulSoup

def extract_news_links(url):

    try:
        response = requests.get(url)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')


        links = []
        for a_tag in soup.find_all('a', href=True):
            href = a_tag['href']
            if href.startswith('/'):
                href = url + href
            links.append(href)
        return links
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the URL: {e}")
        return []
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return []

if __name__ == "__main__":
    toi_url = "https://www.bbc.com"
    print(f"Extracting links from: {toi_url}\n")
    all_links = extract_news_links(toi_url)

    if all_links:
        print(f"Found {len(all_links)} links. Here are a few examples:")
        # Print only the first 20 links for brevity
        for i, link in enumerate(all_links[:20]):
            print(f"{i+1}. {link}")
        if len(all_links) > 20:
            print("\n(Showing first 20 links. Many more links were found.)")
    else:
        print("No links were extracted or an error occurred.")



Extracting links from: https://www.bbc.com

Found 243 links. Here are a few examples:
1. #main-content
2. https://www.bbc.com/
3. https://www.bbc.com/
4. https://www.bbc.com/news
5. https://www.bbc.com/sport
6. https://www.bbc.com/business
7. https://www.bbc.com/innovation
8. https://www.bbc.com/culture
9. https://www.bbc.com/arts
10. https://www.bbc.com/travel
11. https://www.bbc.com/future-planet
12. https://www.bbc.com/audio
13. https://www.bbc.com/video
14. https://www.bbc.com/live
15. https://www.bbc.com/home
16. https://www.bbc.com/news
17. https://www.bbc.com/news/topics/c2vdnvdg6xxt
18. https://www.bbc.com/news/war-in-ukraine
19. https://www.bbc.com/news/us-canada
20. https://www.bbc.com/news/uk

(Showing first 20 links. Many more links were found.)


In [None]:
import requests
from bs4 import BeautifulSoup
import re

# Step 1: Extract all links from homepage
def extract_news_links(base_url):
    try:
        response = requests.get(base_url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        links = []
        for a_tag in soup.find_all('a', href=True):
            href = a_tag['href']
            if href.startswith('/'):
                href = "https://www.bbc.com" + href
            if href.startswith("https://www.bbc.com"):
                links.append(href)
        return list(set(links))  # Remove duplicates
    except Exception as e:
        print(f"Error: {e}")
        return []

# Step 2: Extract valid article links from a section
def extract_bbc_article_links_from_section(section_url):
    try:
        response = requests.get(section_url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        article_links = set()

        for a_tag in soup.find_all('a', href=True):
            href = a_tag['href']

            if href.startswith('/'):
                full_url = "https://www.bbc.com" + href
            else:
                full_url = href

            # BBC article URL pattern filtering
            if (full_url.startswith("https://www.bbc.com/") and
                not re.search(r'\.(jpg|jpeg|png|gif|svg|css|js|webp)', full_url, re.IGNORECASE) and
                ("/news/" in full_url or "/articles/" in full_url or
                 re.search(r'/[a-z-]+-\d{7,}', full_url))):
                article_links.add(full_url)

        return sorted(list(article_links))
    except Exception as e:
        print(f"Error extracting from {section_url}: {e}")
        return []

# Main script
if __name__ == "__main__":
    homepage_url = "https://www.bbc.com"
    print(f"Step 1: Extracting section links from homepage: {homepage_url}")
    section_links = extract_news_links(homepage_url)

    if not section_links:
        print("No section links found.")
    else:
        print(f"\n✅ Found {len(section_links)} section-level links.")
        all_article_links = set()

        print(f"\nStep 2: Visiting each section to find article links...\n")
        for i, section_url in enumerate(section_links):
            print(f"  [{i+1}/{len(section_links)}] Visiting section: {section_url}")
            links = extract_bbc_article_links_from_section(section_url)
            all_article_links.update(links)

        final_article_links = sorted(list(all_article_links))

        if final_article_links:
            print(f"\n✅ Total unique article links extracted: {len(final_article_links)}")
            for i, link in enumerate(final_article_links[:20]):
                print(f"{i+1}. {link}")
            if len(final_article_links) > 20:
                print("\n(Only showing first 20 links. More links were found.)")
        else:
            print("\n❌ No article links were found after visiting the sections.")


Step 1: Extracting section links from homepage: https://www.bbc.com

✅ Found 171 section-level links.

Step 2: Visiting each section to find article links...

  [1/171] Visiting section: https://www.bbc.com/culture/article/20250714-why-the-virgin-queen-never-married
  [2/171] Visiting section: https://www.bbc.com/reel/video/p0lrndgw/should-we-be-drinking-more-matcha-
  [3/171] Visiting section: https://www.bbc.com/travel/destinations/central-america
  [4/171] Visiting section: https://www.bbc.com/travel/destinations/middle-east
  [5/171] Visiting section: https://www.bbc.com/news/videos/c98wxjzezkgo
  [6/171] Visiting section: https://www.bbc.com/travel/destinations
  [7/171] Visiting section: https://www.bbc.com/news/world/australia
  [8/171] Visiting section: https://www.bbc.com/audio/play/w3ct6s2c
  [9/171] Visiting section: https://www.bbc.com/reel/video/p0lqj325/the-ancient-mexican-lake-home-to-earth-s-oldest-lifeforms
  [10/171] Visiting section: https://www.bbc.com/travel/articl

In [None]:
from newspaper import Article
import time
import pandas as pd

def extract_news_body_only(url):
    try:
        article = Article(url)
        article.download()
        article.parse()

        # Get the full text and remove the first line (usually the headline)
        lines = article.text.strip().split('\n')
        body = '\n'.join(lines[1:]).strip() if len(lines) > 1 else article.text.strip()
        return body

    except Exception as e:
        return f"Error: {str(e)}"

# ✅ Example: Make sure this list exists
# all_news_article_links = [...]  # Your list of 1222 URLs

# Create a list to store results
data = []

for idx, link in enumerate(final_article_links):
    print(f"[{idx + 1}/{len(final_article_links)}] Processing: {link}")
    body = extract_news_body_only(link)
    data.append({"url": link, "body": body})

    # Optional delay to avoid IP blocking
    time.sleep(1)

# ✅ Convert to pandas DataFrame
df = pd.DataFrame(data)

# ✅ Save to CSV
df.to_csv("extracted_articles.csv", index=False, encoding='utf-8-sig')

print("\n✅ Done! Data saved to 'extracted_articles.csv'")


NameError: name 'final_article_links' is not defined

In [17]:
import requests
from bs4 import BeautifulSoup

def extract_news_links(url):

    try:
        response = requests.get(url)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')


        links = []
        for a_tag in soup.find_all('a', href=True):
            href = a_tag['href']
            links.append(href)
        return links
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the URL: {e}")
        return []
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return []

if __name__ == "__main__":
    toi_url = "https://www.ndtv.com"
    print(f"Extracting links from: {toi_url}\n")
    all_links = extract_news_links(toi_url)

    if all_links:
        print(f"Found {len(all_links)} links. Here are a few examples:")
        # Print only the first 20 links for brevity
        for i, link in enumerate(all_links[:20]):
            print(f"{i+1}. {link}")
        if len(all_links) > 20:
            print("\n(Showing first 20 links. Many more links were found.)")

    else:
        print("No links were extracted or an error occurred.")



Extracting links from: https://www.ndtv.com

Found 561 links. Here are a few examples:
1. https://www.ndtv.com/?pfrom=home-ndtv_globalnav
2. https://www.ndtv.com/world/?pfrom=home-ndtv_globalnav
3. https://www.ndtvprofit.com/?pfrom=home-ndtv_globalnav
4. https://ndtv.in/?pfrom=home-ndtv_globalnav
5. https://www.ndtv.com/entertainment?pfrom=home-ndtv_globalnav
6. https://sports.ndtv.com/cricket?pfrom=home-ndtv_globalnav
7. https://food.ndtv.com/?pfrom=home-ndtv_globalnav
8. https://www.ndtv.com/lifestyle?pfrom=home-ndtv_globalnav
9. https://doctor.ndtv.com/?pfrom=home-ndtv_globalnav
10. https://www.gadgets360.com/?pfrom=home-ndtv_globalnav
11. https://www.ndtvgames.com/?pfrom=home-ndtv_globalnav
12. https://www.ndtvshopping.com/?pfrom=home-ndtv_globalnav
13. https://www.ndtv.com/apps?pfrom=home-ndtv_globalnav
14. https://rajasthan.ndtv.in/?pfrom=home-ndtv_globalnav
15. https://mpcg.ndtv.in/?pfrom=home-ndtv_globalnav
16. https://marathi.ndtv.com/?pfrom=home-ndtv_globalnav
17. https://www

In [18]:
all_links = set(all_links)
len(all_links)

415

In [None]:
all_links

{'',
 '#',
 'http://ndtvshopping.com/beauty-and-grooming/essentials-you-need-for-a-peaceful-sleep-8938893?pfrom=home-ndtv_shopping',
 'https://archives.ndtv.com/?pfrom=home-ndtv_footer',
 'https://doctor.ndtv.com/?pfrom=home-ndtv_footer',
 'https://doctor.ndtv.com/?pfrom=home-ndtv_globalnav',
 'https://doctor.ndtv.com/?pfrom=home-ndtv_nav_wap',
 'https://doctor.ndtv.com/webstories/health/how-to-consume-chia-seeds-safely-44045?pfrom=home-ndtv_webstories',
 'https://food.ndtv.com/?pfrom=home-ndtv_footer',
 'https://food.ndtv.com/?pfrom=home-ndtv_globalnav',
 'https://food.ndtv.com/food-drinks/6-best-restaurants-in-maldives-3494623?pfrom=home-ndtv_food',
 'https://food.ndtv.com/food-drinks/6-best-restaurants-in-maldives-3494623?pfrom=home-ndtv_food_foodimg',
 'https://food.ndtv.com/food-drinks/why-a2-ghee-is-being-called-a-modern-superfood-8922740?pfrom=home-ndtv_food',
 'https://food.ndtv.com/food-drinks/why-a2-ghee-is-being-called-a-modern-superfood-8922740?pfrom=home-ndtv_food_foodimg'

In [20]:
import time

def extract_article_links_from_section(section_url):
    """Visit a section URL and extract NDTV news article links"""
    print(f"  Visiting section: {section_url}")
    try:
        response = requests.get(section_url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        article_links = set()

        for a_tag in soup.find_all('a', href=True):
            href = a_tag['href']


            if href.startswith("https://www.ndtv.com/") and (
                "/news/" in href or
                "/india-news/" in href or
                "/world-news/" in href or
                "/entertainment/" in href or
                "/sports/" in href or
                "/science/" in href or
                "/education/" in href
            ):
                article_links.add(href)

        return list(article_links)

    except requests.exceptions.RequestException as e:
        print(f"Error fetching {section_url}: {e}")
        return []

if __name__ == "__main__":

    all_article_links = set()

    for i, section_url in enumerate(all_links):
        print(f"\nProcessing {i+1}/{len(all_links)}: {section_url}")
        time.sleep(1)
        links = extract_article_links_from_section(section_url)
        all_article_links.update(links)

    print(f"\n Extracted {len(all_article_links)} unique article links.\n")


    for i, link in enumerate(list(all_article_links)[:20]):
        print(f"{i+1}. {link}")



Processing 1/415: 
  Visiting section: 
Error fetching : Invalid URL '': No scheme supplied. Perhaps you meant https://?

Processing 2/415: https://www.ndtvshopping.com/?pfrom=home-ndtv_globalnav
  Visiting section: https://www.ndtvshopping.com/?pfrom=home-ndtv_globalnav

Processing 3/415: https://www.ndtv.com/photos?pfrom=home-ndtv_nav_wap
  Visiting section: https://www.ndtv.com/photos?pfrom=home-ndtv_nav_wap

Processing 4/415: https://www.ndtv.com/webstories/feature/rbi-cancels-this-banks-license-what-you-need-to-know-44081?pfrom=home-ndtv_webstories
  Visiting section: https://www.ndtv.com/webstories/feature/rbi-cancels-this-banks-license-what-you-need-to-know-44081?pfrom=home-ndtv_webstories

Processing 5/415: https://www.ndtv.com/world-news/gaza-doctor-details-how-starvation-destroys-body-8959203?pfrom=home-ndtv_topscroll
  Visiting section: https://www.ndtv.com/world-news/gaza-doctor-details-how-starvation-destroys-body-8959203?pfrom=home-ndtv_topscroll

Processing 6/415: https

In [21]:
len(all_article_links)

1038

In [22]:
def extract_news_body_only(url):

    try:
        article = Article(url)
        article.download()
        article.parse()


        lines = article.text.strip().split('\n')
        body = '\n'.join(lines[1:]).strip() if len(lines) > 1 else article.text.strip()
        return body

    except Exception as e:
        return f"Error: {str(e)}"


data = []

for idx, link in enumerate(all_article_links):
    print(f"[{idx + 1}/{len(all_article_links)}] Processing: {link}")
    body = extract_news_body_only(link)
    data.append({"url": link, "body": body})


    time.sleep(1)

df = pd.DataFrame(data)


df.to_csv("extracted_articles.csv", index=False, encoding='utf-8-sig')

print("\n Done! Data saved to 'extracted_articles.csv'")


[1/1038] Processing: https://www.ndtv.com/world-news/who-is-yahya-sinwar-the-hamas-leader-killed-by-israel-6819037
[2/1038] Processing: https://www.ndtv.com/india-news/vistaras-mumbai-amritsar-flight-diverted-to-chandigarh-due-to-bad-weather-6926158
[3/1038] Processing: https://www.ndtv.com/education/qs-best-student-cities-ranking-2026-seoul-tops-list-4-indian-cities-in-global-top-150-8880688
[4/1038] Processing: https://www.ndtv.com/world-news/watch-donald-trump-argues-with-fed-chair-jerome-powell-on-live-tv-over-cost-of-fed-building-renovation-8950817
[5/1038] Processing: https://www.ndtv.com/education/webstories/iits-vs-iiits-vs-nits-what-s-the-real-difference-43687
[6/1038] Processing: https://www.ndtv.com/world-news/pm-modi-uk-maldives-visit-live-updates-narendra-modi-india-uk-free-trade-deal-meet-british-pm-keir-starmer-king-charles-khalistani-threat-mohamed-muiz-8930160
[7/1038] Processing: https://www.ndtv.com/entertainment/tanushree-dutta-sparks-debate-over-shravan-fast-and-mu

In [23]:
import pandas as pd

df = pd.DataFrame(data)


df.to_csv("extracted_articles.csv", index=False, encoding='utf-8-sig')

print("\n Done! Data saved to 'extracted_articles.csv'")



 Done! Data saved to 'extracted_articles.csv'


In [24]:
len( all_article_links)

1038

In [26]:
from newspaper import Article
import time
import pandas as pd

def extract_news_body_only(url):
    try:
        article = Article(url)
        article.download()
        article.parse()


        lines = article.text.strip().split('\n')
        body = '\n'.join(lines[1:]).strip() if len(lines) > 1 else article.text.strip()
        return body

    except Exception as e:
        return f"Error: {str(e)}"


data = []

for idx, link in enumerate( all_article_links):
    print(f"[{idx + 1}/{len( all_article_links)}] Processing: {link}")
    body = extract_news_body_only(link)
    data.append({"url": link, "body": body})


    time.sleep(1)


df = pd.DataFrame(data)


df.to_csv("extracted_articles_ndtv.csv", index=False, encoding='utf-8-sig')

print("\n Done! Data saved to 'extracted_articles.csv'")


[1/1038] Processing: https://www.ndtv.com/world-news/who-is-yahya-sinwar-the-hamas-leader-killed-by-israel-6819037
[2/1038] Processing: https://www.ndtv.com/india-news/vistaras-mumbai-amritsar-flight-diverted-to-chandigarh-due-to-bad-weather-6926158
[3/1038] Processing: https://www.ndtv.com/education/qs-best-student-cities-ranking-2026-seoul-tops-list-4-indian-cities-in-global-top-150-8880688
[4/1038] Processing: https://www.ndtv.com/world-news/watch-donald-trump-argues-with-fed-chair-jerome-powell-on-live-tv-over-cost-of-fed-building-renovation-8950817
[5/1038] Processing: https://www.ndtv.com/education/webstories/iits-vs-iiits-vs-nits-what-s-the-real-difference-43687
[6/1038] Processing: https://www.ndtv.com/world-news/pm-modi-uk-maldives-visit-live-updates-narendra-modi-india-uk-free-trade-deal-meet-british-pm-keir-starmer-king-charles-khalistani-threat-mohamed-muiz-8930160
[7/1038] Processing: https://www.ndtv.com/entertainment/tanushree-dutta-sparks-debate-over-shravan-fast-and-mu

Hindustan times


In [27]:
import requests
from bs4 import BeautifulSoup

def extract_section_links(url):
    try:
        response = requests.get(url)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')


        section_container = soup.find('ul', class_='topics-menu')

        if not section_container:
            print("Could not find the 'topics-menu' section.")
            return []

        section_links = []
        for a_tag in section_container.find_all('a', href=True):
            text = a_tag.get_text(strip=True)
            href = a_tag['href']
            section_links.append((text, href))

        return section_links

    except requests.exceptions.RequestException as e:
        print(f"Request error: {e}")
        return []
    except Exception as e:
        print(f"Unexpected error: {e}")
        return []

if __name__ == "__main__":
    url = "https://indianexpress.com/"
    print(f"Extracting section links from: {url}\n")
    sections = extract_section_links(url)

    if sections:
        print(f"Found {len(sections)} section links:\n")
        for i, (name, link) in enumerate(sections, 1):
            print(f"{i}. {name}: {link}")
    else:
        print("No section links found.")


Extracting section links from: https://indianexpress.com/

Request error: 403 Client Error: Forbidden for url: https://indianexpress.com/
No section links found.


In [28]:
import requests
from bs4 import BeautifulSoup
import time

def extract_section_links(url):
    try:
        print("Waiting 2 seconds before making the request...")
        time.sleep(2)
        response = requests.get(url)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')


        section_container = soup.find('ul', class_='topics-menu')

        if not section_container:
            print("Could not find the 'topics-menu' section.")
            return []

        section_links = []
        for a_tag in section_container.find_all('a', href=True):
            text = a_tag.get_text(strip=True)
            href = a_tag['href']
            section_links.append((text, href))

        return section_links

    except requests.exceptions.RequestException as e:
        print(f"Request error: {e}")
        return []
    except Exception as e:
        print(f"Unexpected error: {e}")
        return []

if __name__ == "__main__":
    url = "https://www.indiatoday.in/"
    print(f"Extracting section links from: {url}\n")
    sections = extract_section_links(url)

    if sections:
        print(f"Found {len(sections)} section links:\n")
        for i, (name, link) in enumerate(sections, 1):
            print(f"{i}. {name}: {link}")
    else:
        print("No section links found.")


Extracting section links from: https://www.indiatoday.in/

Waiting 2 seconds before making the request...
Could not find the 'topics-menu' section.
No section links found.


In [29]:
import requests
from bs4 import BeautifulSoup

url = "https://thehindu.com.in/"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")


container = soup.find('div', class_='penci_bottombar penci-desktop-bottombar penci_navbar penci_container bg-normal pcmiddle-normal pc-hasel')

if not container:
    container = soup.select_one("div.penci_bottombar.penci-desktop-bottombar.penci_navbar.penci_container.bg-normal.pcmiddle-normal.pc-hasel")

if container:
    links = [a['href'] for a in container.find_all('a', href=True)]
    print(f"Found {len(links)} links:\n")
    for link in links:
        print(link)
else:
    print("Container with the given classes not found.")


Found 8 links:

https://thehindu.com.in/
https://thehindu.com.in/category/news/
https://thehindu.com.in/category/india/
https://thehindu.com.in/category/business/
https://thehindu.com.in/category/elections/
https://thehindu.com.in/category/cities/
https://thehindu.com.in/category/society/
https://thehindu.com.in/category/technology/


In [30]:


article_links = set()
for idx, section_url in enumerate(links, 1):
    try:
        print(f"🔄 Visiting section {idx}: {section_url}")
        response = requests.get(section_url, timeout=10)
        soup = BeautifulSoup(response.content, 'html.parser')
        links = soup.find_all('a', href=True)

        for tag in links:
            href = tag['href']
            if href.startswith('http') and "thehindu.com.in" in href:
                article_links.add(href)

        print(f" Found {len(links)} links (Current total articles: {len(article_links)})")
        time.sleep(1)

    except Exception as e:
        print(f" Error visiting {section_url}: {e}")
        continue


print(f"\n Total  article links collected: {len(article_links)}")


🔄 Visiting section 1: https://thehindu.com.in/
 Found 329 links (Current total articles: 64)
🔄 Visiting section 2: https://thehindu.com.in/category/news/
 Found 165 links (Current total articles: 69)
🔄 Visiting section 3: https://thehindu.com.in/category/india/
 Found 185 links (Current total articles: 73)
🔄 Visiting section 4: https://thehindu.com.in/category/business/
 Found 162 links (Current total articles: 80)
🔄 Visiting section 5: https://thehindu.com.in/category/elections/
 Found 177 links (Current total articles: 83)
🔄 Visiting section 6: https://thehindu.com.in/category/cities/
 Found 147 links (Current total articles: 83)
🔄 Visiting section 7: https://thehindu.com.in/category/society/
 Found 171 links (Current total articles: 83)
🔄 Visiting section 8: https://thehindu.com.in/category/technology/
 Found 168 links (Current total articles: 87)

 Total  article links collected: 87


In [31]:
all_article_links = article_links

In [32]:
from newspaper import Article
import time
import pandas as pd

def extract_news_body_only(url):
    try:
        article = Article(url)
        article.download()
        article.parse()

        lines = article.text.strip().split('\n')
        body = '\n'.join(lines[1:]).strip() if len(lines) > 1 else article.text.strip()
        return body

    except Exception as e:
        return f"Error: {str(e)}"


data = []

for idx, link in enumerate( all_article_links):
    print(f"[{idx + 1}/{len( all_article_links)}] Processing: {link}")
    body = extract_news_body_only(link)
    data.append({"url": link, "body": body})


    time.sleep(1)


df = pd.DataFrame(data)


df.to_csv("extracted_articles_hindu.csv", index=False, encoding='utf-8-sig')

print("\n Done! Data saved to 'extracted_articles_hindu.csv'")


[1/87] Processing: https://thehindu.com.in/need-double-engine-govt-in-maha-for-investment-says-jaishankar-in-campaign-pitch/
[2/87] Processing: https://thehindu.com.in/record-number-of-women-win-seats-in-japan-election-a-historic-step-for-gender-equality/
[3/87] Processing: https://thehindu.com.in/one-terrorist-killed-as-army-vehicle-targeted-in-j-ks-akhnoor-gunfight-on/
[4/87] Processing: https://thehindu.com.in/author/joseph-j-johnson/
[5/87] Processing: https://thehindu.com.in/spanish-pm-in-india-seeking-to-bolster-trade-ties/
[6/87] Processing: https://thehindu.com.in/category/india/page/2/
[7/87] Processing: https://thehindu.com.in
[8/87] Processing: https://thehindu.com.in/maharashtra-elections-2024-cm-eknath-shinde-to-file-nomination-from-kopri-pachpakhadi/
[9/87] Processing: https://thehindu.com.in/microsoft-fires-employees-who-organized-vigil-for-palestinians-killed-in-gaza-a-controversial-move/
[10/87] Processing: https://thehindu.com.in/indian-americans-still-back-democratic