In [15]:
import os
import pickle
import uuid
import csv
import re
from datetime import datetime
import requests as rq
from bs4 import BeautifulSoup

In [2]:
domain_urls = {
    "bollywood": "https://www.indiatoday.in/movies/bollywood",
    "technology": "https://www.indiatoday.in/technology",
    "lifestyle": "https://www.indiatoday.in/lifestyle",
    "sports": "https://www.indiatoday.in/sports",
    "business": "https://www.indiatoday.in/business",
}


In [4]:
# Directory for storing data
DATA_DIR = "../data"
os.makedirs(DATA_DIR, exist_ok=True)

In [5]:
# Function to fetch and parse articles from a domain
def fetch_all_articles(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        # Example article extraction logic (update selectors based on site structure)
        articles = []
        for article_tag in soup.select("article"):
            headline = article_tag.select_one("h2").text.strip()
            link = article_tag.select_one("a")["href"]
            content = article_tag.select_one("p").text.strip()
            articles.append({"headline": headline, "link": link, "content": content})

        return articles
    except Exception as e:
        print(f"Error fetching articles from {url}: {e}")
        return []

In [8]:
all_articles = fetch_articles(domain_urls["business"])

In [13]:
all_articles[0]

{'headline': 'Will 2025 bring a bull run? Key trends for Nifty to watch',
 'link': '/business/story/icici-securities-says-bullish-2025-ahead-nifty-outlook-to-hit-28800-with-sectoral-leaders-in-focus-2651753-2024-12-18',
 'content': 'ICICI Securities projects a 20% rise in Nifty for 2025, driven by historical trends, sectoral strength, and strategic stock picks.'}

In [11]:
# Function to process articles and identify new ones
def process_articles(domain, articles):
    pickle_file = os.path.join(DATA_DIR, f"{domain}_headlines.pkl")
    unique_headlines = set()

    # Attempt to load existing headlines, create the pickle file if not found
    try:
        with open(pickle_file, "rb") as f:
            unique_headlines = pickle.load(f)
    except FileNotFoundError:
        # If the file does not exist, just continue with an empty set of unique headlines
        print(f"Pickle file for domain '{domain}' not found. Creating a new one.")

    new_articles = []

    for article in articles:
        if article["headline"] in unique_headlines:
            continue  # Skip this article if the headline is already processed
        unique_headlines.add(article["headline"])
        new_articles.append(article)

    # Save updated unique headlines back to pickle file
    with open(pickle_file, "wb") as f:
        pickle.dump(unique_headlines, f)
    if not new_articles:
        print(f"NO new articles were added in {domain}")
    return new_articles

In [12]:
process_articles("business",all_articles)

NO new articles were added in business


[]

In [14]:
# Function to get detailed article information
def get_article_details(url):
    try:
        data = rq.get(url)
        soup = BeautifulSoup(data.text, "html.parser")
        soup.prettify()
        text_find = soup.find("h1")
        date_time_find = soup.find("span", class_="jsx-ace90f4eca22afc7 strydate")

        if text_find and date_time_find:
            head_lines = text_find.text
            content = " ".join([para.text for para in soup.find_all("p")[1:]])
            date_time = date_time_find.text

            pattern = r"UPDATED:\s+(\w{3}\s\d{1,2},\s\d{4}\s\d{2}:\d{2}\sIST)"
            match = re.findall(pattern, date_time)

            if match:
                date_format = "%b %d, %Y %H:%M IST"
                article_time = datetime.strptime(match[0], date_format)
                return content, article_time

    except Exception as e:
        print(f"Error extracting article details from {url}: {e}")
    return None

In [19]:
def add_content_and_time(new_articles):
    for article in new_articles:
        news_link = article['link']
        if "https" not in news_link:
            news_link = "https://www.indiatoday.in" + news_link
        details = get_article_details(news_link)

        if details:
            content, article_time = details
            article["content"] = content
            article["date_time"] = article_time
            article["link"] = news_link
    return new_articles
    

In [22]:
new_articles = add_content_and_time(all_articles)

In [26]:
# Function to save new articles to a CSV file
def save_to_csv(domain, articles):
    csv_file = os.path.join(DATA_DIR, f"{domain}_articles.csv")
    print(csv_file)
    is_new_file = not os.path.exists(csv_file)

    with open(csv_file, "a", newline='', encoding="utf-8") as f:
        writer = csv.writer(f)

        # Write header if file is new
        if is_new_file:
            writer.writerow(["UUID", "Headline", "Link", "Content", "Category", "DateTime"])

        for article in articles:
            writer.writerow([
                str(uuid.uuid4()),
                article["headline"],
                article["link"],
                article["content"],
                domain,
                article["date_time"].isoformat()
            ])

In [27]:
save_to_csv("business" , new_articles)

../data\business_articles.csv


### MY test

In [62]:
import re
import requests as rq
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime

In [66]:
url = "https://www.indiatoday.in/movies/bollywood"
# url = "https://www.indiatoday.in/technology"
# url = "https://www.indiatoday.in/lifestyle"
url = "https://www.indiatoday.in/sports"
url = "https://www.indiatoday.in/business"

In [67]:
data = rq.get(url)
if data.status_code == 200:
    print(url, "executed")
    s = BeautifulSoup(data.text, "html.parser")
    all_articles = s.find_all("div", {"class": "B1S3_content__wrap__9mSB6"})
    category = url.split("/")[-1] if url.split("/")[-1] else "General News"

https://www.indiatoday.in/movies/bollywood executed


In [68]:
all_articles[0]

<div class="B1S3_content__wrap__9mSB6"><h2 class=""><a href="/movies/bollywood/story/meet-thaman-s-man-behind-tracks-like-ramuloo-ramulaa-nain-mattaka-baby-john-varun-dhawan-2651892-2024-12-18" title="Thaman S: The creative force behind hits like Ramuloo Ramulaa and Nain Mattaka">Thaman S: The creative force behind hits like Ramuloo Ramulaa and Nain Mattaka</a></h2><div class="B1S3_sub__title__with__rating__7rtuJ sub__title__with__rating"></div><div class="B1S3_story__shortcont__inicf"><p>Thaman S, famed for his South Indian hits, is set to debut his first full Bollywood album with Baby John this Christmas. The film stars Varun Dhawan in the lead and the will be released on December 25.</p></div></div>

In [69]:
data_frame = {"headlines": [], "link": [], "content": [], "category": [] , "date_time" : []}

In [70]:
def get_article_details(url):
    data = rq.get(url)
    s = BeautifulSoup(data.text, "html.parser")
    s.prettify()
    text_find , date_time_find = s.find("h1") , s.find("span" , class_ = "jsx-ace90f4eca22afc7 strydate")
    if text_find and date_time_find:
      head_lines = text_find.text
      content = " ".join([para.text for para in s.find_all("p")[1:]])
      date_time = date_time_find.text
      pattern = r"UPDATED:\s+(\w{3}\s\d{1,2},\s\d{4}\s\d{2}:\d{2}\sIST)"
      match = re.findall(pattern, date_time)
      date_format = "%b %d, %Y %H:%M IST"
      article_time = datetime.strptime(match[0], date_format)
      
      return content,article_time
    return None

In [71]:
temp_article = "https://www.indiatoday.in/movies/bollywood/story/meet-thaman-s-man-behind-tracks-like-ramuloo-ramulaa-nain-mattaka-baby-john-varun-dhawan-2651892-2024-12-18"
get_article_details(temp_article)

("If you’re a fan of South Indian cinema and its infectious music, chances are you’ve found yourself dancing to Ramuloo Ramulaa from Ala Vaikunthapurramuloo. But do you know the musical genius behind this chartbuster? Enter Thaman S, the maestro whose compositions have become chartbusters. He is set to make waves in Bollywood with his debut full album for Varun Dhawan’s Christmas release, Baby John. Here’s everything you need to know about this prolific composer and his extraordinary journey. Born as Ghantasala Sai Srinivas in Nellore, Andhra Pradesh, Thaman comes from a lineage steeped in music. His grandfather, Ghantasala Balaramayya, was a celebrated director and producer, while his father, Ghantasala Siva Kumar, was a renowned drummer who worked in over 700 films with legendary composer K Chakravarthy. His mother, Ghantasala Savitri, sister Yamini, and aunt B. Vasantha have all lent their voices as playback singers. Growing up in a Telugu-speaking family of musicians in Chennai, Th

In [72]:
def data_frame_creation(data_frame, all_articles, category):
    for article in all_articles:
        link = article.find("a").get("href")
        news_link = link
        if "https" not in news_link:
            news_link = "https://www.indiatoday.in" + news_link
        content, article_date_time = get_article_details(news_link)
        if content:
            data_frame["content"].append(content)
            data_frame["link"].append(news_link)
            data_frame["headlines"].append(article.text)
            data_frame["category"].append(category)
            data_frame["date_time"].append(article_date_time)

In [73]:
data_frame_creation(data_frame, all_articles, category)

In [75]:
df = pd.DataFrame(data_frame)

In [80]:
print(df.iloc[0]["date_time"])

2024-12-18 20:55:00


In [51]:
## code snippet to add time and run tests on it
temp_article = all_articles[0]
print(temp_article)
for article in all_articles:
    news_link = article.find("a").get("href")
    if "https" not in news_link:
        news_link = "https://www.indiatoday.in" + news_link
    data = rq.get(news_link)
    s = BeautifulSoup(data.text, "html.parser")
    s.prettify()
    if s.find("span" , class_ = "jsx-ace90f4eca22afc7 strydate"):
        print(s.find("span" , class_ = "jsx-ace90f4eca22afc7 strydate").text)
    else:
        print("not found the exact class")


<div class="B1S3_content__wrap__9mSB6"><h2 class=""><a href="/technology/news/story/sam-altman-is-apparently-trying-to-remove-the-organisation-that-tried-to-remove-him-from-openai-last-year-2651451-2024-12-18" title="Sam Altman aims to oust the group that tried to fire him from OpenAI last year">Sam Altman aims to oust the group that tried to fire him from OpenAI last year</a></h2><div class="B1S3_sub__title__with__rating__7rtuJ sub__title__with__rating"></div><div class="B1S3_story__shortcont__inicf"><p>In a bid to restructure OpenAI from a non-profit to a for-profit organisation, the company's CEO Sam Altman is apparently trying to remove the organisation that tried to remove him from the company last year.</p></div></div>
UPDATED: Dec 18, 2024 09:04 IST
UPDATED: Dec 18, 2024 10:46 IST
UPDATED: Dec 18, 2024 10:28 IST
UPDATED: Dec 18, 2024 18:49 IST
UPDATED: Dec 18, 2024 16:43 IST
UPDATED: Dec 18, 2024 16:15 IST
UPDATED: Dec 18, 2024 16:18 IST
UPDATED: Dec 18, 2024 14:28 IST
UPDATED: 

In [43]:
news_link

'https://www.indiatoday.in/lifestyle/wellness/story/with-kendall-and-hailey-backing-it-will-nad-witness-a-boom-in-indian-market-2651846-2024-12-18'

In [42]:
head_lines

'With Kendall and Hailey backing it, will NAD+ witness a boom in Indian market?'

In [41]:
content

'In the ever-evolving world of health and wellness, a molecule called NAD+ (Nicotinamide Adenine Dinucleotide) has emerged as the new buzzword. Celebrities worldwide, from Hollywood A-listers to Indian fitness enthusiasts, are singing its praises, calling it the “magic molecule” for youth, energy, and longevity. But what is NAD+, why are stars obsessed with it, and how is it impacting the Indian wellness market? Dr Anjali Hooda, a renowned MD and Internist, explains, “NAD+ is a coenzyme found in all living cells. It plays a critical role in energy production, DNA repair, and slowing the ageing process. As we age, NAD+ levels decline, leading to fatigue, reduced cognitive function, and visible signs of ageing. Boosting NAD+ levels can help rejuvenate the body at a cellular level.”Why celebrities are hooked From Kendall Jenner to Hailey Beiber, celebrities are reportedly using NAD+ therapies for their anti-ageing and energy-boosting effects. Intravenous (IV) NAD+ drips have become a soug

In [46]:
s.find("span" , class_ = "jsx-ace90f4eca22afc7 strydate").text

'UPDATED: Dec 18, 2024 19:15 IST'

In [35]:
df.sample(5)

Unnamed: 0,headlines,link,content,category
2,You've been misunderstanding Trauma Bonding al...,https://www.indiatoday.in/lifestyle/relationsh...,"When we say ""trauma bonding""—what comes to min...",lifestyle
14,How long can you wear makeup without risking s...,https://www.indiatoday.in/lifestyle/beauty/sto...,"You might be the 'just a little blush, and I'm...",lifestyle
12,Is winter depression real? Experts weigh inSea...,https://www.indiatoday.in/lifestyle/wellness/s...,"You are not alone if the gloomy, cold weather ...",lifestyle
16,Why a turtleneck is the fashion BFF you need t...,https://www.indiatoday.in/lifestyle/fashion/st...,"POV: It's time to get up! The alarm rings, but...",lifestyle
1,Can microdosing on Ozempic help you lose weigh...,https://www.indiatoday.in/lifestyle/wellness/s...,The weight-loss market is evolving more than e...,lifestyle


In [36]:
len(df.iloc[0]["headlines"])

187

In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   headlines  20 non-null     object
 1   link       20 non-null     object
 2   content    20 non-null     object
 3   category   20 non-null     object
dtypes: object(4)
memory usage: 768.0+ bytes
