In [1]:
import pandas as pd
import os
import pickle
from dataclasses import dataclass
from pathlib import Path
from newspaper import Article
from newsdataapi import NewsDataApiClient

from dataclasses import dataclass


In [2]:
%pwd

'c:\\Users\\Admin\\Projects\\ML Projects\\stock_sentiment\\notebooks'

In [3]:
PROJECT_ROOT = Path(os.getcwd()).parent
print(f"Project root: {PROJECT_ROOT}")

CONFIG_FILE_PATH = PROJECT_ROOT/'config'/'config.yaml'
print(f"Config path: {CONFIG_FILE_PATH}")

Project root: c:\Users\Admin\Projects\ML Projects\stock_sentiment
Config path: c:\Users\Admin\Projects\ML Projects\stock_sentiment\config\config.yaml


In [4]:
os.chdir("..\\src")
%pwd

'c:\\Users\\Admin\\Projects\\ML Projects\\stock_sentiment\\src'

In [5]:
from utils.common import read_yaml, create_directories

In [37]:
@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    query: list[str] = None

class ConfigurationManager:
    def __init__(self, config_filepath=CONFIG_FILE_PATH):
        self.config = read_yaml(config_filepath)

    def get_data_ingestion_config(self):
        config = self.config.data_ingestion
        create_directories([config.root_dir])
        data_ingestion_config = DataIngestionConfig(
            root_dir=config.root_dir,
            query= config.query
        )
        return data_ingestion_config

class DataIngestion:
    def __init__(self, config:DataIngestionConfig):
        self.config = config
        self.all_news_articles = None
    
    def newsdata_connect(self):
        try:
            api = NewsDataApiClient(apikey='pub_965468a202be412d80928d294b632639')
            print('connected to API')
            return api
        except Exception as e:
            print("Not connected to API")
            return None
        

    def extract_news(self, query:list[str], limit:int=5, country:str='in'):
        api = self.newsdata_connect()
        search_query = query or (self.config.query if self.config.query else 'finance')
        # Combine queries into a single OR-separated string with quotes
        combined_query = " OR ".join([f'"{q}"' for q in search_query])
        print(combined_query)
        try:
            response = api.latest_api(
                q=combined_query,
                country=country,
                category='business',
                size=limit,
                language='en'
            )
            articles = response.get('results',[])
            print('reached here')
            print(articles)
        except Exception as e:
            print("No articles returned")
            return []

        # step 2: scrape full content from each URL
        full_articles = []
        for idx, article in enumerate(articles, 1):
            try:
                print(f"Processing {idx}/{len(articles)}: {article['title'][:50]}...")
                url = article.get('link') # article['link']
                if not url:
                    print("Skipping: No url")
                    continue
                # get full content
                news_article = Article(url)
                news_article.download()
                news_article.parse()
                full_articles.append({
                    'article_id': article['article_id'],
                    'title': article['title'],
                    'description': article['description'],
                    'source': article['source_name'],
                    'url': article['link'],
                    'pubDate': article['pubDate'],
                    'category': article['category'],
                    'full_content': news_article.text,
                    'authors': ', '.join(news_article.authors),
                    'image_url': article['image_url']
                })
            except Exception as e:
                print(f"Error scraping {article['link']}: {e}")
        self.all_news_articles = full_articles
        return self.all_news_articles

    def save_newsdata(self):
        filepath = os.path.join(self.config.root_dir, 'news_articles.pkl')
        with open(filepath, 'wb') as f:
            pickle.dump(self.all_news_articles, f)
    
    def load_newsdata(self):
        filepath = os.path.join(self.config.root_dir, 'news_articles.pkl')
        with open(filepath, 'rb') as f:
            self.news_articles = pickle.load(f)
        return self.news_articles

In [38]:
config_manager = ConfigurationManager()
print(config_manager.config)

{'data_ingestion': {'root_dir': 'artifacts/data_ingestion', 'query': ['India', 'Economy', 'Stock']}}


In [39]:
data_ingestion_config = config_manager.get_data_ingestion_config()

data_ingestion = DataIngestion(config=data_ingestion_config)


In [40]:
query = ['BSE', 'Energy']
articles = data_ingestion.extract_news(query=query)

connected to API
"BSE" OR "Energy"
reached here
[{'article_id': '49feb3529593c33a848fea7d1d209c82', 'link': 'https://www.livemint.com/market/stock-market-news/bse-share-price-jumps-almost-7-after-q2-results-details-here-11762920669665.html', 'title': 'BSE share price jumps almost 7% after Q2 results; details here', 'description': 'BSE share price jumped almost 7 per cent in early trade on the NSE on Wednesday, November 12, a day after the company reported its Q2 results for the financial year 2025-26.', 'content': 'ONLY AVAILABLE IN PAID PLANS', 'keywords': ['stock market news today', 'bse q2 results 2025', 'bse share price', 'bse share price trend', 'indian stock market'], 'creator': ['Nishant Kumar'], 'language': 'english', 'country': ['india'], 'category': ['business'], 'pubDate': '2025-11-12 04:14:48', 'pubDateTZ': 'UTC', 'image_url': 'https://www.livemint.com/lm-img/img/2025/11/12/1600x900/logo/Earnings_1759910110675_1759910115968_1762920839563.png', 'video_url': None, 'source_id'

In [20]:
data_ingestion.save_newsdata()

In [28]:
news_articles = data_ingestion.load_newsdata()