In [1]:
from dataclasses import dataclass
from pathlib import Path
import os

import yfinance as yf
import requests
import pandas as pd

In [2]:
%pwd

'c:\\Users\\Admin\\PhD Projects\\balance_continuous\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\Admin\\PhD Projects\\balance_continuous'

In [5]:
@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    local_data_file: Path

In [24]:
import pandas as pd
import requests
from io import StringIO

# Add proper headers to mimic a browser request
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

link = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies#S&P_500_component_stocks"

# Make the request with headers
response = requests.get(link, headers=headers)
response.raise_for_status()  # Raises an exception for bad status codes

# Parse the HTML content
df = pd.read_html(StringIO(response.text), header=0)[0]
df['Symbol'].to_csv('SandP500Ticker.csv', header=0)

In [19]:
df.head()

Unnamed: 0,Symbol,Security,GICS Sector,GICS Sub-Industry,Headquarters Location,Date added,CIK,Founded
0,MMM,3M,Industrials,Industrial Conglomerates,"Saint Paul, Minnesota",1957-03-04,66740,1902
1,AOS,A. O. Smith,Industrials,Building Products,"Milwaukee, Wisconsin",2017-07-26,91142,1916
2,ABT,Abbott Laboratories,Health Care,Health Care Equipment,"North Chicago, Illinois",1957-03-04,1800,1888
3,ABBV,AbbVie,Health Care,Biotechnology,"North Chicago, Illinois",2012-12-31,1551152,2013 (1888)
4,ACN,Accenture,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,1467373,1989


In [8]:
from sb_project.utils.common import read_yaml, create_directories

In [18]:
CONFIG_FILE_PATH = Path("config/config.yaml")
PARAMS_FILE_PATH = Path("config/params.yaml")

In [36]:
class ConfigurationManager:
    def __init__(
            self, 
            config_filepath = CONFIG_FILE_PATH,
            params_filepath = PARAMS_FILE_PATH
    ):
        self.config = read_yaml(config_filepath) 
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion
        yf_config = self.config.yfinance_config

        create_directories([config.root_dir])

        data_ingestion_config = DataIngestionConfig(
            root_dir=config.root_dir,
            local_data_file=config.local_data_file,
            tickers_url=yf_config.tickers,
            start_date = yf_config.start_date,
            interval = yf_config.interval,
            auto_adjust = yf_config.auto_adjust
        )

        return data_ingestion_config

In [40]:
config_manager = ConfigurationManager()
print(config_manager.config.data_ingestion)
print(config_manager.config.yfinance_config.tickers)

[2025-10-21 16:58:09,383: INFO: yaml file: config\config.yaml loaded successfully]
[2025-10-21 16:58:09,384: INFO: yaml file: config\params.yaml loaded successfully]
[2025-10-21 16:58:09,385: INFO: created directory at: artifacts]
{'root_dir': 'artifacts/data_ingestion', 'local_data_file': 'artifacts/data/sandp500.zip'}
https://raw.githubusercontent.com/krishna-das-m/balance_continuous/refs/heads/main/data/SandP500Ticker.csv


In [6]:
from curl_cffi import requests

In [None]:
def get_tickers_list(file)-> list:
    tickers = pd.read_csv(file, header=None)
    ticker_list = tickers.loc[:,0].tolist()
    return ticker_list

def download_data(ticker:str, start_date=None, interval:str='1d'):
    """
    Download stock data from Yahoo Finance.
    """
    session = requests.Session(impersonate="chrome")
    if start_date is None:
        start_date = '2000-01-01'
    else:
        start_date = pd.to_datetime(start_date)
    data = yf.download(ticker, start=start_date, interval=interval, auto_adjust=False, session=session)
    data.columns = data.columns.droplevel(1)  # Drop the first level of the column index
    # data.reset_index(inplace=True)  # Reset the index to make 'Date' a column
    return data

def ticker_data(ticker_list):
    stocks_df = pd.DataFrame()
    for i,ticker in enumerate(ticker_list):
        ticker_history = download_data(ticker)

        if stocks_df.empty:
            stocks_df = ticker_history
        else:
            stocks_df = pd.concat([stocks_df, ticker_history], ignore_index=True)
    return stocks_df

In [None]:
class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self. config = config

    
    def download_data(self):
        pass