In [1]:
from dataclasses import dataclass
from pathlib import Path
import os

import yfinance as yf
from curl_cffi import requests
import time
import pandas as pd

In [2]:
%pwd

'c:\\Users\\krishnadas\\Projects\\PhD Project\\balance_continuous\\research'

In [6]:
ticker_list = ['MSFT', 'AAPL', 'GOOG', 'NVDA', 'AMZN', 'META', 'BRK-B', 'LLY', 'AVGO','V', 'JPM']

In [12]:
def get_tickers_list(file)-> list:
    tickers = pd.read_csv(file, header=None)
    ticker_list = tickers.loc[:,0].tolist()
    return ticker_list

def download_data(ticker:str, start_date=None, interval:str='1d'):
    """
    Download stock data from Yahoo Finance.
    """
    session = requests.Session(impersonate="chrome")
    if start_date is None:
        start_date = '2000-01-01'
    else:
        start_date = pd.to_datetime(start_date)
    data = yf.download(ticker, start=start_date, interval=interval, auto_adjust=False, session=session)
    data.columns = data.columns.droplevel(1)  # Drop the first level of the column index
    # data.reset_index(inplace=True)  # Reset the index to make 'Date' a column
    return data

def ticker_data(ticker_list):
    stocks_df = pd.DataFrame()
    for i,ticker in enumerate(ticker_list):
        print(i,ticker)
        ticker_history = download_data(ticker)

        ticker_history['Ticker'] = ticker
        ticker_history['Year'] = ticker_history.index.year
        ticker_history['Month'] = ticker_history.index.month
        ticker_history['Weekday'] = ticker_history.index.weekday
        ticker_history['Date'] = ticker_history.index.date

        # sleep 1 sec between downloads - not to overload the API server
        time.sleep(1)
        if stocks_df.empty:
            stocks_df = ticker_history
        else:
            stocks_df = pd.concat([stocks_df, ticker_history], ignore_index=True)
    return stocks_df

In [11]:
ticker_data = ticker_data(ticker_list)

0 MSFT


[*********************100%***********************]  1 of 1 completed


1 AAPL


[*********************100%***********************]  1 of 1 completed


2 GOOG


[*********************100%***********************]  1 of 1 completed


3 NVDA


[*********************100%***********************]  1 of 1 completed


4 AMZN


[*********************100%***********************]  1 of 1 completed


5 META


[*********************100%***********************]  1 of 1 completed


6 BRK-B


[*********************100%***********************]  1 of 1 completed


7 LLY


[*********************100%***********************]  1 of 1 completed


8 AVGO


[*********************100%***********************]  1 of 1 completed


9 V


[*********************100%***********************]  1 of 1 completed


10 JPM


[*********************100%***********************]  1 of 1 completed


In [11]:
# Get project root (one level up from research/)
PROJECT_ROOT = Path(os.getcwd()).parent if Path(os.getcwd()).name == "research" else Path(os.getcwd())

CONFIG_FILE_PATH = Path(PROJECT_ROOT/"config/config.yaml")
PARAMS_FILE_PATH = Path("config/params.yaml")

In [12]:
print(PROJECT_ROOT)
print(CONFIG_FILE_PATH)

c:\Users\krishnadas\Projects\PhD Project\balance_continuous
c:\Users\krishnadas\Projects\PhD Project\balance_continuous\config\config.yaml


In [14]:
%pwd

'c:\\Users\\krishnadas\\Projects\\PhD Project\\balance_continuous\\research'

In [15]:
os.chdir("..\\src")

In [16]:
src_path = PROJECT_ROOT / "src"
from sb_project.utils.common import read_yaml, create_directories

In [17]:
@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir:Path
    tickers : list[str]
    start_date : str
    interval : str
    auto_adjust : bool

In [28]:
# create attribute: config file path
# create method: get the the configs from the file
class ConfigurationManager:
    def __init__(self, config_filepath = CONFIG_FILE_PATH):
        self.config = read_yaml(config_filepath)

        create_directories([self.config.artifacts_root])
        
    def get_data_ingestion_config(self):
        config = self.config.data_ingestion
        yf_config = self.config.yfinance_config
        create_directories([config.root_dir])
        data_ingestion_config = DataIngestionConfig(
            root_dir=config.root_dir,
            tickers=yf_config.financials,
            start_date=yf_config.start_date,
            interval=yf_config.interval,
            auto_adjust=yf_config.auto_adjust
        )
        return data_ingestion_config


In [33]:
class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config
        self.ticker_df = None


    def download_data(self, ticker):
        """
        Download stock data from Yahoo Finance.
        """
        session = requests.Session(impersonate="chrome")
        start_date = pd.to_datetime(self.config.start_date)
        interval = self.config.interval
        data = yf.download(ticker, start=start_date, interval=interval, auto_adjust=False, session=session)
        data.columns = data.columns.droplevel(1)  # Drop the first level of the column index
        # data.reset_index(inplace=True)  # Reset the index to make 'Date' a column
        return data

    def get_ticker_data(self):
        stocks_df = pd.DataFrame()
        for i,ticker in enumerate(self.config.tickers):
            print(i,ticker)
            ticker_history = self.download_data(ticker)

            ticker_history['Ticker'] = ticker
            ticker_history['Year'] = ticker_history.index.year.astype(int)
            ticker_history['Month'] = ticker_history.index.month.astype(int)
            ticker_history['Weekday'] = ticker_history.index.weekday.astype(int)
            ticker_history['Date'] = pd.to_datetime(ticker_history.index.date)

            # sleep 1 sec between downloads - not to overload the API server
            time.sleep(1)
            if stocks_df.empty:
                stocks_df = ticker_history
            else:
                stocks_df = pd.concat([stocks_df, ticker_history], ignore_index=True)
        self.ticker_df = stocks_df

    def save_file(self):
        filepath = os.path.join(self.config.root_dir, 'tickers_financial.parquet')
        self.ticker_df.to_parquet(filepath, engine='fastparquet',compression='brotli', index=False)

    def load(self):
        filepath = os.path.join(self.config.root_dir, 'tickers_financial.parquet')
        self.ticker_df = pd.read_parquet(filepath)

In [30]:
%pwd

'c:\\Users\\krishnadas\\Projects\\PhD Project\\balance_continuous\\src'

In [34]:
config_manager = ConfigurationManager()
# print(config_manager.config_filepath.yfinance_config)

data_ingestion_config = config_manager.get_data_ingestion_config()

data_ingestion = DataIngestion(config=data_ingestion_config)
# print(data_ingestion.config.)
data_ingestion.get_ticker_data()
ticker_data = data_ingestion.ticker_df


[2025-11-10 16:13:09,415: INFO: yaml file: c:\Users\krishnadas\Projects\PhD Project\balance_continuous\config\config.yaml loaded successfully]
[2025-11-10 16:13:09,418: INFO: created directory at: artifacts]
[2025-11-10 16:13:09,420: INFO: created directory at: artifacts/data_ingestion]
0 ACGL


[*********************100%***********************]  1 of 1 completed


1 AFL


[*********************100%***********************]  1 of 1 completed


2 AIG


[*********************100%***********************]  1 of 1 completed


3 AIZ


[*********************100%***********************]  1 of 1 completed


4 AJG


[*********************100%***********************]  1 of 1 completed


5 ALL


[*********************100%***********************]  1 of 1 completed


6 AMP


[*********************100%***********************]  1 of 1 completed


7 AON


[*********************100%***********************]  1 of 1 completed


8 APO


[*********************100%***********************]  1 of 1 completed


9 AXP


[*********************100%***********************]  1 of 1 completed


10 BAC


[*********************100%***********************]  1 of 1 completed


11 BEN


[*********************100%***********************]  1 of 1 completed


12 BK


[*********************100%***********************]  1 of 1 completed


13 BLK


[*********************100%***********************]  1 of 1 completed


14 BRK.B


[*********************100%***********************]  1 of 1 completed

[2025-11-10 16:13:30,030: ERROR: 
1 Failed download:]
[2025-11-10 16:13:30,032: ERROR: ['BRK.B']: YFTzMissingError('possibly delisted; no timezone found')]





15 BRO


[*********************100%***********************]  1 of 1 completed


16 BX


[*********************100%***********************]  1 of 1 completed


17 C


[*********************100%***********************]  1 of 1 completed


18 CB


[*********************100%***********************]  1 of 1 completed


19 CBOE


[*********************100%***********************]  1 of 1 completed


20 CFG


[*********************100%***********************]  1 of 1 completed


21 CINF


[*********************100%***********************]  1 of 1 completed


22 CME


[*********************100%***********************]  1 of 1 completed


23 COF


[*********************100%***********************]  1 of 1 completed


24 COIN


[*********************100%***********************]  1 of 1 completed


25 CPAY


[*********************100%***********************]  1 of 1 completed


26 EG


[*********************100%***********************]  1 of 1 completed


27 ERIE


[*********************100%***********************]  1 of 1 completed


28 FDS


[*********************100%***********************]  1 of 1 completed


29 FI


[*********************100%***********************]  1 of 1 completed


30 FIS


[*********************100%***********************]  1 of 1 completed


31 FITB


[*********************100%***********************]  1 of 1 completed


32 GL


[*********************100%***********************]  1 of 1 completed


33 GPN


[*********************100%***********************]  1 of 1 completed


34 GS


[*********************100%***********************]  1 of 1 completed


35 HBAN


[*********************100%***********************]  1 of 1 completed


36 HIG


[*********************100%***********************]  1 of 1 completed


37 HOOD


[*********************100%***********************]  1 of 1 completed


38 IBKR


[*********************100%***********************]  1 of 1 completed


39 ICE


[*********************100%***********************]  1 of 1 completed


40 IVZ


[*********************100%***********************]  1 of 1 completed


41 JKHY


[*********************100%***********************]  1 of 1 completed


42 JPM


[*********************100%***********************]  1 of 1 completed


43 KEY


[*********************100%***********************]  1 of 1 completed


44 KKR


[*********************100%***********************]  1 of 1 completed


45 L


[*********************100%***********************]  1 of 1 completed


46 MA


[*********************100%***********************]  1 of 1 completed


47 MCO


[*********************100%***********************]  1 of 1 completed


48 MET


[*********************100%***********************]  1 of 1 completed


49 MMC


[*********************100%***********************]  1 of 1 completed


50 MS


[*********************100%***********************]  1 of 1 completed


51 MSCI


[*********************100%***********************]  1 of 1 completed


52 MTB


[*********************100%***********************]  1 of 1 completed


53 NDAQ


[*********************100%***********************]  1 of 1 completed


54 NTRS


[*********************100%***********************]  1 of 1 completed


55 PFG


[*********************100%***********************]  1 of 1 completed


56 PGR


[*********************100%***********************]  1 of 1 completed


57 PNC


[*********************100%***********************]  1 of 1 completed


58 PRU


[*********************100%***********************]  1 of 1 completed


59 PYPL


[*********************100%***********************]  1 of 1 completed


60 RF


[*********************100%***********************]  1 of 1 completed


61 RJF


[*********************100%***********************]  1 of 1 completed


62 SCHW


[*********************100%***********************]  1 of 1 completed


63 SPGI


[*********************100%***********************]  1 of 1 completed


64 STT


[*********************100%***********************]  1 of 1 completed


65 SYF


[*********************100%***********************]  1 of 1 completed


66 TFC


[*********************100%***********************]  1 of 1 completed


67 TROW


[*********************100%***********************]  1 of 1 completed


68 TRV


[*********************100%***********************]  1 of 1 completed


69 USB


[*********************100%***********************]  1 of 1 completed


70 V


[*********************100%***********************]  1 of 1 completed


71 WFC


[*********************100%***********************]  1 of 1 completed


72 WRB


[*********************100%***********************]  1 of 1 completed


73 WTW


[*********************100%***********************]  1 of 1 completed


74 XYZ


[*********************100%***********************]  1 of 1 completed


In [35]:
data_ingestion.save_file()

In [36]:
data_ingestion.load()
ticker_data = data_ingestion.ticker_df

In [22]:
print(type(ticker_data))

<class 'pandas.core.frame.DataFrame'>


In [6]:
import pandas as pd
from scipy.stats import kendalltau
import yfinance as yf # Or another data source

# Step 1: Get stock data
ticker1 = 'AAPL'
ticker2 = 'MSFT'
data = yf.download([ticker1, ticker2], start='2022-01-01', end='2024-01-01')

# Step 2: Prepare data
prices1 = data['Close'][ticker1]
prices2 = data['Close'][ticker2]

# Step 3: Calculate correlation
correlation, p_value = kendalltau(prices1, prices2)

# Step 4: Interpret results
print(f"Kendall's Tau correlation: {correlation}")
print(f"P-value: {p_value}")

# You can also calculate a full correlation matrix for multiple stocks
# For example, create a DataFrame of closing prices and then use the .corr() method with method='kendall'
# price_df = data[['Close']].unstack()
# correlation_matrix = price_df.corr(method='kendall')
# print(correlation_matrix)

  data = yf.download([ticker1, ticker2], start='2022-01-01', end='2024-01-01')
[*********************100%***********************]  2 of 2 completed

Kendall's Tau correlation: 0.8027737745546579
P-value: 6.847450485803656e-159





In [2]:
#!/usr/bin/env python3
"""
sp500_by_sector.py

Downloads the current "List of S&P 500 companies" table from Wikipedia,
groups tickers by GICS sector, and saves the mapping to sp500_by_sector.json.

Output:
 - prints each of the 11 GICS sectors and the list of tickers in each
 - saves JSON file with mapping {"Information Technology": [...], ...}
"""

import json
import sys

import pandas as pd
import requests

WIKI_URL = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
OUT_JSON = "sp500_by_sector.json"

def fetch_sp500_table(url: str) -> pd.DataFrame:
    """
    Fetch the main S&P 500 companies table from Wikipedia and return as DataFrame.
    """
    # Add headers to avoid 403 Forbidden
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    resp = requests.get(url, headers=headers, timeout=30)
    resp.raise_for_status()
    tables = pd.read_html(resp.text)
    if not tables:
        raise RuntimeError("No tables found on the Wikipedia page.")
    # The first table on the page is the S&P 500 constituents table
    df = tables[0].copy()
    return df

def normalize_symbol_col(df: pd.DataFrame) -> str:
    """
    Return the DataFrame column name that contains tickers.
    Accept common variants used across sources.
    """
    possible_names = ["Symbol", "Ticker symbol", "Ticker", "Symbol[1]"]
    for name in possible_names:
        if name in df.columns:
            return name
    # fallback: first column typically contains ticker-like values; try heuristics
    for col in df.columns:
        # simple check: many tickers are short uppercase strings
        sample = str(df[col].iloc[0]) if len(df) > 0 else ""
        if isinstance(sample, str) and 1 <= len(sample) <= 6 and sample.upper() == sample:
            return col
    raise KeyError("Could not find the ticker/symbol column in the table. Columns: " + ", ".join(df.columns))

def normalize_sector_col(df: pd.DataFrame) -> str:
    """
    Return the DataFrame column name that contains the GICS sector.
    Common names: 'GICS Sector' or 'Sector' or similar.
    """
    possible = ["GICS Sector", "GICS sector", "Sector"]
    for name in possible:
        if name in df.columns:
            return name
    raise KeyError("Could not find the GICS Sector column. Columns: " + ", ".join(df.columns))

def build_sector_mapping(df: pd.DataFrame) -> dict:
    """
    Build a dict mapping 'GICS Sector' -> list of tickers (strings).
    """
    sym_col = normalize_symbol_col(df)
    sector_col = normalize_sector_col(df)

    # sanitize tickers (strip whitespace)
    df["__TICKER__"] = df[sym_col].astype(str).str.strip()
    df["__SECTOR__"] = df[sector_col].astype(str).str.strip()

    mapping = df.groupby("__SECTOR__")["__TICKER__"].apply(lambda s: sorted(set(s))).to_dict()
    return mapping

def filter_to_11_sectors(mapping: dict) -> dict:
    """
    Keep only the canonical 11 S&P/GICS sectors in the user's request,
    in this canonical order.
    """
    canonical_order = [
        "Information Technology",
        "Financials",
        "Health Care",
        "Consumer Discretionary",
        "Communication Services",
        "Industrials",
        "Consumer Staples",
        "Energy",
        "Real Estate",
        "Materials",
        "Utilities",
    ]
    out = {sec: mapping.get(sec, []) for sec in canonical_order}
    return out

def main():
    try:
        df = fetch_sp500_table(WIKI_URL)
    except Exception as e:
        print("Error fetching/parsing S&P 500 table:", e, file=sys.stderr)
        sys.exit(1)

    try:
        mapping = build_sector_mapping(df)
    except Exception as e:
        print("Error building mapping:", e, file=sys.stderr)
        print("Available columns:", df.columns.tolist(), file=sys.stderr)
        sys.exit(1)

    # Keep only the 11 canonical sectors and keep order
    sector_map_11 = filter_to_11_sectors(mapping)

    # Save to JSON
    with open(OUT_JSON, "w", encoding="utf-8") as f:
        json.dump(sector_map_11, f, indent=2, ensure_ascii=False)

    # Print a short summary and the lists
    print(f"Saved mapping for 11 sectors to: {OUT_JSON}\n")
    for sec, tickers in sector_map_11.items():
        print(f"{sec} ({len(tickers)}):")
        # print tickers in comma-separated lines, wrap for readability
        if not tickers:
            print("  (no tickers found for this sector in the table)\n")
            continue
        # Print as a single comma-separated line (might be long)
        print("  " + ", ".join(tickers) + "\n")

if __name__ == "__main__":
    main()


  tables = pd.read_html(resp.text)


Saved mapping for 11 sectors to: sp500_by_sector.json

Information Technology (69):
  AAPL, ACN, ADBE, ADI, ADSK, AKAM, AMAT, AMD, ANET, APH, APP, AVGO, CDNS, CDW, CRM, CRWD, CSCO, CTSH, DDOG, DELL, EPAM, FFIV, FICO, FSLR, FTNT, GDDY, GEN, GLW, HPE, HPQ, IBM, INTC, INTU, IT, JBL, KEYS, KLAC, LRCX, MCHP, MPWR, MSFT, MSI, MU, NOW, NTAP, NVDA, NXPI, ON, ORCL, PANW, PLTR, PTC, Q, QCOM, ROP, SMCI, SNPS, STX, SWKS, TDY, TEL, TER, TRMB, TXN, TYL, VRSN, WDAY, WDC, ZBRA

Financials (75):
  ACGL, AFL, AIG, AIZ, AJG, ALL, AMP, AON, APO, AXP, BAC, BEN, BK, BLK, BRK.B, BRO, BX, C, CB, CBOE, CFG, CINF, CME, COF, COIN, CPAY, EG, ERIE, FDS, FI, FIS, FITB, GL, GPN, GS, HBAN, HIG, HOOD, IBKR, ICE, IVZ, JKHY, JPM, KEY, KKR, L, MA, MCO, MET, MMC, MS, MSCI, MTB, NDAQ, NTRS, PFG, PGR, PNC, PRU, PYPL, RF, RJF, SCHW, SPGI, STT, SYF, TFC, TROW, TRV, USB, V, WFC, WRB, WTW, XYZ

Health Care (60):
  A, ABBV, ABT, ALGN, AMGN, BAX, BDX, BIIB, BMY, BSX, CAH, CI, CNC, COO, COR, CRL, CVS, DGX, DHR, DVA, DXCM, ELV, EW,