In [1]:
from dataclasses import dataclass
from pathlib import Path
import os

import yfinance as yf
from curl_cffi import requests
import time
import pandas as pd

In [2]:
%pwd

'c:\\Users\\Admin\\PhD Projects\\balance_continuous\\research'

In [3]:
os.chdir("../src")

In [4]:
%pwd

'c:\\Users\\Admin\\PhD Projects\\balance_continuous\\src'

In [6]:
ticker_list = ['MSFT', 'AAPL', 'GOOG', 'NVDA', 'AMZN', 'META', 'BRK-B', 'LLY', 'AVGO','V', 'JPM']

In [10]:
def get_tickers_list(file)-> list:
    tickers = pd.read_csv(file, header=None)
    ticker_list = tickers.loc[:,0].tolist()
    return ticker_list

def download_data(ticker:str, start_date=None, interval:str='1d'):
    """
    Download stock data from Yahoo Finance.
    """
    session = requests.Session(impersonate="chrome")
    if start_date is None:
        start_date = '2000-01-01'
    else:
        start_date = pd.to_datetime(start_date)
    data = yf.download(ticker, start=start_date, interval=interval, auto_adjust=False, session=session)
    data.columns = data.columns.droplevel(1)  # Drop the first level of the column index
    # data.reset_index(inplace=True)  # Reset the index to make 'Date' a column
    return data

def ticker_data(ticker_list):
    stocks_df = pd.DataFrame()
    for i,ticker in enumerate(ticker_list):
        print(i,ticker)
        ticker_history = download_data(ticker)

        ticker_history['Ticker'] = ticker
        ticker_history['Year'] = ticker_history.index.year
        ticker_history['Month'] = ticker_history.index.month
        ticker_history['Weekday'] = ticker_history.index.weekday
        ticker_history['Date'] = ticker_history.index.date

        # sleep 1 sec between downloads - not to overload the API server
        time.sleep(1)
        if stocks_df.empty:
            stocks_df = ticker_history
        else:
            stocks_df = pd.concat([stocks_df, ticker_history], ignore_index=True)
    return stocks_df

In [11]:
ticker_data = ticker_data(ticker_list)

0 MSFT


[*********************100%***********************]  1 of 1 completed


1 AAPL


[*********************100%***********************]  1 of 1 completed


2 GOOG


[*********************100%***********************]  1 of 1 completed


3 NVDA


[*********************100%***********************]  1 of 1 completed


4 AMZN


[*********************100%***********************]  1 of 1 completed


5 META


[*********************100%***********************]  1 of 1 completed


6 BRK-B


[*********************100%***********************]  1 of 1 completed


7 LLY


[*********************100%***********************]  1 of 1 completed


8 AVGO


[*********************100%***********************]  1 of 1 completed


9 V


[*********************100%***********************]  1 of 1 completed


10 JPM


[*********************100%***********************]  1 of 1 completed


In [5]:
from sb_project.utils.common import read_yaml, create_directories

In [5]:
@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir:Path
    tickers : list[str]
    start_date : str
    interval : str
    auto_adjust : bool

In [7]:
CONFIG_FILE_PATH = Path("config/config.yaml")
PARAMS_FILE_PATH = Path("config/params.yaml")

In [8]:
# create attribute: config file path
# create method: get the the configs from the file
class ConfigurationManager:
    def __init__(self, config_filepath = CONFIG_FILE_PATH):
        self.config = read_yaml(config_filepath)

        create_directories([self.config.artifacts_root])
        
    def get_data_ingestion_config(self):
        config = self.config.data_ingestion
        yf_config = self.config.yfinance_config
        create_directories([config.root_dir])
        data_ingestion_config = DataIngestionConfig(
            root_dir=config.root_dir,
            tickers=yf_config.tickers,
            start_date=yf_config.start_date,
            interval=yf_config.interval,
            auto_adjust=yf_config.auto_adjust
        )
        return data_ingestion_config


In [16]:
class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config
        self.ticker_df = None


    def download_data(self, ticker):
        """
        Download stock data from Yahoo Finance.
        """
        session = requests.Session(impersonate="chrome")
        start_date = pd.to_datetime(self.config.start_date)
        interval = self.config.interval
        data = yf.download(ticker, start=start_date, interval=interval, auto_adjust=False, session=session)
        data.columns = data.columns.droplevel(1)  # Drop the first level of the column index
        # data.reset_index(inplace=True)  # Reset the index to make 'Date' a column
        return data

    def get_ticker_data(self):
        stocks_df = pd.DataFrame()
        for i,ticker in enumerate(self.config.tickers):
            print(i,ticker)
            ticker_history = self.download_data(ticker)

            ticker_history['Ticker'] = ticker
            ticker_history['Year'] = ticker_history.index.year.astype(int)
            ticker_history['Month'] = ticker_history.index.month.astype(int)
            ticker_history['Weekday'] = ticker_history.index.weekday.astype(int)
            ticker_history['Date'] = pd.to_datetime(ticker_history.index.date)

            # sleep 1 sec between downloads - not to overload the API server
            time.sleep(1)
            if stocks_df.empty:
                stocks_df = ticker_history
            else:
                stocks_df = pd.concat([stocks_df, ticker_history], ignore_index=True)
        self.ticker_df = stocks_df

    def save_file(self):
        filepath = os.path.join(self.config.root_dir, 'tickers_df.parquet')
        self.ticker_df.to_parquet(filepath, engine='fastparquet',compression='brotli', index=False)

    def load(self):
        filepath = os.path.join(self.config.root_dir, 'tickers_df.parquet')
        self.ticker_df = pd.read_parquet(filepath)

In [23]:
config_manager = ConfigurationManager()
# print(config_manager.config_filepath.yfinance_config)

data_ingestion_config = config_manager.get_data_ingestion_config()

data_ingestion = DataIngestion(config=data_ingestion_config)
# print(data_ingestion.config.)
data_ingestion.get_ticker_data()
ticker_data = data_ingestion.ticker_df


[2025-10-22 13:29:52,402: INFO: yaml file: config\config.yaml loaded successfully]
[2025-10-22 13:29:52,403: INFO: created directory at: artifacts]
[2025-10-22 13:29:52,404: INFO: created directory at: artifacts/data_ingestion]
0 AAPL


[*********************100%***********************]  1 of 1 completed


1 MSFT


[*********************100%***********************]  1 of 1 completed


2 NVDA


[*********************100%***********************]  1 of 1 completed


3 GOOGL


[*********************100%***********************]  1 of 1 completed


4 AMZN


[*********************100%***********************]  1 of 1 completed


5 META


[*********************100%***********************]  1 of 1 completed


6 BRK-B


[*********************100%***********************]  1 of 1 completed


7 AVGO


[*********************100%***********************]  1 of 1 completed


8 LLY


[*********************100%***********************]  1 of 1 completed


9 TSLA


[*********************100%***********************]  1 of 1 completed


10 WMT


[*********************100%***********************]  1 of 1 completed


11 JPM


[*********************100%***********************]  1 of 1 completed


12 V


[*********************100%***********************]  1 of 1 completed


13 UNH


[*********************100%***********************]  1 of 1 completed


14 XOM


[*********************100%***********************]  1 of 1 completed


15 MA


[*********************100%***********************]  1 of 1 completed


16 ORCL


[*********************100%***********************]  1 of 1 completed


17 COST


[*********************100%***********************]  1 of 1 completed


18 HD


[*********************100%***********************]  1 of 1 completed


19 PG


[*********************100%***********************]  1 of 1 completed


20 NFLX


[*********************100%***********************]  1 of 1 completed


21 JNJ


[*********************100%***********************]  1 of 1 completed


22 BAC


[*********************100%***********************]  1 of 1 completed


23 ABBV


[*********************100%***********************]  1 of 1 completed


24 CRM


[*********************100%***********************]  1 of 1 completed


25 KO


[*********************100%***********************]  1 of 1 completed


26 CVX


[*********************100%***********************]  1 of 1 completed


27 MRK


[*********************100%***********************]  1 of 1 completed


28 AMD


[*********************100%***********************]  1 of 1 completed


29 ADBE


[*********************100%***********************]  1 of 1 completed


30 PEP


[*********************100%***********************]  1 of 1 completed


31 TMO


[*********************100%***********************]  1 of 1 completed


32 WFC


[*********************100%***********************]  1 of 1 completed


33 CSCO


[*********************100%***********************]  1 of 1 completed


34 MCD


[*********************100%***********************]  1 of 1 completed


35 ACN


[*********************100%***********************]  1 of 1 completed


36 LIN


[*********************100%***********************]  1 of 1 completed


37 ABT


[*********************100%***********************]  1 of 1 completed


38 DHR


[*********************100%***********************]  1 of 1 completed


39 TXN


[*********************100%***********************]  1 of 1 completed


40 PM


[*********************100%***********************]  1 of 1 completed


41 VZ


[*********************100%***********************]  1 of 1 completed


42 QCOM


[*********************100%***********************]  1 of 1 completed


43 IBM


[*********************100%***********************]  1 of 1 completed


44 INTU


[*********************100%***********************]  1 of 1 completed


45 NOW


[*********************100%***********************]  1 of 1 completed


46 DIS


[*********************100%***********************]  1 of 1 completed


47 NEE


[*********************100%***********************]  1 of 1 completed


48 CAT


[*********************100%***********************]  1 of 1 completed


49 GE


[*********************100%***********************]  1 of 1 completed


In [24]:
data_ingestion.save_file()

In [25]:
data_ingestion.load()
ticker_data = data_ingestion.ticker_df

In [22]:
print(type(ticker_data))

<class 'pandas.core.frame.DataFrame'>


In [6]:
import pandas as pd
from scipy.stats import kendalltau
import yfinance as yf # Or another data source

# Step 1: Get stock data
ticker1 = 'AAPL'
ticker2 = 'MSFT'
data = yf.download([ticker1, ticker2], start='2022-01-01', end='2024-01-01')

# Step 2: Prepare data
prices1 = data['Close'][ticker1]
prices2 = data['Close'][ticker2]

# Step 3: Calculate correlation
correlation, p_value = kendalltau(prices1, prices2)

# Step 4: Interpret results
print(f"Kendall's Tau correlation: {correlation}")
print(f"P-value: {p_value}")

# You can also calculate a full correlation matrix for multiple stocks
# For example, create a DataFrame of closing prices and then use the .corr() method with method='kendall'
# price_df = data[['Close']].unstack()
# correlation_matrix = price_df.corr(method='kendall')
# print(correlation_matrix)

  data = yf.download([ticker1, ticker2], start='2022-01-01', end='2024-01-01')
[*********************100%***********************]  2 of 2 completed

Kendall's Tau correlation: 0.8027737745546579
P-value: 6.847450485803656e-159



