# Data Retrieval

In this notebook, we collect the relevant data sets from different sources and store them to disk. Besides of renaming the columns, no cleaning tasks are performed. 

## General Setup

In [None]:
import time
import holidays
import pandas as pd
import yfinance as yf

from dotenv import load_dotenv
from fredapi import Fred

from signal_sigma.config.cfg_legacy import *

In [None]:
load_dotenv()

NB_NUMBER = 1

## Stocks from `yfinance`

In [None]:
START_DATE = "2000-01-01"
END_DATE = "2025-04-22"
INTERVAL = "1d"

In [None]:
for ticker in STOCK_TICKERS:
    time.sleep(0.123)  
    
    df = yf.download(
        ticker,
        start=START_DATE,
        end=END_DATE,
        interval=INTERVAL,
        # auto_adjust=True,
    )

    # Get rid of the two-level column name scheme
    # (one level only indicates the tickler).
    df.columns = df.columns.get_level_values(0).rename(None)

    df = df.reset_index(names="date")
    df.index.name = IDX

    # Convert column names to lowercase
    df.columns = df.columns.str.lower()

    csvpath_rel = os.path.join(DATA_STOCKS_DIR_RELPATH, ticker.lower() + ".csv")
    store_df_as_csv(df, csvpath_rel, NB_NUMBER)

In [None]:
df.info()

## Market Indices/Indexoids from `yfinance`

In [None]:
START_DATE_YF = "2000-01-01"
END_DATE_YF = "2025-04-23"

# Macro indicators and market indices
MACRO_TICKERS = {
    # Indices
    "^GSPC": "S&P500_Index",
    "^DJI": "Dow_Jones_Index",
    "^IXIC": "NASDAQ_Composite",
    "^RUT": "Russell2000_Index",
    "^VIX": "VIX_Index",
    # Commodities
    "DX-Y.NYB": "Dollar_Index_DXY",
    "GC=F": "Gold_Futures",
    "CL=F": "WTI_Oil_Futures",
    "HG=F": "Copper_Futures",
    "BZ=F": "Brent_Crude_Futures",
    # Sector ETFs (Proxies)
    "XLK": "Tech_Sector_ETF",
    "XLE": "Energy_Sector_ETF",
    "XLF": "Financial_Sector_ETF",
    "XLY": "ConsumerDiscretionary_ETF",
    # Other Market Metrics
    "LIT": "Lithium_ETF",
    "SMH": "Semiconductor_ETF",
    "XLU": "Electricity_Proxy",
}

In [None]:
# Create output DataFrame
df = pd.DataFrame()

# Download data
for ticker, label in MACRO_TICKERS.items():
    # NOTE: Althoug yfinance provides open/close and min/max values
    # for each indicator, we only incorporate the close value.
    df_tmp = yf.download(ticker, start=START_DATE_YF, end=END_DATE_YF)
    df[label] = df_tmp["Close"]

# Drop completely empty columns (failed downloads).
df = df.dropna(axis=1, how="all")

# Get rid of the two-level column name scheme.
df.columns = df.columns.get_level_values(0).rename(None)

df = df.reset_index(names="date")
df.index.name = IDX

# Convert column names to lowercase
df.columns = df.columns.str.lower()

relpath_df = DATA_YF_MIF_RELPATH
store_df_as_csv(df, relpath_df, NB_NUMBER)

In [None]:
# NOTE: Alternative that would store (similar to stocks)
# each indicator with open/end and high/low in a 
# separate file
# TODO: Set up data processing type in complete analogy to
# the stock case (or simply merge the ticker lists ...)

for ticker in MACRO_TICKERS:
    time.sleep(0.123)  
    df = yf.download(ticker, start=START_DATE_YF, end=END_DATE_YF)

    # Get rid of the two-level column name scheme
    # (one level only indicates the tickler).
    df.columns = df.columns.get_level_values(0).rename(None)

    df = df.reset_index(names="date")
    df.index.name = IDX

    # Convert column names to lowercase
    df.columns = df.columns.str.lower()

    # csvpath_rel = os.path.join(DIR_DATA_INDICATORS, ticker.lower() + ".csv")
    # store_df_as_csv(df, csvpath_rel, NB_NUMBER)

## Market Indicators from `fred`

In [None]:
API_KEY_FED = os.getenv("FRED_API_KEY")

INDICATOR_TICKERS = {
    # Consumer Price Index (CPI): Measures inflation and purchasing power.
    "CPIAUCSL": "cpi",
    # Federal Funds Rate: Target rate for overnight lending between banks.
    "FEDFUNDS": "fed_rate",
    # Crude Oil Prices: Reflects energy costs and global economic conditions.
    "DCOILWTICO": "oil",
    # Gross Domestic Product (GDP): Measures overall economic activity and growth.
    "GDP": "gdp",
    # Nonfarm Payrolls: Number of jobs added or lost in the economy.
    "PAYEMS": "nonfarm_payrolls",
    # 10-Year Treasury Yield: Reflects long-term interest rates.
    "DGS10": "treasury_yield",
    # Industrial Production Index: Measures output of industrial sectors.
    "INDPRO": "industrial_production",
    # Retail Sales: Reflects consumer spending and economic health.
    "RSXFS": "retail_sales",
    # Manufacturing PMI: Indicates business conditions in the manufacturing sector.
    "MANEMP": "pmi",
    # Consumer Confidence Index: Reflects consumer sentiment and spending outlook.
    "UMCSENT": "consumer_confidence",
}

START_DATE = "2000-01-01"
END_DATE = "2025-04-22"
INTERVAL = "1d"

In [None]:
# Create a Fred object using the API key
fred = Fred(api_key=API_KEY_FED)

df = pd.DataFrame()

for id, name in INDICATOR_TICKERS.items():
    # Fetch the data for the current indicator using its series ID
    data = fred.get_series(id)
    # Resample to daily frequency and forward fill missing values
    data = data.resample("D").ffill()
    # Add the data as a column to the DataFrame using the indicator name as the column name
    df[name] = data

df = df.reset_index(names="date")
df.index.name = IDX

In [None]:
# Filter-out non-working day rows

# NOTE: Further date-related features for the merged data frame
# will be introduced in the feature engineering notebook.

# XXX: Does it make a difference to first filter out non-working days
# and then to perform an inner join for all frames or first to perform
# an outer join and then to filter out non-working days?

df["date"] = pd.to_datetime(df["date"])
df = df[df["date"] >= pd.to_datetime(START_DATE)]

# 0 = Monday, ..., 6 = Sunday.
df["weekday"] = df["date"].dt.weekday

# US holidays indicator
us_holidays = holidays.US()
df["is_holiday"] = df["date"].apply(lambda dt: dt in us_holidays)

# Working day indicator
df["is_trading_day"] = ~df["is_holiday"] & ~df["weekday"].isin([5, 6])

# Filter the DataFrame to include only working days for analysis purposes
df = df[df["is_trading_day"]]

relpath_df = DATA_FED_CEI_RELPATH
store_df_as_csv(df, relpath_df, NB_NUMBER)