# Notebook 1 - Development work for pulling data from yahoo finance's API and loading it into bronze table tbl_api_payloads_yfinance_daily

### Import Libraries

In [1]:
# Standard libraries
import os
import numpy as np
import pandas as pd
from datetime import datetime
import json

# Typing
from typing import Tuple, List, Set

# Connect to AWS RDS
from dotenv import load_dotenv
import psycopg2
from psycopg2.extensions import connection as Connection
from psycopg2.extensions import cursor as Cursor
from psycopg2 import OperationalError, ProgrammingError, Error

# Connect to yahoo finance's API
import yfinance as yf

# Suppress scientific notation globally
pd.set_option('display.float_format', '{:,.2f}'.format)

### Configuration & Constants

In [2]:
# Load .env file (searches in the current directory or parent)
load_dotenv()

# Access environment variables for connecting to my PostgreSQL database
rds_host = os.getenv("rds_host")
rds_port = int(os.getenv("rds_port"))
rds_dbname = os.getenv("rds_dbname")
rds_username = os.getenv("rds_username")
rds_password = os.getenv("rds_password")

### Helper Functions - DB Connection

In [3]:
def connect_to_rds(rds_host: str, rds_port: int, rds_dbname: str, rds_username: str, rds_password: str) -> Tuple[Connection, Cursor]:
    """
    Connect to AWS RDS database and returns a connection object for querying later
    """

    try:
        conn = psycopg2.connect(
            host=rds_host,
            port=rds_port,
            dbname=rds_dbname,
            user=rds_username,
            password=rds_password
        )
        cursor = conn.cursor()
        print("✅ Connected successfully!")
        return conn, cursor

    except OperationalError as e:
        print("❌ Operational error (e.g. bad credentials, unreachable host):", e)
        raise
    except ProgrammingError as e:
        print("❌ Programming error (e.g. bad DB name or SQL syntax):", e)
        raise
    except Error as e:
        print("❌ psycopg2 general error:", e)
        raise
    except Exception as e:
        print("❌ Unknown error:", e)
        raise        

In [4]:
def sql_query_as_df(sql_query: str, cursor) -> pd.DataFrame:
    """
    Given a SQL query (string format), return the query's results as a Pandas dataframe
    """
    # Run query
    cursor.execute(sql_query)
    
    # Fetch all rows
    rows = cursor.fetchall()
    
    # Get column names from the cursor description
    column_names = [desc[0] for desc in cursor.description]
    
    # Convert to DataFrame
    df_from_query = pd.DataFrame(rows, columns=column_names)
    
    return df_from_query

In [5]:
def create_tbl_api_payloads_yfinance_daily(cursor, conn) -> None:
    """
    Create the tbl_api_payloads_yfinance_daily table if it doesn't already exist.
    """
    create_table_sql = """
    CREATE TABLE IF NOT EXISTS tbl_api_payloads_yfinance_daily (
        business_date DATE NOT NULL,
        ingestion_timestamp TIMESTAMPTZ DEFAULT NOW(),
        raw_payload JSONB,
        PRIMARY KEY (business_date)
    );
    """
    cursor.execute(create_table_sql)
    conn.commit()

In [6]:
def insert_yfinance_payload_by_date(
    df_yahoo_finance_api: pd.DataFrame, 
    cursor: Cursor, 
    conn: Connection, 
    table_name: str="tbl_api_payloads_yfinance_daily"
):
    """
    Insert Pandas dataframe (containing yahoo finance API call) into PostgreSQL table tbl_api_payloads_yfinance_daily,
    with each row of table containing a business_date's data
    TODO: implement reverse transformer, which extracts from tbl_api_payloads_yfinance_daily and
        re-creates the original multi-index dataframe from the yfinance API calls.  Useful for future audit/debugging purposes, but not required now
    TODO: add exception handling, retry logic
    TODO: Log how many rows were inserted vs skipped
    TODO: Add unit test using a mock Postgres or sqlite test instance
    TODO: Hook into audit table (record insert status + timestamp)
    """
    
    for timestamp in df_yahoo_finance_api.index:
        
        # Ensure that row of yahoo_finance_api is actually a dataframe, not a series
        row_of_df_yahoo_finance_api = df_yahoo_finance_api.loc[[timestamp]]
        
        # Ensures we have a list of list such as [["SPY","Open"],["SPY","High"],...].  Without orient="split", it would be ('SPY', 'Open')...
        json_payload = row_of_df_yahoo_finance_api.to_json(orient = "split")
    
        # Convert timestamp into date (without time component)
        business_date = timestamp.date()
        
        cursor.execute(
            f"""
            INSERT INTO {table_name} (business_date, raw_payload)
            VALUES (%s, %s)
            ON CONFLICT (business_date) DO NOTHING;
            """,
            (business_date, json_payload)
        )
    
    conn.commit()

### Tickers and Dates for Experiment

In [7]:
tickers = ["SPY","QQQ","GLD"]
start_date = '2025-04-01'
end_date = '2025-04-11'

### Call Yahoo Finance API to load some stock prices

In [8]:
conn, cursor = connect_to_rds(rds_host, rds_port, rds_dbname, rds_username, rds_password)

✅ Connected successfully!


In [9]:
create_tbl_api_payloads_yfinance_daily(cursor, conn)

In [10]:
df_yahoo_finance_api = yf.download(tickers, start = start_date, end = end_date, period = "1d", group_by = "ticker")

YF.download() has changed argument auto_adjust default to True


[*********************100%***********************]  3 of 3 completed


In [11]:
df_yahoo_finance_api.head(n = 5)

Ticker,SPY,SPY,SPY,SPY,SPY,GLD,GLD,GLD,GLD,GLD,QQQ,QQQ,QQQ,QQQ,QQQ
Price,Open,High,Low,Close,Volume,Open,High,Low,Close,Volume,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
2025-04-01,557.45,562.94,553.68,560.97,54609600,288.54,289.13,285.91,287.57,15923600,467.3,473.63,464.42,472.7,41156200
2025-04-02,555.05,567.42,554.81,564.52,76014500,288.28,289.03,287.36,288.16,11074800,466.12,479.56,465.86,476.15,49894500
2025-04-03,545.11,547.97,536.7,536.7,125986000,282.78,289.09,282.45,286.42,20524400,456.44,460.07,450.14,450.66,70456300
2025-04-04,523.67,525.87,505.06,505.28,217965100,283.64,284.03,278.02,279.72,21517200,438.14,440.37,422.67,422.67,117088400
2025-04-07,489.19,523.17,481.8,504.38,256611400,278.86,280.14,272.58,273.71,19807000,408.66,443.14,402.39,423.69,161557000


In [12]:
df_yahoo_finance_api.index

DatetimeIndex(['2025-04-01', '2025-04-02', '2025-04-03', '2025-04-04',
               '2025-04-07', '2025-04-08', '2025-04-09', '2025-04-10'],
              dtype='datetime64[ns]', name='Date', freq=None)

In [13]:
df_yahoo_finance_api.columns

MultiIndex([('SPY',   'Open'),
            ('SPY',   'High'),
            ('SPY',    'Low'),
            ('SPY',  'Close'),
            ('SPY', 'Volume'),
            ('GLD',   'Open'),
            ('GLD',   'High'),
            ('GLD',    'Low'),
            ('GLD',  'Close'),
            ('GLD', 'Volume'),
            ('QQQ',   'Open'),
            ('QQQ',   'High'),
            ('QQQ',    'Low'),
            ('QQQ',  'Close'),
            ('QQQ', 'Volume')],
           names=['Ticker', 'Price'])

### Create a function to write results of yahoo finance API call to the "bronze" table

In [14]:
insert_yfinance_payload_by_date(df_yahoo_finance_api, cursor, conn, "tbl_api_payloads_yfinance_daily")

In [15]:
view_table_query = """
SELECT * FROM tbl_api_payloads_yfinance_daily
LIMIT 5;
"""
df_view = sql_query_as_df(sql_query = view_table_query, cursor = cursor)
df_view

Unnamed: 0,business_date,ingestion_timestamp,raw_payload
0,2025-04-01,2025-04-17 18:25:52.444728+00:00,"{'data': [[557.450012207, 562.9400024414, 553...."
1,2025-04-02,2025-04-17 18:25:52.444728+00:00,"{'data': [[555.049987793, 567.4199829102, 554...."
2,2025-04-03,2025-04-17 18:25:52.444728+00:00,"{'data': [[545.1099853516, 547.9699707031, 536..."
3,2025-04-04,2025-04-17 18:25:52.444728+00:00,"{'data': [[523.6699829102, 525.8699951172, 505..."
4,2025-04-07,2025-04-17 18:25:52.444728+00:00,"{'data': [[489.1900024414, 523.1699829102, 481..."


In [33]:
df_view["raw_payload"][0]

{'data': [[557.450012207,
   562.9400024414,
   553.6799926758,
   560.9699707031,
   54609600,
   288.5400085449,
   289.1300048828,
   285.9100036621,
   287.5700073242,
   15923600,
   467.299987793,
   473.6300048828,
   464.4200134277,
   472.700012207,
   41156200]],
 'index': [1743465600000],
 'columns': [['SPY', 'Open'],
  ['SPY', 'High'],
  ['SPY', 'Low'],
  ['SPY', 'Close'],
  ['SPY', 'Volume'],
  ['GLD', 'Open'],
  ['GLD', 'High'],
  ['GLD', 'Low'],
  ['GLD', 'Close'],
  ['GLD', 'Volume'],
  ['QQQ', 'Open'],
  ['QQQ', 'High'],
  ['QQQ', 'Low'],
  ['QQQ', 'Close'],
  ['QQQ', 'Volume']]}

### Create an ETL Pipeline to send data from "bronze" (raw payloads) to "silver" (staging) table 

In [35]:
create_price_table_staging = """
CREATE TABLE IF NOT EXISTS tbl_yfinance_prices_daily_staging (
    ticker TEXT NOT NULL,
    business_date DATE NOT NULL,
    price_open NUMERIC,
    price_low NUMERIC,
    price_high NUMERIC,
    price_close NUMERIC,
    volume NUMERIC,
    created_timestamp TIMESTAMPTZ DEFAULT NOW(),
    PRIMARY KEY (ticker, business_date)
);
"""

cursor.execute(create_price_table_staging)
conn.commit()

In [36]:
## To do: After that, I need to create an ETL pipe from bronze table tbl_api_payloads_yfinance_daily  to silver table tbl_prices_daily_staging, and wrap this inside of Docker and Airflow

Okay, please summary what we are doing in each of the 3 stages: bronze, silver, and gold tables for me.  To my knowledge (and correct me if I am wrong):
- Raw API payload is 3D, dimensionalized over: (ticker, price/volume/etc, date)
- Bronze (tbl_api_payload_yfinance_daily): we are still re-shaping the raw API payload, because the rows of bronze table are keyed on date, so each row/date has a json structure containing (ticker, price/volume/etc).  Additionally, because we cannot store multi-indices in a json directly, we also have to flatten/unroll the multi-index into rows of keys of (ticker, price/volume/etc) mapping to a value such as $100
- Silver (tbl_prices_daily_staging or something like that): Now, we're moved 1 more dimension over into the key, so the key is now (date, ticker) and maps to a value which is (price/volume/etc)
- Gold (tbl_prices_daily_prod or something like that): still the same dimensions, keyed on (date, ticker) mapping to a (price/volume/etc), but we've adjusted for dividends, stock splits, etc such that the gold table's values can be directly fed into a machine learning algorithm and/or financial engineering models