# Notebook 2 - Development work for ETL'ing data from bronze table tbl_api_payloads_yfinance_daily to silver table tbl_prices_daily_staging

### Setup

In [None]:
import sys
import os
from dotenv import load_dotenv

# /home/ubuntu/financial-etl-poc/this_folder
current_folder = os.path.dirname(os.path.abspath("__file__"))
# /home/ubuntu/financial-etl-poc/
project_root_folder = os.path.abspath(os.path.join(current_folder, ".."))
sys.path.append(project_root_folder)
# For loading credentials from .env under financial-etl-poc
dotenv_path = os.path.join(project_root_folder, ".env")

if project_root_folder not in sys.path:
    sys.path.append(project_root_folder)

from utils.db_utils import *
from psycopg2.extras import execute_values  # Bulk insertion of Pandas dataframes
from datetime import datetime, date, timezone

In [2]:
load_dotenv(dotenv_path)
conn, cursor = connect_to_rds()

✅ Connected successfully!


### Create a function for pulling data from tbl_api_payloads_yfinance_daily (extract part of ETL)

In [3]:
def extract_raw_payloads_from_tbl_api_payloads_yfinance_daily(start_date: date, end_date: date, cursor: Cursor) -> pd.DataFrame:
    """
    Extract the raw json payloads from bronze table tbl_api_payloads_yfinance_daily between start_date and end_date
    """

    query = f"""
    SELECT business_date, raw_payload
    FROM tbl_api_payloads_yfinance_daily
    WHERE business_date BETWEEN '{start_date}' and '{end_date}';
    """

    df_api_payloads_yfinance_daily = sql_query_as_df(query, cursor)

    if df_api_payloads_yfinance_daily.empty:
        raise ValueError(f"Querying for json payloads from bronze table tbl_api_payloads_yfinance_daily between {start_date} and {end_date} returned no rows")

    print(f"Fetched {len(df_api_payloads_yfinance_daily)} rows from bronze table tbl_api_payloads_yfinance_daily for dates {start_date} to {end_date}")
    
    return df_api_payloads_yfinance_daily

In [4]:
# Test extract_raw_payloads_from_tbl_api_payloads_yfinance_daily
start_date = date(2025, 4, 1)
end_date = date(2025, 5, 10)
df_api_payloads_yfinance_daily = extract_raw_payloads_from_tbl_api_payloads_yfinance_daily(start_date, end_date, cursor)
df_api_payloads_yfinance_daily


Fetched 12 rows from bronze table tbl_api_payloads_yfinance_daily for dates 2025-04-01 to 2025-05-10


Unnamed: 0,business_date,raw_payload
0,2025-04-01,"{'data': [[557.450012207, 562.9400024414, 553...."
1,2025-04-02,"{'data': [[555.049987793, 567.4199829102, 554...."
2,2025-04-03,"{'data': [[545.1099853516, 547.9699707031, 536..."
3,2025-04-04,"{'data': [[523.6699829102, 525.8699951172, 505..."
4,2025-04-07,"{'data': [[489.1900024414, 523.1699829102, 481..."
5,2025-04-08,"{'data': [[521.8599853516, 524.9799804688, 489..."
6,2025-04-09,"{'data': [[493.4400024414, 548.6199951172, 493..."
7,2025-04-10,"{'data': [[532.1699829102, 533.5, 509.32000732..."
8,2025-04-25,"{'data': [[67.9199981689, 68.1299972534, 67.56..."
9,2025-04-23,"{'data': [[104.5199966431, 104.8000030518, 102..."


### Exploration on how to recover multiIndex dataframe from json payload, and then flatten

In [5]:
# Each raw_payload is a dictionary, with keys: data, index, and columns.  Convert this back into a multi-index pandas dataframe
raw_payload = df_api_payloads_yfinance_daily["raw_payload"][0]
raw_payload

{'data': [[557.450012207,
   562.9400024414,
   553.6799926758,
   560.9699707031,
   54609600,
   288.5400085449,
   289.1300048828,
   285.9100036621,
   287.5700073242,
   15923600,
   467.299987793,
   473.6300048828,
   464.4200134277,
   472.700012207,
   41156200]],
 'index': [1743465600000],
 'columns': [['SPY', 'Open'],
  ['SPY', 'High'],
  ['SPY', 'Low'],
  ['SPY', 'Close'],
  ['SPY', 'Volume'],
  ['GLD', 'Open'],
  ['GLD', 'High'],
  ['GLD', 'Low'],
  ['GLD', 'Close'],
  ['GLD', 'Volume'],
  ['QQQ', 'Open'],
  ['QQQ', 'High'],
  ['QQQ', 'Low'],
  ['QQQ', 'Close'],
  ['QQQ', 'Volume']]}

In [6]:
data = raw_payload["data"]
data

[[557.450012207,
  562.9400024414,
  553.6799926758,
  560.9699707031,
  54609600,
  288.5400085449,
  289.1300048828,
  285.9100036621,
  287.5700073242,
  15923600,
  467.299987793,
  473.6300048828,
  464.4200134277,
  472.700012207,
  41156200]]

In [7]:
index = pd.to_datetime(raw_payload["index"], unit = "ms")
index


DatetimeIndex(['2025-04-01'], dtype='datetime64[ns]', freq=None)

In [8]:
columns = pd.MultiIndex.from_tuples(raw_payload["columns"], names = ["Ticker", "Field"])
columns

MultiIndex([('SPY',   'Open'),
            ('SPY',   'High'),
            ('SPY',    'Low'),
            ('SPY',  'Close'),
            ('SPY', 'Volume'),
            ('GLD',   'Open'),
            ('GLD',   'High'),
            ('GLD',    'Low'),
            ('GLD',  'Close'),
            ('GLD', 'Volume'),
            ('QQQ',   'Open'),
            ('QQQ',   'High'),
            ('QQQ',    'Low'),
            ('QQQ',  'Close'),
            ('QQQ', 'Volume')],
           names=['Ticker', 'Field'])

In [9]:
# Create a Pandas dataframe from data, index, and columns
df_payload = pd.DataFrame(
    data = data,
    index = index,
    columns = columns
)

df_payload

Ticker,SPY,SPY,SPY,SPY,SPY,GLD,GLD,GLD,GLD,GLD,QQQ,QQQ,QQQ,QQQ,QQQ
Field,Open,High,Low,Close,Volume,Open,High,Low,Close,Volume,Open,High,Low,Close,Volume
2025-04-01,557.450012,562.940002,553.679993,560.969971,54609600,288.540009,289.130005,285.910004,287.570007,15923600,467.299988,473.630005,464.420013,472.700012,41156200


In [10]:
df_payload.stack(level = "Ticker", future_stack = True)

Unnamed: 0_level_0,Field,Open,High,Low,Close,Volume
Unnamed: 0_level_1,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2025-04-01,SPY,557.450012,562.940002,553.679993,560.969971,54609600
2025-04-01,GLD,288.540009,289.130005,285.910004,287.570007,15923600
2025-04-01,QQQ,467.299988,473.630005,464.420013,472.700012,41156200


In [21]:
df_flattened = df_payload.stack(level = "Ticker", future_stack = True).reset_index()
df_flattened

Field,level_0,Ticker,Open,High,Low,Close,Volume
0,2025-04-01,SPY,557.450012,562.940002,553.679993,560.969971,54609600
1,2025-04-01,GLD,288.540009,289.130005,285.910004,287.570007,15923600
2,2025-04-01,QQQ,467.299988,473.630005,464.420013,472.700012,41156200


In [22]:
df_flattened = df_flattened.rename(columns = {
    "level_0": "business_date",
    "Ticker": "ticker",
    "Open": "price_open",
    "High": "price_high",
    "Low": "price_low",
    "Close": "price_close",
    "Volume": "volume"
})
df_flattened

Field,business_date,ticker,price_open,price_high,price_low,price_close,volume
0,2025-04-01,SPY,557.450012,562.940002,553.679993,560.969971,54609600
1,2025-04-01,GLD,288.540009,289.130005,285.910004,287.570007,15923600
2,2025-04-01,QQQ,467.299988,473.630005,464.420013,472.700012,41156200


### Version 1: Formalizing into a function (for n = 1) in the case where we want to pass in a one-row dataframe

In [25]:
# Test for a one-row dataframe
row_api_payloads_yfinance_daily = df_api_payloads_yfinance_daily.iloc[[0]]
row_api_payloads_yfinance_daily

Unnamed: 0,business_date,raw_payload
0,2025-04-01,"{'data': [[557.450012207, 562.9400024414, 553...."


In [26]:
# Test to see if this works for a single row (n = 1)
def transform_raw_payload_to_row_keyed_on_ticker_and_business_date(row_api_payloads_yfinance_daily: pd.DataFrame) -> pd.DataFrame:

    """
    Transforms a single raw_payload JSON (from yfinance) into a DataFrame keyed on ticker and business_date.
    Validates that payload dates match the metadata business_date.
    """

    if len(row_api_payloads_yfinance_daily) != 1:
        raise ValueError(f"Length of row_api_payloads_yfinance_daily is {len(row_api_payloads_yfinance_daily)} instead of 1. Def transform_raw_payload_to_row_keyed_on_ticker_and_business_date needs to be given a dataframe of one row to unroll")

    business_date_of_row = row_api_payloads_yfinance_daily["business_date"].iloc[0]
    raw_payload = row_api_payloads_yfinance_daily["raw_payload"].iloc[0]

    data = raw_payload["data"]
    index = pd.to_datetime(raw_payload["index"], unit = "ms")
    columns = pd.MultiIndex.from_tuples(raw_payload["columns"], names = ["Ticker", "Field"])
    df_multiIndex = pd.DataFrame(data = data, index = index, columns = columns)
    df_yfinance_prices_daily_staging = df_multiIndex.stack(level = "Ticker").reset_index()
    df_yfinance_prices_daily_staging = df_yfinance_prices_daily_staging.rename(columns = {
        "level_0": "business_date",
        "Ticker": "ticker",
        "Open": "price_open",
        "High": "price_high",
        "Low": "price_low",
        "Close": "price_close",
        "Volume": "volume"
    })

    business_date_of_payload = df_yfinance_prices_daily_staging["business_date"].unique()[0].date()
    if business_date_of_row != business_date_of_payload:
        raise ValueError(f"Business_date column of row_api_payloads_yfinance_daily {business_date_of_row} does not match the business date of the raw json payload {business_date_of_payload}")

    return df_yfinance_prices_daily_staging


In [27]:
df_yfinance_prices_daily_staging = transform_raw_payload_to_row_keyed_on_ticker_and_business_date(row_api_payloads_yfinance_daily)
df_yfinance_prices_daily_staging

  df_yfinance_prices_daily_staging = df_multiIndex.stack(level = "Ticker").reset_index()


Field,business_date,ticker,price_open,price_high,price_low,price_close,volume
0,2025-04-01,GLD,288.540009,289.130005,285.910004,287.570007,15923600
1,2025-04-01,QQQ,467.299988,473.630005,464.420013,472.700012,41156200
2,2025-04-01,SPY,557.450012,562.940002,553.679993,560.969971,54609600


### Version 2: Creating a function for the case where we pass in a series instead.  Will use series version with iterrows().  This is the transformation part of ETL

In [29]:
# Test for a series, representing one row of df_api_payloads_yfinance_daily
row_api_payloads_yfinance_daily = df_api_payloads_yfinance_daily.iloc[0]
row_api_payloads_yfinance_daily

business_date                                           2025-04-01
raw_payload      {'data': [[557.450012207, 562.9400024414, 553....
Name: 0, dtype: object

In [None]:
# Test to see if this works for a single row (n = 1)
def transform_raw_payload_to_row_keyed_on_ticker_and_business_date(row_api_payloads_yfinance_daily: pd.Series) -> pd.DataFrame:

    """
    Transforms a single raw_payload JSON (from yfinance) into a DataFrame keyed on ticker and business_date.
    Validates that payload dates match the metadata business_date.
    """

    business_date_of_row = row_api_payloads_yfinance_daily["business_date"]
    raw_payload = row_api_payloads_yfinance_daily["raw_payload"]

    data = raw_payload["data"]
    index = pd.to_datetime(raw_payload["index"], unit = "ms")
    columns = pd.MultiIndex.from_tuples(raw_payload["columns"], names = ["Ticker", "Field"])
    df_multiIndex = pd.DataFrame(data = data, index = index, columns = columns)
    df_yfinance_prices_daily_staging = df_multiIndex.stack(level = "Ticker").reset_index()
    df_yfinance_prices_daily_staging = df_yfinance_prices_daily_staging.rename(columns = {
        "level_0": "business_date",
        "Ticker": "ticker",
        "Open": "price_open",
        "High": "price_high",
        "Low": "price_low",
        "Close": "price_close",
        "Volume": "volume"
    })

    business_date_of_payload = df_yfinance_prices_daily_staging["business_date"].unique()[0].date()
    if business_date_of_row != business_date_of_payload:
        raise ValueError(f"Business_date column of row_api_payloads_yfinance_daily {business_date_of_row} does not match the business date of the raw json payload {business_date_of_payload}")
    
    # TODO: Should I add a check for NAs, to check if any of the prices or volumes for any tickers are NULL?

    return df_yfinance_prices_daily_staging


In [31]:
df_yfinance_prices_daily_staging = transform_raw_payload_to_row_keyed_on_ticker_and_business_date(row_api_payloads_yfinance_daily)
df_yfinance_prices_daily_staging

  df_yfinance_prices_daily_staging = df_multiIndex.stack(level = "Ticker").reset_index()


Field,business_date,ticker,price_open,price_high,price_low,price_close,volume
0,2025-04-01,GLD,288.540009,289.130005,285.910004,287.570007,15923600
1,2025-04-01,QQQ,467.299988,473.630005,464.420013,472.700012,41156200
2,2025-04-01,SPY,557.450012,562.940002,553.679993,560.969971,54609600


### Create function insert_into_yfinance_prices_staging_by_date (load part of ETL)

In [14]:
# We need to convert that payload from bronze table into the following format for silver table
query = """
SELECT * FROM tbl_yfinance_prices_daily_staging
LIMIT 3;
"""

df_silver = sql_query_as_df(query, cursor)
df_silver.head(n = 3)

Unnamed: 0,ticker,business_date,price_open,price_low,price_high,price_close,volume,created_timestamp


In [None]:
column_order = ["ticker", "business_date", "price_open", "price_low", "price_high", "price_close", "volume"]
df_yfinance_prices_daily_staging = df_yfinance_prices_daily_staging[column_order]

list_of_tuples_to_insert = [
    (*row, datetime.now(timezone.utc)) for row in df_yfinance_prices_daily_staging.itertuples(index = False, name = None)
]
list_of_tuples_to_insert

[('GLD',
  Timestamp('2025-04-01 00:00:00'),
  288.5400085449,
  285.9100036621,
  289.1300048828,
  287.5700073242,
  15923600,
  datetime.datetime(2025, 5, 18, 14, 12, 25, 983968)),
 ('QQQ',
  Timestamp('2025-04-01 00:00:00'),
  467.299987793,
  464.4200134277,
  473.6300048828,
  472.700012207,
  41156200,
  datetime.datetime(2025, 5, 18, 14, 12, 25, 983973)),
 ('SPY',
  Timestamp('2025-04-01 00:00:00'),
  557.450012207,
  553.6799926758,
  562.9400024414,
  560.9699707031,
  54609600,
  datetime.datetime(2025, 5, 18, 14, 12, 25, 983975))]

In [None]:
def insert_into_yfinance_prices_staging_by_date(
    df_yfinance_prices_daily_staging: pd.DataFrame,
    cursor: Cursor, 
    conn: Connection    
):
    """
    Inserts unrolled json payload, in the form of dataframe, into silver table df_yfinance_prices_daily_staging
    """

    column_order = ["ticker", "business_date", "price_open", "price_low", "price_high", "price_close", "volume"]
    df_yfinance_prices_daily_staging = df_yfinance_prices_daily_staging[column_order]

    list_of_tuples_to_insert = [
        (*row, datetime.now(timezone.utc)) for row in df_yfinance_prices_daily_staging.itertuples(index = False, name = None)
    ]

    # TODO: There is a possibility where we would want to run this driver explicitly to override existing price/volumes for a given (ticker, business_date) to correct a wrong json payload, so we need to change "do nothing" to instead prompt the user "What you are writing conflicts with existing data, do you want to override or skip?"
    insert_query = """
        INSERT INTO tbl_yfinance_prices_daily_staging (
            ticker, business_date, price_open, price_low, price_high, price_close, volume, created_timestamp
        )
        VALUES %s
        ON CONFLICT (ticker, business_date) DO NOTHING;
    """

    execute_values(cursor, insert_query, list_of_tuples_to_insert)
    conn.commit()

    # TODO: Formalize this print statement using Python's logging functionality instead
    if list_of_tuples_to_insert:
        print(f"✅ Inserted {len(list_of_tuples_to_insert)} rows into tbl_yfinance_prices_daily_staging")
    else:
        print("⚠️ No rows to insert into tbl_yfinance_prices_daily_staging")

In [None]:
# Test new function
insert_into_yfinance_prices_staging_by_date(df_yfinance_prices_daily_staging, cursor, conn)

In [None]:
# Verify that it worked
query = """
SELECT * from tbl_yfinance_prices_daily_staging
"""

result = sql_query_as_df(query, cursor)
result

Unnamed: 0,ticker,business_date,price_open,price_low,price_high,price_close,volume,created_timestamp
0,GLD,2025-04-01,288.5400085449,285.9100036621,289.1300048828,287.5700073242,15923600,2025-05-18 14:33:23.780097+00:00
1,QQQ,2025-04-01,467.299987793,464.4200134277,473.6300048828,472.700012207,41156200,2025-05-18 14:33:23.780105+00:00
2,SPY,2025-04-01,557.450012207,553.6799926758,562.9400024414,560.9699707031,54609600,2025-05-18 14:33:23.780107+00:00
