# Notebook 2 - Development work for ETL'ing data from bronze table tbl_api_payloads_yfinance_daily to silver table tbl_prices_daily_staging

### Setup

In [10]:
import sys
import os
from dotenv import load_dotenv

# /home/ubuntu/financial-etl-poc/this_folder
current_folder = os.path.dirname(os.path.abspath("__file__"))
# /home/ubuntu/financial-etl-poc/
project_root_folder = os.path.abspath(os.path.join(current_folder, ".."))
sys.path.append(project_root_folder)
# For loading credentials from .env under financial-etl-poc
dotenv_path = os.path.join(project_root_folder, ".env")

if project_root_folder not in sys.path:
    sys.path.append(project_root_folder)

from utils.db_utils import *

In [11]:
load_dotenv(dotenv_path)
conn, cursor = connect_to_rds()

✅ Connected successfully!


### Exploration on how to recover multiIndex dataframe from json payload, and then flatten

In [13]:
query = """
SELECT * FROM tbl_api_payloads_yfinance_daily
LIMIT 3;
"""

df_bronze = sql_query_as_df(query, cursor)
df_bronze.head(n = 3)

Unnamed: 0,business_date,ingestion_timestamp,raw_payload
0,2025-04-01,2025-04-17 18:25:52.444728+00:00,"{'data': [[557.450012207, 562.9400024414, 553...."
1,2025-04-02,2025-04-17 18:25:52.444728+00:00,"{'data': [[555.049987793, 567.4199829102, 554...."
2,2025-04-03,2025-04-17 18:25:52.444728+00:00,"{'data': [[545.1099853516, 547.9699707031, 536..."


In [None]:
# Each raw_payload is a dictionary, with keys: data, index, and columns.  Convert this back into a multi-index pandas dataframe
raw_payload = df_bronze["raw_payload"][0]
raw_payload

{'data': [[557.450012207,
   562.9400024414,
   553.6799926758,
   560.9699707031,
   54609600,
   288.5400085449,
   289.1300048828,
   285.9100036621,
   287.5700073242,
   15923600,
   467.299987793,
   473.6300048828,
   464.4200134277,
   472.700012207,
   41156200]],
 'index': [1743465600000],
 'columns': [['SPY', 'Open'],
  ['SPY', 'High'],
  ['SPY', 'Low'],
  ['SPY', 'Close'],
  ['SPY', 'Volume'],
  ['GLD', 'Open'],
  ['GLD', 'High'],
  ['GLD', 'Low'],
  ['GLD', 'Close'],
  ['GLD', 'Volume'],
  ['QQQ', 'Open'],
  ['QQQ', 'High'],
  ['QQQ', 'Low'],
  ['QQQ', 'Close'],
  ['QQQ', 'Volume']]}

In [None]:
# We need to convert that payload from bronze table into the following format for silver table
query = """
SELECT * FROM tbl_yfinance_prices_daily_staging
LIMIT 3;
"""

df_silver = sql_query_as_df(query, cursor)
df_silver.head(n = 3)

Unnamed: 0,ticker,business_date,price_open,price_low,price_high,price_close,volume,created_timestamp


In [27]:
data = raw_payload["data"]
data

[[557.450012207,
  562.9400024414,
  553.6799926758,
  560.9699707031,
  54609600,
  288.5400085449,
  289.1300048828,
  285.9100036621,
  287.5700073242,
  15923600,
  467.299987793,
  473.6300048828,
  464.4200134277,
  472.700012207,
  41156200]]

In [28]:
index = pd.to_datetime(raw_payload["index"], unit = "ms")
index


DatetimeIndex(['2025-04-01'], dtype='datetime64[ns]', freq=None)

In [29]:
columns = pd.MultiIndex.from_tuples(raw_payload["columns"], names = ["Ticker", "Field"])
columns

MultiIndex([('SPY',   'Open'),
            ('SPY',   'High'),
            ('SPY',    'Low'),
            ('SPY',  'Close'),
            ('SPY', 'Volume'),
            ('GLD',   'Open'),
            ('GLD',   'High'),
            ('GLD',    'Low'),
            ('GLD',  'Close'),
            ('GLD', 'Volume'),
            ('QQQ',   'Open'),
            ('QQQ',   'High'),
            ('QQQ',    'Low'),
            ('QQQ',  'Close'),
            ('QQQ', 'Volume')],
           names=['Ticker', 'Field'])

In [38]:
# Create a Pandas dataframe from data, index, and columns
df_payload = pd.DataFrame(
    data = data,
    index = index,
    columns = columns
)

df_payload

Ticker,SPY,SPY,SPY,SPY,SPY,GLD,GLD,GLD,GLD,GLD,QQQ,QQQ,QQQ,QQQ,QQQ
Field,Open,High,Low,Close,Volume,Open,High,Low,Close,Volume,Open,High,Low,Close,Volume
2025-04-01,557.450012,562.940002,553.679993,560.969971,54609600,288.540009,289.130005,285.910004,287.570007,15923600,467.299988,473.630005,464.420013,472.700012,41156200


In [46]:
df_payload.stack(level = "Ticker", future_stack = True)

Unnamed: 0_level_0,Field,Open,High,Low,Close,Volume
Unnamed: 0_level_1,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2025-04-01,SPY,557.450012,562.940002,553.679993,560.969971,54609600
2025-04-01,GLD,288.540009,289.130005,285.910004,287.570007,15923600
2025-04-01,QQQ,467.299988,473.630005,464.420013,472.700012,41156200


In [49]:
df_flattened = df_payload.stack(level = "Ticker", future_stack = True).reset_index()
df_flattened

Field,level_0,Ticker,Open,High,Low,Close,Volume
0,2025-04-01,SPY,557.450012,562.940002,553.679993,560.969971,54609600
1,2025-04-01,GLD,288.540009,289.130005,285.910004,287.570007,15923600
2,2025-04-01,QQQ,467.299988,473.630005,464.420013,472.700012,41156200


### Formalizing into a function (for n = 1)

In [80]:
raw_payload = df_bronze["raw_payload"][0]
type(raw_payload)

dict

In [84]:
# Test to see if this works for a single row (n = 1)
def transform_raw_payload_to_row_keyed_on_ticker_and_business_date(raw_payload: dict) -> pd.DataFrame:

    data = raw_payload["data"]
    index = pd.to_datetime(raw_payload["index"], unit = "ms")
    columns = pd.MultiIndex.from_tuples(raw_payload["columns"], names = ["Ticker", "Field"])
    df_multiIndex = pd.DataFrame(data = data, index = index, columns = columns)
    df_flattened = df_multiIndex.stack(level = "Ticker").reset_index()
    df_flattened.columns = ["business_date", "ticker", "price_open", "price_high", "price_low", "price_close", "volume"]

    return df_flattened

raw_payload = df_bronze["raw_payload"][0]
df_flattened = transform_raw_payload_to_row_keyed_on_ticker_and_business_date(raw_payload)
df_flattened


  df_flattened = df_multiIndex.stack(level = "Ticker").reset_index()


Unnamed: 0,business_date,ticker,price_open,price_high,price_low,price_close,volume
0,2025-04-01,GLD,288.540009,289.130005,285.910004,287.570007,15923600
1,2025-04-01,QQQ,467.299988,473.630005,464.420013,472.700012,41156200
2,2025-04-01,SPY,557.450012,562.940002,553.679993,560.969971,54609600


In [71]:
# We need to convert that payload from bronze table into the following format for silver table
query = """
SELECT * FROM tbl_yfinance_prices_daily_staging
LIMIT 3;
"""

df_silver = sql_query_as_df(query, cursor)
df_silver.head(n = 3)

Unnamed: 0,ticker,business_date,price_open,price_low,price_high,price_close,volume,created_timestamp


In [None]:
def insert_into_yfinance_prices_staging_by_date(
    df_api_payloads_yfinance_daily: pd.DataFrame,
    cursor: Cursor, 
    conn: Connection    
):
    """
    df_bronze is a DataFrame extracted from tbl_api_payloads_yfinance_daily, with columns:
        business_date, ingest_timestamp, raw_payload
    Transform the schema of this data and insert into tbl_yfinance_prices_daily_staging, with columms:
        ticker, business_date, price_open, price_low, price_high, price_close, volume, created_timestamp
    """

    list_of_df_flattened = []

    for _, row in df_api_payloads_yfinance_daily.iterrows():

        business_date = row["business_date"]
        raw_payload = row["raw_payload"]

        df_flattened = transform_raw_payload_to_row_keyed_on_ticker_and_business_date(raw_payload)
        list_of_df_flattened.append(df_flattened)
    
    # TODO: Handle case if list_of_df_flattened is empty

    df_yfinance_prices_daily_staging = pd.concat(list_of_df_flattened)

    # TODO: Create created_timestamp
    # TODO: Re-order columns to match those of tbl_yfinance_prices_daily_staging
    # TODO: Cursor execute insertion of df_yfinance_prices_daily_staging into tbl_yfinance_prices_daily_staging
    # TODO: Sanity-checks, for example, does the "business_date" from df_bronze match the "index" (date in ms) from the json_payload column? If not, how to handle?
    # TODO: Sanity-check that dimensions of the payload are rectangular, i.e. - same number of tickers per row, same number of fields per row, etc.
    conn.commit()