# This notebook is a POC for an ETL pipeline, where I load prices from yahoo finance into PostgreSQL staging table, transform it, and then load it into PostgreSQL production table

### Import Libraries

In [1]:
# Standard libraries
import os
import numpy as np
import pandas as pd
from datetime import datetime
import json

# Connect to AWS RDS
from dotenv import load_dotenv
import psycopg2
from psycopg2.extensions import connection as Connection
from psycopg2 import OperationalError, ProgrammingError, Error

# Connect to yahoo finance's API
import yfinance as yf

### Configuration & Constants

In [2]:
# Load .env file (searches in the current directory or parent)
load_dotenv()

# Access environment variables for connecting to my PostgreSQL database
rds_host = os.getenv("rds_host")
rds_port = int(os.getenv("rds_port"))
rds_dbname = os.getenv("rds_dbname")
rds_username = os.getenv("rds_username")
rds_password = os.getenv("rds_password")

### Helper Functions - DB Connection

In [3]:
def connect_to_rds(rds_host: str, rds_port: int, rds_dbname: str, rds_username: str, rds_password: str) -> Connection:

    try:
        conn = psycopg2.connect(
            host=rds_host,
            port=rds_port,
            dbname=rds_dbname,
            user=rds_username,
            password=rds_password
        )
        print("✅ Connected successfully!")
        return conn

    except OperationalError as e:
        print("❌ Operational error (e.g. bad credentials, unreachable host):", e)
        raise
    except ProgrammingError as e:
        print("❌ Programming error (e.g. bad DB name or SQL syntax):", e)
        raise
    except Error as e:
        print("❌ psycopg2 general error:", e)
        raise
    except Exception as e:
        print("❌ Unknown error:", e)
        raise        

### Set Up Connection

In [6]:
conn = connect_to_rds(rds_host, rds_port, rds_dbname, rds_username, rds_password)
cursor = conn.cursor()

✅ Connected successfully!


In [9]:
create_tbl_api_payloads_yfinance_daily = """
CREATE TABLE IF NOT EXISTS tbl_api_payloads_yfinance_daily (
    business_date DATE NOT NULL,
    ingestion_timestamp TIMESTAMPTZ DEFAULT NOW(),
    raw_payload JSONB,
    PRIMARY KEY (business_date)
);
"""

cursor.execute(create_tbl_api_payloads_yfinance_daily)
conn.commit()

In [10]:
tickers = ["SPY","QQQ","GLD"]
start_date = '2025-04-01'
end_date = '2025-04-11'

In [11]:
df_yf_raw = yf.download(tickers, start = start_date, end = end_date, period = "1d", group_by = "ticker")

YF.download() has changed argument auto_adjust default to True


[*********************100%***********************]  3 of 3 completed


In [12]:
df_yf_raw

Ticker,SPY,SPY,SPY,SPY,SPY,QQQ,QQQ,QQQ,QQQ,QQQ,GLD,GLD,GLD,GLD,GLD
Price,Open,High,Low,Close,Volume,Open,High,Low,Close,Volume,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
2025-04-01,557.450012,562.940002,553.679993,560.969971,54609600,467.299988,473.630005,464.420013,472.700012,41156200,288.540009,289.130005,285.910004,287.570007,15923600
2025-04-02,555.049988,567.419983,554.809998,564.52002,76014500,466.119995,479.559998,465.859985,476.149994,49894500,288.279999,289.029999,287.359985,288.160004,11074800
2025-04-03,545.109985,547.969971,536.700012,536.700012,125986000,456.440002,460.070007,450.140015,450.660004,70456300,282.779999,289.089996,282.450012,286.420013,20524400
2025-04-04,523.669983,525.869995,505.059998,505.279999,217965100,438.140015,440.369995,422.670013,422.670013,117088400,283.640015,284.029999,278.019989,279.720001,21517200
2025-04-07,489.190002,523.169983,481.799988,504.380005,256611400,408.660004,443.140015,402.390015,423.690002,161557000,278.859985,280.140015,272.579987,273.709991,19807000
2025-04-08,521.859985,524.97998,489.160004,496.480011,165816600,438.160004,443.140015,409.790009,416.059998,101248100,277.869995,278.320007,274.23999,275.200012,12639500
2025-04-09,493.440002,548.619995,493.049988,548.619995,241867300,415.570007,467.829987,415.429993,466.0,142876900,282.220001,285.869995,281.040009,285.380005,25342200
2025-04-10,532.169983,533.5,509.320007,524.580017,162331200,453.559998,455.589996,432.630005,446.179993,108384100,288.809998,292.859985,287.230011,292.350006,19837800


In [13]:
df_yf_raw.index

DatetimeIndex(['2025-04-01', '2025-04-02', '2025-04-03', '2025-04-04',
               '2025-04-07', '2025-04-08', '2025-04-09', '2025-04-10'],
              dtype='datetime64[ns]', name='Date', freq=None)

In [14]:
df_yf_raw.columns

MultiIndex([('SPY',   'Open'),
            ('SPY',   'High'),
            ('SPY',    'Low'),
            ('SPY',  'Close'),
            ('SPY', 'Volume'),
            ('QQQ',   'Open'),
            ('QQQ',   'High'),
            ('QQQ',    'Low'),
            ('QQQ',  'Close'),
            ('QQQ', 'Volume'),
            ('GLD',   'Open'),
            ('GLD',   'High'),
            ('GLD',    'Low'),
            ('GLD',  'Close'),
            ('GLD', 'Volume')],
           names=['Ticker', 'Price'])

In [18]:
index_0 = df_yf_raw.index[0]
# A single row is a Pandas series with multiIndex (Ticker, Price)
df_yf_raw_0 = df_yf_raw.loc[index_0]

In [26]:
df_yf_raw_0.unstack().to_dict()

{'Close': {'GLD': 287.57000732421875,
  'QQQ': 472.70001220703125,
  'SPY': 560.969970703125},
 'High': {'GLD': 289.1300048828125,
  'QQQ': 473.6300048828125,
  'SPY': 562.9400024414062},
 'Low': {'GLD': 285.9100036621094,
  'QQQ': 464.4200134277344,
  'SPY': 553.6799926757812},
 'Open': {'GLD': 288.5400085449219,
  'QQQ': 467.29998779296875,
  'SPY': 557.4500122070312},
 'Volume': {'GLD': 15923600.0, 'QQQ': 41156200.0, 'SPY': 54609600.0}}

In [20]:
# The columns are MultiIndex ["Ticker", "Price"], so we need to flatten by converting ticker from column to row
df_flattened = df_yf_raw.stack(level = "Ticker", future_stack = True).reset_index()

In [22]:
df_flattened.head(n = 5)

Price,Date,Ticker,Open,High,Low,Close,Volume
0,2025-04-01,SPY,557.450012,562.940002,553.679993,560.969971,54609600
1,2025-04-01,QQQ,467.299988,473.630005,464.420013,472.700012,41156200
2,2025-04-01,GLD,288.540009,289.130005,285.910004,287.570007,15923600
3,2025-04-02,SPY,555.049988,567.419983,554.809998,564.52002,76014500
4,2025-04-02,QQQ,466.119995,479.559998,465.859985,476.149994,49894500


In [23]:
df_flattened.index

RangeIndex(start=0, stop=24, step=1)

In [25]:
df_flattened.columns

Index(['Date', 'Ticker', 'Open', 'High', 'Low', 'Close', 'Volume'], dtype='object', name='Price')

In [None]:
create_price_table_staging = """
CREATE TABLE IF NOT EXISTS prices_staging (
    ticker TEXT NOT NULL,
    timestamp TIMESTAMPTZ NOT NULL,
    price NUMERIC,
    volume NUMERIC,
    frequency TEXT DEFAULT 'daily',
    source_api TEXT,
    created_timestamp TIMESTAMPTZ DEFAULT NOW(),
    raw_payload JSONB,
    PRIMARY KEY (ticker, timestamp)
);
"""

cursor.execute(create_price_table_staging)
conn.commit()