In [76]:
import os
import sys

import pandas as pd
from datetime import date

# Required to import other modules from this project, in folders such as utils/ or notebooks/
current_folder = os.getcwd()
project_root_folder = os.path.abspath(os.path.join(current_folder, ".."))
sys.path.append(project_root_folder)
# Import functionalities from other modules in this project
from utils.db_utils import *

# Load .env file for AWS RDS login credentials
from dotenv import load_dotenv
dotenv_path = os.path.join(project_root_folder, ".env")
load_dotenv(dotenv_path)

# Global variables
conn, cursor = connect_to_rds()

✅ Connected successfully!


### Extract from staging table

In [19]:
# Test data
start_date = '2025-06-02'
end_date = '2025-06-03'
tickers = ["SPY", "GLD"]
ticker_list_as_str = "'" + "','".join(ticker for ticker in tickers) + "'"

In [27]:
def extract_raw_data_from_staging(start_date: date, end_date: date, tickers: List[str], cursor: Cursor, conn: Connection):
    """
    Given start_date, end_date, and list of tickers, extract raw/unadjusted price & volume data from tbl_tiingo_daily_staging, as a Pandas dataframe
    """

    # Example: 'SPY','GLD'
    ticker_list_as_str = "'" + "','".join(ticker for ticker in tickers) + "'"

    query = f"""
    select
        ticker, business_date,
        open, high, low, close, volume,
        div_cash, split_factor
    from tbl_tiingo_daily_staging
    where business_date between '{start_date}' and '{end_date}' and ticker in ({ticker_list_as_str})
    order by ticker, business_date;
    """

    df = sql_query_as_df(query, cursor)
    return df

In [None]:
extract_raw_data_from_staging(start_date, end_date, tickers, cursor, conn)


    select
        ticker, business_date,
        open, high, low, close, volume,
        div_cash, split_factor
    from tbl_tiingo_daily_staging
    where business_date between '2025-06-02' and '2025-06-03' and ticker in ('SPY','GLD')
    order by ticker, business_date;
    


Unnamed: 0,ticker,business_date,open,high,low,close,volume,div_cash,split_factor
0,GLD,2025-06-02,309.21,311.72,308.72,311.67,13593232,0.0,1.0
1,GLD,2025-06-03,309.56,309.56,307.15,308.91,9315081,0.0,1.0
2,SPY,2025-06-02,587.76,592.79,585.06,592.71,61630502,0.0,1.0
3,SPY,2025-06-03,592.34,597.08,591.85,596.09,63606204,0.0,1.0


### Adjust for corporate actions for a single stock

In [24]:
# Create example dataframe of raw prices + corporate actions, where the price moves strictly due to corporate actions and actually has zero returns every day for an investor of the stock
df = pd.DataFrame({
    "business_date": pd.to_datetime([
        "2025-06-01", "2025-06-02", "2025-06-03", "2025-06-04", "2025-06-05", "2025-06-06", "2025-06-07", "2025-06-08"
    ]),
    "raw_price": [306, 306, 300, 100, 100, 98, 49, 49],
    "div_cash": [0, 0, 6, 0, 0, 2, 0, 0],
    "split_factor": [1, 1, 1, 3, 1, 1, 2, 1]
})

In [25]:
df["adj_price"] = df["raw_price"].copy()

In [26]:
# Test works: we prove that the adjusted price for this stock is $49 the whole way through
for i in range(len(df)):
    dividend = df.iloc[i]["div_cash"]
    split_ratio = df.iloc[i]["split_factor"]
    
    if dividend > 0:
        df.loc[:i-1, "adj_price"] = df.loc[:i-1, "adj_price"] - dividend
    
    if split_ratio != 1:
        df.loc[:i-1, "adj_price"] = df.loc[:i-1, "adj_price"] / split_ratio
    
    print("i:",i)
    print("dividend:",dividend)
    print("split_ratio:",split_ratio)
    print(df)
    print()

i: 0
dividend: 0
split_ratio: 1
  business_date  raw_price  div_cash  split_factor  adj_price
0    2025-06-01        306         0             1        306
1    2025-06-02        306         0             1        306
2    2025-06-03        300         6             1        300
3    2025-06-04        100         0             3        100
4    2025-06-05        100         0             1        100
5    2025-06-06         98         2             1         98
6    2025-06-07         49         0             2         49
7    2025-06-08         49         0             1         49

i: 1
dividend: 0
split_ratio: 1
  business_date  raw_price  div_cash  split_factor  adj_price
0    2025-06-01        306         0             1        306
1    2025-06-02        306         0             1        306
2    2025-06-03        300         6             1        300
3    2025-06-04        100         0             3        100
4    2025-06-05        100         0             1        100
5    

In [None]:
# Show final result
df

Unnamed: 0,business_date,raw_price,div_cash,split_factor,adj_price
0,2025-06-01,306,0,1,49
1,2025-06-02,306,0,1,49
2,2025-06-03,300,6,1,49
3,2025-06-04,100,0,3,49
4,2025-06-05,100,0,1,49
5,2025-06-06,98,2,1,49
6,2025-06-07,49,0,2,49
7,2025-06-08,49,0,1,49


### Try another example

In [43]:
# Create example dataframe of raw prices + corporate actions, where the price moves strictly due to corporate actions and actually has zero returns every day for an investor of the stock
df = pd.DataFrame({
    "business_date": pd.to_datetime([
        "2025-06-01", "2025-06-02", "2025-06-03", "2025-06-04", "2025-06-05", "2025-06-06", "2025-06-07", "2025-06-08"
    ]),
    "raw_price": [100, 99, 101, 50, 49, 50, 51, 25],
    "div_cash": [0, 1, 0, 0, 1, 0, 0, 0],
    "split_factor": [1, 1, 1, 2, 1, 1, 1, 2]
})

In [44]:
df["adj_price"] = df["raw_price"].copy()

In [45]:
# Test works: we prove that the adjusted price for this stock is $49 the whole way through
for i in range(len(df)):
    dividend = df.iloc[i]["div_cash"]
    split_ratio = df.iloc[i]["split_factor"]
    
    if dividend > 0:
        df.loc[:i-1, "adj_price"] = df.loc[:i-1, "adj_price"] - dividend
    
    if split_ratio != 1:
        df.loc[:i-1, "adj_price"] = df.loc[:i-1, "adj_price"] / split_ratio
    
    print("i:",i)
    print("dividend:",dividend)
    print("split_ratio:",split_ratio)
    print(df)
    print()

i: 0
dividend: 0
split_ratio: 1
  business_date  raw_price  div_cash  split_factor  adj_price
0    2025-06-01        100         0             1        100
1    2025-06-02         99         1             1         99
2    2025-06-03        101         0             1        101
3    2025-06-04         50         0             2         50
4    2025-06-05         49         1             1         49
5    2025-06-06         50         0             1         50
6    2025-06-07         51         0             1         51
7    2025-06-08         25         0             2         25

i: 1
dividend: 1
split_ratio: 1
  business_date  raw_price  div_cash  split_factor  adj_price
0    2025-06-01        100         0             1         99
1    2025-06-02         99         1             1         99
2    2025-06-03        101         0             1        101
3    2025-06-04         50         0             2         50
4    2025-06-05         49         1             1         49
5    

  df.loc[:i-1, "adj_price"] = df.loc[:i-1, "adj_price"] / split_ratio


### Adjust for corporate actions for multiple stocks

In [90]:
df = pd.DataFrame({
    "ticker":["ABC", "ABC", "ABC", "ABC", "ABC", "ABC", "ABC", "ABC",
              "XYZ", "XYZ", "XYZ", "XYZ", "XYZ", "XYZ", "XYZ", "XYZ"],
    "business_date": pd.to_datetime([
        "2025-06-01", "2025-06-02", "2025-06-03", "2025-06-04", "2025-06-05", "2025-06-06", "2025-06-07", "2025-06-08",
        "2025-06-01", "2025-06-02", "2025-06-03", "2025-06-04", "2025-06-05", "2025-06-06", "2025-06-07", "2025-06-08"]),
    "raw_price": [306, 306, 300, 100, 100, 98, 49, 49,
                  100, 99, 101, 50, 49, 50, 51, 25],
    "div_cash": [0, 0, 6, 0, 0, 2, 0, 0,
                 0, 1, 0, 0, 1, 0, 0, 0],
    "split_factor": [1, 1, 1, 3, 1, 1, 2, 1,
                     1, 1, 1, 2, 1, 1, 1, 2]
})

In [91]:
df

Unnamed: 0,ticker,business_date,raw_price,div_cash,split_factor
0,ABC,2025-06-01,306,0,1
1,ABC,2025-06-02,306,0,1
2,ABC,2025-06-03,300,6,1
3,ABC,2025-06-04,100,0,3
4,ABC,2025-06-05,100,0,1
5,ABC,2025-06-06,98,2,1
6,ABC,2025-06-07,49,0,2
7,ABC,2025-06-08,49,0,1
8,XYZ,2025-06-01,100,0,1
9,XYZ,2025-06-02,99,1,1


In [95]:
def adj_corp_actions_for_one_stock(df_ticker: pd.DataFrame) -> pd.DataFrame:

    # Make a copy to avoid mutating the original dataframe
    df_ticker = df_ticker.copy()

    # Reset index of df_ticker to start from 0. Otherwise, if we don't do this, we will inherit index of df, and this iterative algorithm will fail for stock 2 onwards, and only work for the first stock in df
    df_ticker = df_ticker.reset_index(drop=True)

    # Cast to float to avoid issues with division and subtract later on
    df_ticker["adj_price"] = df_ticker["raw_price"].astype(float)
    
    # Iterate through every business_date for this stock, from past to present
    for i in df_ticker.index:
        
        dividend = df_ticker.loc[i,"div_cash"]
        split_ratio = df_ticker.loc[i,"split_factor"]
        
        # If there is a dividend payment today, subtract all prices from yesterday & before by dividend amount to make history comparable to today
        if dividend > 0:
            df_ticker.loc[:i-1, "adj_price"] -= dividend
        
        # If there is a stock split today, divide all prices from yesterday & before by split_ratio to make history comparable to today
        if split_ratio != 1:
            df_ticker.loc[:i-1, "adj_price"] /= split_ratio
    
    return df_ticker

In [96]:
# group_keys = False to avoid multi-layer index, i.e. - to keep rows keyed on (ticker, business_date)
# include_groups = True to include ticker as a column in the resulting output dataframe. If false, ticker will not appear in result
def adj_corp_actions(df: pd.DataFrame) -> pd.DataFrame:
    result = df.groupby("ticker", group_keys = False).apply(adj_corp_actions_for_one_stock, include_groups = True)
    return result

In [97]:
adj_corp_actions(df)

  result = df.groupby("ticker", group_keys = False).apply(adj_corp_actions_for_one_stock, include_groups = True)


Unnamed: 0,ticker,business_date,raw_price,div_cash,split_factor,adj_price
0,ABC,2025-06-01,306,0,1,49.0
1,ABC,2025-06-02,306,0,1,49.0
2,ABC,2025-06-03,300,6,1,49.0
3,ABC,2025-06-04,100,0,3,49.0
4,ABC,2025-06-05,100,0,1,49.0
5,ABC,2025-06-06,98,2,1,49.0
6,ABC,2025-06-07,49,0,2,49.0
7,ABC,2025-06-08,49,0,1,49.0
0,XYZ,2025-06-01,100,0,1,24.25
1,XYZ,2025-06-02,99,1,1,24.25


### Test that adj_corp_actions actually computes the adjusted fields correctly, using Nvidia as an example

In [24]:
def adj_corp_actions_for_one_stock(df_ticker: pd.DataFrame) -> pd.DataFrame:
    """
    Given a dataframe of raw/unadjusted price history for a single ticker, sorted by business_date, iterate from oldest business_date to most recent business_date, using dividend payment and stock split ratio information, to calculate adjusted fields that reflect the true return an investor holding the stock would have earned. Dataframe must contain columns:
    - Business_date: business_date (ascending / from oldest to newest)
    - Unadjusted fields: open (opening price), high (high price), low (low price), close (closing price), volume
    - Dollar of dividends paid that day: div_cash. 0 = no dividend that day (default), 1 = $1 dividend was paid out that day, etc.
    - Stock split ratio: split_factor. 1 = no stock split that day (default), 2 = a stock of $100 split into 2 shares of $50 that date, etc.
    Returns dataframe with additional columns: adj_open, adj_high, adj_low, adj_close, adj_volume
    """

    # TODO: I need to actually test this and validate that it works on Tingo's stock price data for at least one stock for a few business_date

    # Make a copy to avoid mutating the original dataframe
    df_ticker = df_ticker.copy()

    # Reset index of df_ticker to start from 0. Otherwise, if we don't do this, we will inherit index of df, and this iterative algorithm will fail for stock 2 onwards, and only work for the first stock in df
    df_ticker = df_ticker.reset_index(drop=True)

    # There are 4 flavors of price to adjust for corporate actions
    price_fields = ["open", "high", "low", "close"]
    volume_fields = ["volume"]  # I know this redundant but in the future we might have more volume fields such as short_volume, etc.
    fields = price_fields + volume_fields

    # Cast to float to avoid issues with division and subtract later on
    for field in fields:
        df_ticker[f"adj_{field}"] = df_ticker[field].astype(float)

    # Iterate through every business_date for this stock, from past to present
    for i in df_ticker.index:
        
        dividend = df_ticker.loc[i,"div_cash"]
        split_ratio = df_ticker.loc[i,"split_factor"]
        
        # If there is a dividend payment today, subtract all prices from yesterday & before by dividend amount to make history comparable to today
        if dividend > 0:
            for price_field in price_fields:
                df_ticker.loc[:i-1, f"adj_{price_field}"] -= dividend        
        
        # If there is a stock split today, divide all prices from yesterday & before by split_ratio to make history comparable to today
        if split_ratio != 1:
            for price_field in price_fields:
                df_ticker.loc[:i-1, f"adj_{price_field}"] /= split_ratio
            for volume_field in volume_fields:
                df_ticker.loc[:i-1, f"adj_{volume_field}"] *= split_ratio
    
    return df_ticker

In [25]:
# group_keys = False to avoid multi-layer index, i.e. - to keep rows keyed on (ticker, business_date)
# include_groups = True to include ticker as a column in the resulting output dataframe. If false, ticker will not appear in result
def adj_corp_actions(df: pd.DataFrame) -> pd.DataFrame:
    """
    Applies function adj_corp_actions_for_one_stock to multiple stocks. Input dataframe df must contain columns:
    - Keys: ticker, business_date
    - Unadjusted fields: open (opening price), high (high price), low (low price), close (closing price), volume
    - Dollar of dividends paid that day: div_cash. 0 = no dividend that day (default), 1 = $1 dividend was paid out that day, etc.
    - Stock split ratio: split_factor. 1 = no stock split that day (default), 2 = a stock of $100 split into 2 shares of $50 that date, etc.
    """
    result = df.groupby("ticker", group_keys = False).apply(adj_corp_actions_for_one_stock, include_groups = True)
    return result

In [30]:
# Test this on Nvidia's raw stock price, which experienced several stock split and dividends
start_date = '2024-01-02'
end_date = '2025-06-03'
tickers = ['NVDA']
df_NVDA_raw = extract_raw_data_from_staging(start_date, end_date, tickers, cursor, conn)

In [None]:
# This is what my calculated adj_fields look like
df_NVDA_adj_calc = adj_corp_actions(df_NVDA_raw)
df_NVDA_adj_calc.head(n = 5)

  result = df.groupby("ticker", group_keys = False).apply(adj_corp_actions_for_one_stock, include_groups = True)


Unnamed: 0,ticker,business_date,open,high,low,close,volume,div_cash,split_factor,adj_open,adj_high,adj_low,adj_close,adj_volume
0,NVDA,2024-01-02,492.44,492.95,475.95,481.68,41125422,0.0,1.0,49.2,49.251,47.551,48.124,411254220.0
1,NVDA,2024-01-03,474.85,481.841,473.2,475.69,32089617,0.0,1.0,47.441,48.1401,47.276,47.525,320896170.0
2,NVDA,2024-01-04,477.67,485.0,475.08,479.98,30653489,0.0,1.0,47.723,48.456,47.464,47.954,306534890.0
3,NVDA,2024-01-05,484.62,495.47,483.0601,490.97,41080455,0.0,1.0,48.418,49.503,48.26201,49.053,410804550.0
4,NVDA,2024-01-08,495.12,522.75,494.79,522.53,64250990,0.0,1.0,49.468,52.231,49.435,52.209,642509900.0


In [None]:
# Get the actual adj_fields from Tiingo's API and compare
query = """
select ticker, business_date, adj_close, adj_volume
from tbl_tiingo_daily_staging
where ticker = 'NVDA'
order by business_date;
"""

df_NVDA_adj_tiingo = sql_query_as_df(query, cursor)
df_NVDA_adj_tiingo.head(n = 5)

Unnamed: 0,ticker,business_date,adj_close,adj_volume
0,NVDA,2024-01-02,48.146883,411254220
1,NVDA,2024-01-03,47.548145,320896170
2,NVDA,2024-01-04,47.976957,306534890
3,NVDA,2024-01-05,49.075475,410804550
4,NVDA,2024-01-08,52.230092,642509900


In [44]:
cols = ["business_date","adj_close", "adj_volume"]
corp_action_cols = ["div_cash", "split_factor"]

df_NVDA_adj_comparison = pd.merge(
    left = df_NVDA_adj_calc[cols + corp_action_cols], 
    right = df_NVDA_adj_tiingo[cols],
    how = "inner",
    on = "business_date",
    suffixes = ["_calc", "_tiingo"]
)

In [60]:
# TODO: Is this difference of a few cents due to rounding error, or is my algorithm actually incorrect?
# Hard to tell when the dividend payment itself is literally $0.01 in some cases
df_NVDA_adj_comparison["adj_close_diff"] = df_NVDA_adj_comparison["adj_close_calc"] - df_NVDA_adj_comparison["adj_close_tiingo"]

In [64]:
df_NVDA_adj_comparison.to_excel("NVDA_adj_comparison.xlsx")

### Try another stock? Tesla

In [74]:
# Test this on Nvidia's raw stock price, which experienced several stock split and dividends
def compare_adj_price(ticker, start_date, end_date):

    tickers = [ticker]
    df_raw = extract_raw_data_from_staging(start_date, end_date, tickers, cursor, conn)
    df_adj_calc = adj_corp_actions(df_raw)

    query = f"""
    select ticker, business_date, adj_close, adj_volume
    from tbl_tiingo_daily_staging
    where ticker = '{ticker}' and business_date between '{start_date}' and '{end_date}'
    order by business_date;
    """
    df_adj_tiingo = sql_query_as_df(query, cursor)

    cols = ["business_date","adj_close", "adj_volume"]
    corp_action_cols = ["div_cash", "split_factor"]

    df_adj_comparison = pd.merge(
        left = df_adj_calc[cols + corp_action_cols], 
        right = df_adj_tiingo[cols],
        how = "inner",
        on = "business_date",
        suffixes = ["_calc", "_tiingo"]
    )

    df_adj_comparison["adj_close_diff_cents"] = 100 * (df_adj_comparison["adj_close_calc"] - df_adj_comparison["adj_close_tiingo"])

    return df_adj_comparison

In [77]:
df_adj_comparison_TSLA = compare_adj_price("TSLA", "2020-01-01", "2022-10-01")

  result = df.groupby("ticker", group_keys = False).apply(adj_corp_actions_for_one_stock, include_groups = True)


In [None]:
# Good to go!
df_adj_comparison_TSLA.to_excel("TSLA_adj_comparison.xlsx")


### Test percentage change calculator

In [79]:
def calculate_daily_returns(df: pd.DataFrame) -> pd.DataFrame:
    """
    Given a dataframe of raw/unadjusted price history for a single ticker, sorted by business_date, iterate from oldest business_date to most recent business_date, using dividend payment and stock split ratio information, to calculate adjusted fields that reflect the true return an investor holding the stock would have earned. Dataframe must contain columns:
    - Business_date: business_date (ascending / from oldest to newest)
    - Unadjusted fields: open (opening price), high (high price), low (low price), close (closing price), volume
    - Dollar of dividends paid that day: div_cash. 0 = no dividend that day (default), 1 = $1 dividend was paid out that day, etc.
    - Stock split ratio: split_factor. 1 = no stock split that day (default), 2 = a stock of $100 split into 2 shares of $50 that date, etc.
    Returns dataframe with additional columns: adj_open, adj_high, adj_low, adj_close, adj_volume
    """

    """
    Calculate daily percentage change for adj_open, adj_close, adj_volume, over business_date, for each ticker in df
    """

    # TODO: This might not be necessary because I already sort by (ticker, business_date) in extract_raw_data_from_staging
    df.sort_values(["ticker", "business_date"], inplace = True)

    cols = ["adj_open", "adj_close", "adj_volume"]

    for col in cols:
        df[f"{col}_pct_chg"] = df.groupby("ticker")[col].pct_change()
        df[f"{col}_pct_chg"]  = 100 * df[f"{col}_pct_chg"]

    return df

In [80]:
df = pd.DataFrame({
    "ticker":["ABC", "ABC", "ABC", "ABC", "ABC",
              "XYZ", "XYZ", "XYZ", "XYZ", "XYZ"],
    "business_date": pd.to_datetime([
        "2025-06-01", "2025-06-02", "2025-06-03", "2025-06-04", "2025-06-05", 
        "2025-06-01", "2025-06-02", "2025-06-03", "2025-06-04", "2025-06-05"]),
    "adj_open": [100, 99, 100, 102, 101,
                 200, 100, 200, 150, 200],
    "adj_close": [105, 100, 105, 100, 95,
                 205, 200, 205, 200, 200],
    "adj_volume": [1000, 1000, 1200, 1050, 1000,
                   1000, 1100, 1000, 1500, 1000]
})

In [81]:
df_returns = calculate_daily_returns(df)

In [None]:
df_returns

Unnamed: 0,ticker,business_date,adj_open,adj_close,adj_volume,adj_open_pct_chg,adj_close_pct_chg,adj_volume_pct_chg
0,ABC,2025-06-01,100,105,1000,,,
1,ABC,2025-06-02,99,100,1000,-1.0,-4.761905,0.0
2,ABC,2025-06-03,100,105,1200,1.010101,5.0,20.0
3,ABC,2025-06-04,102,100,1050,2.0,-4.761905,-12.5
4,ABC,2025-06-05,101,95,1000,-0.980392,-5.0,-4.761905
5,XYZ,2025-06-01,200,205,1000,,,
6,XYZ,2025-06-02,100,200,1100,-50.0,-2.439024,10.0
7,XYZ,2025-06-03,200,205,1000,100.0,2.5,-9.090909
8,XYZ,2025-06-04,150,200,1500,-25.0,-2.439024,50.0
9,XYZ,2025-06-05,200,200,1000,33.333333,0.0,-33.333333
