Filling data from `1996 - 2025` of S & P 500 data using `yfinance`.

In [35]:
import yfinance as yf
from dotenv import load_dotenv
import os
from sqlalchemy import create_engine
import pandas as pd
from datetime import date

load_dotenv()

db_url = os.getenv('DB_URL')
db_conn = create_engine(db_url)

constituents_table = os.getenv('CONSTIUENTS_TABLE')
prices_table = os.getenv('PRICES_TABLE')

data_path = os.getenv('DATA_PATH')

Up to you to have an appropiate database url and table set up. There is:
1. `Date`: a `date` column.
2. `Close, High, Low, Open`: price columns; I would take 4 decimal points.
3. `Volum`: volume columns of type integer.

In [60]:
def get_ticker_data(ticker: str, start: str, end: str):
    df = yf.download(ticker, start=start, end=end, auto_adjust=True)
    df.columns = df.columns.map(lambda x: x[0] if isinstance(x, tuple) else x)
    df = df.reset_index()
    df.index.name = None
    df['ticker'] = ticker

    for col in ['Close', 'High', 'Low', 'Open']:
        df[col] = round(df[col], 4)

    df.rename(columns = {
        'Date': 'date',
        'Close': 'adj_close',
        'High': 'high',
        'Low': 'low',
        'Open': 'open',
        'Volume': 'volume',
        'ticker': 'ticker'
    }, inplace=True)

    return df

In [None]:
# quick e.g of how this works
get_ticker_data('AAPL', '2023-01-01', '2025-01-01')

[*********************100%***********************]  1 of 1 completed


Unnamed: 0,date,adj_close,high,low,open,volume,ticker
0,2023-01-03,123.470604,129.226044,122.582112,128.613978,112117500,AAPL
1,2023-01-04,124.744110,127.014701,123.480480,125.267331,89113600,AAPL
2,2023-01-05,123.421249,126.136083,123.164580,125.504267,80962700,AAPL
3,2023-01-06,127.962410,128.623840,123.292902,124.398582,87754700,AAPL
4,2023-01-09,128.485626,131.703947,128.228957,128.801541,70790800,AAPL
...,...,...,...,...,...,...,...
497,2024-12-24,257.578674,257.588630,254.675658,254.875189,23234700,AAPL
498,2024-12-26,258.396667,259.474086,257.010028,257.568678,27237100,AAPL
499,2024-12-27,254.974930,258.077462,252.451019,257.209530,42355300,AAPL
500,2024-12-30,251.593094,252.889969,250.146586,251.623020,35557500,AAPL


The below code changes a historical database of `S & P 500` constituents I found online (![]())

In [16]:
s_and_p_500_historical = pd.read_csv(f'{data_path}/s_and_p_500.csv')

s_and_p_500_historical['tickers'] = s_and_p_500_historical['tickers'].str.split(',')
s_and_p_500_historical['date'] = pd.to_datetime(s_and_p_500_historical['date'])

s_and_p_500_historical = s_and_p_500_historical.sort_values('date')

active = set()
records = []

for i, row in s_and_p_500_historical.iterrows():
    current_date = row['date']
    current_tickers = set(row['tickers'])

    entered = current_tickers - active
    for ticker in entered:
        records.append({'ticker': ticker, 'start_date': current_date, 'end_date': None})

    left = active - current_tickers
    for ticker in left:
        for rec in reversed(records):
            if rec['ticker'] == ticker and rec['end_date'] is None:
                rec['end_date'] = current_date
                break

    active = current_tickers

s_and_p_timeline = pd.DataFrame(records)

Write the constituents to a database table if you wish. The `psql` table I used is located in `/data/sql/constituents.sql`

In [None]:
s_and_p_timeline.to_sql(constituents_table, db_conn, if_exists='append', index=False)

230

Okay now we fill the actual prices. I worry that I will be blocked pretty quickly from pulling, so I'll add a try catch and loop. 

In [68]:
aapl_idx = s_and_p_timeline[s_and_p_timeline['ticker'] == 'AAPL'].index[0]

In [69]:
continue_fill = s_and_p_timeline.iloc[aapl_idx:]

In [71]:
for _, row in continue_fill.iterrows():
    if row['start_date'] is pd.NaT:
        continue # something has gone wrong here... but constituents table should be fairly good.

    # actually we can only pull currently listed stocks

    ticker = row['ticker']
    start_date = pd.to_datetime(row['start_date']).strftime('%Y-%m-%d')
    end_date = row['end_date'] if row['end_date'] is not pd.NaT else date.today().strftime('%Y-%m-%d')

    ticker_price_data = get_ticker_data(ticker, start_date, end_date)
    ticker_price_data.to_sql(prices_table, db_conn, if_exists='append', index=False)

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

1 Failed download:
['CGP']: YFPricesMissingError('possibly delisted; no price data found  (1d 1996-01-02 -> 2001-01-30 00:00:00) (Yahoo error = "Data doesn\'t exist for startDate = 820558800, endDate = 980830800")')
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

1 Failed download:
['TMK']: YFTzMissingError('possibly delisted; no timezone found')
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[******

In [None]:
# break up into dump files
total_rows = pd.read_sql("SELECT COUNT(*) FROM adjusted_historical", db_conn).iloc[0, 0]
num_parts = 5
chunk_size = total_rows // num_parts

for i in range(num_parts):
    offset = i * chunk_size
    limit = chunk_size if i < num_parts - 1 else total_rows - offset  # ensure we get all remaining rows

    query = f"""
        SELECT * FROM adjusted_historical
        ORDER BY date, ticker
        OFFSET {offset} LIMIT {limit}
    """

    df_chunk = pd.read_sql(query, db_conn)
    df_chunk.to_csv(f"../dump/adjusted_historical_part_{i+1}.csv", index=False)

In [74]:
total_rows

2670101

In [45]:
ticker_price_data

Unnamed: 0,date,Adj Close,adj_close,high,low,open,volume,ticker


In [42]:
start_date

'2025-07-09'