In [1]:
# Import required libraries
import yfinance as yf
import pandas as pd
import sqlite3
import os
from datetime import datetime
from tqdm.notebook import tqdm

# Define the data directory
DATA_DIR = "./data"

In [8]:
def ensure_dir(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)


def get_and_store_data(tickers):
    valid_tickers = []

    # Ensure data directory exists
    ensure_dir(DATA_DIR)

    # Create subdirectories for SQLite and Parquet files
    sqlite_dir = os.path.join(DATA_DIR, "sqlite")
    parquet_dir = os.path.join(DATA_DIR, "parquet")
    ensure_dir(sqlite_dir)
    ensure_dir(parquet_dir)

    # SQLite database path
    db_path = os.path.join(sqlite_dir, "full_stock_data.sqlite")
    conn = sqlite3.connect(db_path)

    # Create progress bar
    pbar = tqdm(tickers, desc="Processing Stocks", unit="stock")

    for ticker in pbar:
        try:
            # if file already exists, skip
            if os.path.exists(os.path.join(parquet_dir, f"{ticker}.parquet")):
                pbar.set_description(f"Skipping {ticker} (already processed)")
                valid_tickers.append(ticker)
                continue

            stock = yf.Ticker(ticker)

            # Get all available history
            history = stock.history(period="max")

            if not history.empty:
                valid_tickers.append(ticker)

                # Store in SQLite
                history.to_sql(ticker, conn, if_exists="replace")

                # Store as Parquet
                parquet_path = os.path.join(parquet_dir, f"{ticker}.parquet")
                history.to_parquet(parquet_path)

                # Update progress bar description
                pbar.set_description(
                    f"Processed {ticker} ({history.index[0].date()} to {history.index[-1].date()})"
                )
            else:
                pbar.set_description(f"No data for {ticker}")

        except Exception as e:
            pbar.set_description(
                f"Error with {ticker}: {str(e)[:50]}..."
            )  # Truncate long error messages

    conn.close()
    return valid_tickers


# Get S&P 500 components
sp500 = pd.read_html("https://en.wikipedia.org/wiki/List_of_S%26P_500_companies")[0]
sp500_tickers = sp500["Symbol"].tolist()

print("Starting data collection and storage process...")
valid_tickers = get_and_store_data(sp500_tickers)

print(f"\nProcess completed. Total valid tickers: {len(valid_tickers)}")
print(f"First 10 valid tickers: {valid_tickers[:10]}")


Starting data collection and storage process...


Processing Stocks:   0%|          | 0/503 [00:00<?, ?stock/s]

$BRK.B: possibly delisted; no timezone found
$BF.B: possibly delisted; no price data found  (1d 1925-09-16 -> 2024-08-22)



Process completed. Total valid tickers: 501
First 10 valid tickers: ['MMM', 'AOS', 'ABT', 'ABBV', 'ACN', 'ADBE', 'AMD', 'AES', 'AFL', 'A']


In [9]:
# read data from sqlite

def read_data(ticker, source='sqlite'):
    if source == 'sqlite':
        db_path = os.path.join(DATA_DIR, "sqlite", 'full_stock_data.sqlite')
        with sqlite3.connect(db_path) as conn:
            return pd.read_sql(f"SELECT * FROM '{ticker}'", conn)
    elif source == 'parquet':
        parquet_path = os.path.join(DATA_DIR, "parquet", f"{ticker}.parquet")
        return pd.read_parquet(parquet_path)
    else:
        raise ValueError("Invalid source. Choose 'sqlite' or 'parquet'.")


data = read_data('AAPL')
print(data.head())


                        Date      Open      High       Low     Close  \
0  1980-12-12 00:00:00-05:00  0.098943  0.099373  0.098943  0.098943   
1  1980-12-15 00:00:00-05:00  0.094211  0.094211  0.093781  0.093781   
2  1980-12-16 00:00:00-05:00  0.087328  0.087328  0.086898  0.086898   
3  1980-12-17 00:00:00-05:00  0.089049  0.089479  0.089049  0.089049   
4  1980-12-18 00:00:00-05:00  0.091630  0.092061  0.091630  0.091630   

      Volume  Dividends  Stock Splits  
0  469033600        0.0           0.0  
1  175884800        0.0           0.0  
2  105728000        0.0           0.0  
3   86441600        0.0           0.0  
4   73449600        0.0           0.0  
