In [6]:
# Import required libraries
import yfinance as yf
import pandas as pd
import sqlite3
import os
from datetime import datetime
from tqdm.notebook import tqdm

# Define the data directory
DATA_DIR = "./data"

Starting data collection and storage process...


Processed BDX (1973-02-21 to 2024-08-22):  12%|█▏        | 62/503 [00:33<05:02,  1.46stock/s]  $BRK.B: possibly delisted; no timezone found
Processed BRO (1981-02-11 to 2024-08-22):  15%|█▌        | 77/503 [00:43<03:43,  1.91stock/s] $BF.B: possibly delisted; no price data found  (1d 1925-09-16 -> 2024-08-22)
Processed ZTS (2013-02-01 to 2024-08-22): 100%|██████████| 503/503 [05:48<00:00,  1.44stock/s]  


Process completed. Total valid tickers: 501
First 10 valid tickers: ['MMM', 'AOS', 'ABT', 'ABBV', 'ACN', 'ADBE', 'AMD', 'AES', 'AFL', 'A']

Example data for MMM:
Data from SQLite - First 5 rows:
                        Date  Open      High       Low     Close  Volume  \
0  1962-01-02 00:00:00-05:00   0.0  0.587004  0.569739  0.574055  254509   
1  1962-01-03 00:00:00-05:00   0.0  0.578371  0.564344  0.578371  505190   
2  1962-01-04 00:00:00-05:00   0.0  0.588083  0.578371  0.578371  254509   
3  1962-01-05 00:00:00-05:00   0.0  0.576213  0.561107  0.563265  376979   
4  1962-01-08 00:00:00-05:00   0.0  0.564343  0.556790  0.560027  399942   

   Dividends  Stock Splits  
0        0.0           0.0  
1        0.0           0.0  
2        0.0           0.0  
3        0.0           0.0  
4        0.0           0.0  

Data from SQLite - Last 5 rows:
                            Date        Open        High         Low  \
15763  2024-08-16 00:00:00-04:00  126.739998  127.339996  126.019997




In [None]:
def ensure_dir(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)

def get_and_store_data(tickers):
    valid_tickers = []
    
    # Ensure data directory exists
    ensure_dir(DATA_DIR)
    
    # Create subdirectories for SQLite and Parquet files
    sqlite_dir = os.path.join(DATA_DIR, "sqlite")
    parquet_dir = os.path.join(DATA_DIR, "parquet")
    ensure_dir(sqlite_dir)
    ensure_dir(parquet_dir)
    
    # SQLite database path
    db_path = os.path.join(sqlite_dir, 'full_stock_data.sqlite')
    conn = sqlite3.connect(db_path)
    
    # Create progress bar
    pbar = tqdm(tickers, desc="Processing Stocks", unit="stock")
    
    for ticker in pbar:
        try:
            stock = yf.Ticker(ticker)
            
            # Get all available history
            history = stock.history(period="max")
            
            if not history.empty:
                valid_tickers.append(ticker)
                
                # Store in SQLite
                history.to_sql(ticker, conn, if_exists='replace')
                
                # Store as Parquet
                parquet_path = os.path.join(parquet_dir, f"{ticker}.parquet")
                history.to_parquet(parquet_path)
                
                # Update progress bar description
                pbar.set_description(f"Processed {ticker} ({history.index[0].date()} to {history.index[-1].date()})")
            else:
                pbar.set_description(f"No data for {ticker}")
        
        except Exception as e:
            pbar.set_description(f"Error with {ticker}: {str(e)[:50]}...")  # Truncate long error messages
    
    conn.close()
    return valid_tickers

def read_data(ticker, source='sqlite'):
    if source == 'sqlite':
        db_path = os.path.join(DATA_DIR, "sqlite", 'full_stock_data.sqlite')
        with sqlite3.connect(db_path) as conn:
            return pd.read_sql(f"SELECT * FROM '{ticker}'", conn)
    elif source == 'parquet':
        parquet_path = os.path.join(DATA_DIR, "parquet", f"{ticker}.parquet")
        return pd.read_parquet(parquet_path)
    else:
        raise ValueError("Invalid source. Choose 'sqlite' or 'parquet'.")

# Get S&P 500 components
sp500 = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')[0]
sp500_tickers = sp500['Symbol'].tolist()

print("Starting data collection and storage process...")
valid_tickers = get_and_store_data(sp500_tickers)

print(f"\nProcess completed. Total valid tickers: {len(valid_tickers)}")
print(f"First 10 valid tickers: {valid_tickers[:10]}")


In [None]:

# Example of reading data
if valid_tickers:
    example_ticker = valid_tickers[0]
    sqlite_data = read_data(example_ticker, 'sqlite')
    parquet_data = read_data(example_ticker, 'parquet')
    
    print(f"\nExample data for {example_ticker}:")
    print(f"Data from SQLite - First 5 rows:")
    print(sqlite_data.head())
    print(f"\nData from SQLite - Last 5 rows:")
    print(sqlite_data.tail())
    print(f"\nTotal rows in SQLite: {len(sqlite_data)}")
    
    print(f"\nData from Parquet - First 5 rows:")
    print(parquet_data.head())
    print(f"\nData from Parquet - Last 5 rows:")
    print(parquet_data.tail())
    print(f"\nTotal rows in Parquet: {len(parquet_data)}")
else:
    print("No valid tickers found.")