In [None]:
# Final Revision of Capstone Stock Data Download

In [None]:
# Raw Stock Data Download Notes:
# File sequence # 1

# The purpose of this notebook is to download raw stock data from the
# yfinance library

# Inupts:
# A. Stock ticker symbols
# (imported from nyse_tickers)

# Outputs:
# A. Stock historical data for selected tickers



In [None]:
# y finance is the source of raw stock data for this project:
!pip install yfinance



In [None]:
# library imports

# data source
import yfinance as yf
# calculations
import pandas as pd
import numpy as np
# timing
import time
# file management
import os
from pathlib import Path
# csv grouping
import glob


In [None]:
# mount google drive
from google.colab import drive
# this resets all file variables
drive.flush_and_unmount()
# mount/remount
drive.mount('/content/drive')

Drive not mounted, so nothing to flush and unmount.
Mounted at /content/drive


In [None]:

# import list of stock tickers from .txt file:
txt_file = r"/content/drive/MyDrive/Capstone_Docs_Shared/nyse_tickers.txt"
with open(txt_file, "r") as file:
    # read by line
    lines = file.read().splitlines()
    print("Number of Ticker Symbols Imported: ")
    print(len(lines))

Number of Ticker Symbols Imported: 
3118


In [None]:
# important note: the nature of data availability in yfinance necessitates...
#.. a sort of recursive dL approach.
# Not all stock tickers are available at all times, and some from imported...
# list are unavailabe entirely

In [None]:

# initialize list to store downloaded tickers:
u_tickers = []

# where dL files are sent
dest_direct = r"/content/drive/MyDrive/Capstone_Docs_Shared"


In [None]:
# function to check stored csv files...
# ...for tickers already downloaded

def get_dl_tickers(dest_direct):

    base_name = "yf_stock_output"
    ext = ".csv"
    # get list of files in directory
    files = glob.glob(f"{dest_direct}/{base_name}_*{ext}")
    #print(files)

    # set to store ticker symbols
    dl_tickers = set()

    # get list of ticker names
    for file in files:
        # only need to read ticker column
        df = pd.read_csv(file, usecols=[1])
        # take only unique tickers:
        u_tickers_file = df.iloc[:, 0].unique().tolist()

        dl_tickers.update(u_tickers_file)

    return dl_tickers



In [None]:
# get ticker symbols already downloaded
# note: this will be none the first time users run the notebook

u_tickers = get_dl_tickers(dest_direct)
u_tickers = list(u_tickers)
print("Total Num Unique Tickers Downloaded So Far:")
print(len(u_tickers))

# note: the max num downloadable tickers is ~2350-2370 uniques

Total Num Unique Tickers Downloaded So Far:
0


In [None]:
# function to select current batch of tickers to dL
# want to use batches, because of rate limits ...
# ... plus some tickers become avail/unavail within minutes, ...
# ... so we want to capture those too

# assemble batch of tickers
def assemble_batch(lines, u_tickers):
    # filter by removing those already downloaded
    filter_lines = list(set(lines) - set(u_tickers))
    # batch in next 500 'new' tickers in the .txt file list
    ticker_batch = filter_lines[:500]

    return ticker_batch


In [None]:
# function to download data

def data_download(ticker_batch):

    # time this operation
    beg_time = time.time()

    # download data
    # outputs "Multi-Index"dataframe object type
    stock_data = yf.download(ticker_batch, period='max', group_by='ticker')

    # end time
    end_time = time.time()

    # time taken
    tot_time = end_time - beg_time

    print(f'Total DL Time: {tot_time:.6f} seconds')

    # flatten M-I dataframe for csv format
    stock_data = stock_data.stack(level=0).reset_index()

    return stock_data


In [None]:
# function to save to csv
def save_to_csv(stock_data,destination_directory):

    # check for preexisting path in directory
    dest_direct = destination_directory

    # check if directory exists, create if not
    os.makedirs(dest_direct, exist_ok=True)

    # base file name
    base_name = "stock_output"
    ext = ".csv"

    # counter for naming files
    count = 1

    # check if file already exists
    # (if so, increment naming counter by 1)
    while (Path(dest_direct) / f"{base_name}_{count}{ext}").exists():
        count += 1

    # define final path to file:
    output_path = Path(dest_direct) / f"{base_name}_{count}{ext}"

    # save DF to csv
    stock_data.to_csv(output_path, index=False)

    print(f"Saved to {output_path}")


In [None]:
# loop through tickers until all are processed:

while True:
    # assemble batch
    ticker_batch = assemble_batch(lines, u_tickers)
    # check if batch is empty
    if ticker_batch:
        # download data
        stock_data = data_download(ticker_batch)
        # save to csv
        save_to_csv(stock_data,dest_direct)
        # update list of downloaded tickers
        u_tickers.extend(ticker_batch)
    else:
        break



YF.download() has changed argument auto_adjust default to True


[******************    38%                       ]  190 of 500 completedERROR:yfinance:404 Client Error: Not Found for url: https://query2.finance.yahoo.com/v10/finance/quoteSummary/SEAL?modules=financialData%2CquoteType%2CdefaultKeyStatistics%2CassetProfile%2CsummaryDetail&corsDomain=finance.yahoo.com&formatted=false&symbol=SEAL&crumb=v%2FeVMHWPgT2
[*********************100%***********************]  500 of 500 completed
ERROR:yfinance:
4 Failed downloads:
ERROR:yfinance:['ICR', 'Q']: YFPricesMissingError('possibly delisted; no price data found  (1d 1926-05-14 -> 2025-04-19)')
ERROR:yfinance:['SEAL']: AttributeError("'NoneType' object has no attribute 'update'")
ERROR:yfinance:['GLOP']: YFTzMissingError('possibly delisted; no timezone found')


Total DL Time: 124.046124 seconds


  stock_data = stock_data.stack(level=0).reset_index()


Saved to /content/drive/MyDrive/Capstone_Docs_Shared/stock_output_6.csv


[*********************100%***********************]  500 of 500 completed
ERROR:yfinance:
3 Failed downloads:
ERROR:yfinance:['BF']: YFPricesMissingError('possibly delisted; no price data found  (1d 1926-05-14 -> 2025-04-19)')
ERROR:yfinance:['TRNO']: YFRateLimitError('Too Many Requests. Rate limited. Try after a while.')
ERROR:yfinance:['B']: YFTzMissingError('possibly delisted; no timezone found')


Total DL Time: 118.006225 seconds


  stock_data = stock_data.stack(level=0).reset_index()


Saved to /content/drive/MyDrive/Capstone_Docs_Shared/stock_output_7.csv


[*********************100%***********************]  500 of 500 completed
ERROR:yfinance:
4 Failed downloads:
ERROR:yfinance:['ATH', 'ANG', 'OAK']: YFTzMissingError('possibly delisted; no timezone found')
ERROR:yfinance:['AKO']: YFPricesMissingError('possibly delisted; no price data found  (1d 1926-05-14 -> 2025-04-19)')


Total DL Time: 111.724125 seconds


  stock_data = stock_data.stack(level=0).reset_index()


Saved to /content/drive/MyDrive/Capstone_Docs_Shared/stock_output_8.csv


[*********************100%***********************]  500 of 500 completed
ERROR:yfinance:
5 Failed downloads:
ERROR:yfinance:['CDR', 'ATCO', 'ARGO', 'Y', 'TRTN']: YFTzMissingError('possibly delisted; no timezone found')


Total DL Time: 109.071615 seconds


  stock_data = stock_data.stack(level=0).reset_index()


Saved to /content/drive/MyDrive/Capstone_Docs_Shared/stock_output_9.csv


[*********************100%***********************]  394 of 394 completed
ERROR:yfinance:
4 Failed downloads:
ERROR:yfinance:['AHL', 'I']: YFTzMissingError('possibly delisted; no timezone found')
ERROR:yfinance:['SFB']: YFInvalidPeriodError("SFB: Period 'max' is invalid, must be of the format 1d, 5d, etc.")
ERROR:yfinance:['N']: YFPricesMissingError('possibly delisted; no price data found  (1d 1926-05-14 -> 2025-04-19)')


Total DL Time: 89.442306 seconds


  stock_data = stock_data.stack(level=0).reset_index()


Saved to /content/drive/MyDrive/Capstone_Docs_Shared/stock_output_10.csv


In [None]:
# list of unique tickers-- comment this out once it gets too long...
# ... helpful for early checks
print(u_tickers)
print("Total Num Unique Tickers Downloaded:")
print(len(u_tickers))

# check undownloaded tickers:
difference = set(lines) - set(u_tickers)
print("Num Tickers Not Downloaded:")
print(difference)
print(len(difference))

['BOW', 'INFY', 'XYZ', 'ED', 'RA', 'ACP', 'HLX', 'MEGI', 'LND', 'AMG', 'GBTG', 'HMC', 'CDLR', 'NTST', 'KNSL', 'TAP', 'MTAL', 'BEPJ', 'AFB', 'FHI', 'AMP', 'GPC', 'IHD', 'NSP', 'MSI', 'CLB', 'CDE', 'RF', 'TNL', 'PAI', 'RQI', 'ASA', 'IMAX', 'ICL', 'HUN', 'WTS', 'HSHP', 'TFSA', 'FBP', 'KFY', 'HASI', 'WT', 'PFN', 'STEM', 'FCN', 'WKC', 'ANF', 'MCK', 'MYTE', 'CUZ', 'PEB', 'HLLY', 'GPK', 'HBI', 'ROL', 'PKG', 'QGEN', 'BUI', 'VIK', 'FMS', 'NVS', 'CTA', 'OPAD', 'DELL', 'GTY', 'PAGS', 'ANET', 'TUYA', 'EHAB', 'MANU', 'SOR', 'GCI', 'IBP', 'EEX', 'SCE', 'TVE', 'NCDL', 'BARK', 'GBAB', 'TFPM', 'MPC', 'QVCC', 'KBH', 'BXMT', 'OMI', 'PG', 'FTI', 'GWH', 'HYAC', 'BSBR', 'AMC', 'TY', 'HLT', 'DVN', 'RY', 'WLK', 'DEC', 'TEN', 'PPL', 'TPR', 'FUN', 'IPG', 'BTA', 'BCH', 'FMX', 'MVT', 'USNA', 'AZEK', 'TEO', 'MMD', 'VTS', 'BWMX', 'WAB', 'SRG', 'UVE', 'Z', 'TKC', 'IOT', 'HIPO', 'DK', 'FRA', 'LCII', 'SILA', 'CLDT', 'MCD', 'MXE', 'BDJ', 'HSY', 'DKS', 'MBI', 'BNL', 'ODV', 'NOVA', 'DTM', 'TG', 'MGF', 'BRK', 'ENZ', 'MDV'

In [None]:
# combine all downloaded ticker files:

# list of file paths for data to merge:
file_paths = []

# take source files name pattern
pattern = os.path.join(dest_direct, 'stock_output_*')

# use glob to populate list
file_paths = glob.glob(pattern)

# Output file path:
merged_file = r"/content/drive/MyDrive/Capstone_Docs_Shared/merged_hist_stock_output.csv"

# if the merged file already exists, delete it (start fresh)
if os.path.exists(merged_file):
  os.remove(merged_file)

# Process two files at a time ...
# ... this is necessary to avoid crashing the shared workspace
for i in range(0, len(file_paths), 2):
  # select the batch of 2 files
  batch_files = file_paths[i:i+2]

  # read and concatenate the batch
  # list containing two dataframes:
  df_list = [pd.read_csv(file) for file in batch_files]

  # actual concatenation of 2 files in list
  batch_df = pd.concat(df_list, ignore_index=True)

  # Create final file or ID it for further manipulation
  if i == 0:
    # create on first iteration
    batch_df.to_csv(merged_file, index=False)
  else:
    # csv append to end of existing file, no headers:
    batch_df.to_csv(merged_file, mode='a', index=False, header=False)

print("File Merging Process Complete")
print("Final Merged File Name: {merged_file}")


  batch_df = pd.concat(df_list, ignore_index=True)


File Merging Process Complete
Final Merged File Name: {merged_file}
