In [1]:
import os
import glob
import pandas as pd
import matplotlib.pyplot as plt

from concurrent import futures
import datetime as dt
import numpy as np
from scipy.stats import gaussian_kde
from pandas_datareader import data as pdr
import yfinance as yf
yf.pdr_override() # <== that's all it takes :-)

data_dir = "./data/mostattractive_stocks"
os.makedirs(data_dir, exist_ok=True)

In [2]:
tables = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')

In [3]:
sp500_df = tables[0]
second_table = tables[1]
print(sp500_df.shape)

# rename symbol to escape symbol error
sp500_df["Symbol"] = sp500_df["Symbol"].map(lambda x: x.replace(".", "-"))
sp500_df.to_csv("./data/SP500_20230104.csv", index=False)
sp500_df = pd.read_csv("./data/SP500_20230104.csv")
print(sp500_df.shape)
sp500_tickers = list(sp500_df["Symbol"])
sp500_df.head()

(503, 9)
(503, 9)


Unnamed: 0,Symbol,Security,SEC filings,GICS Sector,GICS Sub-Industry,Headquarters Location,Date first added,CIK,Founded
0,MMM,3M,reports,Industrials,Industrial Conglomerates,"Saint Paul, Minnesota",1976-08-09,66740,1902
1,AOS,A. O. Smith,reports,Industrials,Building Products,"Milwaukee, Wisconsin",2017-07-26,91142,1916
2,ABT,Abbott,reports,Health Care,Health Care Equipment,"North Chicago, Illinois",1964-03-31,1800,1888
3,ABBV,AbbVie,reports,Health Care,Pharmaceuticals,"North Chicago, Illinois",2012-12-31,1551152,2013 (1888)
4,ACN,Accenture,reports,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,1467373,1989


In [4]:
""" set the download window """
start_date = "2017-01-01"
end_date = "2017-04-30"

bad_names = [] # to keep track of failed queries

def download_stock(stock):
    try: 
        # download dataframe
        data = pdr.get_data_yahoo(stock, start=start_date, end=end_date)
        data['Name'] = stock
        output_name = f"{data_dir}//{stock}.csv"
        data.to_csv(output_name)
    except BaseException:
        bad_names.append(stock)
        print('bad: %s' % (stock))

In [5]:
# set the maximum thread number
max_workers = 20
now = dt.datetime.now()
path_failed_queries = f'{data_dir}/failed_queries.txt'
if os.path.exists(path_failed_queries):
    with open(path_failed_queries) as f:
        failed_queries = f.read().split("\n")[:-1]
        sp500_tickers_ = failed_queries
else:
    sp500_tickers_ = sp500_tickers
print("number of stockes to download:", len(sp500_tickers_))

number of stockes to download: 503


In [7]:
# in case a smaller number of stocks than threads was passed in
workers = min(max_workers, len(sp500_tickers_))

In [8]:
with futures.ThreadPoolExecutor(workers) as executor:
    res = executor.map(download_stock, sp500_tickers_)

[*********************100%***********************]  1 of 1 completed



















bad: ABBV
bad: ADP
bad: AMT
bad: ABC
bad: LNT
bad: ADM
[*********************100%***********************]  1 of 1 completed





bad: AES
bad: AMGN
[*********************100%***********************]  1 of 1 completed

bad: AOS
bad: ADI
[*********************100%***********************]  1 of 1 completed

bad: ALLE
bad: ADBE
[*********************100%***********************]  1 of 1 completed

bad: AMP
bad: GOOG
bad: AKAM
bad: AAPL
bad: ARE
bad: AON
bad: GOOGL
bad: APH
bad: AXP
bad: ALL
bad: ACN
bad: A
bad: MMM
bad: AWK
bad: ACGL
bad: AMZN
bad: ATVI
bad: APA
bad: AAP
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


[*********************100%***********************]  1 of 1 completed














bad: AMCR
bad: APD
bad: ALGN
bad: BBY
bad: AIZ
bad: WRB
bad: AJG
bad: ALK
bad: CDW
[*********************100%*******

In [10]:
with futures.ThreadPoolExecutor(workers) as executor:
    res = executor.map(download_stock, sp500_tickers_)
    
""" Save failed queries to a text file to retry """
if len(bad_names) > 0:
    with open(path_failed_queries, 'w') as outfile:
        for name in bad_names:
            outfile.write(name + "\n")
                          
finish_time = dt.datetime.now()
duration = finish_time - now
minutes, seconds = divmod(duration.seconds, 60)
print(f'The threaded script took {minutes} minutes and {seconds} seconds to run.')
print(f'{len(bad_names)} stocks failed: ', bad_names)                          

number of stockes to download: 503
[*********************100%***********************]  1 of 1 completed






[*********************100%***********************]  1 of 1 completed

[*********************100%***********************]  1 of 1 completed










bad: ALL
bad: AKAM
bad: GOOGL
bad: AMZN
bad: LNT
bad: AAP
bad: MMM
bad: AES
bad: ADBE
bad: A
bad: GOOG
bad: ADP
bad: AMT
bad: ABC
bad: ABBV
bad: AXP
bad: AMGN
[*********************100%***********************]  1 of 1 completed
















bad: ADM
bad: APA
bad: AON
bad: BBY
bad: AOS
bad: ALLE
bad: ADI
bad: AAPL
bad: ACN
bad: ACGL
bad: AWK
bad: AMP
bad: APH
bad: ARE
bad: ATVI
bad: APD
bad: AAL
bad: BAC
bad: AJG
bad: AFL
[*********************100%***********************]  1 of 1 completed





[*********************100%***********************]  1 of 1 completed













bad: ALB
bad: BIO
bad: ATO
bad: AMCR
bad: AIZ
[*********************100%***********************]  1 of 1 completed
[*********************100%****************

In [9]:
historial_stock_data_files

NameError: name 'historial_stock_data_files' is not defined

In [None]:
historial_stock_data_files = glob.glob(f"{data_dir}/*.csv")
highest_day_list = []
for files in historial_stock_data_files:
    price = pd.read_csv(files, index_col="Date", parse_dates = True)
    ticker = os.path.splitext(os.path.basename(files))[0]
    price_close = price[["Close"]]
    highest_day = price_close.idxmax()[0]
    highest_price = price_close.max()[0]
    print(f"{price_close}:{highest_day}:{highest_price}")
    # highest_day_list.append(
    #     pd.DataFrame({"highest_day": highest_day, "ticker": ticker, "highest_price": highest_price }))

In [18]:
highest_day

NameError: name 'highest_day' is not defined

In [13]:

df = pd.concat(highest_day_list).reset_index(drop=True)
print(df.shape)
df.head()

ValueError: No objects to concatenate