In [66]:
import pandas as pd
import requests
import warnings

In [2]:
warnings.filterwarnings("ignore")

In [3]:
# ! pip3 install lxml html5lib

Get list of S&P 500 symbols

In [80]:
def scrape_yahoo_finance(stock: str) -> pd.DataFrame:
    url = f"https://finance.yahoo.com/quote/{stock}/history/?period1=1712523837&period2=1720386234"
    request = requests.get(
        url,
        headers={
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
        },
    )
    df = pd.read_html(request.text)[0]
    df.rename(columns={"Close Close price adjusted for splits.": "Close"}, inplace=True)
    return df[["Date", "Open", "High", "Low", "Close", "Volume"]]

In [81]:
df = scrape_yahoo_finance("TSLA")
df

Unnamed: 0,Date,Open,High,Low,Close,Volume
0,"Jul 5, 2024",249.81,252.37,242.46,251.52,154170000
1,"Jul 3, 2024",234.56,248.35,234.25,246.39,166561500
2,"Jul 2, 2024",218.89,231.30,218.06,231.26,205047900
3,"Jul 1, 2024",201.02,213.23,200.85,209.86,135691400
4,"Jun 28, 2024",199.55,203.20,195.26,197.88,95438100
...,...,...,...,...,...,...
57,"Apr 12, 2024",172.34,173.81,170.36,171.05,64506600
58,"Apr 11, 2024",172.55,175.88,168.51,174.60,94516000
59,"Apr 10, 2024",173.04,174.93,170.01,171.76,84532400
60,"Apr 9, 2024",172.91,179.22,171.92,176.88,103232700


In [71]:
def format_table(df_in: pd.DataFrame, stock: str = "") -> pd.DataFrame:
    df = df_in[["Date", "Close"]]
    df["Date"] = pd.to_datetime(df["Date"])
    df.set_index("Date", inplace=True)
    df.rename(columns={"Close": stock}, inplace=True)
    if df[stock].dtype == "O":
        # Remove rows where the closing price is contains a space - these
        # are dividend adjustment lines, then convert to float
        df = df[stock][~df[stock].str.contains(" ")].apply(float).to_frame()
    return df

In [74]:
df = format_table(df, stock="TSLA")
df

Unnamed: 0_level_0,TSLA
Date,Unnamed: 1_level_1
2024-07-05,251.52
2024-07-03,246.39
2024-07-02,231.26
2024-07-01,209.86
2024-06-28,197.88
...,...
2024-04-12,171.05
2024-04-11,174.60
2024-04-10,171.76
2024-04-09,176.88


In [77]:
some_symbols = pd.read_csv("./data/sp_500_companies.csv")["Symbol"].sample(
    10, random_state=42
)
some_symbols

268     JNJ
73      BMY
289    LDOS
155     DOV
104     CVX
280     KKR
392     DGX
124     STZ
68       BX
244    HBAN
Name: Symbol, dtype: object

In [78]:
for stock in some_symbols:
    print(f"Processing stock {stock}")
    try:
        df_stock = format_table(scrape_yahoo_finance(stock=stock), stock=stock)
    except Exception as e:
        print(f"Error retrieving data for {stock} - error was {str(e)}")
        continue
    df[stock] = df_stock[stock]

Processing stock JNJ
Processing stock BMY
Processing stock LDOS
Processing stock DOV
Processing stock CVX
Processing stock KKR
Processing stock DGX
Processing stock STZ
Processing stock BX
Processing stock HBAN


In [79]:
df

Unnamed: 0_level_0,TSLA,JNJ,BMY,LDOS,DOV,CVX,KKR,DGX,STZ,BX,HBAN
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2024-07-05,251.52,146.48,39.66,145.51,176.68,154.31,106.08,140.45,259.14,122.72,12.84
2024-07-03,246.39,145.69,40.06,146.64,176.97,156.71,106.06,140.90,250.37,123.40,13.04
2024-07-02,231.26,146.03,40.45,145.86,176.96,156.75,105.93,136.64,258.94,123.21,13.21
2024-07-01,209.86,146.44,41.30,145.56,176.99,156.52,104.51,137.09,256.29,122.23,13.16
2024-06-28,197.88,146.16,41.53,145.88,180.45,156.42,105.24,136.88,257.28,123.80,13.18
...,...,...,...,...,...,...,...,...,...,...,...
2024-04-12,171.05,147.52,48.28,125.73,172.06,158.96,97.29,128.53,262.24,123.47,13.31
2024-04-11,174.60,148.79,49.12,126.64,173.28,161.89,100.45,130.02,268.34,126.87,13.49
2024-04-10,171.76,150.20,50.29,126.22,173.83,162.67,99.01,131.62,264.92,126.08,13.58
2024-04-09,176.88,152.29,51.60,127.10,177.35,162.00,100.27,132.75,262.97,132.00,14.04


In [65]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 62 entries, 2024-07-05 to 2024-04-08
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   TSLA    62 non-null     float64
 1   JNJ     62 non-null     float64
 2   BMY     62 non-null     float64
 3   LDOS    62 non-null     float64
 4   DOV     62 non-null     float64
 5   CVX     62 non-null     float64
 6   KKR     62 non-null     float64
 7   DGX     62 non-null     float64
 8   STZ     62 non-null     float64
 9   BX      62 non-null     float64
 10  HBAN    62 non-null     float64
dtypes: float64(11)
memory usage: 5.8 KB
