In [35]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

# URL for the Wikipedia page with the S&P 500 list
url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"

# Send a request to the website
response = requests.get(url)

# Parse the page content
soup = BeautifulSoup(response.content, 'html.parser')

# Find the table containing the S&P 500 companies
table = soup.find('table', {'class': 'wikitable'})

# Use pandas to parse the table
df = pd.read_html(str(table))[0]

# Get the list of tickers from the 'Symbol' column
tickers_list = [ticker for ticker in df['Symbol'].tolist() if ticker.isalpha()]


In [None]:
import yfinance as yf
import time

# Specify the number of years back
years_back = 1
cooldown = 1 # seconds between API calls
output_file = './data/stock_data.csv'

# Initialize variables
all_stock_data = []
skipped_stocks = []
start_date = pd.Timestamp.now() - pd.DateOffset(years=years_back)
end_date = pd.Timestamp.now()

# Loop through each ticker
for ticker in tickers_list:
    start_time = time.time()
    try:
        sector = yf.Ticker(ticker)
        stock_data = yf.download(ticker, start=start_date, end=end_date)
        info = sector.info
        
        # Skip stock if we can't get fundamental data on it
        if not info.get("sector") or info.get("sector") == "":
            skipped_stocks.append(ticker)
            continue

        # Loop through each row in stock data
        for date, row in stock_data.iterrows():
            all_stock_data.append([
                ticker,
                info.get("sector"),
                date.strftime("%Y-%m-%d"),
                row["Close"].values[0],
                row["Open"].values[0],
                row["Low"].values[0],
                row["High"].values[0],
                row["Volume"].values[0]
            ])
    except Exception as e:
        print(f"Error processing {ticker}: {e}")
        skipped_stocks.append(ticker)
        continue

    elapsed_time = time.time() - start_time
    if elapsed_time < cooldown:
        time.sleep(cooldown - elapsed_time)

# Create DataFrame from new data
columns = ["ticker", "sector", "date", "close", "open", "low", "high", "volume"]
stock_data_df = pd.DataFrame(all_stock_data, columns=columns)

# Save the updated data
stock_data_df.to_csv(output_file, index=False)
print(f"Skipped tickers: {skipped_stocks}")
print(f"Stock data saved to {output_file}")