# S&P 500 Dataset

This notebook scrapes wikipedia to obtain historical stock information for current S&P 500 companies.

In [None]:
import pandas as pd # library for data analysis
import requests # library to handle requests
from bs4 import BeautifulSoup # library to parse HTML documents
import yfinance as yf
import os, contextlib

In [None]:
period = '5y' # valid periods: 1d,5d,1mo,3mo,6mo,1y,2y,5y,10y,ytd,max

In [None]:
# get the response in the form of html
wikiurl="https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
table_class="wikitable sortable jquery-tablesorter"
response=requests.get(wikiurl)
# print(response.status_code)

In [None]:
# parse data from the html into a beautifulsoup object
soup = BeautifulSoup(response.text, 'html.parser')
stocks = soup.find('table',{'class':"wikitable"})

In [None]:
df = pd.read_html(str(stocks))

# convert list to dataframe
df = pd.DataFrame(df[0])

# cols to keep
cols = ['Symbol', 'Security', 'GICS Sector', 'GICS Sub-Industry', 'Date first added']

sp500 = df.loc[:,cols]

In [None]:
symbols = list(sp500['Symbol'])

offset = 0
limit = 3000

In [None]:
limit = limit if limit else len(symbols)
end = min(offset + limit, len(symbols))
is_valid = [False] * len(symbols)

# force silencing of verbose API
with open(os.devnull, 'w') as devnull:
    with contextlib.redirect_stdout(devnull):
        for i in range(offset, end):
            s = symbols[i]
            data = yf.download(s, period=period)
            if len(data.index) == 0:
                continue
        
            is_valid[i] = True
            data.to_csv('stocks/{}.csv'.format(s))

print('Total number of valid symbols downloaded = {}'.format(sum(is_valid)))

In [None]:
stock_list = []

for ticker in symbols:
    if ticker == 'BRK.B' or ticker == 'BF.B':
        continue
    df = pd.read_csv('stocks/{}.csv'.format(ticker), index_col=None, header=0)
    df['Ticker'] = '{}'.format(ticker)
    stock_list.append(df)

dataframe = pd.concat(stock_list, axis=0, ignore_index=True)

order = ['Date', 'Ticker', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']

dataframe = dataframe.loc[:,order]

In [None]:
dataframe.to_csv('sp500.csv', index = False)