The S&P 500 is an American stock market index based on the market capitalizations of 500 large companies having common stock listed on the NYSE, NASDAQ, or the Cboe BZX Exchange.  

In [80]:
from mpl_finance import candlestick_ohlc
import bs4 as bs
import datetime as dt
import numpy as np
import pandas as pd
import pandas_datareader.data as pdr
import pickle
import requests
import os

To extract the stock data of these 500 companies, we first need the Ticker Symbols for each of these companies. We scrape these symbols from the following URL. 

In [2]:
URL = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
response = requests.get(URL)
soup = bs.BeautifulSoup(response.content, 'html.parser')

In [17]:
#Inspect element on the URL to find the table class name for the S&P500 elements. 
#Since we are only interested in the first table, use soup.find.
table = soup.find('table', {'class':'wikitable sortable'}).tbody 

In [30]:
#Get the column index from the table with the Ticker Symbols.
rows = table.findAll('tr')
columns = [c.text.replace('\n', '') for c in rows[0].findAll('th')]
idx =0;
for i in range(0,len(columns)):
    if columns[i] == 'Symbol':
        idx = i
        break;

In [37]:
#Extract the Ticker Symbols into a list.
def getTSymbolList():
    tSymbols = []
    rows = table.findAll('tr')
    for row in rows[1:]:
        tSymbol = row.findAll('td')[idx].text
        tSymbols.append(tSymbol)
    return tSymbols

In [54]:
#Saves this list into a file using Pickle.
f = open("TSymbols.pickle", "wb")
pickle.dump(getTSymbolList(),f)
f.close()

The following functions scrapes the Yahoo Stock data of all S&P500 companies from start date=start to end date=end and stores them in newly created StockData folder. We use pandas_datareader.data to do this.

In [94]:
def getYahooData(start, end):
    if not os.path.isfile('TSymbols.pickle'):
        print('Ticker Symbols file doesn\'t exist, please create one.')
    else:
        with open("TSymbols.pickle", "rb") as g:
                tSymbols = pickle.load(g)

        if not os.path.exists('StockData'):
            os.makedirs('StockData')

        for tSymbol in tSymbols:
            print(tSymbol)
            if not os.path.exists('StockData/{}.csv'.format(tSymbol)):
                df = pdr.DataReader(tSymbol.replace('.',''), 'yahoo', start, end) 
                df.to_csv('StockData/{}.csv'.format(tSymbol))
            else:
                print('Already have {}'.format(tSymbol))

In [95]:
start = dt.datetime(2000, 1, 1)
end = dt.datetime.now()

In [98]:
getYahooData(start, end)