### Data Preprocessing

S&P500 data from 2016-2019

In [1]:
import time
import yfinance as yf

import pandas as pd
from pandas_datareader import data as pdr

In [2]:
def get_tickers():
    """ Returns list of S&P 500 symbols
    """
    table=pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
    symbols = table[0]['Symbol'].values
    return symbols

def get_data(start,end,ref='Close'):
    """ Query YFinance for equity data 
    """
    start_time=time.time()
    ticker_names=get_tickers()
    data_dict={}
    
    for ct,ticker in enumerate(ticker_names):
        try:
            data = pdr.get_data_yahoo(ticker, start=start, end=end)[ref]
            data=data.rename(ticker)
            data_dict[ticker]=data
        except:
            continue
        if ct%25==0:
            print(f"{ct} out of {len(ticker_names)} tickers downloaded")
    
    data_df=pd.DataFrame(pd.concat(list(data_dict.values()),axis=1))
    data_df=data_df.rename(columns=dict(zip(data_df.columns,list(data_dict.keys()))))
    
    data_df.to_csv('equity_data_2021.csv')
    stop_time=time.time()
    print(f"Total time taken for {len(ticker_names)} is {stop_time-start_time})")
    return


In [3]:
start="2016-01-01"
end="2020-01-01"
get_data(start,end)

0 out of 502 tickers downloaded
25 out of 502 tickers downloaded
50 out of 502 tickers downloaded
75 out of 502 tickers downloaded
100 out of 502 tickers downloaded
125 out of 502 tickers downloaded
150 out of 502 tickers downloaded
175 out of 502 tickers downloaded
200 out of 502 tickers downloaded
225 out of 502 tickers downloaded
250 out of 502 tickers downloaded
275 out of 502 tickers downloaded
300 out of 502 tickers downloaded
325 out of 502 tickers downloaded
350 out of 502 tickers downloaded
375 out of 502 tickers downloaded
400 out of 502 tickers downloaded
425 out of 502 tickers downloaded
450 out of 502 tickers downloaded
475 out of 502 tickers downloaded
500 out of 502 tickers downloaded
Total time taken for 502 is 900.1830995082855)
