In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt

from yahoo_fin.stock_info import get_data
from get_all_tickers import get_tickers as gt # want to use this for tickers data

In [2]:
def load_csv(path):
    '''
    Loads a csv of the given path and returns as a dataframe
    
    '''
    data = pd.read_csv(path)
    
    return data

In [3]:
# Load the S&P 500 data
stock_twits_path = "UPDATE"
sp500 = set(load_csv("/Users/kelseyesposito/finBERT/stocktwits/s&p500.csv")["Symbol"])
stocks = ["AMZN", "AAPL", "GOOG", "EBAY"]

In [4]:
def pull_yahoo_data(ticker, start, end, dur):
    '''
    Pulls adjusted close, date, and ticker date for the given stock, start, and end date.
    Depending on duration, this can pull daily, weekly, or monthly data
    '''
    
    yahoo = get_data(ticker, start_date=start, end_date=end, index_as_date=False, interval=dur)[["date", "adjclose", "ticker"]]
    return yahoo

In [5]:
# this function will be slow as more stocks are added
def get_stocks(stocks=stocks, dur="1d", start=None, end=None): # 1d, 1wk, 1mo
    '''
    Description of the function
    
    '''
    daily = pd.DataFrame(columns=['date', 'ticker', 'adjclose'])
    
    for ticker in stocks:
        try:
            temp = pull_yahoo_data(ticker, start, end, dur)
            daily = daily.append(temp)
            daily.sort_values(by="date", inplace=True)
            daily.reset_index(drop=True,inplace=True)
        except:
            print("Could not access data for ", ticker)
    
    return daily

In [6]:
# Set the start date, end date, and duration of returns we want from yahoo finance
start_date = dt.datetime.strptime("2000-01-01", "%Y-%m-%d")
end_date = dt.datetime.now()
duration = '1d'

In [7]:
# Create the yahoo finance dataframes (~20 years of data atm, daily)
daily = get_stocks(sp500, duration, start_date, end_date)
daily

Could not access data for  CTL
Could not access data for  MYL
Could not access data for  BF.B
Could not access data for  ETFC
Could not access data for  NBL
Could not access data for  BRK.B


Unnamed: 0,date,ticker,adjclose
0,2000-01-03,CB,10.392402
1,2000-01-03,EFX,10.855853
2,2000-01-03,J,7.692913
3,2000-01-03,CI,23.833805
4,2000-01-03,COP,9.057103
...,...,...,...
2380559,2021-02-05,PHM,48.130001
2380560,2021-02-05,HIG,50.439999
2380561,2021-02-05,CME,191.389999
2380562,2021-02-05,SEE,45.209999


In [8]:
# ~20 years of weekly data atm
weekly = get_stocks(sp500, '1wk', start_date, end_date)

Could not access data for  CTL
Could not access data for  MYL
Could not access data for  BF.B
Could not access data for  ETFC
Could not access data for  NBL
Could not access data for  BRK.B


In [9]:
# ~20 years of monthly data atm
monthly = get_stocks(sp500, '1mo', start_date, end_date)

Could not access data for  CTL
Could not access data for  MYL
Could not access data for  BF.B
Could not access data for  ETFC
Could not access data for  NBL
Could not access data for  BRK.B


In [10]:
def df_to_csv(df, path):
    '''
    Saves the given dataframe to a csv using the provided file path.
    '''
    
    try:
        df.to_csv(path)
        print("Success.")
    except:
        print("Error in csv creation.")
        

In [11]:
# Save the daily, weekly, and monthly datasets to file
file_path = "UPDATE"
df_to_csv(daily, "/Users/kelseyesposito/finBERT/yahoo/yahoo_daily_sp500.csv")
df_to_csv(weekly, "/Users/kelseyesposito/finBERT/yahoo/yahoo_weekly_sp500.csv")
df_to_csv(monthly, "/Users/kelseyesposito/finBERT/yahoo/yahoo_monthly_sp500.csv")

Success.
Success.
Success.
