In [12]:


import pandas as pd
import yfinance as yf
import numpy as np
import stockstats
import matplotlib.pyplot as plt
import copy

In [2]:

###This class is from FinRL. Basically uses yahoo finance API to pull in stock data
class YahooDownloader:
    """Provides methods for retrieving daily stock data from
    Yahoo Finance API

    Attributes
    ----------
        start_date : str
            start date of the data (modified from config.py)
        end_date : str
            end date of the data (modified from config.py)
        ticker_list : list
            a list of stock tickers (modified from config.py)

    Methods
    -------
    fetch_data()
        Fetches data from yahoo API

    """

    def __init__(self, start_date: str, end_date: str, ticker_list: list):

        self.start_date = start_date
        self.end_date = end_date
        self.ticker_list = ticker_list

    def fetch_data(self) -> pd.DataFrame:
        """Fetches data from Yahoo API
        Parameters
        ----------

        Returns
        -------
        `pd.DataFrame`
            7 columns: A date, open, high, low, close, volume and tick symbol
            for the specified stock ticker
        """
        # Download and save the data in a pandas DataFrame:
        data_df = pd.DataFrame()
        for tic in self.ticker_list:
            temp_df = yf.download(tic, start=self.start_date, end=self.end_date)
            temp_df["tic"] = tic
            data_df = data_df.append(temp_df)
        # reset the index, we want to use numbers as index instead of dates
        data_df = data_df.reset_index()
        try:
            # convert the column names to standardized names
            data_df.columns = [
                "date",
                "open",
                "high",
                "low",
                "close",
                "adjcp",
                "volume",
                "tic",
            ]
            # use adjusted close price instead of close price
            data_df["close"] = data_df["adjcp"]
            # drop the adjusted close price column
            data_df = data_df.drop("adjcp", 1)
        except NotImplementedError:
            print("the features are not supported currently")
        # create day of the week column (monday = 0)
        data_df["day"] = data_df["date"].dt.dayofweek
        # convert date to standard string format, easy to filter
        data_df["date"] = data_df.date.apply(lambda x: x.strftime("%Y-%m-%d"))
        # drop missing data
        data_df = data_df.dropna()
        data_df = data_df.reset_index(drop=True)
        print("Shape of DataFrame: ", data_df.shape)
        # print("Display DataFrame: ", data_df.head())

        data_df = data_df.sort_values(by=['date','tic']).reset_index(drop=True)

        return data_df

    def select_equal_rows_stock(self, df):
        df_check = df.tic.value_counts()
        df_check = pd.DataFrame(df_check).reset_index()
        df_check.columns = ["tic", "counts"]
        mean_df = df_check.counts.mean()
        equal_list = list(df.tic.value_counts() >= mean_df)
        names = df.tic.value_counts().index
        select_stocks_list = list(names[equal_list])
        df = df[df.tic.isin(select_stocks_list)]
        return df

In [4]:
#sptickers is a csv containing S&P 500 ticker names. Here we're just choosing 10 random ones to pull in data for

tickers = np.random.choice(pd.read_csv('sptickers.csv')['Tickers'], 10, replace = False)

downloader = YahooDownloader(start_date='2000-01-01',
                            end_date = '2021-04-10',
                            ticker_list=tickers)

In [5]:
#this downloads the data into a pandas dataframe
df = downloader.fetch_data()

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
Shape of DataFrame:  (46679, 8)


In [94]:
df

Unnamed: 0,date,open,high,low,close,volume,tic
0,2000-01-03,32.791668,32.833332,28.645832,31.083332,2792700,BIIB
1,2000-01-03,28.855125,29.533344,28.361876,23.115248,8402230,DIS
2,2000-01-03,33.875000,33.875000,33.187500,14.784572,12200,ESS
3,2000-01-03,37.625000,37.937500,35.000000,22.202518,312300,IFF
4,2000-01-03,44.000000,44.000000,41.687500,22.493465,703500,PNC
...,...,...,...,...,...,...,...
46674,2021-04-09,139.089996,140.309998,138.419998,140.279999,1108200,IFF
46675,2021-04-09,26.830000,26.950001,26.450001,26.680000,2209000,NWSA
46676,2021-04-09,179.759995,180.360001,178.300003,179.054428,1302300,PNC
46677,2021-04-09,81.190002,81.680000,80.080002,80.860001,1311100,SYY


In [6]:
# This finds "valid" tickers, that is stocks that have data for all days in the specified range. 
# We don't want to deal with missing data
num_days = len(np.unique(df['date']))
valid_tics = []
for tic in tickers:

    if len(df.loc[df['tic'] == tic]) == num_days:
        valid_tics.append(tic)


    
    

7


In [8]:
#Pick two (or more maybe) of these valid tickers
tickers = np.random.choice(valid_tics, 2, replace = False)


In [95]:
tickers

array(['DIS', 'IFF'], dtype='<U4')

In [75]:
#This cell essentailly calculates the technical indicators and attaches them to the dataframes
#The main output is i_dfs, which is a dictionary of {ticker: stock_dataframe}
#See below output for example

i_dfs = {}
i_ss = {}

df = df.drop(columns = ['day'])
for tic in tickers:
    mask = df['tic'] == tic

    i_dfs[tic] = df[mask]
    i_ss[tic] = stockstats.StockDataFrame.retype(copy.deepcopy(df[mask]))
    
TECHNICAL_INDICATORS_LIST = ["macd", "macds",
                             "boll_ub","boll_lb",
                             "rsi_5", "rsi_14", "rsi_30", 
                             "cci_30", "dx_30",
                             "open_5_sma", "open_14_sma", "open_30_sma"]

for key, ss in i_ss.items():

   
    for ta in TECHNICAL_INDICATORS_LIST:

        i_dfs[key][ta] = ss.get(ta).values
        
        
for key, val in i_dfs.items():
    val['price'] = val['open'].values
    i_dfs[key] = val.drop(columns = ['open', 'high', 'low', 'close'])

for key, val in i_dfs.items():
    i_dfs[key] = val[30:].reset_index(drop=True)


In [98]:
i_dfs['DIS']

Unnamed: 0,date,volume,tic,macd,macds,boll_ub,boll_lb,rsi_5,rsi_14,rsi_30,cci_30,dx_30,open_5_sma,open_14_sma,open_30_sma,price
0,2000-02-15,7082717,DIS,0.468651,0.533624,30.740582,25.737044,43.279101,56.173007,59.215025,33.836419,18.493594,36.808781,36.681065,34.998143,36.500500
1,2000-02-16,4475215,DIS,0.423849,0.511652,30.745950,25.900930,44.815170,56.474978,59.366748,47.463648,19.044432,36.673138,36.698681,35.224216,36.377190
2,2000-02-17,4756310,DIS,0.358868,0.481076,30.602087,26.267242,36.229531,53.686265,57.809329,21.280336,8.109662,36.512832,36.681065,35.396853,36.377190
3,2000-02-18,6493867,DIS,0.246309,0.434099,30.223446,26.858659,23.413340,47.867735,54.432701,-10.194273,3.743984,36.401851,36.676661,35.509890,35.883938
4,2000-02-22,12744146,DIS,0.032385,0.353723,30.209182,26.877759,12.071003,38.354924,48.238879,-112.038965,20.767556,36.019582,36.615005,35.635257,34.959095
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5316,2021-04-05,7405300,DIS,-0.647872,0.158946,201.491807,181.164194,55.051145,49.439677,53.367550,-39.615883,7.946517,186.560001,189.557143,192.159333,191.470001
5317,2021-04-06,7926200,DIS,-0.482324,0.030692,199.590555,181.847446,61.440533,51.389234,54.123670,-36.614633,7.185582,187.212000,189.014285,192.384666,188.500000
5318,2021-04-07,6238700,DIS,-0.520231,-0.079493,199.161912,181.581089,46.776997,47.881442,52.568051,-55.336930,3.911218,188.406000,188.754286,192.258333,189.800003
5319,2021-04-08,7763000,DIS,-0.563145,-0.176223,198.585272,181.383729,45.282955,47.495314,52.395757,-71.753356,0.739450,188.679999,188.200714,191.912333,187.199997


In [91]:
#split into training and test datasets
train_dfs = {}
test_dfs = {}
train_frac = .8
for key, val in i_dfs.items():
    data_len = len(val)
    train_idx = int(train_frac*data_len)
    
    train_dfs[key] = val[:train_idx].reset_index(drop=True)
    test_dfs[key] = val[train_idx:].reset_index(drop=True)
    

In [93]:
#Save the training/test data
import pickle
pickle.dump(train_dfs, open('train_ta', 'wb'))
pickle.dump(test_dfs, open('test_ta', 'wb'))
