In [1]:


import pandas as pd
import yfinance as yf
import numpy as np
import stockstats
import matplotlib.pyplot as plt
import copy

In [2]:

###This class is from FinRL. Basically uses yahoo finance API to pull in stock data
class YahooDownloader:
    """Provides methods for retrieving daily stock data from
    Yahoo Finance API

    Attributes
    ----------
        start_date : str
            start date of the data (modified from config.py)
        end_date : str
            end date of the data (modified from config.py)
        ticker_list : list
            a list of stock tickers (modified from config.py)

    Methods
    -------
    fetch_data()
        Fetches data from yahoo API

    """

    def __init__(self, start_date: str, end_date: str, ticker_list: list):

        self.start_date = start_date
        self.end_date = end_date
        self.ticker_list = ticker_list

    def fetch_data(self) -> pd.DataFrame:
        """Fetches data from Yahoo API
        Parameters
        ----------

        Returns
        -------
        `pd.DataFrame`
            7 columns: A date, open, high, low, close, volume and tick symbol
            for the specified stock ticker
        """
        # Download and save the data in a pandas DataFrame:
        data_df = pd.DataFrame()
        for tic in self.ticker_list:
            temp_df = yf.download(tic, start=self.start_date, end=self.end_date)
            temp_df["tic"] = tic
            data_df = data_df.append(temp_df)
        # reset the index, we want to use numbers as index instead of dates
        data_df = data_df.reset_index()
        try:
            # convert the column names to standardized names
            data_df.columns = [
                "date",
                "open",
                "high",
                "low",
                "close",
                "adjcp",
                "volume",
                "tic",
            ]
            # use adjusted close price instead of close price
            data_df["close"] = data_df["adjcp"]
            # drop the adjusted close price column
            data_df = data_df.drop("adjcp", 1)
        except NotImplementedError:
            print("the features are not supported currently")
        # create day of the week column (monday = 0)
        data_df["day"] = data_df["date"].dt.dayofweek
        # convert date to standard string format, easy to filter
        data_df["date"] = data_df.date.apply(lambda x: x.strftime("%Y-%m-%d"))
        # drop missing data
        data_df = data_df.dropna()
        data_df = data_df.reset_index(drop=True)
        print("Shape of DataFrame: ", data_df.shape)
        # print("Display DataFrame: ", data_df.head())

        data_df = data_df.sort_values(by=['date','tic']).reset_index(drop=True)

        return data_df

    def select_equal_rows_stock(self, df):
        df_check = df.tic.value_counts()
        df_check = pd.DataFrame(df_check).reset_index()
        df_check.columns = ["tic", "counts"]
        mean_df = df_check.counts.mean()
        equal_list = list(df.tic.value_counts() >= mean_df)
        names = df.tic.value_counts().index
        select_stocks_list = list(names[equal_list])
        df = df[df.tic.isin(select_stocks_list)]
        return df

In [3]:
#sptickers is a csv containing S&P 500 ticker names. Here we're just choosing 10 random ones to pull in data for

tickers = np.random.choice(pd.read_csv('sptickers.csv')['Tickers'], 5, replace = False)
#tickers = ['DIS', 'IFF']
downloader = YahooDownloader(start_date='2000-01-01',
                            end_date = '2021-04-10',
                            ticker_list=tickers)

In [4]:
#this downloads the data into a pandas dataframe
df = downloader.fetch_data()

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
Shape of DataFrame:  (26298, 8)


In [5]:
df

Unnamed: 0,date,open,high,low,close,volume,tic,day
0,2000-01-03,0.936384,1.004464,0.907924,0.859423,535796800,AAPL,0
1,2000-01-03,34.187500,34.250000,33.812500,14.228443,61900,AVB,0
2,2000-01-03,7.703125,7.734375,7.437500,5.862040,394800,DGX,0
3,2000-01-03,12.467476,12.494579,11.898309,5.283997,684882,FMC,0
4,2000-01-04,0.966518,0.987723,0.903460,0.786965,512377600,AAPL,1
...,...,...,...,...,...,...,...,...
26293,2021-04-09,129.800003,133.039993,129.470001,132.774475,106686700,AAPL,4
26294,2021-04-09,354.679993,361.329987,353.529999,361.010010,833600,ANTM,4
26295,2021-04-09,186.539993,186.899994,185.119995,185.940002,538600,AVB,4
26296,2021-04-09,126.910004,127.300003,125.690002,126.690002,1062100,DGX,4


In [6]:
# This finds "valid" tickers, that is stocks that have data for all days in the specified range. 
# We don't want to deal with missing data
num_days = len(np.unique(df['date']))
valid_tics = []
for tic in tickers:

    if len(df.loc[df['tic'] == tic]) == num_days:
        valid_tics.append(tic)


print (len(valid_tics))
    

4


In [7]:
#Pick two (or more maybe) of these valid tickers
tickers = np.random.choice(valid_tics, 3, replace = False)


In [8]:
tickers

array(['DGX', 'AAPL', 'AVB'], dtype='<U4')

In [9]:
#This cell essentailly calculates the technical indicators and attaches them to the dataframes
#The main output is i_dfs, which is a dictionary of {ticker: stock_dataframe}
#See below output for example

i_dfs = {}
i_ss = {}

df = df.drop(columns = ['day'])
for tic in tickers:
    mask = df['tic'] == tic

    i_dfs[tic] = df[mask]
    i_ss[tic] = stockstats.StockDataFrame.retype(copy.deepcopy(df[mask]))
    
TECHNICAL_INDICATORS_LIST = ["macd", "macds",
                             "boll_ub","boll_lb", 
                             "rsi_14", "cci_14", "dx_14",
                             "open_14_sma", "pdi", "mdi",
                            "dx", "adx", "vr", "wr_14"]

for key, ss in i_ss.items():

   
    for ta in TECHNICAL_INDICATORS_LIST:

        i_dfs[key][ta] = ss.get(ta).values
        
        
for key, val in i_dfs.items():
    val['price'] = val['close'].values
    i_dfs[key] = val.drop(columns = ['open', 'high', 'low', 'close'])

for key, val in i_dfs.items():
    i_dfs[key] = val[30:].reset_index(drop=True)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

In [10]:
#split into training and test datasets
train_dfs = {}
test_dfs = {}
train_frac = .8
for key, val in i_dfs.items():
    data_len = len(val)
    train_idx = int(train_frac*data_len)
    
    train_dfs[key] = val[:train_idx].reset_index(drop=True)
    test_dfs[key] = val[train_idx:].reset_index(drop=True)
    

In [11]:
#Save the training/test data
import pickle
pickle.dump(train_dfs, open('train_ta', 'wb'))
pickle.dump(test_dfs, open('test_ta', 'wb'))
