<a href="https://colab.research.google.com/github/kconstable/market_predictions/blob/main/predict_market_prices.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Predict Market Prices

## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import requests
import time
import pickle
import re
import datetime
from datetime import timedelta
from tabulate import tabulate
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from google.colab import files
from google.colab import drive
drive.mount('/content/drive')

from tensorflow import keras
from sklearn.preprocessing import MinMaxScaler
from statsmodels.tsa.stattools import adfuller


# alphavalue key
with open('/content/drive/MyDrive/Colab Notebooks/data/av_key.txt') as f:
    key = f.read().strip()

Mounted at /content/drive



pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.



## Functions

In [2]:
def get_economic_indicators(funct,key,interval=None,maturity=None,throttle=0):
  """
  Returns Economic Indicator Data with missing values interpolated between dates
  Monthly Data:
    NONFARM_PAYROLL, INFLATION_EXPECTATION,CONSUMER_SENTIMENT,UNEMPLOYMENT
  Daily, Weekly, Monthly Data:  
    FEDERAL_FUNDS_RATE = interval (daily,weekly,monthly)
    TREASURY_YIELD = interval (daily, weekly, monthly), 
                     maturity (3month, 5year, 10year, and 30year)
  """
  
  # query strings
  # Monthly Data:
  if funct in ['NONFARM_PAYROLL','INFLATION_EXPECTATION','CONSUMER_SENTIMENT','UNEMPLOYMENT']:
    url = f'https://www.alphavantage.co/query?function={funct}&apikey={key}'

  # Daily, Weekly or Monthly Data:
  # Interest Rates
  if funct == 'FEDERAL_FUNDS_RATE':
    url = f'https://www.alphavantage.co/query?function={funct}&interval={interval}&apikey={key}'

  # Treasury Yield  
  if funct == 'TREASURY_YIELD':
    url = f'https://www.alphavantage.co/query?function={funct}&interval={interval}&maturity={maturity}&apikey={key}'

  # pull data
  r = requests.get(url)
  time.sleep(throttle)
  d = r.json()

  # convert to df
  df = pd.DataFrame(d['data'])

  # move date to a datetime index
  df.date = pd.to_datetime(df.date)
  df.set_index('date',inplace=True)

  # add the ticker name and frequency
  df['name'] = d['name']
  df['interval']=d['interval'] 

  # clean data & interpolate missing values
  # missing data encoded with '.'
  # change datatype to float
  df.replace('.',np.nan,inplace=True)
  df.value = df.value.astype('float')

  # missing data stats
  missing =sum(df.value.isna())
  total =df.shape[0]
  missing_pct = round(missing/total*100,2)

  # interpolate using the time index
  if missing >0:
    df.value.interpolate(method='time',inplace=True)
    action = 'interpolate'
  else:
    action = 'none'

  # Print the results
  if maturity is not None:
    summary = ['Economic Indicator',funct+':'+maturity,str(total),str(missing),str(missing_pct)+'%',action]
  else:
    summary = ['Economic Indicator',funct,str(total),str(missing),str(missing_pct)+'%',action]


  return {'summary':summary,'data':df}

In [3]:
def get_technical_indicators(symbol,funct,key,interval,time_period=None,throttle=0):
  """
  Returns Technical Indicators (only works for stocks, not cyrpto)
  MACD:   symbol,interval
  RSI:    symbol,interval,time_period
  BBANDS: symbol,interval,time_period

  Parameters:
          interval: (1min, 5min, 15min, 30min, 60min, daily, weekly, monthly)
          series_type: (open, close,high,low)-default to close
          timer_periods: Integer
  """
  # build the query string
  if funct =='MACD':
    url = f'https://www.alphavantage.co/query?function={funct}&symbol={symbol}&interval={interval}&series_type=close&apikey={key}'
  if funct in ['RSI','BBANDS']:
    url = f'https://www.alphavantage.co/query?function={funct}&symbol={symbol}&interval={interval}&series_type=close&time_period={time_period}&apikey={key}'

  # request data as json, convert to dict, pause request to avoid the data throttle
  r = requests.get(url)
  time.sleep(throttle)
  d = r.json()

  # extract to a df, add the indicator name, convert the index to datetime
  df = pd.DataFrame(d[f'Technical Analysis: {funct}']).T
  df.index = pd.to_datetime(df.index)

  # convert the data to float
  for col in df.columns:
    df[col] = df[col].astype('float')

  # check for missing data
  missing = df.isnull().any().sum()
  total = len(df)
  missing_pct = round(missing/total*100,2)


  # Print the results
  summary=['Technical Indicator',funct,str(total),str(missing),str(missing_pct)+'%','none']

  return {'summary':summary,'data':df}

In [4]:
def get_crypto_data(symbol,key):
  """
  Pulls daily crypto prices from alpha advantage.
  Inputs:
    symbol: ETH, BTC, DOGE
    key:    The alpha advantage API key
  Output:
    a dataframe of crypto prices: open,high,low, close, volume
  """
  # build query string, get data as json and convert to a dict
  url = f'https://www.alphavantage.co/query?function=DIGITAL_CURRENCY_DAILY&symbol={symbol}&market=CAD&apikey={key}'
  r = requests.get(url)
  d = r.json()

  # extract data to df
  df=pd.DataFrame(d['Time Series (Digital Currency Daily)']).T

  # remove columns not required
  # returns the price in two currencies, just keep USD
  cols = [c for c in df.columns if '(CAD)' not in c]
  df=df.loc[:, cols]
  df.columns = ['open','high','low','close','volume','marketcap']
  df.drop(['marketcap'],axis=1,inplace=True)

  # change data types
  df.index = pd.to_datetime(df.index)

  # convert datatype to float
  for col in ['open','high','low','close','volume']:
    df[col] = df[col].astype('float')

  # add the cyrpto name
  df['symbol'] = d['Meta Data']['3. Digital Currency Name']

  return df

In [5]:
def calc_bollinger(df,feature,window=20,st=2):
  """
  Calculates bollinger bands for a price time-series.  Used for crypto currencies
  Input: 
    df     : A dataframe of time-series prices
    feature: The name of the feature in the df to calculate the bands for
    window : The size of the rolling window.  Defaults to 20 days with is standard
    st     : The number of standard deviations to use in the calculation. 2 is standard 
  Output: 
    Returns the df with the bollinger band columns added
  """

  # rolling mean and stdev
  rolling_m  = df[feature].rolling(window).mean()
  rolling_st = df[feature].rolling(window).std()

  # add the upper/lower and middle bollinger bands
  df['b-upper']  = rolling_m + (rolling_st * st)
  df['b-middle'] = rolling_m 
  df['b-lower']  = rolling_m - (rolling_st * st)

In [6]:
def calc_rsi(df,feature='close',window=14):
  """
  Calculates the RSI for the input feature
  Input:
    df      : A dataframe with a time-series of prices
    feature : The name of the feature in the df to calculate the bands for
    window  : The size of the rolling window.  Defaults to 14 days which is standard
  Output: 
    Returns the df with the rsi band column added
  """
  # RSI
  # calc the diff in daily prices, exclude nan
  diff =df[feature].diff()
  diff.dropna(how='any',inplace=True)

  # separate positive and negitive changes
  pos_m, neg_m = diff.copy(),diff.copy()
  pos_m[pos_m<0]=0
  neg_m[neg_m>0]=0

  # positive/negative rolling means
  prm = pos_m.rolling(window).mean()
  nrm = neg_m.abs().rolling(window).mean()

  # calc the rsi and add to the df
  ratio = prm /nrm
  rsi = 100.0 - (100.0 / (1.0 + ratio))
  df['rsi']=rsi

In [7]:
def calc_macd(df,feature='close'):
  """
  Calculates the MACD and signial for the input feature
  Input:
    df      : A dataframe with a time-series of prices
    feature : The name of the feature in the df to calculate the bands for
  Output: 
    Returns the df with the macd columns added
  """
  ema12 = df[feature].ewm(span=12,adjust=False).mean()
  ema26 = df[feature].ewm(span=26,adjust=False).mean()
  df['macd']=ema12-ema26
  df['macd_signal'] = df['macd'].ewm(span=9,adjust=False).mean()

In [8]:
def get_ticker_data(symbol,key,outputsize='compact',throttle=0):
  """
  Returns daily data for a stock (symbol)
    outputsize: compact(last 100) or full (20 years)
    key: apikey
    symbols: OILK (oil ETF),BAR(gold ETF),VXZ (volatility ETF)
  """
  url = f'https://www.alphavantage.co/query?function=TIME_SERIES_DAILY&symbol={symbol}&outputsize={outputsize}&apikey={key}'
  r = requests.get(url)
  time.sleep(throttle)
  d = r.json()

  # extract data to a df
  df = pd.DataFrame(d['Time Series (Daily)']).T
  df.columns = ['open','high','low','close','volume']
  df['symbol'] = d['Meta Data']['2. Symbol']

  # change data types
  df.index = pd.to_datetime(df.index)

  # convert datatype to float
  for col in ['open','high','low','close','volume']:
    df[col] = df[col].astype('float')

  # Calculate missing data
  missing = sum(df.close.isna())
  total = df.shape[0]
  missing_pct = round(missing/total*100,2)

  # Print the results
  summary = ['Ticker',symbol,str(total),str(missing),str(missing_pct)+'%','none']

  return {'summary':summary,'data':df}

In [9]:
def get_consolidated_stock_data(symbol,key,config,outputsize='compact',throttle=30,dropna=True):
  """
  Pulls data from alpha advantage and consolidates
  API Limitations: 5 API requests per minute and 500 requests per day
  Inputs:
    symbol: stock ticker
    key   : api key
    config: dictionary which lists the economic, technical and commodities to pull
    outputsize: compact(latest 100) or full (up to 20 years of daily data)
    throttle: number of seconds to wait between api requests
    dropna: True/False, drops any records with nan
  Output:
    A dataframe with consolidated price data for the symbol + economic/technical
    indicators and commodity prices
  """

  # Result header and accumulator
  header = ['Type','Data','Total','Missing',' % ','Action']
  summary =[]

  # Get stock prices
  try:
    results  = get_ticker_data(symbol,key,outputsize,0)
    dff = results['data']
    summary.append(results['summary'])
    print(f'Complete:===>Ticker:{symbol}')
  except:
    print(f'Error:===>Ticker:{symbol}')



  # Get Commodity prices
  # ****************************************************************************
  for commodity in config['Commodities']:
    try:
      # get prices
      results = get_ticker_data(commodity,key,outputsize,throttle)
      df = results['data']
      summary.append(results['summary'])
      print(f'Complete:===>Commodity:{commodity}')


      # rename close to commodity name, remove unneeded columns and join with 
      # the stock prices by date
      df.rename(columns={'close':commodity},inplace=True)
      df.drop(['open','high','low','volume','symbol'],axis=1,inplace=True)
      dff = dff.join(df,how='left')
    except:
      print(f"Error===>Commodity:{commodity}")


  # Economic Indicators
  # ****************************************************************************
  # loop through the config to pull the requested data
  for indicator,values in config['Economic'].items():
    if indicator == 'TREASURY_YIELD':
      for tr in values:
        try:
          results = get_economic_indicators(indicator,key,interval=tr['interval'],maturity=tr['maturity'],throttle=throttle)
          summary.append(results['summary'])
          print(f"Complete:===>{indicator}:{tr['maturity']}")

          df = results['data']
          dff = dff.join(df,how='left')
          dff.rename(columns={"value": tr['name']},inplace=True)
          dff.drop(['name', 'interval'], axis=1,inplace = True)
        except:
          print(f"Error===>{indicator}:{tr['maturity']}")
   
    else: 
      # daily
      if values['interval']=='daily':
        try:
          results = get_economic_indicators(indicator,key,interval=values['interval'],throttle=throttle)
          df = results['data']
          summary.append(results['summary'])
          print(f"Complete:===>{indicator}")

          dff = dff.join(df,how='left')
          dff.rename(columns={"value": values['name']},inplace=True)
          dff.drop(['name', 'interval'], axis=1,inplace = True)
        except:
          print(f"Error===>{indicator}")
  
      else: 
        try:
          # monthly or weekly
          results = get_economic_indicators(indicator,key,throttle=throttle)
          summary.append(results['summary'])
          df = results['data']
          print(f"Complete:===>{indicator}")

          # reindex to daily, fill missing values forward
          days = pd.date_range(start = min(df.index),end =max(df.index),freq='D')
          df =df.reindex(days,method = 'ffill')
      
          # join with the other data
          dff = dff.join(df,how='left')
          dff.rename(columns={"value": values['name']},inplace=True)
          dff.drop(['name', 'interval'], axis=1,inplace = True)
        except:
          print(f"Error===>{indicator}")

  # # Technical Indicators
  # ****************************************************************************
  for indicator,values in config['Technical'].items():
    try:
      results = get_technical_indicators(symbol,indicator,key,values['interval'],values['time_period'],throttle)
      df = results['data']
      summary.append(results['summary'])

      dff = dff.join(df,how='left')
      print(f"Complete:===>{indicator}")
    except:
      print(f"Error===>{indicator}")

  
  # clean column names
  dff.rename(columns={"Real Upper Band":'b-upper',
                      "Real Lower Band":'b-lower',
                      "Real Middle Band":"b-middle",
                      "RSI":"rsi",
                      "MACD_Hist":"macd_hist",
                      "MACD_Signal":"macd_signal",
                      "MACD":"macd"
                      },inplace=True)
      

  # Fill in any missing data after joining all datasets
  dff.fillna(method='bfill',inplace=True,axis = 0)

  # drop rows with missing commodity prices
  if dropna:
    dff.dropna(how='any',inplace=True)

  # print the results table
  print("\n\n")
  print(tabulate(summary,header))

  return dff

In [10]:
def get_consolidated_crypto_data(symbol,key,config,boll_window=20,boll_std=2,rsi_window=14,throttle=30,dropna=True):
  """
  Pulls data from alpha advantage and consolidates
  API Limitations: 5 API requests per minute and 500 requests per day
  Inputs:
    symbol: crypto ticker
    key   : api key
    config: dictionary which lists the economic indicators and commodities to pull
    throttle: number of seconds to wait between api requests
    dropna: True/False, drops any records with nan
  Output:
    A dataframe with consolidated price data for the symbol + economic/technical
    indicators and commodity prices
  """

  # Result header and accumulator
  header = ['Type','Data','Total','Missing',' % ','Action']
  summary =[]

  # Get crypto prices
  try:
    dff  = get_crypto_data(symbol,key)
    
    # add month feature
    dff['month'] = dff.index.month

    print(f'Complete:===>Crypto:{symbol}')

  except:
    print(f'Error:===>Crypto:{symbol}')


  # Get Commodity prices
  # ****************************************************************************
  for commodity in config['Commodities']:
    try:
      # get prices
      results = get_ticker_data(commodity,key,'full',throttle)
      df = results['data']
      summary.append(results['summary'])
      print(f'Complete:===>Commodity:{commodity}')


      # rename close to commodity name, remove unneeded columns and join with 
      # the stock prices by date
      df.rename(columns={'close':commodity},inplace=True)
      df.drop(['open','high','low','volume','symbol'],axis=1,inplace=True)
      dff = dff.join(df,how='left')
    except:
      print(f"Error===>Commodity:{commodity}")


  # Economic Indicators
  # ****************************************************************************
  # loop through the config to pull the requested data
  for indicator,values in config['Economic'].items():
    if indicator == 'TREASURY_YIELD':
      for tr in values:
        try:
          results = get_economic_indicators(indicator,key,interval=tr['interval'],maturity=tr['maturity'],throttle=throttle)
          summary.append(results['summary'])
          print(f"Complete:===>{indicator}:{tr['maturity']}")

          df = results['data']
          dff = dff.join(df,how='left')
          dff.rename(columns={"value": tr['name']},inplace=True)
          dff.drop(['name', 'interval'], axis=1,inplace = True)
        except:
          print(f"Error===>{indicator}:{tr['maturity']}")
   
    else: 
      # daily
      if values['interval']=='daily':
        try:
      
          results = get_economic_indicators(indicator,key,interval=values['interval'],throttle=throttle)
          df = results['data']
          summary.append(results['summary'])
          print(f"Complete:===>{indicator}")

          dff = dff.join(df,how='left')
          dff.rename(columns={"value": values['name']},inplace=True)
          dff.drop(['name', 'interval'], axis=1,inplace = True)
        except:
          print(f"Error===>{indicator}")
  
      else: 
        try:
          # monthly or weekly
         
          results = get_economic_indicators(indicator,key,throttle=throttle)
          summary.append(results['summary'])
          df = results['data']
          print(f"Complete:===>{indicator}")

          # reindex to daily, fill missing values forward
          days = pd.date_range(start = min(df.index),end =max(df.index),freq='D')
          df =df.reindex(days,method = 'ffill')
      
          # join with the other data
          dff = dff.join(df,how='left')
          dff.rename(columns={"value": values['name']},inplace=True)
          dff.drop(['name', 'interval'], axis=1,inplace = True)
        except:
          print(f"Error===>{indicator}")

  # # Technical Indicators
  # ****************************************************************************
  calc_rsi(dff,'close',rsi_window)
  calc_bollinger(dff,'close',boll_window,boll_std)
  calc_macd(dff,'close')
      

  # Fill in any missing data after joining all datasets
  dff.fillna(method='bfill',inplace=True,axis = 0)

  # drop rows with missing commodity prices
  if dropna:
    dff.dropna(how='any',inplace=True)

  # print the results table
  print("\n\n")
  print(tabulate(summary,header))

  return dff

In [11]:
def transform_stationary(df,features_to_transform,transform='log'):
  """
  Transform time-series data using a log or boxcox transform.  Calculate the augmented
  dickey-fuller (ADF) test for stationarity after the transform
  Inputs:
    df: a dataframe of features
    features_to_transform: A list of features to apply the transform
    transform: The transform to apply (log, boxbox)
  Output
    Applies the transforms inplace in df
  """
  # transform each column in the features_to_transform list
  for feature in df.columns:
    if feature in features_to_transform:
      # log transform
      if transform=='log':
        df[feature] = df[feature].apply(np.log)

      # boxcox transform  
      elif transform=='boxcox':
        bc,_ = stats.boxcox(df[feature])
        df[feature] = bc

      else:
        print("Transformation not recognized")

  # check the closing price for stationarity using the augmented dicky fuller test
  t_stat, p_value, _, _, critical_values, _  = adfuller(df.close.values, autolag='AIC')
  print('\n\nAugmented Dicky Fuller Test for Stationarity')
  print("="*60)
  print(f'ADF Statistic: {t_stat:.2f}')
  for key, value in critical_values.items():
    print('Critial Values:')
    if t_stat < value:
      print(f'   {key}, {value:.2f} => non-stationary')
    else:
      print(f'   {key}, {value:.2f} => stationary')


In [12]:
def shift_features(df,features_shift):
  """
  Shifts features by time periods to convert them to lagged indicators
  Input:
    df: dataframe of features
    features_shift: dictionary  of {feature:period shift}
  Output:
    df: original dataframe + the shifted features
  """
  dff = df.copy()
  for feature,shift in features_shift.items():
    t_shift = pd.DataFrame(dff[feature].shift(periods=shift))
    dff =dff.join(t_shift,how='left',rsuffix='_shift')

  # remove nan introducted with lag features
  dff.dropna(how='any',inplace=True)

  return dff

In [13]:
def prepare_data(df,n_steps,features=[]):
  """
  Filter, scale and convert dataframe data to numpy arrays

  Inputs: 
    df       => A dataframe of observations with features and y-labels
    y        => The name of the column that is the truth labels
    features => A list of features.  Used to subset columns

  Outputs:
    scaled_y => numpy array of the y-label data
    scaled_x => numpy array of the training features

  """

  # subset the latest n_steps rows to be used for prediction
  df = df.iloc[0:n_steps,:]

  # reverse the index such that dates are in chronological order
  df = df.iloc[::-1]

  # Subset features, get the y-label values
  df_y = df['close']
  df_X = df[features]

  # replace the date index with an integer index
  idx_dates = df.index
  df_X.reset_index(drop=True,inplace=True)

  # convert to numpay arrays
  array_X = np.array(df_X)
  array_y = np.array(df_y).reshape(-1,1)


  # print the output
  print("\nData Preparation")
  print("="*60)
  print(f"=> {len(features)} Features")
  print(f"=> Input Dimensions :{array_X.shape}")
  print("\n")

  return idx_dates, array_y,array_X

In [14]:
def make_predictions(model,scaler,scaled_X,n_steps,n_features,n_pred,start_date):
  """
  Predict the next n_pred days with n_steps of daily data
  Input:
    model: A trained LSTM model
    scaler: The scaler used
    scaled_X: scaled input features
    n_steps: the number of input days used in the model
    n_features: the number of features used in the model
    n_pred: the number of days predicted in the model
    start_date: the start date of the prediction window
  Output:
    a data frame of predicted prices
  """

  # Predict the prices
  y_pred_scaled = model.predict(scaled_X.reshape(1,n_steps,n_features))

  # convert units back to the original scale
  y_pred_unscaled = scaler.inverse_transform(y_pred_scaled)

  # convert from log transform back to original scale
  y_pred_np = np.exp(y_pred_unscaled)

  # set the date index
  pred_dates = pd.date_range(start_date + datetime.timedelta(days=1), periods=n_pred,freq='D').tolist()


  # convert to dataframe
  df_pred = pd.DataFrame(y_pred_np.T,columns=['pred'])
  df_pred['actual']= np.nan
  df_pred['date'] = pred_dates
  df_pred.set_index(['date'],inplace=True)
  df_pred.index = pd.to_datetime(df_pred.index)
  df_pred =df_pred[['actual','pred']]

  return df_pred

In [None]:
def roll_predictions(df_new,df_pred=None,df_hist =None):
  """
  Updates previous predicted prices with actual prices, and adds
  the next n_pred prediction window
  Input:
    df_new: The new dataset of inputs
    df_pred: A dataframe of predicted prices
    df_hist: A dataframe that stores the actual/predicted prices 
            (the output of this function)
  Output:
    A dataframe of up to date prices with the next prediction window.
    Incluces the daily and cumulative prediction error
  """
  # First time creating history file
  if df_pred is None and df_hist is None:
    # create the initial history of prices (without predictions)
    df_hist_new = pd.DataFrame(df_new['close'],columns=['close'])
    df_hist_new.columns = ['actual']
    df_hist_new['pred']=np.nan
    df_hist_new['diff']=np.nan
    df_hist_new['diff_cum']=np.nan
    df_hist_new['actual_pct']=np.nan
    df_hist_new['pred_pct']=np.nan


  else:
    # append to existing history file
    # make a copy of df_hist
    df = df_hist.copy()

    # Get yesterdays closing price
    yesterday = df_new.index.max()
    yesterdays_close = df_new.loc[yesterday,'close'].item()

    # update df_hist with yesterdays_close,
    update_price(df,yesterday,yesterdays_close,'actual')


    # remove old predictions 
    # yesterdays nan should have been replaced in the prevous step
    df = df[~df['actual'].isnull()]

    # add new predictions
    df_hist_new =pd.concat([df,df_pred])

    # calculate the difference between actual/predicted values
    # for current period and cumulative
    df_hist_new['diff'] = df_hist_new['pred']-df_hist_new['actual']
    df_hist_new['diff_cum'] = df_hist_new['diff'].cumsum()

    # calculate the percent change in actual / predicted values
    df_hist_new['actual_pct'] = df_hist_new['actual'].pct_change() * 100
    df_hist_new['pred_pct'] = df_hist_new['pred'].pct_change() * 100

    #sort by date
    df_hist_new.sort_index(inplace=True)

  return df_hist_new

In [120]:
# def roll_predictions2(df_new,df_pred=None,df_hist =None):
#   """
#   Updates previous predicted prices with actual prices, and adds
#   the next n_pred prediction window
#   Input:
#     df_new: The new dataset of inputs
#     df_pred: A dataframe of predicted prices
#     df_hist: A dataframe that stores the actual/predicted prices 
#             (the output of this function)
#   Output:
#     A dataframe of up to date prices with the next prediction window.
#     Incluces the daily and cumulative prediction error
#   """
#   # reverse the order of the dataframe
#   df_new = df_new.iloc[::-1]


#   # First time creating history file
#   if df_pred is None and df_hist is None:
#     # create the initial history of prices (without predictions)
#     df_hist_new = pd.DataFrame(df_new['close'],columns=['close'])
#     df_hist_new.columns = ['actual']
#     df_hist_new['pred']=np.nan
#     df_hist_new['diff']=np.nan
#     df_hist_new['diff_cum']=np.nan
#     df_hist_new['actual_pct']=np.nan
#     df_hist_new['diff_pct']=np.nan
#     df_hist_new['naive']=np.nan
#     df_hist_new['naive_diff']=np.nan
#     df_hist_new['naive_diff_cum']=np.nan
#     df_hist_new['naive_diff_pct']=np.nan

#   else:
#     # append to existing history file
#     # make a copy of df_hist
#     df = df_hist.copy()

#     # Get yesterdays closing price
#     yesterday = df_new.index.max()
#     yesterdays_close = df_new.loc[yesterday,'close'].item()
#     naive_close = df_new.loc[yesterday - datetime.timedelta(days=1),'close'].item()

#     # update df_hist with yesterdays_close,
#     update_price(df,yesterday,yesterdays_close,'actual')

#     # update df_hist naive prediction with yesterays close
#     update_price(df,df_new.index.max(),naive_close,'naive')


#     # remove old predictions 
#     # yesterdays nan should have been replaced in the prevous step
#     df = df[~df['actual'].isnull()]

#     # add new predictions
#     df_hist_new =pd.concat([df,df_pred])

#     # calculate the difference between actual/predicted values
#     # for current period and cumulative
#     df_hist_new['diff'] = df_hist_new['pred']-df_hist_new['actual']
#     df_hist_new['diff_cum'] = df_hist_new['diff'].cumsum()
#     df_hist_new['naive_diff'] = df_hist_new['naive']-df_hist_new['actual']
#     df_hist_new['naive_diff_cum'] = df_hist_new['naive_diff'].cumsum()

#     # calculate the percent change in actual / predicted values
#     df_hist_new['actual_pct'] = df_hist_new['actual'].pct_change() * 100
#     df_hist_new['diff_pct'] = df_hist_new['diff'] / df_hist_new['actual'] *100
#     df_hist_new['naive_diff_pct'] = df_hist_new['naive_diff'] / df_hist_new['actual']*100


#     #sort by date
#     df_hist_new.sort_index(inplace=True)

#   return df_hist_new

In [16]:
def update_price(df_hist,date,value,type='actual'):
  """
  Updates the price in the history dataframe
  Input: 
    df_hist: the dataframe that contains the history of prices/predicitons/metrics
    date: the date to update
    value: the price to update
    type: actual or predicted price to update
  """
  # update the price as of the date
  # should be yesterdays price
  df_hist.at[date,type]=value


In [139]:
def plot_actual_predicted(name,df):
  """
  Plots the prices as a time-series showing actual/predicted values with daily and 
  cumulative prediction errors
  Input:
    name: the name of the stock/crypto
    df: the historical dataframe (output from roll_predictions)
  """
  fig = make_subplots(rows=3, 
                      cols=1,
                      shared_xaxes=True,
                      vertical_spacing=0.1,
                      subplot_titles = ('Actual vs Predicted Closing Price','Daily Error','Cumulative Error'))
  # Actual prices
  fig.add_trace(go.Scatter(
      x=df.index,
      y=df.actual,
      fill='tozeroy',
      mode = 'lines',
      line =dict(color="#ccc"),
      name = 'Actual'),
      row=1,col=1
  )
  # predicted prices
  fig.add_trace(go.Scatter(
      x=df.index,
      y=df.pred,
      fill = 'tozeroy',
      mode = 'lines+markers',
      line = dict(color='rgba(247, 12, 55, 0.1)'),
      marker =dict(size=5),
      name= 'Predicted'),
      row=1,col=1
  )
  # daily error
  fig.add_trace(go.Bar(
      x = df.index,
      y = df['diff'],
      name = 'Error',
      marker_color = 'rgba(247, 12, 55, 0.5)'
  ),row=2,col=1)

  # cumulative error
  fig.add_trace(go.Scatter( 
      x=df.index,
      y=df['diff_cum'],
      fill = 'tozeroy',
      line = dict(color='rgba(247, 12, 55, 0.5)'),
      mode = 'lines',
      name = 'Cumulative Error-LSTM Model'
  ),row=3,col=1)
  
  fig.update_layout(height=600, 
                    width=800, 
                    template = 'plotly_white',
                    title_text=f"{name}: Actual Vs. Predicted Prices")
  fig.show()


In [140]:
# def plot_actual_predicted2(name,df):
#   """
#   Plots the prices as a time-series showing actual/predicted values with daily and 
#   cumulative prediction errors
#   Input:
#     name: the name of the stock/crypto
#     df: the historical dataframe (output from roll_predictions)
#   """
#   fig = make_subplots(rows=3, 
#                       cols=1,
#                       shared_xaxes=True,
#                       vertical_spacing=0.1,
#                       subplot_titles = ('Actual vs Predicted Closing Price','Daily Error','Cumulative Error'))
#   # Actual prices
#   fig.add_trace(go.Scatter(
#       x=df.index,
#       y=df.actual,
#       fill='tozeroy',
#       mode = 'lines',
#       line =dict(color="#ccc"),
#       name = 'Actual'),
#       row=1,col=1
#   )
#   # predicted prices
#   fig.add_trace(go.Scatter(
#       x=df.index,
#       y=df.pred,
#       fill = 'tozeroy',
#       mode = 'lines+markers',
#       line = dict(color='rgba(247, 12, 55, 0.1)'),
#       marker =dict(size=5),
#       name= 'Predicted'),
#       row=1,col=1
#   )
#   # daily error
#   fig.add_trace(go.Bar(
#       x = df.index,
#       y = df['diff'],
#       name = 'Error',
#       marker_color = 'rgba(247, 12, 55, 0.5)'
#   ),row=2,col=1)

#   # cumulative error
#   fig.add_trace(go.Scatter( 
#       x=df.index,
#       y=df['diff_cum'],
#       fill = 'tozeroy',
#       line = dict(color='rgba(247, 12, 55, 0.5)'),
#       mode = 'lines',
#       name = 'Cumulative Error-LSTM Model'
#   ),row=3,col=1)
  
#   fig.add_trace(go.Scatter(
#       x=df.index,
#       y=df['naive_diff_cum'],
#       fill = 'tozeroy',
#       line = dict(color='rgba(53, 81, 92, 0.5)'),
#       mode = 'lines',
#       name = 'Cumulative Error- Naive Model'
#   ),row=3,col=1)

# Historical Back Test
+ pull all data to current date
+ start in jan 1,2021 and make the 5 day prediction
+ roll the model forward 1 day, make the next 5 day prediction
+ increment by 1 day, and repeat until the current date is reached

## VM Ware


In [None]:
# initial hist file
stock = 'VMW'
features_to_transform = ['open','close','high','low']
transform = 'log'
n_steps = 40
n_predict = 5
config ={'Economic':
         {'TREASURY_YIELD':[{'interval':'daily','maturity':'5year','name':'yield5y'},
                            {'interval':'daily','maturity':'10year','name':'yield10y'},
                            {'interval':'daily','maturity':'30year','name':'yield30y'},
                            {'interval':'daily','maturity':'3month','name':'yield3m'}
                            ],
          'FEDERAL_FUNDS_RATE':{'interval':'daily','name':'ir'},
          'NONFARM_PAYROLL':{'interval':'monthly','name':'nfp'},
          'UNEMPLOYMENT':{'interval':'monthly','name':'unemployment'},
          'CONSUMER_SENTIMENT':{'interval':'monthly','name':'cs'},
          'INFLATION_EXPECTATION':{'interval':'monthly','name':'infl'},
          },
         'Technical':{
           'BBANDS':{'interval':'daily','time_period':20},
           'RSI':{'interval':'daily','time_period':14},
           'MACD':{'interval':'daily','time_period':None}
           },
         'Commodities':['GLD','OIL','SPY','VXX','QQQ','SKYY','VGT']
         }

# get full data history
df_new = get_consolidated_stock_data(stock,key,config,'full')


# get the trained model
model = keras.models.load_model(f'/content/drive/MyDrive/Colab Notebooks/models/model_VWM_final')

# Get the features used in the final model
df_vmw_features = pd.read_pickle(f'/content/drive/MyDrive/Colab Notebooks/data/{stock}_market_data.pickle')
features = [f for f in df_vmw_features.columns if f not in ['symbol']]

# # Backtest to Jan 1,2021
start_date = '2021-1-1'

# Create first df_hist file
df_tmp = df_new.loc[df_new.index <=start_date]
df_hist_new =roll_predictions(df_tmp)
df_hist_new.to_pickle(f'/content/drive/MyDrive/Colab Notebooks/data/{stock}_hist.pickle')

# backtest
# datelist = pd.date_range(start_date, periods=266).tolist()
datelist = pd.date_range(start='2021-01-01',end='2021-09-24',freq='B').tolist()
for dt in datelist:

  print(dt)
  print("="*60)
  df_tmp = df_new.loc[df_new.index <=dt]
  df_orig = df_tmp.copy()

  # transform
  print("=> log transpose\n")
  transform_stationary(df_tmp,features_to_transform,transform)

  #prepare
  print("=>prepare data\n")
  idx_dates, array_y, array_X = prepare_data(df_tmp,n_steps,features)

  # scale the input and outputs
  print("=>scale data\n")
  scaler_X = MinMaxScaler(feature_range=(0,1))
  scaled_X = scaler_X.fit_transform(array_X)
  scaler_y = MinMaxScaler(feature_range=(0,1))
  scaled_y = scaler_y.fit_transform(array_y)


  # make predictions
  print("=>make predictions\n")
  df_pred =make_predictions(model,scaler_y,scaled_X,n_steps,len(features),n_predict,df_tmp.index.max())

  # get the previous df_hist data
  print("=>get previous file\n")
  df_hist = pd.read_pickle(f'/content/drive/MyDrive/Colab Notebooks/data/{stock}_hist.pickle')

  # update the hist file with yesterdays close price, and add the new predictions
  print("=>roll file forward\n")
  df_hist_new =roll_predictions(df_orig,df_pred,df_hist)
  df_hist_new.to_pickle(f'/content/drive/MyDrive/Colab Notebooks/data/{stock}_hist.pickle')


# plot
plot_actual_predicted(stock,df_hist_new)


## Bitcoin

In [None]:
stock = 'BTC'
features_to_transform = ['open','close','high','low','b-upper','b-lower','b-middle']
transform = 'log'
n_steps = 40
n_predict = 5

config ={'Economic':
         {'TREASURY_YIELD':[
                            {'interval':'daily','maturity':'5year','name':'yield5y'},
                            {'interval':'daily','maturity':'10year','name':'yield10y'}
                            ],
          },
         'Commodities':['SPY']
         }

get full data history
df_new = get_consolidated_crypto_data(stock,key)


get the trained model
model = keras.models.load_model(f'/content/drive/MyDrive/Colab Notebooks/models/model_BTC_log_features_optimized')

# Get the features used in the final model
df_btc_features = pd.read_pickle(f'/content/drive/MyDrive/Colab Notebooks/data/{stock}_market_data_features.pickle')
features = [f for f in df_btc_features.columns if f not in ['symbol']]


# initial hist file
start_date = '2021-01-01'
df_tmp = df_new.loc[df_new.index <=start_date]
df_hist_new =roll_predictions(df_tmp)
df_hist_new.to_pickle(f'/content/drive/MyDrive/Colab Notebooks/data/{stock}_hist.pickle')

# backtest for sep 2021
datelist = pd.date_range(start_date, periods=270).tolist()
for dt in datelist:

  print(dt)
  print("="*60)
  df_tmp = df_new.loc[df_new.index <=dt]
  df_orig = df_tmp.copy()

  # transform
  print("=> log transpose\n")
  transform_stationary(df_tmp,features_to_transform,transform)

  #prepare
  print("=>prepare data\n")
  idx_dates, array_y, array_X = prepare_data(df_tmp,n_steps,features)

  # scale the input and outputs
  print("=>scale data\n")
  scaler_X = MinMaxScaler(feature_range=(0,1))
  scaled_X = scaler_X.fit_transform(array_X)
  scaler_y = MinMaxScaler(feature_range=(0,1))
  scaled_y = scaler_y.fit_transform(array_y)


  # make predictions
  print("=>make predictions\n")
  # df_pred =make_predictions(model,scaler_y,scaled_X,n_steps,len(features),n_predict,df_tmp.index.max()-datetime.timedelta(days=1))
  df_pred =make_predictions(model,scaler_y,scaled_X,n_steps,len(features),n_predict,df_tmp.index.max())

  # get the previous df_hist data
  print("=>get previous file\n")
  df_hist = pd.read_pickle(f'/content/drive/MyDrive/Colab Notebooks/data/{stock}_hist.pickle')

  # update the hist file with yesterdays close price, and add the new predictions
  print("=>roll file forward\n")
  df_hist_new =roll_predictions(df_orig,df_pred,df_hist)
  df_hist_new.to_pickle(f'/content/drive/MyDrive/Colab Notebooks/data/{stock}_hist.pickle')


# plot
plot_actual_predicted(stock,df_hist_new)


In [None]:
# BTC- with naive model
# stock = 'BTC'
# features_to_transform = ['open','close','high','low','b-upper','b-lower','b-middle']
# transform = 'log'
# n_steps = 40
# n_predict = 5

# config ={'Economic':
#          {'TREASURY_YIELD':[
#                             {'interval':'daily','maturity':'5year','name':'yield5y'},
#                             {'interval':'daily','maturity':'10year','name':'yield10y'}
#                             ],
#           },
#          'Commodities':['SPY']
#          }

# get full data history
# df_new = get_consolidated_crypto_data(stock,key)


# get the trained model
# model = keras.models.load_model(f'/content/drive/MyDrive/Colab Notebooks/models/model_BTC_log_features_optimized')

# # Get the features used in the final model
# df_btc_features = pd.read_pickle(f'/content/drive/MyDrive/Colab Notebooks/data/{stock}_market_data_features.pickle')
# features = [f for f in df_btc_features.columns if f not in ['symbol']]


# # initial hist file
# start_date = '2021-01-01'
# df_tmp = df_new.loc[df_new.index <=start_date]
# df_hist_new =roll_predictions2(df_tmp)
# df_hist_new.to_pickle(f'/content/drive/MyDrive/Colab Notebooks/data/{stock}_hist2.pickle')

# # backtest for sep 2021
# datelist = pd.date_range(start_date, periods=270).tolist()
# for dt in datelist:

#   print(dt)
#   print("="*60)
#   df_tmp = df_new.loc[df_new.index <=dt]
#   df_orig = df_tmp.copy()

#   # transform
#   print("=> log transpose\n")
#   transform_stationary(df_tmp,features_to_transform,transform)

#   #prepare
#   print("=>prepare data\n")
#   idx_dates, array_y, array_X = prepare_data(df_tmp,n_steps,features)

#   # scale the input and outputs
#   print("=>scale data\n")
#   scaler_X = MinMaxScaler(feature_range=(0,1))
#   scaled_X = scaler_X.fit_transform(array_X)
#   scaler_y = MinMaxScaler(feature_range=(0,1))
#   scaled_y = scaler_y.fit_transform(array_y)


#   # make predictions
#   print("=>make predictions\n")
#   # df_pred =make_predictions(model,scaler_y,scaled_X,n_steps,len(features),n_predict,df_tmp.index.max()-datetime.timedelta(days=1))
#   df_pred =make_predictions(model,scaler_y,scaled_X,n_steps,len(features),n_predict,df_tmp.index.max())

#   # get the previous df_hist data
#   print("=>get previous file\n")
#   df_hist = pd.read_pickle(f'/content/drive/MyDrive/Colab Notebooks/data/{stock}_hist2.pickle')

#   # update the hist file with yesterdays close price, and add the new predictions
#   print("=>roll file forward\n")
#   df_hist_new =roll_predictions2(df_orig,df_pred,df_hist)
#   df_hist_new.to_pickle(f'/content/drive/MyDrive/Colab Notebooks/data/{stock}_hist2.pickle')


# # plot
# plot_actual_predicted2(stock,df_hist_new)


## BLX.TO

In [None]:
stock = 'BLX.TO'
features_to_transform = ['open','close','high','low']
transform = 'log'
n_steps = 40
n_predict = 5

config ={'Economic':
         {'TREASURY_YIELD':[{'interval':'daily','maturity':'3month','name':'yield3m'}]},
         'Technical':{
           'BBANDS':{'interval':'daily','time_period':20},
           'MACD':{'interval':'daily','time_period':None}
           },
         'Commodities':['PBD']
         }
df_new = get_consolidated_stock_data(stock,key,config,'full')


# shift features to match the model
df_new = shift_features(df_new,{'yield3m': -60})

# get the features
features = pd.read_pickle(f'/content/drive/MyDrive/Colab Notebooks/data/{stock}_market_data.pickle')
features = features.columns

# get the model
model = keras.models.load_model(f'/content/drive/MyDrive/Colab Notebooks/models/model_{stock}_log_features_optimized')

# initial hist file
start_date = '2021-1-1'
num_rows = df_new.loc[df_new.index > start_date].shape[0]

df_tmp = df_new.loc[df_new.index <=start_date]
df_hist_new =roll_predictions(df_tmp)
df_hist_new.to_pickle(f'/content/drive/MyDrive/Colab Notebooks/data/{stock}_hist.pickle')

# backtest for sep 2021
datelist = pd.date_range(start_date, periods=266).tolist()
for dt in datelist:

  print(dt)
  print("="*60)
  df_tmp = df_new.loc[df_new.index <=dt]
  df_orig = df_tmp.copy()

  # transform
  print("=> log transpose\n")
  transform_stationary(df_tmp,features_to_transform,transform)

  #prepare
  print("=>prepare data\n")
  idx_dates, array_y, array_X = prepare_data(df_tmp,n_steps,features)

  # scale the input and outputs
  print("=>scale data\n")
  scaler_X = MinMaxScaler(feature_range=(0,1))
  scaled_X = scaler_X.fit_transform(array_X)
  scaler_y = MinMaxScaler(feature_range=(0,1))
  scaled_y = scaler_y.fit_transform(array_y)


  # make predictions
  print("=>make predictions\n")
  df_pred =make_predictions(model,scaler_y,scaled_X,n_steps,len(features),n_predict,df_tmp.index.max())

  # get the previous df_hist data
  print("=>get previous file\n")
  df_hist = pd.read_pickle(f'/content/drive/MyDrive/Colab Notebooks/data/{stock}_hist.pickle')

  # update the hist file with yesterdays close price, and add the new predictions
  print("=>roll file forward\n")
  df_hist_new =roll_predictions(df_orig,df_pred,df_hist)
  df_hist_new.to_pickle(f'/content/drive/MyDrive/Colab Notebooks/data/{stock}_hist.pickle')


# plot
plot_actual_predicted(stock,df_hist_new)

In [146]:
# BLX.TO - with naive model
stock = 'BLX.TO'
features_to_transform = ['open','close','high','low']
transform = 'log'
n_steps = 40
n_predict = 5

config ={'Economic':
         {'TREASURY_YIELD':[{'interval':'daily','maturity':'3month','name':'yield3m'}]},
         'Technical':{
           'BBANDS':{'interval':'daily','time_period':20},
           'MACD':{'interval':'daily','time_period':None}
           },
         'Commodities':['PBD']
         }
# df_new = get_consolidated_stock_data(stock,key,config,'full')


# # shift features to match the model
# df_new = shift_features(df_new,{'yield3m': -60})

# # get the features
# features = pd.read_pickle(f'/content/drive/MyDrive/Colab Notebooks/data/{stock}_market_data.pickle')
# features = features.columns

# # get the model
# model = keras.models.load_model(f'/content/drive/MyDrive/Colab Notebooks/models/model_{stock}_log_features_optimized')


# initial hist file
start_date = '2021-01-01'
df_tmp = df_new.loc[df_new.index <=start_date]
df_hist_new =roll_predictions2(df_tmp)
df_hist_new.to_pickle(f'/content/drive/MyDrive/Colab Notebooks/data/{stock}_hist2.pickle')

# backtest for sep 2021
datelist = pd.date_range(start_date, periods=2,freq='B').tolist()
for dt in datelist:

  print(dt)
  print("="*60)
  df_tmp = df_new.loc[df_new.index <=dt]
  df_orig = df_tmp.copy()

  # transform
  print("=> log transpose\n")
  transform_stationary(df_tmp,features_to_transform,transform)

  #prepare
  print("=>prepare data\n")
  idx_dates, array_y, array_X = prepare_data(df_tmp,n_steps,features)

  # scale the input and outputs
  print("=>scale data\n")
  scaler_X = MinMaxScaler(feature_range=(0,1))
  scaled_X = scaler_X.fit_transform(array_X)
  scaler_y = MinMaxScaler(feature_range=(0,1))
  scaled_y = scaler_y.fit_transform(array_y)


  # make predictions
  print("=>make predictions\n")
  # df_pred =make_predictions(model,scaler_y,scaled_X,n_steps,len(features),n_predict,df_tmp.index.max()-datetime.timedelta(days=1))
  df_pred =make_predictions(model,scaler_y,scaled_X,n_steps,len(features),n_predict,df_tmp.index.max())

  # get the previous df_hist data
  print("=>get previous file\n")
  df_hist = pd.read_pickle(f'/content/drive/MyDrive/Colab Notebooks/data/{stock}_hist2.pickle')

  # update the hist file with yesterdays close price, and add the new predictions
  print("=>roll file forward\n")
  df_hist_new =roll_predictions2(df_orig,df_pred,df_hist)
  df_hist_new.to_pickle(f'/content/drive/MyDrive/Colab Notebooks/data/{stock}_hist2.pickle')


# plot
plot_actual_predicted2(stock,df_hist_new)

2021-01-01 00:00:00
=> log transpose



Augmented Dicky Fuller Test for Stationarity
ADF Statistic: -2.35
Critial Values:
   1%, -3.43 => stationary
Critial Values:
   5%, -2.86 => stationary
Critial Values:
   10%, -2.57 => stationary
=>prepare data


Data Preparation
=> 10 Features
=> Input Dimensions :(40, 10)


=>scale data

=>make predictions





A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



=>get previous file

=>roll file forward

2021-01-04 00:00:00
=> log transpose





A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy





Augmented Dicky Fuller Test for Stationarity
ADF Statistic: -2.42
Critial Values:
   1%, -3.43 => stationary
Critial Values:
   5%, -2.86 => stationary
Critial Values:
   10%, -2.57 => stationary
=>prepare data


Data Preparation
=> 10 Features
=> Input Dimensions :(40, 10)


=>scale data

=>make predictions

=>get previous file

=>roll file forward



KeyError: ignored