<a href="https://colab.research.google.com/github/kconstable/crypto-ensemble-model-predictions/blob/main/feature_engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import requests
import time
import pickle

from datetime import date, timedelta,datetime

import plotly.graph_objects as go
from plotly.subplots import make_subplots

from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE

from google.colab import files
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# load saved market data
ticker = 'BTC'
df = pd.read_pickle(f'/content/drive/MyDrive/Colab Notebooks/capstone2/data/{ticker}_market_data.pickle')
print(f"\n{ticker}")
print("="*60)
print(f"{ticker}-all features: ",df.shape)


BTC
BTC-all features:  (2125, 66)


# Create Lag Features
The features may have more influence if they are lagged vs current.  For example, the unemployment rate from 40 days ago may influence the price to a greater extent than the current unemployment rate.  

To explore this possibility, the correlation between closing prices and current/lagged features are calculated and compared.  The number of days lagged was determined iteratively until the correlation with the largest magnitude was found.

The calc_optimal_feature_lags function calculates the correlation between each feature and the closing price. It loops over lags from zero days to 60 days and returns a list of features with higher higher correlations when lagged. 

The shift_features function adds a lagged version of the input features to the features data frame.

RFE is then used to select from all availble features (current and lagged) to find the most influential features to be included in the LSTM model.

In [5]:
def calc_optimal_feature_lags(df,features_to_lag,lags):
  """
  Calculates the correlation between each feature and the closing price over a series of lags
  Inputs:
    df: A data frame of feartures.  Expects 'close' price to be included
    features_to_lag: A list of features to shift by each lag in lags
    lags: A list of lags (in days)
  Outputs:
    A dictionary of {feature name:lag (in days)}. Only returns features with 
    higher correlations with lags
  """
  # get a copy of the df
  dff = df.copy()

  # dict to store the optimal lag for each feature
  d = {}
  
  # loop each feature, each lag
  for feature in features_to_lag:
    # maximum abs correlation with close price, and the associated lag
    # reset after each feature
    max_corr = 0
    max_lag = 0
    for lag in lags:
      if feature != 'close':
        # name of shifted column: feature_shift_lag
        shift = f"_shift_{str(lag)}"

        # shift the feature by the lag,join with dff
        tshift=dff[feature].shift(lag)
        dff =dff.join(tshift,how='left',rsuffix=shift)

        # calc the corr with the close price
        corr = dff[['close',feature+shift]].corr()[feature+shift]['close']

        # update the max abs corr and associated lag
        if abs(corr)>max_corr:
          max_corr = corr
          max_lag = lag
    # save the max abs corr and associated lag
    if max_lag <0:
      d[feature] = max_lag

  return d

In [6]:
def shift_features(df,features_shift):
  """
  Shifts features by time periods to convert them to lagged indicators
  Input:
    df: dataframe of features
    features_shift: dictionary  of {feature:period shift}
  Output:
    df: original dataframe + the shifted features
  """
  dff = df.copy()
  for feature,shift in features_shift.items():
    t_shift = pd.DataFrame(dff[feature].shift(periods=shift))
    dff =dff.join(t_shift,how='left',rsuffix='_shift')

  # remove nan introducted with lag features
  dff.dropna(how='any',inplace=True)

  return dff

In [18]:
def plot_corr(df,features,title):
  """
  Plots correlations in a heatmap
  Input: 
    df: a dataframe of features to plot
  """
    
  # calculate correlations
  cm =df[features].corr()

  # plot a heatmap of correlations
  fig = go.Figure()
  fig.add_trace(go.Heatmap(
        z = cm,
        x = cm.columns.values,
        y = cm.columns.values,
        # colorscale = 'Bluyl'
        colorscale = 'YlOrRd'
      )
  )
  fig.update_layout(
      title_text=f"Feature Correlations:{title}", 
      title_x=0.5, 
      width=600, 
      height=600,
      yaxis_autorange='reversed',
      template = 'plotly_white'
  )

  fig.show()

## Economic Indicators

In [39]:
# economic indicators
features_to_lag = ['gdp','nfp','unemployment','cs','infl','ir','yield3m','yield5y','yield10y','yield30y']

# try shifting from zero to -60 days prior
lags = list(range(0,-70,-10))

feature_shifts =calc_optimal_feature_lags(df,features_to_lag,lags)
print(feature_shifts)

# add the optimal feature lags
features_to_lag.insert(0,'close')
dff = shift_features(df[features_to_lag],feature_shifts)

# plot corr
plot_corr(dff,dff.columns,'Economic Indicators')


{'gdp': -60, 'nfp': -60, 'cs': -60, 'infl': -60, 'ir': -60, 'yield3m': -60, 'yield5y': -60, 'yield10y': -60, 'yield30y': -60}


## Sentiment

In [44]:
# sentiment 
features_to_lag = ['weighted_sentiment','ma_sentiment_10','ma_sentiment_20','ma_sentiment_40','news_count','ma_news_count','sentiment_title','ma_sentiment_title','idx_fear_greed','google_trends']

# try shifting from zero to -60 days prior
lags = list(range(0,-70,-10))

feature_shifts =calc_optimal_feature_lags(df,features_to_lag,lags)
print(feature_shifts)

# add the optimal feature lags
features_to_lag.insert(0,'close')
dff = shift_features(df[features_to_lag],feature_shifts)

# plot corr
plot_corr(dff,dff.columns,'Sentiment')

{'sentiment_title': -60}


## Technical Indicators

In [41]:
# technical indicators 
features_to_lag = ['b-upper','b-lower','rsi','stoch_high','stoch_low','stoch_D','stoch_K','macd','macd_signal']

# try shifting from zero to -60 days prior
lags = list(range(0,-70,-10))

feature_shifts =calc_optimal_feature_lags(df,features_to_lag,lags)
print(feature_shifts)

# add the optimal feature lags
features_to_lag.insert(0,'close')
dff = shift_features(df[features_to_lag],feature_shifts)

# plot corr
plot_corr(dff,dff.columns,'Technical Indicators')

{'b-upper': -10, 'b-lower': -10, 'rsi': -60, 'stoch_high': -10, 'stoch_low': -10, 'stoch_D': -60, 'stoch_K': -60}


## Commodities, FX Rates and Indexes

In [43]:
# technical indicators 
features_to_lag =['close','GLD','OIL','BOIL','VXX','SPY','XLE','QQQ','USDEUR','USDGBP','USDJPY']

# try shifting from zero to -60 days prior
lags = list(range(0,-70,-10))

feature_shifts =calc_optimal_feature_lags(df,features_to_lag,lags)
print(feature_shifts)

# add the optimal feature lags
features_to_lag.insert(0,'close')
dff = shift_features(df[features_to_lag],feature_shifts)

# plot corr
plot_corr(dff,dff.columns,'Commodities, FX Rates & Indexes')


{'OIL': -60, 'VXX': -60, 'XLE': -60, 'USDEUR': -60, 'USDGBP': -60, 'USDJPY': -60}


In [None]:

# List of features to lag
# include economic and commodities (exclude technical indicators)

features_to_lag =[f for f in df_btc.columns if f not in ['open','high','low','symbol','b-lower','b-middle','b-upper','rsi','macd_hist','macd','macd_signal']]

# try shifting from zero to -60 days prior
lags = list(range(0,-70,-10))

# Find the lag with the largest absolute correlation with closing price
feature_shifts =calc_optimal_feature_lags(df_btc,features_to_lag,lags)
print(feature_shifts)

# shift the features according to 
# df_btc_shifted = shift_features(df_btc,feature_shifts)

# # plot the correlations
# plot_corr(df_btc_shifted)

# # plot feature importance and ranks
# selected = plot_feature_importance(df_btc_shifted,10,1985)


# # subset features, add back the price features
# [selected.append(f) for f in ['open','close','high','low']] 
# df_btc_features = df_btc_shifted[selected]

# # save to google drive
# df_btc_features.to_pickle(f'/content/drive/MyDrive/Colab Notebooks/data/{stock}_market_data_features.pickle')
