<a href="https://colab.research.google.com/github/kconstable/crypto-ensemble-model-predictions/blob/main/feature_engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import requests
import time
import pickle

from datetime import date, timedelta,datetime

import plotly.graph_objects as go
from plotly.subplots import make_subplots

from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE

from google.colab import files
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [78]:
# load saved market data
ticker = 'BTC'
df = pd.read_pickle(f'/content/drive/MyDrive/Colab Notebooks/capstone2/data/{ticker}_market_data.pickle')
print(f"\n{ticker}")
print("="*60)
print(f"{ticker}-all features: ",df.shape)


BTC
BTC-all features:  (2125, 69)


# Feature Engineering

## Create Lag Features
The features may have more influence if they are lagged vs current.  For example, the unemployment rate from 40 days ago may influence the price to a greater extent than the current unemployment rate.  

To explore this possibility, the correlation between closing prices and current/lagged features are calculated and compared.  The number of days lagged was determined iteratively until the correlation with the largest magnitude was found.

The calc_optimal_feature_lags function calculates the correlation between each feature and the closing price. It loops over lags from zero days to 60 days and returns a list of features with higher higher correlations when lagged. 

The shift_features function adds a lagged version of the input features to the features data frame.

RFE is then used to select from all availble features (current and lagged) to find the most influential features to be included in the LSTM model.

In [53]:
def calc_optimal_feature_lags(df,features_to_lag,lags):
  """
  Calculates the correlation between each feature and the closing price over a series of lags
  Inputs:
    df: A data frame of feartures.  Expects 'close' price to be included
    features_to_lag: A list of features to shift by each lag in lags
    lags: A list of lags (in days)
  Outputs:
    A dictionary of {feature name:lag (in days)}. Only returns features with 
    higher correlations with lags
  """
  # get a copy of the df
  dff = df.copy()

  # dict to store the optimal lag for each feature
  d = {}
  
  # loop each feature, each lag
  for feature in features_to_lag:
    # maximum abs correlation with close price, and the associated lag
    # reset after each feature
    max_corr = 0
    max_lag = 0
    for lag in lags:
      if feature != 'close':
        # name of shifted column: feature_shift_lag
        shift = f"_shift_{str(lag)}"

        # shift the feature by the lag,join with dff
        tshift=dff[feature].shift(lag)
        dff =dff.join(tshift,how='left',rsuffix=shift)

        # calc the corr with the close price
        corr = dff[['close',feature+shift]].corr()[feature+shift]['close']

        # update the max abs corr and associated lag
        if abs(corr)>abs(max_corr):
          max_corr = abs(corr)
          max_lag = lag
    # save the max abs corr and associated lag
    if max_lag <0:
      d[feature] = max_lag

  return d

In [5]:
def shift_features(df,features_shift):
  """
  Shifts features by time periods to convert them to lagged indicators
  Input:
    df: dataframe of features
    features_shift: dictionary  of {feature:period shift}
  Output:
    df: original dataframe + the shifted features
  """
  dff = df.copy()
  for feature,shift in features_shift.items():
    t_shift = pd.DataFrame(dff[feature].shift(periods=shift))
    dff =dff.join(t_shift,how='left',rsuffix='_shift')

  # remove nan introducted with lag features
  dff.dropna(how='any',inplace=True)

  return dff

In [6]:
def plot_corr(df,features,title):
  """
  Plots correlations in a heatmap
  Input: 
    df: a dataframe of features to plot
  """
    
  # calculate correlations
  cm =df[features].corr()

  # plot a heatmap of correlations
  fig = go.Figure()
  fig.add_trace(go.Heatmap(
        z = cm,
        x = cm.columns.values,
        y = cm.columns.values,
        # colorscale = 'Bluyl'
        colorscale = 'YlOrRd'
      )
  )
  fig.update_layout(
      title_text=f"Feature Correlations:{title}", 
      title_x=0.5, 
      width=600, 
      height=600,
      yaxis_autorange='reversed',
      template = 'plotly_white'
  )

  fig.show()

### Economic Indicators
+ most economic indicators have higher correlations when lagged 100 days or more

In [31]:
# economic indicators
features_to_lag = ['gdp','nfp','unemployment','cs','infl','ir','yield3m','yield5y','yield10y','yield30y']

# try shifting from zero to -60 days prior
lags = list(range(0,-120,-10))

feature_shifts =calc_optimal_feature_lags(df,features_to_lag,lags)
print(feature_shifts)

# add the optimal feature lags
features_to_lag.insert(0,'close')
df_econ = shift_features(df[features_to_lag],feature_shifts)

# plot corr
plot_corr(df_econ,df_econ.columns,'Economic Indicators')


{'gdp': -100, 'nfp': -110, 'cs': -110, 'infl': -110, 'ir': -110, 'yield3m': -110, 'yield5y': -110, 'yield10y': -110, 'yield30y': -110}


### Sentiment
+ the news count 2 days prior to the current day has a higher correlation with closing price than the current news count

In [33]:
# sentiment 
features_to_lag = ['weighted_sentiment','ma_sentiment_10','ma_sentiment_20','ma_sentiment_40','news_count','ma_news_count','sentiment_title','ma_sentiment_title','idx_fear_greed','google_trends']

# try shifting from zero to -60 days prior
lags = list(range(0,-21,-1))

feature_shifts =calc_optimal_feature_lags(df,features_to_lag,lags)
print(feature_shifts)

# add the optimal feature lags
features_to_lag.insert(0,'close')
df_sent = shift_features(df[features_to_lag],feature_shifts)

# plot corr
plot_corr(df_sent,df_sent.columns,'Sentiment')

{'news_count': -2}


### Commodities, FX Rates and Indexes
+ OIL, natural gas (BOIL) and DOGE have significantly higher correlations when lagged

In [54]:
# technical indicators 
features_to_lag =['GLD','OIL','BOIL','VXX','SPY','XLE','QQQ','USDEUR','USDGBP','USDJPY','ETH','LTC','DOGE']


# try shifting from zero to -60 days prior
lags = list(range(0,-120,-10))

feature_shifts =calc_optimal_feature_lags(df,features_to_lag,lags)
print(feature_shifts)

# add the optimal feature lags
features_to_lag.insert(0,'close')
df_com = shift_features(df[features_to_lag],feature_shifts)

# plot corr
plot_corr(df_com,df_com.columns,'Commodities, FX Rates & Indexes')


{'OIL': -110, 'BOIL': -110, 'VXX': -110, 'USDEUR': -10, 'USDGBP': -100, 'ETH': -30, 'DOGE': -60}


### Add the Lagged Features

In [79]:


# add shifted economic columns to the dataset
add_cols = [c for c in df_econ.columns if c not in df.columns]
df = df.join(df_econ[add_cols],how='left')

# add shifted commodity columns to the dataset
add_cols = [c for c in df_com.columns if c not in df.columns]
df = df.join(df_com[add_cols],how='left')


# add shifted sentiment columns to the dataset
add_cols = [c for c in df_sent.columns if c not in df.columns]
df = df.join(df_sent[add_cols],how='left')

print(f"Total Features:{df.shape[1]}")
print("="*60)
print(df.columns)


# save the dataset
df.to_pickle(f'/content/drive/MyDrive/Colab Notebooks/capstone2/data/{ticker}_market_data_features.pickle')


Total Features:86
Index(['ticker', 'open', 'high', 'low', 'close', 'volume', 'month', 'weekday',
       'futures_open', 'futures_high', 'futures_low', 'futures_close',
       'futures_volume', 'idx_fear_greed', 'idx_classification', 'ETH', 'DOGE',
       'LTC', 'weighted_sentiment', 'ma_sentiment_10', 'ma_sentiment_20',
       'ma_sentiment_40', 'b-upper-ma_sentiment_10',
       'b-middle-ma_sentiment_10', 'b-lower-ma_sentiment_10', 'news_count',
       'sentiment_title', 'ma_news_count', 'ma_sentiment_title',
       'b-upper-ma_news_count', 'b-middle-ma_news_count',
       'b-lower-ma_news_count', 'b-upper-ma_sentiment_title',
       'b-middle-ma_sentiment_title', 'b-lower-ma_sentiment_title',
       'google_trends', 'b-upper-google_trends', 'b-middle-google_trends',
       'b-lower-google_trends', 'USDEUR', 'USDJPY', 'USDGBP', 'GLD', 'OIL',
       'BOIL', 'VXX', 'SPY', 'XLE', 'QQQ', 'b-upper', 'b-middle', 'b-lower',
       'rsi', 'stoch_high', 'stoch_low', 'stoch_K', 'stoch_D', 'macd

# Feature Selection

In [121]:
def plot_feature_importance(df_features,num_features,seed):
  """
  Calculate the feature importance and rank using RFE and random forest regression
  Input: 
    df: dataframe of features. Expects that the close price is the target variable
    num_features: the number of features to rank
    seed: set the seed for reproducability
  Output:
    A list of top features, and plos of feature importance and ranks
  """

  # remove na
  df = df_features.copy()
  df.dropna(how='any',inplace=True)

  # get y target
  y = df.close

  # exclude price features and the symbol name from the feature list
  cols = [c for c in df.columns if c not in  ['ticker','close','open','high','low','idx_classification']]
  X = df[cols]


  # fit random forest model to get feature importance scores
  model = RandomForestRegressor(n_estimators=500, random_state=seed)
  model.fit(X, y)

  # use RFE to rank features
  rfe = RFE(RandomForestRegressor(n_estimators=500, random_state=seed), n_features_to_select=num_features)
  fit = rfe.fit(X, y)


  # Print Top Features
  selected =[]
  for i in range(len(fit.support_)):
    if fit.support_[i]:
      selected.append(cols[i])
  
  print("Top Ranked Features")
  print("="*60)
  print(selected)

  #
  df_features = pd.DataFrame(list(zip(cols,fit.ranking_,model.feature_importances_)),
                             columns=['feature','rank','score'])
  df_features.sort_values(by=['score','rank'],ascending=False,inplace=True)
  df_features.set_index('feature',inplace=True)
  df_features = df_features[df_features['rank']==1]

  # create plot
  fig = go.Figure()
  
  # create the plot of feature importance
  fig.add_trace(go.Bar(
              name = 'Importance',
              x=df_features.score,
              y=df_features.index,
              orientation='h',
              marker=dict(color='orange'),
              opacity = 0.5
    )
  )


  fig.update_layout(
      title = 'Feature Importance',
      template='plotly_white',
                    width=700,
                    height=800,
                    yaxis={'categoryorder':'total descending'})

  fig.show()
  return df_features

In [122]:
df_features = plot_feature_importance(df,50,1985)


Top Ranked Features
['volume', 'futures_open', 'futures_high', 'futures_low', 'futures_close', 'idx_fear_greed', 'ETH', 'DOGE', 'LTC', 'ma_sentiment_40', 'b-upper-ma_sentiment_10', 'b-middle-ma_sentiment_10', 'b-lower-ma_sentiment_10', 'news_count', 'ma_sentiment_title', 'b-upper-ma_news_count', 'b-middle-ma_news_count', 'b-lower-ma_news_count', 'b-upper-ma_sentiment_title', 'b-middle-ma_sentiment_title', 'b-lower-ma_sentiment_title', 'b-upper-google_trends', 'b-middle-google_trends', 'b-lower-google_trends', 'USDEUR', 'USDJPY', 'USDGBP', 'OIL', 'BOIL', 'VXX', 'QQQ', 'b-upper', 'b-middle', 'b-lower', 'rsi', 'stoch_high', 'stoch_low', 'stoch_K', 'macd', 'macd_signal', 'yield5y_shift', 'yield10y_shift', 'yield30y_shift', 'OIL_shift', 'BOIL_shift', 'VXX_shift', 'USDEUR_shift', 'ETH_shift', 'DOGE_shift', 'news_count_shift']



+ Bitcoin: open,high,low,close,volume
+ Bitcoin Futures: open, high,low, close
+ Peers: ETH, DOGE, LTC, fear-greed-index
+ Sentiment: Sentiment (40 day moving average), Sentiment (10-day bollinger), news_count (bollinger bands), google-trends (bollinger), title-sentiment (bollinger)
+ Technical: RSI, MACD, Stoch, Bollinger bands
+ Economic: 30-year-bond-yields-shifted, 5-year-bond-yields-shifted
+ Commodities; Oil-shifted, BOIL-shifted, SPY,QQQ
+ FX Rates: USDGBP, USDEUR-shifted