In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import pandas_ta as ta
import matplotlib.pyplot as plt
import os

from os.path import join
from os import getcwd
from pathlib import Path
from sys import path

full_path = getcwd()
functions_path = join( Path(full_path).parents[0].parents[0] )
path.append( functions_path  )

from functions.feature_engineering import FEATURES_CONFIG_IDS
import functions.feature_engineering as fe

import seaborn as sns

%matplotlib inline

In [25]:
import yaml
from pathlib import Path
CONF = yaml.safe_load(Path('config.yml').read_text())


In [3]:
CONF['Feature_Engineering'][1]['ta']

[{'kind': 'ema', 'length': 3},
 {'kind': 'ema', 'length': 15},
 {'kind': 'ema', 'length': 60},
 {'kind': 'bbands', 'length': 20},
 {'kind': 'bbands', 'length': 20},
 {'kind': 'macd', 'fast': 8, 'slow': 21},
 {'kind': 'rsi', 'length': 14},
 {'kind': 'atr', 'length': 14},
 {'kind': 'pdist'}]

In [4]:
CONF

{'Feature_Engineering': {1: {'ta': [{'kind': 'ema', 'length': 3},
    {'kind': 'ema', 'length': 15},
    {'kind': 'ema', 'length': 60},
    {'kind': 'bbands', 'length': 20},
    {'kind': 'bbands', 'length': 20},
    {'kind': 'macd', 'fast': 8, 'slow': 21},
    {'kind': 'rsi', 'length': 14},
    {'kind': 'atr', 'length': 14},
    {'kind': 'pdist'}],
   'include_lags': True,
   'lags': [1, 3, 5, 15, 30],
   'ref_variable_lags': ['Close'],
   'drop_lags': False,
   'Volume_Features': True,
   'Volume_Windows': [5, 60],
   'Volume_Returns': True,
   'Volume_Returns_lags': [5, 15],
   'Volume_Returns_binary': True,
   'EntryPrice_PrevClose': False,
   'lags_diff': [1, 15, 30, 60],
   'binary_lags': True,
   'Return_Features': True,
   'return_lags': [1, 5, 15, 60, 240],
   'Momentum_Features': False,
   'use_prob_features': True,
   'probability_features': ['entry_type', 'risk_type'],
   'Prob_Features_Windows': [2, 6]}}}

# Description 
The goal is to generate features to be able to predict the variable risk_type and entry_type. 

This codes produces different feature for different scenario. Using the global variable FEATURES_CONFIG_IDS different feature scenarios can be selected. 

Select the forecast horizon fh, then the column risk_type and entry_type will be shifted fh steps to the future. This will be the target variables for the ML Models. 

In [5]:
storage_folder = '/media/john/Data/Tensor_Invest_Fund/data/Cryptos/TBM/'

storage_folder = '/mnt/Data/Tensor_Invest_Fund/data/Cryptos/TBM/'

In [6]:
os.listdir(os.path.join(storage_folder, 'ADAUSDT'))

['Tripe_Barrier_Method_ADAUSDT_ptsl_1-1_vb_15m.parquet',
 'Tripe_Barrier_Method_ADAUSDT_ptsl_2-1_vb_15m.parquet',
 'Tripe_Barrier_Method_ADAUSDT_ptsl_2-2_vb_15m.parquet']

In [7]:
feature_id = 1

In [8]:
feature_id = 1
strategy = '1-1_vb_15m' # '2-1_vb_15m' or 2-2_vb_15m

SYMBOLS = ['ADAUSDT',
 'BNBBTC',
 'BNBUSDT',
 'BTCUSDT',
 'DOGEUSDT',
 'EOSUSDT',
 'ETCUSDT',
 'ETHUSDT',
 'IOTAUSDT',
 'LTCUSDT',
 'MKRUSDT',
 'TRXUSDT',
 'XLMUSDT',
 'XMRBTC']

SYMBOLS = ['ADAUSDT',
 'BNBBTC',
 'BNBUSDT',
 'ETHUSDT']

input_folder_db = storage_folder

dfs = []

for ticker in SYMBOLS:

    data_path = f'{ticker}/Tripe_Barrier_Method_{ticker}_ptsl_{strategy}.parquet'

    dfs.append(pd.read_parquet(os.path.join(input_folder_db, data_path)))

data = pd.concat(dfs, ignore_index= True)

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1050253 entries, 0 to 1050252
Data columns (total 19 columns):
 #   Column                        Non-Null Count    Dtype         
---  ------                        --------------    -----         
 0   Open Time                     1050253 non-null  int64         
 1   Open                          1050253 non-null  float64       
 2   High                          1050253 non-null  float64       
 3   Low                           1050253 non-null  float64       
 4   Close                         1050253 non-null  float64       
 5   Volume                        1050253 non-null  float64       
 6   Clos Time                     1050253 non-null  int64         
 7   Quote Asset Volume            1050253 non-null  float64       
 8   Number of Trades              1050253 non-null  int32         
 9   Taker Buy Base Asset Volume   1050253 non-null  float64       
 10  Taker Buy Quote Asset Volume  1050253 non-null  float64       
 11

In [10]:
data['Date'].max(), data['Date'].min()

(Timestamp('2022-03-25 14:59:00'), Timestamp('2021-09-01 01:02:00'))

In [11]:
data['Ticker'].unique()

array(['ADAUSDT', 'BNBBTC', 'BNBUSDT', 'ETHUSDT'], dtype=object)

## Technical Indicators Strategy

In [13]:
# Ref: https://github.com/twopirllc/pandas-ta/blob/main/examples/PandasTA_Strategy_Examples.ipynb

dfs = []

for ticker in SYMBOLS:

    _df = data[data['Ticker'] == ticker].copy()

    MNQ_strategy = ta.Strategy(
        name="MNQ Strategy",
        description="Non Multiprocessing Strategy by rename Columns",
        ta = CONF['Feature_Engineering'][feature_id]['ta']
    )


    #data.set_index(['datetime'], inplace  = True)
    # Run it
    _df.ta.strategy(MNQ_strategy)

    dfs.append(_df)

new_data = pd.concat(dfs, ignore_index=True)

In [14]:
data = new_data.copy()

# Lag Features

In [16]:
dfs = []

if CONF['Feature_Engineering'][feature_id]['include_lags']:

    for ticker in SYMBOLS:

        print("Calculating lags for ticker", ticker)

        _df = data[data['Ticker'] == ticker].copy()

        n_lags = CONF['Feature_Engineering'][feature_id]['lags']
        ref_variable_lags = CONF['Feature_Engineering'][feature_id]['ref_variable_lags']
        drop = CONF['Feature_Engineering'][feature_id]['drop_lags']

        for ref_variable in ref_variable_lags:

            lags_features = []

            if n_lags is not None:
                for lag in n_lags:

                    columns_name = f'{ref_variable}_lag_{lag}'

                    _df.loc[:,columns_name] = _df[ref_variable].shift(lag)

                    lags_features.append(columns_name) 

        dfs.append(_df)

    new_data = pd.concat(dfs, ignore_index=True)

data = new_data.copy()

Calculating lags for ticker ADAUSDT
Calculating lags for ticker BNBBTC
Calculating lags for ticker BNBUSDT
Calculating lags for ticker ETHUSDT


# Entry Price - Previous Close Features

In [20]:
if CONF['Feature_Engineering'][feature_id]['EntryPrice_PrevClose']: 

    lags_diff = CONF['Feature_Engineering'][feature_id]['lags_diff']
    target_variable = 'Entry_PrevClose_diff'
    short= 5
    long = 15
    variables = ['sma', 'std', 'bbands']
    drop_columns = []

    for lag in lags_diff:

        target_variable_name = f'{target_variable}_{lag}'

        data[target_variable_name] = data['entry_market'] - data[f'{ref_variable_lags}_lag_{lag}']

        data = fe.calculate_rolling_features(data, target_variable_name, short, long, variables, drop_columns, drop_target_variable = True )

        
if drop:
    data.drop(columns = lags_features, inplace = True)



# Return Features

In [7]:
data.columns

Index(['Date', 'Time', 'Open', 'High', 'Low', 'Close', 'Vol', 'year', 'month',
       'day', 'hour', 'minute', 'ticker', 'datetime', 'Time_tuple',
       'entry_market', 'target', 'stop', 'entry_type', 'risk_type', 'NATR_5',
       'NATR_60'],
      dtype='object')

In [33]:
def calculate_returns(data: pd.DataFrame, variable: str, lags: list, binary_lags: bool, date_col : str = 'Date', outlier_cutoff : float = 0.01):
    """Calculate returns base of a target variable. 

    Args:
        data (pd.DataFrame): _description_
        variable (str): _description_
        lags (list): _description_
        binary_lags (bool): _description_
        date_col (str, optional): _description_. Defaults to 'Date'.
        outlier_cutoff (float, optional): 

    Returns:
        _type_: _description_
    """

    returns = []

    for lag in lags:
        if binary_lags:
            _return = returns.append(data.set_index([date_col])[variable]
                        .sort_index() # Sort by Date
                        .pct_change(lag) # Calculate percentage change of the respective lag value
                        .pipe(lambda x: x.clip(lower=x.quantile(outlier_cutoff),
                                                upper=x.quantile(1-outlier_cutoff))) # Cutoff outliers
                        .add(1) # add 1 to the returns
                        .pow(1/lag) # apply n root for n = lag
                        .sub(1) #substract 1
                        .apply(lambda x: 1 if x > 0 else 0)
                        .to_frame(f'{variable}_return_{lag}m')
                        
                        )

        else:
            _return = returns.append(data.set_index([date_col])[variable]
                    .sort_index() # Sort by Date
                    .pct_change(lag) # Calculate percentage change of the respective lag value
                    .pipe(lambda x: x.clip(lower=x.quantile(outlier_cutoff),
                                            upper=x.quantile(1-outlier_cutoff))) # Cutoff outliers
                    .add(1) # add 1 to the returns
                    .pow(1/lag) # apply n root for n = lag
                    .sub(1) #substract 1
                    .to_frame(f'{variable}_return_{lag}m')
                    
                )

    returns.append(_return)
        
    returns = pd.concat(returns, axis = 1)
    #returns.info(null_counts=True)

    data = data.set_index([date_col]).join(returns).dropna()
    data.reset_index(inplace = True)

    return data
    

In [35]:
if CONF['Feature_Engineering'][feature_id]['Return_Features']:

    dfs = []

    for ticker in SYMBOLS:

        print("Calculating returns for ticker", ticker)

        _df = data[data['Ticker'] == ticker].copy()

        outlier_cutoff = 0.01
        lags = CONF['Feature_Engineering'][feature_id]['return_lags'] 
        binary_lags = CONF['Feature_Engineering'][feature_id]['binary_lags'] 
        
        variable = 'Close'

        _df = calculate_returns(_df, variable, lags, binary_lags)

        dfs.append(_df)

    new_data = pd.concat(dfs, ignore_index=True)

    data = new_data.copy()

Calculating returns for ticker ADAUSDT
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 296038 entries, 2021-09-01 01:02:00 to 2022-03-25 14:59:00
Data columns (total 5 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Close_return_1m    296037 non-null  float64
 1   Close_return_5m    296033 non-null  float64
 2   Close_return_15m   296023 non-null  float64
 3   Close_return_60m   295978 non-null  float64
 4   Close_return_240m  295798 non-null  float64
dtypes: float64(5)
memory usage: 13.6 MB
Calculating returns for ticker BNBBTC
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 162139 entries, 2021-12-03 00:41:00 to 2022-03-25 14:59:00
Data columns (total 5 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Close_return_1m    162138 non-null  float64
 1   Close_return_5m    162134 non-null  float64
 2   Close_return_15m   162124 non-null  float64
 3   C

# Momentum Features

In [11]:
if FEATURES_CONFIG_IDS[feature_id]['Return_Features']:
    if FEATURES_CONFIG_IDS[feature_id]['Momentum_Features']:

        dfs = []

        for ticker in SYMBOLS:

            print("Calculating momemtum for ticker", ticker)

            _df = data[data['Ticker'] == ticker].copy()

            for lag in lags:
                if lag > lags[0]:
                    _df['momentum_{}_{}'.format( lags[0], lag)] = data[f'return_{lag}m'].sub(data['return_{}m'.format(lags[0])])
                if lag > lags[1]:
                    _df['momentum_{}_{}'.format( lags[1], lag)] = data[f'return_{lag}m'].sub(data['return_{}m'.format(lags[1])])

        
            dfs.append(_df)

        new_data = pd.concat(dfs, ignore_index=True)

        data = new_data.copy()    

# Volume Features

In [12]:
if FEATURES_CONFIG_IDS[feature_id]['Volume_Features']:

    dfs = []

    for ticker in SYMBOLS:

        print("Calculating momemtum for ticker", ticker)

        _df = data[data['Ticker'] == ticker].copy()

        short = FEATURES_CONFIG_IDS[feature_id]['Volume_Windows'][0]
        long = FEATURES_CONFIG_IDS[feature_id]['Volume_Windows'][1]
        drop_columns = []

        target_variable = 'Volume'
        variables = ['sma', 'std']

        _df = fe.calculate_rolling_features(_df, target_variable, short, long, variables, drop_columns, drop_target_variable=False )

        dfs.append(_df)
        
    new_data = pd.concat(dfs, ignore_index=True)

    data = new_data.copy()  

if FEATURES_CONFIG_IDS[feature_id]['Volume_Returns']:

    dfs = []

    for ticker in SYMBOLS:

        print("Calculating momemtum for ticker", ticker)

        _df = data[data['Ticker'] == ticker].copy()

        lags = FEATURES_CONFIG_IDS[feature_id]['Volume_Returns_lags'] 
        binary_lags = FEATURES_CONFIG_IDS[feature_id]['Volume_Returns_binary'] 
        
        variable = 'Volume'

        _df = calculate_returns(_df, variable, lags, binary_lags)


        dfs.append(_df)
        
    new_data = pd.concat(dfs, ignore_index=True)

    data = new_data.copy() 

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 38417 entries, 2021-06-01 16:10:00 to 2021-10-14 21:44:00
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   Vol_return_5m   38417 non-null  int64
 1   Vol_return_15m  38417 non-null  int64
dtypes: int64(2)
memory usage: 900.4 KB


In [13]:
data.columns

Index(['datetime', 'Date', 'Time', 'Open', 'High', 'Low', 'Close', 'Vol',
       'year', 'month', 'day', 'hour', 'minute', 'ticker', 'Time_tuple',
       'entry_market', 'target', 'stop', 'entry_type', 'risk_type', 'NATR_5',
       'NATR_60', 'Close_return_1m', 'Close_return_5m', 'Close_return_15m',
       'Close_return_60m', 'Close_return_240m', 'Vol_sma_5', 'Vol_sma_60',
       'Vol_std_5', 'Vol_std_60', 'Vol_return_5m', 'Vol_return_15m'],
      dtype='object')

# Time Features

In [14]:
data = fe.build_fourier_time_features(data, time_levels = ['month', 'day', 'hour', 'minute'], max_levels = [12, 30, 24, 60], drop_columns = True)

# Probability Distribution Features of Target Variable

In [15]:
# Target Variable canbe entry_type or risk_type
short = FEATURES_CONFIG_IDS[feature_id]['Prob_Features_Windows'][0]
long = FEATURES_CONFIG_IDS[feature_id]['Prob_Features_Windows'][1]

In [16]:
# Select only entry_type, or only risk_type or both

probability_features = FEATURES_CONFIG_IDS[feature_id]['probability_features']

if FEATURES_CONFIG_IDS[feature_id]['use_prob_features']:
    for target_variable in probability_features:

        daily_distribution = fe.calculate_prob_distribution_features(data = data, 
                                                target_variable =target_variable ,
                                                short = short, 
                                                long = long)

        data = pd.merge(data, daily_distribution[['Date',
                                                f'{target_variable}_sma_{short}',
                                                f'{target_variable}_sma_{long}',
                                                f'{target_variable}_std_{short}',
                                                f'{target_variable}_std_{long}',
                                                f'{target_variable}_cv_{short}' ]], 
                                                on = ['Date'], 
                                                how = 'left').copy(deep = True)

In [17]:
# (daily_distribution[daily_distribution[target_variable] == 1]
# .plot(x = 'Date', 
# y = ['distribution',f'{target_variable}_sma_{short}', f'{target_variable}_cv_{short}'] , figsize = (15,10) )) #,

In [18]:
if feature_id == 4:

    #short = 4
    #long = 15

    data['Low_diff'] = data['Low'] - data['stop']

    # data['Low_diff'] = scaler.fit(np.array(data['Low_diff']).reshape(-1, 1))

    # data[f'Low_diff_sma_{short}'] = (data['Low_diff'].rolling(window = short, closed = 'left')
    #                                                     .mean().fillna(method = 'backfill'))
    
    # data[f'Low_diff_sma_{long}'] = (data['Low_diff'].rolling(window = long, closed = 'left')
    #                                                     .mean().fillna(method = 'backfill'))


    # data.drop(columns = [f'Low_diff_std_{short}', 
    #                     f'Low_diff_std_{long}'], inplace = True )


if feature_id == 5:

    data[f'Low_sma_{short}'] = (data['Low'].rolling(window = short, closed = 'left')
                                                        .mean().fillna(method = 'backfill'))
    
    data[f'Low_sma_{long}'] = (data['Low'].rolling(window = long, closed = 'left')
                                                        .mean().fillna(method = 'backfill'))

    data[f'Low_std_{short}'] = (data['Low'].rolling(window = short, closed = 'left')
                                                        .std().fillna(method = 'backfill'))
    
    data[f'Low_std_{long}'] = (data['Low'].rolling(window = long, closed = 'left')
                                                        .std().fillna(method = 'backfill'))

    data[f'Low_cv_{short}'] = (data[f'Low_std_{short}'] /
                                        data[f'Low_sma_{short}'])

    data[f'Low_cv_{long}'] = (data[f'Low_std_{short}'] / 
                                        data[f'Low_sma_{short}'])


    data.drop(columns = [f'Low_std_{short}', 
                        f'Low_std_{long}'], inplace = True )


In [19]:
# TODO: 

# Improve Feature Engineering on Risk Forecast
# Daily Risk Forecast Distribution 
# Hourly Risk Forecast Distribution 

# # Hourly Risk Distribution
# data['hour'] = data['datetime'].dt.hour

# hourly_distribution = (data.groupby(['hour'])['risk_type'].value_counts()
#                     .to_frame()
#                     .rename(columns = {'risk_type': 'counts'})
#                     .reset_index())

# hourly_distribution['hourly_sum'] = hourly_distribution.groupby(['hour'])['counts'].transform(np.sum)

# hourly_distribution['distribution'] = np.round( hourly_distribution['counts'] / hourly_distribution['hourly_sum'] , 4)

# Set Forecast Horizon

In [20]:
data.columns

Index(['datetime', 'Date', 'Time', 'Open', 'High', 'Low', 'Close', 'Vol',
       'year', 'ticker', 'Time_tuple', 'entry_market', 'target', 'stop',
       'entry_type', 'risk_type', 'NATR_5', 'NATR_60', 'Close_return_1m',
       'Close_return_5m', 'Close_return_15m', 'Close_return_60m',
       'Close_return_240m', 'Vol_sma_5', 'Vol_sma_60', 'Vol_std_5',
       'Vol_std_60', 'Vol_return_5m', 'Vol_return_15m', 'month_sin',
       'month_cos', 'day_sin', 'day_cos', 'hour_sin', 'hour_cos', 'minute_sin',
       'minute_cos', 'entry_type_sma_2', 'entry_type_sma_6',
       'entry_type_std_2', 'entry_type_std_6', 'entry_type_cv_2',
       'risk_type_sma_2', 'risk_type_sma_6', 'risk_type_std_2',
       'risk_type_std_6', 'risk_type_cv_2'],
      dtype='object')

In [21]:
forecast_variable = FEATURES_CONFIG_IDS[feature_id].get('forecast_variable', None)

if forecast_variable is not None:

    fh = FEATURES_CONFIG_IDS[feature_id].get('forecast_shift')

    data.loc[:,f'{forecast_variable}_target'] = ((data["Close"].shift(-fh)).sub(data["Close"]))
    data.loc[:,f'{forecast_variable}_target'] = data[target_variable].apply(lambda x: 0 if x <= 0 else 1 )

In [22]:
data = data.dropna()

## Data Storage

In [23]:
#data['return_5m_target'].value_counts()

In [24]:
len(data.columns)

47

In [25]:
output_folder_db ='../../data/Model2'
sub_experiment_type = f'feature_config{feature_id}'

In [26]:
output_location = os.path.join(output_folder_db, 'Feature_Engineering',  'NQ', 
                            ('Model2_Feature_Engineering__{}_{}.parquet'
                            .format( sub_experiment_type, strategy)))

print(output_location)


../../data/Model2\Feature_Engineering\NQ\Model2_Feature_Engineering__feature_config16_t10-r10_w5.parquet


In [27]:
data.to_parquet(output_location , engine = 'fastparquet', compression = 'gzip')