In [1]:
import pandas as pd
import numpy as np
import datetime

from xgboost import XGBClassifier

from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.utils import class_weight
from sklearn.model_selection import GridSearchCV

from os.path import join
from os import getcwd
from pathlib import Path
from sys import path

full_path = getcwd()
functions_path = join( Path(full_path).parents[0].parents[0] )
path.append( functions_path  )

import mlflow
import dateutil

from joblib import parallel_backend


from functions import utils, modelling
import tempfile
import os

#Hyperparameter Optimization
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)

In [2]:
# Load data
feature_config = 14
strategy = '1-1_vb_15m' #'t20-r10_w15'
file_path = "/mnt/d/Tensor_Database/01_Cryptos/Features_Eng/Feature_Engineering_conf_1_Tickers_4_Stategy_1-1_vb_15m.parquet"

input_file_path = os.path.join( file_path)
df = pd.read_parquet(input_file_path)

In [None]:
def generate_labels():

    # The time window for the features - N
    feature_window = 60
    # The time window for the labels (also called horizon) - M
    label_window = 5

    # Calculate daily returns on the adjusted close
    df['daily_returns'] = df['Close'].pct_change()
    # Calculate the threshold dynamically from the daily returns on a rolling basis on the feature_window
    # The threshold is a function of the rolling standard deviation
    df['return_threshold'] = 0.125 * \
        np.sqrt(feature_window)*df.daily_returns.rolling(feature_window).std()
    
    
    df['return_threshold'] = df.daily_returns.ewm(span = feature_window).std()
    
    # Calculate the future return from current price to the end of the label window
    df['fut_returns'] = df['Close'].pct_change(
        +label_window).shift(-label_window)
    # Assign class label to the feature window based on the returns of the label window compared to a dynamic threshold
    df['target_class'] = np.where(df.fut_returns > df.return_threshold, 1, np.where(
        df.fut_returns < -df.return_threshold, -1, 0))
    
    
    

In [25]:
df.loc[:,'sma_signal'] = df['sma_cross_over'].apply(lambda x: 1 if x == 1 else 0)

In [26]:
def feature_selection(df: pd.DataFrame, 
                    X_columns: list, 
                    start_train_date : datetime.datetime,
                    end_train_date : datetime.datetime,
                    forecast_variable : str ):
    """ Feature Selection Cases

    Args: 
        df (pandas.DataFrame): Input data frame containg target variable and features
        case (int): if target is entry_type, select among different features scenarios
        train_start_date (datetime.datetime): start training date
    """
    
    
    df = df.dropna().copy()

    # Create Dummy Columns for Tickers
    X_columns += list(df['Ticker'].unique())
    df = df.drop(columns = ['Ticker']).join(pd.get_dummies(df['Ticker']))

    X_train = (df.loc[(df['Date'] >= start_train_date) 
                    & (df['Date'] < end_train_date),:]
                    .filter(X_columns)
                    .reset_index(drop = True))

    X_test = (df.loc[(df['Date'] >= end_train_date),:]
                .filter(X_columns).reset_index(drop = True))

    Y_train = np.squeeze((df.loc[(df['Date'] >= start_train_date) 
                    & (df['Date'] < end_train_date),:]
                    .filter([forecast_variable])
                    .astype(int)))

    Y_test = np.squeeze(df.loc[(df['Date'] >= end_train_date),:]
                .filter([forecast_variable])
                .reset_index(drop = True)
                .astype(int))

    # X_train.drop(columns = ['Date'], inplace = True)
    # X_test.drop(columns = ['Date'], inplace = True)

    X_train.set_index(['Date'], inplace = True)
    X_test.set_index(['Date'], inplace = True)

    dfs = [X_train, X_test, Y_train, Y_test]

    return dfs

In [27]:
X_columns = ['Ticker', 'Date', 
                'Volume_standard','Number of Trades_standard', 'max_trades_standard',
                'month', 'day', 'hour', 'minute', 
                'Close_lag_1', 'Close_lag_3', 'Close_lag_5', 'Close_lag_15',
                'Close_lag_30', 'VWMA_60', 'VWMA_15', 'Volume_sma_5', 'Volume_sma_30', 'Volume_std_5',
                'Volume_std_30','ATRr_15', 'PDIST','BBU_15_2.0', 'BBB_15_2.0', 'BBP_15_2.0', 'BBL_120_2.0', 'BBM_120_2.0',
                'BBU_120_2.0', 'BBB_120_2.0', 'BBP_120_2.0', 'MACD_15_120_9',
                'MACDh_15_120_9', 'MACDs_15_120_9','sma_signal'] 


forecast_variable = 'metalabel'

end_train_date = datetime.datetime(2022, 1, 31) 
start_train_date = end_train_date  - dateutil.relativedelta.relativedelta(months = 6)

# ---------------------------------------------- #
# (1) Feature Selection - Train/Test Split
# ---------------------------------------------- # 

X_train, X_test, Y_train, Y_test = feature_selection(df.copy(),
                                                     X_columns, 
                                                     start_train_date,
                                                     end_train_date,
                                                     forecast_variable)


# ---------------------------------------------- #
# (2) Synthetic Minority Oversampling Technique
# ---------------------------------------------- # 
from imblearn.over_sampling import SMOTE

# Ref: https://arxiv.org/pdf/1106.1813.pdf
sm = SMOTE()

X_train_res, y_train_res = sm.fit_resample(X_train, Y_train)

In [28]:
labels, counts = np.unique(y_train_res, return_counts=True)

In [29]:
print("Labels are: ", labels )
print("Counts are: ", counts )

Labels are:  [0 1]
Counts are:  [765280 765280]


In [30]:
60*24

1440

In [31]:
test_size = 20000
n_splits = 10
max_train_size = None
gap=30

# Create Temporal Cross Validation Object
tscv = TimeSeriesSplit(n_splits = n_splits, 
                      max_train_size  = max_train_size, 
                      test_size = test_size, 
                      gap = gap)

param_grid = {
              'learning_rate': Real(0.005, 0.05, prior='log-uniform'),
              'max_depth': Integer(3, 25, prior='log-uniform'),
              'n_estimators': Integer(10, 100, prior='log-uniform'),
            #   'gamma': Real(0.1, 0.3, prior='log-uniform'),
            #   'reg_alpha': Real(0.01, 1.6, prior='log-uniform'),
            #   'reg_lambda': Real(0.01, 1.6, prior='log-uniform'),
            #   'min_child_weight': Real(0.01, 5, prior='log-uniform'),
            #   'max_delta_step' : Real(0.01, 5, prior='log-uniform')
               }

# ---------------------------------------------- #
# Experiment Parameters 
# ---------------------------------------------- #
n_jobs = 4
n_iter = 10
random_state = 123
experiment_name = f'TBM-{strategy}' 
experiment_data_folder = 'experiment_data'
model_name = 'XGboost SMA Cross + XGBoost'


for train_index, test_index in tscv.split(X_train):
     print("TRAIN:", train_index, "TEST:", test_index)


xgb_model = XGBClassifier(objective="binary:logistic", 
                            booster='gbtree',
                            eval_metric='auc',
                            tree_method='hist', 
                            grow_policy='lossguide',
                            use_label_encoder=False)
                            
# xgb_model = XGBClassifier(objective="binary:logistic", 
#                             booster='gbtree',
#                             eval_metric='auc',
#                             tree_method='gpu_hist',
#                             gpu_id=0,
#                             grow_policy='lossguide',
#                             use_label_encoder=False)
# Bayesian Grid Search
bayes_search = BayesSearchCV(
    xgb_model,
    param_grid,
    n_iter = n_iter,
    random_state = random_state,
    n_jobs = n_jobs,
    cv = tscv
)

# executes bayesian optimization
model = bayes_search.fit(X_train, Y_train)

# Predict training set
Y_pred = model.predict(X_test)

#print(Y_pred)

# Evaluate Predictions
conf_matrix = confusion_matrix(Y_test, Y_pred)/len(Y_pred)
class_accuracy = utils.cal_label_accuracy(conf_matrix)

print(class_accuracy)

TRAIN: [     0      1      2 ... 570631 570632 570633] TEST: [570664 570665 570666 ... 590661 590662 590663]
TRAIN: [     0      1      2 ... 590631 590632 590633] TEST: [590664 590665 590666 ... 610661 610662 610663]
TRAIN: [     0      1      2 ... 610631 610632 610633] TEST: [610664 610665 610666 ... 630661 630662 630663]
TRAIN: [     0      1      2 ... 630631 630632 630633] TEST: [630664 630665 630666 ... 650661 650662 650663]
TRAIN: [     0      1      2 ... 650631 650632 650633] TEST: [650664 650665 650666 ... 670661 670662 670663]
TRAIN: [     0      1      2 ... 670631 670632 670633] TEST: [670664 670665 670666 ... 690661 690662 690663]
TRAIN: [     0      1      2 ... 690631 690632 690633] TEST: [690664 690665 690666 ... 710661 710662 710663]
TRAIN: [     0      1      2 ... 710631 710632 710633] TEST: [710664 710665 710666 ... 730661 730662 730663]
TRAIN: [     0      1      2 ... 730631 730632 730633] TEST: [730664 730665 730666 ... 750661 750662 750663]
TRAIN: [     0     

In [35]:
Importance = model.best_estimator_.feature_importances_
df_importance = pd.DataFrame({'Variable': X_columns, 'Importance':Importance})
df_importance.sort_values(by = ['Importance'], ascending = False, inplace = True)

ValueError: All arrays must be of the same length

In [14]:
t, l = np.unique(Y_test, return_counts=True)

In [18]:
t, l

(array([0, 1]), array([306707,   2173]))

In [12]:
add_params = { 'feature_config' : feature_config,
                'forecast_variable': forecast_variable,
                'end_train_date': end_train_date,
                'start_train_date': start_train_date
                }

# Store Experiment in MLFlow
print("Storing Model Results...")
modelling.log_results(gridsearch =  model, 
                class_accuracy = class_accuracy, 
                add_params = add_params,
                training_columns = X_columns,
                experiment_name = experiment_name, 
                experiment_data_folder = experiment_data_folder, 
                model_name = model_name, 
                tags={}, 
                log_only_best=True) 

Storing Model Results...
 Additional Parameters
 Logging metrics
 Logging class accuracy
 Logging model
 Logging CV results matrix
 Logging Feature Importance


ValueError: All arrays must be of the same length

In [19]:
# param_grid = {
#               'learning_rate': Real(0.005, 0.05, prior='log-uniform'),
#               'max_depth': Integer(3, 25, prior='log-uniform'),
#               'n_estimators': Integer(10, 100, prior='log-uniform'),
#             #   'gamma': Real(0.1, 0.3, prior='log-uniform'),
#             #   'reg_alpha': Real(0.01, 1.6, prior='log-uniform'),
#             #   'reg_lambda': Real(0.01, 1.6, prior='log-uniform'),
#             #   'min_child_weight': Real(0.01, 5, prior='log-uniform'),
#             #   'max_delta_step' : Real(0.01, 5, prior='log-uniform')
#                }

# # ---------------------------------------------- #
# # Experiment Parameters 
# # ---------------------------------------------- #
# n_jobs = -2
# n_iter = 35
# random_state = 123
# experiment_name = f'EntryType-BayesianCV-Feat14-Classweight2-60-{strategy}' 
# experiment_data_folder = 'experiment_data'
# model_name = 'Risk-Profit Trading Classification'


# # ---------------------------------------------- #
# # Outer CV Parameters 
# # ---------------------------------------------- #
# forecast_variable = 'entry_type' # risk_type , entry_type or return_5m_target
# case = 4
# n_splits = 60
# test_period_length = 60
# train_period_length = 5000 
# lookahead = 1
# n_points_w = 480

# # ---------------------------------------------- #
# # Inner CV Parameters 
# # ---------------------------------------------- #
# test_period_length_cv = 30
# n_splits_cv = 4

# # 11 Monate - Training (CV Time Series)
# # 1 Monat - Test (Retraining Zeitraum)

# cv_bayesian_search = modelling.MultipleTimeSeriesCV(n_splits = n_splits,
#                                 train_period_length = train_period_length, 
#                                 test_period_length = test_period_length, 
#                                 lookahead = lookahead, 
#                                 date_idx = 'datetime'
#                                 )


# train_start_date = None
# max_test_date = datetime.datetime(2021, 10, 14)

# df_model = df.set_index(['ticker', 'datetime']).copy(deep = True)

# test_days = []

# # modelling.check_test_training_indeces(cv_bayesian_search, df_model)

# for train_idx, test_idx in cv_bayesian_search.split(df_model):

#     print("Testing for", f"{min(train_idx)}-{max(train_idx)}:{min(test_idx)}-{max(test_idx)}")
    
#     dfs = feature_selection(df, 
#                             case, 
#                             train_start_date ,
#                             max_test_date,
#                             train_idx , 
#                             test_idx , 
#                             forecast_variable)

    

#     print(" Running Model...")

#     # # Optimize Model and test
#     model, class_accuracy, X_columns, classes_weights = bayesian_gridcv_xgb_model(dfs,
#                                                               forecast_variable ,
#                                                               param_grid,
#                                                               n_jobs,
#                                                               n_iter,
#                                                               random_state,
#                                                               test_period_length_cv,                        
#                                                               n_splits_cv,
#                                                               n_points_w,
#                                                                 )

#     print(class_accuracy)

#     Y_train = dfs[3]
    
#     add_params = { 'feature_config' : feature_config,
#                     'forecast_variable': forecast_variable,
#                     'train_period_length':train_period_length,
#                     'test_period_length':test_period_length,
#                     'counts_0': Y_train[Y_train[forecast_variable] == 0].shape[0],
#                     'counts_1': Y_train[Y_train[forecast_variable] == 1].shape[0],
#                     'test_period_length_cv':test_period_length_cv,
#                     'n_splits_cv': n_splits_cv,
#                     'case':case,
#                     'n_last_values_weight_class': n_points_w

#                     }

#     # Store Experiment in MLFlow
#     print("Storing Model Results...")
#     modelling.log_results(gridsearch =  model, 
#                     class_accuracy = class_accuracy, 
#                     add_params = add_params,
#                     training_columns = X_columns,
#                     experiment_name = experiment_name, 
#                     experiment_data_folder = experiment_data_folder, 
#                     model_name = model_name, 
#                     tags={}, 
#                     log_only_best=True) 
    
             

Testing for 32371-37370:37371-37430
Index(['datetime', 'Vol', 'ticker', 'entry_market', 'target', 'stop', 'SMA_15',
       'SMA_60', 'entry_type_lag_15', 'return_1m', 'return_15m', 'return_30m',
       'return_60m', 'Vol_sma_5', 'Vol_sma_60', 'Vol_std_5', 'Vol_std_60',
       'month_sin', 'month_cos', 'hour_sin', 'hour_cos', 'minute_sin',
       'minute_cos', 'entry_type_sma_2', 'entry_type_sma_6',
       'entry_type_std_2', 'entry_type_std_6', 'entry_type_cv_2',
       'risk_type_sma_2', 'risk_type_sma_6', 'risk_type_std_2',
       'risk_type_std_6', 'risk_type_cv_2'],
      dtype='object')
 Running Model...
Target variable is:  entry_type
0.0    2940
1.0    2060
Name: entry_type, dtype: int64
Cross validation on: 
Testing for  211-4969:4970-4999
Testing for  181-4939:4940-4969
Testing for  151-4909:4910-4939
Testing for  121-4879:4880-4909
{'class_0': 98.21, 'class_1': 0.0}
{'class_0': 98.21, 'class_1': 0.0}
Storing Model Results...
INFO: 'EntryType-BayesianCV-Feat14-Classweight2-60-

In [10]:
# # Model for Sep 22-2021
# logged_model = 'runs:/34776846208744728b0bb34eda7260e6/Risk-Profit Trading Classification'

# # Load model as a PyFuncModel.
# loaded_model = mlflow.pyfunc.load_model(logged_model)

# # Predict on a Pandas DataFrame.
# Y_pred = loaded_model.predict(X_test)

In [11]:
Y_train = dfs[2]

In [12]:
X_train.shape[0]

NameError: name 'X_train' is not defined

In [None]:
df.groupby(['Date'])['entry_type'].value_counts()

Date        entry_type
2021-06-01  0.0           229
            1.0            48
2021-06-02  0.0           313
            1.0            78
2021-06-03  0.0           268
                         ... 
2021-10-12  1.0           145
2021-10-13  0.0           233
            1.0           158
2021-10-14  0.0           321
            1.0            69
Name: entry_type, Length: 192, dtype: int64

In [None]:
df[df['Date'] == '2021-09-22']['entry_type_fh_2'].value_counts()

KeyError: 'entry_type_fh_2'

In [None]:
df[df['Date'] == '2021-09-23']['entry_type_fh_2'].value_counts()

0.0    268
1.0    106
Name: entry_type_fh_2, dtype: int64

In [None]:
df[df['Date'] == '2021-09-24']['entry_type_fh_2'].value_counts()

0.0    259
1.0    115
Name: entry_type_fh_2, dtype: int64

In [None]:
# Importance = model.best_estimator_.feature_importances_

# df_importance = pd.DataFrame({'Variable': X_train.columns, 'Importance':Importance})

# df_importance.sort_values(by = ['Importance'], ascending = False, inplace = True)

In [None]:
# Y_test_pred = Y_test.copy(deep = True)
# Y_test_pred['pred'] = Y_pred