In [1]:
import pandas as pd
import numpy as np
import datetime

from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.utils import class_weight
from sklearn.model_selection import GridSearchCV

from os.path import join
from os import getcwd
from pathlib import Path
from sys import path

full_path = getcwd()
functions_path = join( Path(full_path).parents[0].parents[0] )
path.append( functions_path  )

import mlflow
import dateutil

from joblib import parallel_backend


from functions import utils, modelling
import tempfile
import os

#Hyperparameter Optimization
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

import warnings
warnings.filterwarnings('ignore')

In [3]:
# Load data
feature_config = 14
strategy = '1-1_vb_15m' #'t20-r10_w15'
file_path = "/mnt/d/Tensor_Database/01_Cryptos/Features_Eng/Feature_Engineering_conf_1_Tickers_4_Stategy_1-1_vb_15m.parquet"

input_file_path = os.path.join( file_path)
df = pd.read_parquet(input_file_path)

In [15]:
def feature_selection(df: pd.DataFrame, 
                    case: int, 
                    train_start_date : datetime.datetime,
                    max_test_date : datetime.datetime,
                    train_idx : list , 
                    test_idx: list , 
                    forecast_variable : str ):
    """ Feature Selection Cases

    Args: 
        df (pandas.DataFrame): Input data frame containg target variable and features
        case (int): if target is entry_type, select among different features scenarios
        train_start_date (datetime.datetime): start training date
    """
                              
    if forecast_variable == 'risk_type':
        X_columns = df.drop(columns = ['Date', 'Time', 
                                        'ticker','datetime', 
                                        'Open', 'High', 'Low', 'Close', 'year',
                                        'day_sin', 'day_cos',
                                        'minute_sin', 'minute_cos'
                                        'Time_tuple', forecast_variable,
                                        'entry_type','entry_type_fh_2','risk_type_fh_2'] ).columns
    
    elif forecast_variable == 'entry_type':
        if case == 1:
            X_columns = df.drop(columns = ['Date', 'Time', 
                                            'ticker','datetime', 
                                            'Open', 'High', 'Low', 'Close', 'year',
                                            'day_sin', 'day_cos',
                                            'Time_tuple', forecast_variable,
                                            'risk_type',
                                            'entry_type_fh_2','risk_type_fh_2'] ).columns
        elif case == 2:
            X_columns = df.drop(columns = ['Date', 'Time', 
                                            'ticker', 
                                            'Open', 'High', 'Low','year',
                                            'day_sin', 'day_cos',
                                            'Time_tuple', forecast_variable,
                                            'risk_type',
                                            ] ).columns
        elif case == 3:
            X_columns = df.drop(columns = ['Date', 'Time', 'datetime', 'Close',
                                            'ticker', 
                                            'Open', 'High', 'Low','year',
                                            'day_sin', 'day_cos',
                                            'Time_tuple', forecast_variable,
                                            'risk_type'] ).columns
        elif case == 4:
            X_columns = df.drop(columns = ['Date', 'Time',  
                                            'Open', 'High', 'Low','year',
                                            'day_sin', 'day_cos',
                                            'Time_tuple', forecast_variable,
                                            'risk_type'] ).columns

        elif case == 5:        
            cols_to_drop = [x for x in df.columns if 'NATR' in x]   
            X_columns = df.drop(columns = ['Date', 'Time',  
                                            'Open', 'High', 'Low','year'
                                            'day_sin', 'day_cos',
                                            'Time_tuple', forecast_variable,
                                            'risk_type'] + cols_to_drop ).columns

    elif forecast_variable == 'return_5m_target':
        if case == 1:
            print("Forecast Variable: ", forecast_variable)
            
            X_columns = df.drop(columns = ['Date', 'Time', 
                                            'Open', 'High', 
                                            'Low', 'Close', 'year',
                                            'day_sin', 'day_cos',
                                            'Time_tuple', forecast_variable,
                                            'risk_type', 'entry_type'] ).columns

                                            
        
    else: 
        ValueError("Invalid Forecast Variable")
        


    print(X_columns)
    

    # Split: Test/Training
    if train_start_date is not None:
        df = df[df['Date'] >= train_start_date].copy(deep = True)
    if max_test_date is not None:
        df = df[df['Date'] <= max_test_date].copy(deep = True)

    X_train = df.iloc[train_idx].filter(X_columns).reset_index(drop = True)
    X_test = df.iloc[test_idx].filter(X_columns).reset_index(drop = True)

    Y_train = df.iloc[train_idx].filter([forecast_variable]).reset_index(drop = True)
    Y_test = df.iloc[test_idx].filter([forecast_variable]).reset_index(drop = True)

    X_train.set_index(['ticker', 'datetime'], inplace = True)
    X_test.set_index(['ticker', 'datetime'], inplace = True)

    dfs = [X_train, X_test, Y_train, Y_test]

    return dfs

In [16]:
def bayesian_gridcv_xgb_model(dfs : list,
                        forecast_variable : str ,
                        param_grid: dict,
                        n_jobs : int = -2,
                        n_iter: int = 50,
                        random_state: int = 0,
                        test_period_length_cv: int = 60,
                        n_splits_cv: int = 5,
                        n_points_w: int = 200,
                        ):
    """Hyperparameter Optimization for XGboost using GridSearch
    exahustive search
    
    Args:
        dfs (list): Data frame containing X_train, X_test, Y_test, Y_train
        forecast_variable (str): Target variable to train for
        param_grid (dict): parameter grid
        n_jobs (int): Number of jobs to run in parallel.
        n_iter (int): Number of parameter settings that are sampled
        random_state (int): random state seeder 
    """

    X_train, X_test, Y_train, Y_test = dfs[0], dfs[1], dfs[2], dfs[3]
 
    print("Target variable is: ", forecast_variable)
    print(Y_train[forecast_variable].value_counts())

    # # TODO: Check for improvement in class weight functions - has a big influence on accuracy
    classes_weights = class_weight.compute_sample_weight(
        class_weight = calculate_classes_weight(Y_train, n_points_w),
        y = Y_train
    )

    # ---------------------------------------------- #
    # Inner CV Parameters 
    # ---------------------------------------------- #
    train_period_length_cv = X_train.shape[0] - test_period_length * n_splits_cv -1 
    lookahead = 1

    cv_bayesian_search = modelling.MultipleTimeSeriesCV(n_splits = n_splits_cv,
                                    train_period_length = train_period_length_cv, 
                                    test_period_length = test_period_length_cv, 
                                    lookahead = lookahead, 
                                    date_idx = 'datetime'
                                    )

    xgb_model = XGBClassifier(objective="binary:logistic", 
                                booster='gbtree',
                                eval_metric='auc',
                                tree_method='hist', 
                                grow_policy='lossguide',
                                use_label_encoder=False)

    # Update X_columns
    X_columns = X_train.columns

    print("Cross validation on: ")
    modelling.check_test_training_indeces(cv_bayesian_search, X_train)

    # Bayesian Grid Search
    bayes_search = BayesSearchCV(
        xgb_model,
        param_grid,
        n_iter = n_iter,
        random_state = random_state,
        n_jobs = n_jobs,
        cv = cv_bayesian_search
    )

    # executes bayesian optimization
    model = bayes_search.fit(X_train, Y_train, sample_weight=classes_weights)

    # Predict training set
    Y_pred = model.predict(X_test)

    #print(Y_pred)

    # Evaluate Predictions
    conf_matrix = confusion_matrix(Y_test, Y_pred)/len(Y_pred)
    class_accuracy = utils.cal_label_accuracy(conf_matrix)

    print(class_accuracy)

    return model, class_accuracy, X_columns, classes_weights

In [17]:
def calculate_classes_weight(Y_train, n_points_w):
    """
    Caculate classes weights from most recent n points in training
    """

    class_weights = (Y_train.astype(int).iloc[-n_points_w:]
                    .value_counts()
                    .rename('counts')
                    .to_frame()
                    .reset_index())

    class_weights['weight'] =  1/(class_weights['counts'] / class_weights['counts'].sum())

    weight_0 = class_weights[class_weights['entry_type'] == 0]['weight'].values[0]
    weight_1 = class_weights[class_weights['entry_type'] == 1]['weight'].values[0]

    return {0: weight_0, 1: weight_1}

In [19]:
param_grid = {
              'learning_rate': Real(0.005, 0.05, prior='log-uniform'),
              'max_depth': Integer(3, 25, prior='log-uniform'),
              'n_estimators': Integer(10, 100, prior='log-uniform'),
            #   'gamma': Real(0.1, 0.3, prior='log-uniform'),
            #   'reg_alpha': Real(0.01, 1.6, prior='log-uniform'),
            #   'reg_lambda': Real(0.01, 1.6, prior='log-uniform'),
            #   'min_child_weight': Real(0.01, 5, prior='log-uniform'),
            #   'max_delta_step' : Real(0.01, 5, prior='log-uniform')
               }

# ---------------------------------------------- #
# Experiment Parameters 
# ---------------------------------------------- #
n_jobs = -2
n_iter = 35
random_state = 123
experiment_name = f'EntryType-BayesianCV-Feat14-Classweight2-60-{strategy}' 
experiment_data_folder = 'experiment_data'
model_name = 'Risk-Profit Trading Classification'


# ---------------------------------------------- #
# Outer CV Parameters 
# ---------------------------------------------- #
forecast_variable = 'entry_type' # risk_type , entry_type or return_5m_target
case = 4
n_splits = 60
test_period_length = 60
train_period_length = 5000 
lookahead = 1
n_points_w = 480

# ---------------------------------------------- #
# Inner CV Parameters 
# ---------------------------------------------- #
test_period_length_cv = 30
n_splits_cv = 4

# 11 Monate - Training (CV Time Series)
# 1 Monat - Test (Retraining Zeitraum)

cv_bayesian_search = modelling.MultipleTimeSeriesCV(n_splits = n_splits,
                                train_period_length = train_period_length, 
                                test_period_length = test_period_length, 
                                lookahead = lookahead, 
                                date_idx = 'datetime'
                                )


train_start_date = None
max_test_date = datetime.datetime(2021, 10, 14)

df_model = df.set_index(['ticker', 'datetime']).copy(deep = True)

test_days = []

# modelling.check_test_training_indeces(cv_bayesian_search, df_model)

for train_idx, test_idx in cv_bayesian_search.split(df_model):

    print("Testing for", f"{min(train_idx)}-{max(train_idx)}:{min(test_idx)}-{max(test_idx)}")
    
    dfs = feature_selection(df, 
                            case, 
                            train_start_date ,
                            max_test_date,
                            train_idx , 
                            test_idx , 
                            forecast_variable)

    

    print(" Running Model...")

    # # Optimize Model and test
    model, class_accuracy, X_columns, classes_weights = bayesian_gridcv_xgb_model(dfs,
                                                              forecast_variable ,
                                                              param_grid,
                                                              n_jobs,
                                                              n_iter,
                                                              random_state,
                                                              test_period_length_cv,                        
                                                              n_splits_cv,
                                                              n_points_w,
                                                                )

    print(class_accuracy)

    Y_train = dfs[3]
    
    add_params = { 'feature_config' : feature_config,
                    'forecast_variable': forecast_variable,
                    'train_period_length':train_period_length,
                    'test_period_length':test_period_length,
                    'counts_0': Y_train[Y_train[forecast_variable] == 0].shape[0],
                    'counts_1': Y_train[Y_train[forecast_variable] == 1].shape[0],
                    'test_period_length_cv':test_period_length_cv,
                    'n_splits_cv': n_splits_cv,
                    'case':case,
                    'n_last_values_weight_class': n_points_w

                    }

    # Store Experiment in MLFlow
    print("Storing Model Results...")
    modelling.log_results(gridsearch =  model, 
                    class_accuracy = class_accuracy, 
                    add_params = add_params,
                    training_columns = X_columns,
                    experiment_name = experiment_name, 
                    experiment_data_folder = experiment_data_folder, 
                    model_name = model_name, 
                    tags={}, 
                    log_only_best=True) 
    
             

Testing for 32371-37370:37371-37430
Index(['datetime', 'Vol', 'ticker', 'entry_market', 'target', 'stop', 'SMA_15',
       'SMA_60', 'entry_type_lag_15', 'return_1m', 'return_15m', 'return_30m',
       'return_60m', 'Vol_sma_5', 'Vol_sma_60', 'Vol_std_5', 'Vol_std_60',
       'month_sin', 'month_cos', 'hour_sin', 'hour_cos', 'minute_sin',
       'minute_cos', 'entry_type_sma_2', 'entry_type_sma_6',
       'entry_type_std_2', 'entry_type_std_6', 'entry_type_cv_2',
       'risk_type_sma_2', 'risk_type_sma_6', 'risk_type_std_2',
       'risk_type_std_6', 'risk_type_cv_2'],
      dtype='object')
 Running Model...
Target variable is:  entry_type
0.0    2940
1.0    2060
Name: entry_type, dtype: int64
Cross validation on: 
Testing for  211-4969:4970-4999
Testing for  181-4939:4940-4969
Testing for  151-4909:4910-4939
Testing for  121-4879:4880-4909
{'class_0': 98.21, 'class_1': 0.0}
{'class_0': 98.21, 'class_1': 0.0}
Storing Model Results...
INFO: 'EntryType-BayesianCV-Feat14-Classweight2-60-

In [10]:
# # Model for Sep 22-2021
# logged_model = 'runs:/34776846208744728b0bb34eda7260e6/Risk-Profit Trading Classification'

# # Load model as a PyFuncModel.
# loaded_model = mlflow.pyfunc.load_model(logged_model)

# # Predict on a Pandas DataFrame.
# Y_pred = loaded_model.predict(X_test)

In [11]:
Y_train = dfs[2]

In [12]:
X_train.shape[0]

NameError: name 'X_train' is not defined

In [None]:
df.groupby(['Date'])['entry_type'].value_counts()

Date        entry_type
2021-06-01  0.0           229
            1.0            48
2021-06-02  0.0           313
            1.0            78
2021-06-03  0.0           268
                         ... 
2021-10-12  1.0           145
2021-10-13  0.0           233
            1.0           158
2021-10-14  0.0           321
            1.0            69
Name: entry_type, Length: 192, dtype: int64

In [None]:
df[df['Date'] == '2021-09-22']['entry_type_fh_2'].value_counts()

KeyError: 'entry_type_fh_2'

In [None]:
df[df['Date'] == '2021-09-23']['entry_type_fh_2'].value_counts()

0.0    268
1.0    106
Name: entry_type_fh_2, dtype: int64

In [None]:
df[df['Date'] == '2021-09-24']['entry_type_fh_2'].value_counts()

0.0    259
1.0    115
Name: entry_type_fh_2, dtype: int64

In [None]:
# Importance = model.best_estimator_.feature_importances_

# df_importance = pd.DataFrame({'Variable': X_train.columns, 'Importance':Importance})

# df_importance.sort_values(by = ['Importance'], ascending = False, inplace = True)

In [None]:
# Y_test_pred = Y_test.copy(deep = True)
# Y_test_pred['pred'] = Y_pred