In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import random
import datetime
import numpy as np
import pandas as pd  
import seaborn as sns
from math import floor, ceil
import seaborn as sns  # for prettier plots
import matplotlib.style as style
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import mplfinance as fplt
from matplotlib.dates import date2num
from joblib import Parallel, delayed

from xgboost import XGBClassifier
from lightgbm import  LGBMClassifier
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE


from sklearn.model_selection import TimeSeriesSplit, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import plot_confusion_matrix, classification_report, plot_precision_recall_curve, roc_auc_score

from tsfresh import extract_relevant_features
import sktime

In [3]:
df = pd.read_csv('../data/WIN$NM5_DF.csv', parse_dates=['Date'], header=0, names=['Date', 'Open', 'High', 'Low', 'Close'])

In [5]:
class Bot:
    def __init__(self, data, candles, expiration, stoploss, takeprofit):
        self.raw_ts = data
        self.stoploss=stoploss
        self.takeprofit=takeprofit
        self.candles=candles
        self.expiration=expiration
        self.processed_ts = pd.DataFrame()
        
    def check_tkp_stp(self, idx, ts_length, closes, first_open, unique_dates, stoploss, takeprofit):
        # expirated
        y = 0
        
        if idx + self.expiration-1 >= ts_length:
            return np.nan
        # GAP
        if unique_dates > 1:
            return np.nan
                
        closes = (closes-first_open)/first_open
        
        for close in closes:
            if close < 0 and abs(close) >= stoploss:
                y = 0
                break
            elif close > 0 and close >= takeprofit:
                y = 1
                break
                
        return y

    def create_output(self, ts, window, stoploss, takeprofit):
        #output = [np.nan]*ts.shape[0]
        
        executor = Parallel(n_jobs=os.cpu_count(), batch_size=128)
        tasks = (delayed(self.check_tkp_stp)(idx, ts.shape[0], 
                                        closes=ts.loc[idx:idx+window-1, 'Close'], 
                                        unique_dates=ts.loc[idx:idx+window-1, 'Date'].dt.strftime('%Y-%m-%d').nunique(), 
                                        first_open=ts['Open'].iloc[idx], 
                                        stoploss=stoploss, takeprofit=takeprofit) for idx in ts.index)
        output = executor(tasks)
        
        return output

    def treat_candle_gaps(self, ts):
        ts = ts.dropna(axis=0).reset_index(drop=True)
        ts['Gap'] = ts.loc[:, ts.columns.str.contains('Day_lag')].apply(lambda x: len(set(x)) > 1, axis=1)
        ts = ts[ts.Gap == False]
        ts = ts.drop('Gap', axis=1)
        ts = ts.drop(ts.columns[ts.columns.str.contains('Day_lag')], axis=1)

        return ts

    def process_ts(self, variables=['Open', 'High', 'Low', 'Close']):

        print(f'Processing time series with init shape of {self.raw_ts.shape}')
        processed_ts = self.raw_ts.sort_values(by=['Date'], ascending=True).copy()

        for lag in range(1, self.candles + 1):
            processed_ts[f'Date_lag_{lag}'] = processed_ts['Date'].shift(lag)
            processed_ts[f'Day_lag_{lag}'] = processed_ts['Date'].dt.day.shift(lag)
            
            for var in variables:
                processed_ts[f'{var}_lag_{lag}'] = processed_ts[var].shift(lag)


        processed_ts = self.treat_candle_gaps(processed_ts)
        processed_ts = processed_ts.reset_index(drop=True)
        processed_ts['takeprofit'] = self.create_output(processed_ts, window=self.expiration, stoploss=self.stoploss, takeprofit=self.takeprofit)
    
        self.processed_ts = processed_ts.dropna(axis=0).reset_index(drop=True)
        self.processed_ts = self.processed_ts.set_index('Date')
        
        print(f'TS was processed and now has the final shape of {self.processed_ts.shape}')
        
    def train_test_split_ts(self, test_size=.2):
        if not self.processed_ts.empty:
            cut_index = self.processed_ts.shape[0]
            cut_index = int(cut_index*(1-test_size))
            self.train_ts = self.processed_ts[0:cut_index][self.processed_ts.columns[~self.processed_ts.columns.str.contains(f'Date_lag_*')]]
            self.test_ts = self.processed_ts[cut_index:][self.processed_ts.columns[~self.processed_ts.columns.str.contains(f'Date_lag_*')]]
        else:
            print('TS was not processed please call process_ts first!!!')
            
bot = Bot(df, candles=5, expiration=5, stoploss=0.001, takeprofit=0.002)
bot.process_ts()

Processing time series with init shape of (100028, 5)
TS was processed and now has the final shape of (92619, 30)


In [6]:
bot.train_test_split_ts()

In [8]:
def plot_input(first, takepredict=None, variables=['Date', 'Open', 'High', 'Low', 'Close']):
    future = bot.raw_ts.set_index('Date')[first.reset_index().Date.unique()[0]:].iloc[1:bot.expiration].reset_index()
    
    lagged = pd.DataFrame()
    for lag in range(1, bot.candles+1):
        lag_obs = first[first.columns[first.columns.str.contains(f'_lag_{lag}$')]].copy()
        lag_obs.columns = lag_obs.columns.str.rstrip(f'_lag_{lag}$')
        lagged = lagged.append(lag_obs, sort=False, ignore_index=True)
    
    
    first_candle = first.reset_index()[variables + ['takeprofit']]
    
    to_plot = pd.concat([first_candle, lagged, future], ignore_index=True, sort=False)
    to_plot = to_plot.sort_values(by='Date') 
    to_plot.index = to_plot.Date
    to_plot[['Open', 'High', 'Low', 'Close']] = to_plot[['Open', 'High', 'Low', 'Close']].astype(float)
    
    #apd  = fplt.make_addplot(df_tkp_stp[df_tkp_stp.Date.between('2021-02-10 10:00:00', '2021-02-10 14:00:00')][['Take_Profit', 'Stop_Loss']])
    buy_time = to_plot[to_plot.takeprofit.notna()].Date.values[0]
    takeprofit = to_plot[to_plot.takeprofit.notna()].takeprofit.values[0]
    last_candle = to_plot.Date.min()
    open_price = to_plot[to_plot.takeprofit.notna()].Open.values[0]
    tkp_stp = open_price*(1 + bot.takeprofit), open_price*(1 - bot.stoploss), open_price
    
    title = f'Should take profit? {"Yes" if takeprofit==1 else "No"}'
    
    if takepredict != None:
        title = f'{title} : Predicted: {"Yes" if takepredict==1 else "No"}'
        
    fig, axes = fplt.plot(
                to_plot,
                type='candle',
                style='charles',
                title=title,
                ylabel='Price',
                figratio=(12,7),
                #addplot=apd,
                returnfig=True,
                show_nontrading=True,
                hlines=dict(hlines=tkp_stp, colors=['g','r', 'b'], linestyle='-.')
                )

    axes[0].axvspan(date2num(last_candle), date2num(buy_time), 
              label=f"Profundidades de candles = {bot.candles} ",color="b", alpha=0.3)
    axes[0].legend()
    
# plot_ts = bot.raw_ts.merge(bot.processed_ts.reset_index(), how='left').set_index('Date').copy()

# x = random.randint(0, plot_ts.shape[0])
# if plot_ts.iloc[x, -1] != np.nan:
#     input_ = plot_ts.iloc[x:x+bot.expiration, :]
#     to_plot = plot_input(input_)
x = random.randint(0, test_ts[test_ts.takepredict==1].shape[0])
inp = test_ts[test_ts.takepredict==1].reset_index().Date.iloc[x]
inp = bot.processed_ts.loc[inp]
inp = inp.to_frame().T
inp.index.name = 'Date'
inp = inp.reset_index()
takepredict = test_ts[test_ts.takepredict==1].takepredict.iloc[x]
plot_input(inp, takepredict=takepredict)

AttributeError: 'DataFrame' object has no attribute 'takepredict'

In [9]:
# Models and param grids to use on GridSearch
models = [{'name': 'xgboost', 'label': 'XGBoost',
           'classifier': XGBClassifier(random_state=42, use_label_encoder=False),
           'grid':{
               'clf__min_child_weight': [1, 5, 10],
               'clf__gamma': [0.5, 1, 1.5, 2, 5],
               'clf__subsample': [0.6, 0.8, 1.0],
               'clf__colsample_bytree': [0.6, 0.8, 1.0],
               'clf__max_depth': [3, 4, 5],
               'clf__objective': ['binary:logistic']
           }}, ]

In [None]:
def model_selection(classifier, name, grid, train_ts, test_ts,
                    scoring, cv=TimeSeriesSplit(n_splits=5), n_jobs=-1):
    
    y_train = train_ts.takeprofit.values.astype(int)
    X_train = train_ts.drop(['takeprofit'], axis=1).copy()

    y_test = test_ts.takeprofit.values.astype(int)
    X_test = test_ts.drop(['takeprofit'], axis=1).copy()
    
    # Oversampling with smote
    smote = SMOTE(random_state=42)
    # Pipeline, every fold does as sampling
    pipeline = Pipeline([('sampling', smote), ('clf', classifier)])
    #pipeline = Pipeline([('clf', classifier)])
    
    # GridSearch init
    gridsearch_cv=RandomizedSearchCV(pipeline, 
                               grid,
                               cv=cv, 
                               scoring=scoring, 
                               n_jobs=n_jobs, 
                               verbose=1, random_state=42, n_iter=25)
    
    gridsearch_cv.fit(X_train, y_train)
    
    # Creates the results dataframe
    results_dict = {}
    results_dict['classifier_name'] = name    
    results_dict['classifier'] = gridsearch_cv.best_estimator_
    results_dict['best_params'] = gridsearch_cv.best_params_
    results_dict['ROC_AUC_TRAIN'] = gridsearch_cv.best_score_
    
    # Compute the ROC_AUC score in the never seen test test
    y_pred = gridsearch_cv.best_estimator_.predict(X_test)
    results_dict['ROC_AUC_TEST'] = roc_auc_score(y_test, y_pred)
    
    return(results_dict)

results = []
for m in models:    
    results.append(model_selection(m['classifier'], 
                                   m['name'],
                                   m['grid'],
                                   bot.train_ts, 
                                   bot.test_ts,
                                   'roc_auc'))
    
results = pd.DataFrame.from_dict(results)
results

In [10]:
from sktime.utils.data_processing import from_2d_array_to_nested
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sktime.forecasting.model_selection import temporal_train_test_split

from sktime.classification.compose import (
    ColumnEnsembleClassifier,
    TimeSeriesForestClassifier,
)
from sktime.classification.dictionary_based import BOSSEnsemble
from sktime.classification.shapelet_based import MrSEQLClassifier
from sktime.datasets import load_basic_motions
from sktime.transformations.panel.compose import ColumnConcatenator

In [11]:
from sktime.transformations.panel.tsfresh import TSFreshRelevantFeatureExtractor

In [12]:
X = from_2d_array_to_nested(bot.train_ts[bot.train_ts.columns.difference(['takeprofit'])])
y = bot.train_ts.takeprofit

In [13]:
X_train, X_test, y_train, y_test = temporal_train_test_split(X, y)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

(55571, 1) (55571,) (18524, 1) (18524,)


In [None]:
steps = [
    #("tsfresh", TSFreshRelevantFeatureExtractor(n_jobs=os.cpu_count())),
    ("concatenate", ColumnConcatenator()),
    ("classify", TimeSeriesForestClassifier(n_estimators=100, n_jobs=os.cpu_count())),
]
clf = Pipeline(steps)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

In [14]:
clf = MrSEQLClassifier(n_jobs=os.cpu_count())
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

KeyboardInterrupt: 

In [None]:
X = from_2d_array_to_nested(bot.test_ts[bot.test_ts.columns.difference(['takeprofit'])])
y = bot.test_ts.takeprofit

In [None]:
clf.score(X, y)

In [None]:
y_out = clf.predict(X)

In [None]:
test_ts = bot.test_ts.copy()
test_ts['takepredict'] = y_out