In [172]:
import pandas as pd
import numpy as np
from scipy.stats import spearmanr
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import make_scorer 
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

import random

random.seed(611)

def information_coefficient(y_true, y_pred):
    rho, pval = spearmanr(y_true,y_pred) #spearman's rank correlation
    # print (rho)
    return rho

def sharpe(y_true, y_pred):
    positions = np.where(y_pred> 0,1,-1 )
    dailyRet = pd.Series(positions).shift(1).fillna(0).values * y_true
    dailyRet = np.nan_to_num(dailyRet)
    ratio = (252.0 ** (1.0/2.0)) * np.mean(dailyRet) / np.std(dailyRet)
    return ratio

in this notebook

we will analyze the timothy_generated indicators and compare those with talib generated indicators

for the same model, we will see how much increase in the accuracy, we will also look for the impact of entropy



In [173]:
df = pd.read_csv('./data/OEX_full.csv')
df['Date'] = pd.to_datetime(df['Date'], format = '%Y%m%d')

df = df.set_index('Date')

for n in list(range(1,30)):
    name = 'ret' + str(n)
    df[name] = df['Open'].pct_change(periods=n)#for trading with open

df['retFut1'] = df['Open'].pct_change(1).shift(-1).fillna(0)

In [174]:
df.head(5)

Unnamed: 0_level_0,Open,High,Low,Close,Volume,RSI_20,RSI_25,DT_RSI_2_20,STO_20_1,MADIFF_10_100_0,...,ret21,ret22,ret23,ret24,ret25,ret26,ret27,ret28,ret29,retFut1
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-01-11,528.26001,529.73999,526.299988,528.609985,4255780000,,,,,,...,,,,,,,,,,-0.002499
2010-01-12,526.940002,526.940002,522.289978,524.289978,4716160000,,,,,,...,,,,,,,,,,-0.004194
2010-01-13,524.72998,529.419983,522.900024,527.929993,4170360000,,,,,,...,,,,,,,,,,0.006098
2010-01-14,527.929993,530.73999,527.5,529.599976,3915200000,,,,,,...,,,,,,,,,,0.002766
2010-01-15,529.390015,529.400024,522.059998,524.109985,4758730000,,,,,,...,,,,,,,,,,-0.009974


In [175]:
import talib as ta
df['RSI_20_ta'] = ta.RSI(np.array(df['Open']), timeperiod = 20)
df['RSI_25_ta'] = ta.RSI(np.array(df['Open']), timeperiod = 25)

In [176]:
df_rsi = df[['RSI_20','RSI_20_ta','RSI_25','RSI_25_ta']]
df_rsi

Unnamed: 0_level_0,RSI_20,RSI_20_ta,RSI_25,RSI_25_ta
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-01-11,,,,
2010-01-12,,,,
2010-01-13,,,,
2010-01-14,,,,
2010-01-15,,,,
...,...,...,...,...
2022-01-03,59.75192,58.551837,59.18160,58.294115
2022-01-04,58.54899,61.280878,58.20707,60.546426
2022-01-05,51.42509,58.552140,52.33847,58.339634
2022-01-06,50.65286,50.460456,51.69226,51.633016


In [177]:
print(df.columns)
indicator_col = [
       'RSI_20','RSI_20_ta', 'RSI_25','RSI_25_ta',
       'DT_RSI_2_20', 'STO_20_1', 'MADIFF_10_100_0', 'MADIFF_10_100',
       'MACD_10_100_5', 'LINTRND_10', 'PR_INT_0', 'PR_INT_20', 'CMMA_10_252',
       'ENT_2_10', 'ENT_4_16', 'FTI_LP', 'FTI_BP', 'FTI_BF'
]
base_col = ['ret1', 'ret2', 'ret3', 'ret4', 'ret5', 'ret6', 'ret7', 'ret8', 'ret9',
       'ret10', 'ret11', 'ret12', 'ret13', 'ret14', 'ret15', 'ret16', 'ret17',
       'ret18', 'ret19', 'ret20', 'ret21', 'ret22', 'ret23', 'ret24', 'ret25',
       'ret26', 'ret27', 'ret28', 'ret29']
df_model = df[base_col + ['retFut1']]

X_train = df_model.drop(['retFut1'], axis=1)
y_train = df_model[['retFut1']]

Index(['Open', 'High', 'Low', 'Close', 'Volume', 'RSI_20', 'RSI_25',
       'DT_RSI_2_20', 'STO_20_1', 'MADIFF_10_100_0', 'MADIFF_10_100',
       'MACD_10_100_5', 'LINTRND_10', 'PR_INT_0', 'PR_INT_20', 'CMMA_10_252',
       'ENT_2_10', 'ENT_4_16', 'FTI_LP', 'FTI_BP', 'FTI_BF', 'ret1', 'ret2',
       'ret3', 'ret4', 'ret5', 'ret6', 'ret7', 'ret8', 'ret9', 'ret10',
       'ret11', 'ret12', 'ret13', 'ret14', 'ret15', 'ret16', 'ret17', 'ret18',
       'ret19', 'ret20', 'ret21', 'ret22', 'ret23', 'ret24', 'ret25', 'ret26',
       'ret27', 'ret28', 'ret29', 'retFut1', 'RSI_20_ta', 'RSI_25_ta'],
      dtype='object')


In [178]:
import antropy as ant
import random
def timothy_entropy(x):
    x = [i for i in x if str(i) != 'nan']
    nbins = 10
    hist, edges = np.histogram(x, bins=10)
    ent_sum = 0
    for i in hist:
        if(i != 0):
            p = i/len(x)
            ent_sum -= p * np.log(p)
    return(ent_sum)

entropy_list = []

for i in indicator_col:
    tim = timothy_entropy(df[i])
    ent = ant.perm_entropy(df[i], normalize=True)
    res_dict = {'col':i, 'timothy':tim, 'antropy':ent}
    entropy_list.append(res_dict)

df_entropy = pd.DataFrame(entropy_list)
df_entropy['timothy_rank'] = df_entropy['timothy'].rank()
df_entropy['antropy_rank'] = df_entropy['antropy'].rank()
df_entropy

Unnamed: 0,col,timothy,antropy,timothy_rank,antropy_rank
0,RSI_20,1.769358,0.938986,5.0,12.0
1,RSI_20_ta,1.827064,0.951404,7.0,15.0
2,RSI_25,1.738932,0.93909,4.0,13.0
3,RSI_25_ta,1.779821,0.951016,6.0,14.0
4,DT_RSI_2_20,1.935188,0.93606,9.0,11.0
5,STO_20_1,2.101656,0.820284,16.0,7.0
6,MADIFF_10_100_0,2.048536,0.656678,14.0,3.0
7,MADIFF_10_100,2.03419,0.651896,11.0,2.0
8,MACD_10_100_5,1.402384,0.844895,1.0,9.0
9,LINTRND_10,1.662271,0.77494,2.0,6.0


In [179]:
sharpe_scorer = make_scorer(sharpe, greater_is_better=True)
spearmanr_scorer = make_scorer(information_coefficient, greater_is_better=True)
scoring = {"rmse": "neg_root_mean_squared_error", 'sharpe': sharpe_scorer, 'spearmanr': spearmanr_scorer}


In [180]:
split = TimeSeriesSplit(n_splits=5)


In [200]:
def tuning_model(X_train, y_train, pipeline, param_grid):

    grid_search = GridSearchCV(pipeline, param_grid, cv=split, scoring=scoring, refit='sharpe', return_train_score=True)
    grid_search.fit(X_train, y_train.values.ravel())
    best_parameters = grid_search.best_params_
    best_model = grid_search.best_estimator_
    results = pd.DataFrame(grid_search.cv_results_)
    return(grid_search, results, grid_search.best_score_*100)

def calculateMaxDD(cumret):
    highwatermark = np.zeros(len(cumret))
    drawdown      = np.zeros(len(cumret))
    drawdownduration = np.zeros(len(cumret))
    for t in range(1, len(cumret)):
        highwatermark[t] = np.max([highwatermark[t-1], cumret[t]])
        drawdown[t] = (1+cumret[t]) / (1 + highwatermark[t]) - 1
        if (drawdown[t]==0):
            drawdownduration[t] = 0
        else:
            drawdownduration[t] = drawdownduration[t-1] + 1
    return np.min(drawdown), np.max(drawdownduration)

In [197]:
def extra_model_eva(grid_search, X, y):
    positions = np.where(grid_search.predict(X)> 0,1,-1 ) #POSITIONS
    dailyRet = pd.Series(positions).fillna(0).values * y.retFut1 #for trading right after the open
    dailyRet = dailyRet.fillna(0)
    cumret = np.cumprod(dailyRet + 1) - 1
    cagr = (1 + cumret[-1]) ** (252 / len(cumret)) - 1
    maxDD, maxDDD = calculateMaxDD(cumret)
    ratio = (252.0 ** (1.0/2.0)) * np.mean(dailyRet) / np.std(dailyRet)
    ## return CAGR, Sharpe ratio, Calmar
    return(cagr, ratio, -cagr/maxDD)


In [202]:
def model_evaluation(df, model_name, pipe, param_grid):
    df_model = df[base_col + ['retFut1']]
    X_train = df_model.drop(['retFut1'], axis=1)
    y_train = df_model[['retFut1']]

    grid_search, res, eva = tuning_model(X_train, y_train, pipe, param_grid)

    base_sharpe = np.max(res['mean_test_sharpe'])
    base_rmse = np.max(res['mean_test_rmse'])
    base_spearmanr = np.max(res['mean_test_spearmanr'])
    CAGR, Sharpe_ratio, Calmar = extra_model_eva(grid_search, X_train, y_train)

    ind_result = [{'col': 'base', 
            'model_name':model_name,
            'test_sharpe': base_sharpe ,
            'test_rmse': base_rmse, 
            'test_spearmanr': base_spearmanr,
            'train_cagr':CAGR,
            'train_sharpe_ratio':Sharpe_ratio,
            'calmar':Calmar
            }]

    for ind in indicator_col:
        df_temp = df[base_col + ['retFut1'] + [ind]]
        X_train_temp = df_temp.drop(['retFut1'], axis=1)
        y_train_temp = df_temp[['retFut1']]
        grid_search, cv_res, cv_score = tuning_model(X_train_temp, y_train_temp, pipe, param_grid)
        CAGR, Sharpe_ratio, Calmar = extra_model_eva(grid_search, X_train_temp, y_train_temp)
        ind_dict = {'col': ind, 
            'model_name':model_name,
            'test_sharpe': np.max(cv_res['mean_test_sharpe']) , 
            'test_rmse': np.max(cv_res['mean_test_rmse']), 
            'test_spearmanr': np.max(cv_res['mean_test_spearmanr']),
            'train_cagr':CAGR,
            'train_sharpe_ratio':Sharpe_ratio,
            'calmar':Calmar
            }
        ind_result.append(ind_dict)

    return(ind_result)



In [203]:

numeric_sub_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value = 0)),
    ('scaler', StandardScaler())])
ridge = Ridge(max_iter=5000) 
a_rs = np.logspace(-40, 0, num=100, endpoint = True)

ridge_pipe = Pipeline(steps=[('preprocessor', numeric_sub_pipeline),('ridge', ridge)])
ridge_param_grid = [{ 'ridge__alpha': a_rs }]

ridge_model_res = model_evaluation(df, 'ridge', ridge_pipe, ridge_param_grid)
ridge_model_df = pd.DataFrame(ridge_model_res)

from sklearn.ensemble import RandomForestRegressor
rf_pipe = Pipeline(steps=[('preprocessor', numeric_sub_pipeline),('rf', RandomForestRegressor())])
rf_param_grid = [{ 'rf__n_estimators': [100] , 'rf__max_depth':[10,15,20]}]

rf_model_res = model_evaluation(df, 'rf', rf_pipe, rf_param_grid)
rf_model_df = pd.DataFrame(rf_model_res)

from sklearn.ensemble import GradientBoostingRegressor
gb_pipe = Pipeline(steps=[('preprocessor', numeric_sub_pipeline),('gb', GradientBoostingRegressor())])
gb_param_grid = [{ 'gb__n_estimators': [100] , 'gb__max_depth':[5,10,15,20]}]

gb_model_res = model_evaluation(df, 'gb', gb_pipe, gb_param_grid)
gb_model_df = pd.DataFrame(gb_model_res)

myres = pd.concat([ridge_model_df, rf_model_df, gb_model_df])

In [209]:
ridge_model_df.sort_values('train_cagr',ascending=False)



Unnamed: 0,col,model_name,test_sharpe,test_rmse,test_spearmanr,train_cagr,train_sharpe_ratio,calmar
0,base,ridge,0.699042,-0.008724,0.021218,0.292899,1.800394,1.937451
1,RSI_20,ridge,0.707285,-0.008646,0.069843,0.444851,2.563219,2.196329
2,RSI_20_ta,ridge,0.678488,-0.008738,0.010711,0.2834,1.750258,1.874617
3,RSI_25,ridge,0.7621,-0.008672,0.052537,0.3831,2.261584,1.801342
4,RSI_25_ta,ridge,0.682714,-0.00873,0.016375,0.296149,1.817471,1.958952
5,DT_RSI_2_20,ridge,0.137645,-0.007859,0.590651,1.829139,7.819101,8.007001
6,STO_20_1,ridge,0.877883,-0.008656,0.064578,0.395125,2.321216,1.79261
7,MADIFF_10_100_0,ridge,0.56874,-0.008726,0.020554,0.24068,1.52079,1.238898
8,MADIFF_10_100,ridge,0.516945,-0.008726,0.02051,0.226695,1.444163,1.236189
9,MACD_10_100_5,ridge,-0.401044,-0.008074,0.323621,1.031251,5.039504,3.798076


In [195]:
rf_model_df.sort_values('test_spearmanr',ascending=False)

Unnamed: 0,col,model_name,test_sharpe,test_rmse,test_spearmanr
11,PR_INT_0,rf,-0.295914,-0.005018,0.870618
13,CMMA_10_252,rf,-0.219324,-0.006039,0.725141
5,DT_RSI_2_20,rf,-0.044046,-0.007364,0.646322
1,RSI_20,rf,-0.270911,-0.007511,0.503777
12,PR_INT_20,rf,-0.440523,-0.008059,0.480013
3,RSI_25,rf,-0.133355,-0.007733,0.446594
9,MACD_10_100_5,rf,0.328637,-0.008081,0.389748
6,STO_20_1,rf,-0.139632,-0.008591,0.251403
10,LINTRND_10,rf,-0.02858,-0.008662,0.251159
18,FTI_BF,rf,0.37122,-0.008839,0.041577


In [193]:
gb_model_df.sort_values('test_sharpe',ascending=False)

Unnamed: 0,col,model_name,test_sharpe,test_rmse,test_spearmanr
0,base,gb,0.644097,-0.009301,0.043347
8,MADIFF_10_100,gb,0.590343,-0.009311,0.031459
17,FTI_BP,gb,0.56625,-0.009278,0.047142
9,MACD_10_100_5,gb,0.565618,-0.008216,0.387981
4,RSI_25_ta,gb,0.523419,-0.009312,0.042538
2,RSI_20_ta,gb,0.492486,-0.009332,0.051516
15,ENT_4_16,gb,0.415138,-0.009324,0.043653
6,STO_20_1,gb,0.363858,-0.008666,0.259963
18,FTI_BF,gb,0.33415,-0.009245,0.048774
14,ENT_2_10,gb,0.308742,-0.009262,0.045844


In [187]:
# result_df.sort_values('test_sharpe',ascending=False)