In [16]:
import pandas as pd 
import numpy as np
import random
import datetime
import sklearn.linear_model as skl_lm
import matplotlib.pyplot as plt 
import utilities as utils
import fractionalDiff as fd
import pickle
import scipy.stats as sps
from sys import exit
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA, KernelPCA

In [138]:
inputs = pd.read_excel('data/cct_prices.xlsx', index_col=False)
logicOutputs = pd.read_excel('data/cct_logics.xlsx', index_col=False)
tickers = ['CALA_BL', 'CALA_P', 'CNOR_BL', 'CNOR_P', 'CSUD_BL', 'CSUD_P',
       'NORD_BL', 'NORD_P', 'SARD_BL', 'SARD_P', 'SICI_BL', 'SICI_P', 'SUD_BL',
       'SUD_P']
inputVariables = inputs.columns[1:]

# maWindow = 11


# inputs['weekDay'] = inputs['date'].apply(lambda x: datetime.datetime.weekday(x))
# inputs['yearWeek'] = pd.DatetimeIndex(inputs['date']).isocalendar().week[0] * 10 + inputs['date'].apply(lambda x: datetime.datetime.weekday(x))

# for rollWndw in range(4, maWindow):
#     inputs[[f'mah{rollWndw}', f'mal{rollWndw}', f'mac{rollWndw}', f'mav{rollWndw}', f'maubb{rollWndw}', f'malbb{rollWndw}']] = inputs[['high', 'low', 'close', 'volume', 'UpperBB', 'LowerBB']].rolling(maWindow).mean()

# inputs.dropna(inplace = True)
# inputs.reset_index(inplace = True)
# logicOutputs = logicOutputs.iloc[(maWindow-1):, :]
# inputs.shape, logicOutputs.shape

In [141]:
def TrendFollowOutput(data, date):
    prediction = 2.*((data.loc[data.date == date, ticker].values )[0] >= 0) - 1.
    return prediction

def CreatePca(X, XMeans, XStds, nComponents):
    XZeroMeans = (X - XMeans)/XStds 
    U, s, Vh = np.linalg.svd(XZeroMeans, full_matrices=False)
    principalComponents = XZeroMeans@Vh.T 
    principalComponents = principalComponents[:, :nComponents]
    explainedVariances = s**2/(XZeroMeans.shape[0] - 1)
    explainedVariances = explainedVariances/np.sum(explainedVariances)
    return principalComponents, explainedVariances

In [142]:
def GetKernelColumns(X, Y):
    kpca = KernelPCA(n_components=X.shape[0], kernel='poly', gamma=0.01)
    Xkpca = kpca.fit(X).transform(X)

    df = pd.concat([pd.DataFrame(Xkpca, columns = [f'pc_{k + 1}'for k in range(Xkpca.shape[1])]), pd.DataFrame(Y, columns = ['dir'])], axis = 1)
    distances = df.groupby(['dir']).mean().reset_index(drop=True)
    variables = distances.columns

    distancesLst = []

    for x1 in variables:                                                                                                            
        for x2 in variables:
            dx = distances[x1].values - distances[x2].values
            distance = np.sqrt(np.sum(dx*dx))
            distancesLst.append(distance)

    distance = np.array(distancesLst).reshape((len(variables) , len(variables)))
    maxDistance = distance[np.where(distance==np.max(distance))]

    return np.where(distance==np.max(distance))[0], Xkpca, maxDistance

In [170]:
scoresByOrderLst = []
mlModels = []
window_0 = 2
deltaWindow = 22
cutoff = 1e-2
orders = np.linspace(0.1, 1.0, 10)

windows = range(window_0, window_0 + deltaWindow + 1)
lags = np.array([fd.findCutoff(order, cutoff, 1) for order in orders]).max()
lags = 6
#windows = [4, 9, 10, 12, 14, 16, 18, 20, 22, 24]

firstOrderDifferences = fd.FractionallyDifferentiateInputs(inputs, inputVariables, order = 1.0, thresholdVal = lags, cutoff=False)[lags:].reset_index()

transformedInputsLst = []
for order in orders:
    print(f'Calculating differencing orders: {order}')
    transformedInputs = fd.FractionallyDifferentiateInputs(inputs, inputVariables, order = order, thresholdVal = lags, cutoff=False)[lags:].reset_index(drop=True)
    #transformedInputs.drop(columns='index', inplace = True)
    transformedInputsLst.append(transformedInputs)

transformedInputsAllOrders = pd.concat(transformedInputsLst, axis = 1)
transformedInputsAllOrders.drop(columns = 'date', inplace = True)



for orderNumber, order in enumerate(orders):
    print(f'Differencing order: {order}')
    transformedInputs = transformedInputsLst[orderNumber]
    recapWithOrdersLst = []
    simRecapDatesLst = []

    for windowNumber, window in enumerate(windows):
        print(f'Recalibration window: {window}')
        trainScores = []
        testScores = []
        testScoresAlternativeMdl = []
        tickersLst = []
        trainOrderLst = []
        simulationDatesLst = []
        simulationDates = transformedInputs.date.unique()[(window):(transformedInputs.date.shape[0] - 1)]
        strategyReturnLst = []
        alternativeMdlStrategyReturnLst = []
        assetReturnByDateLst = []

        for simulationDate in simulationDates:
            
            startDateIdx = transformedInputs.index[transformedInputs.date == simulationDate][0] - window
            stopDateIdx = transformedInputs.index[transformedInputs.date == simulationDate][0] + 1
            
            startDate = transformedInputs.date[startDateIdx]
            testDate = transformedInputs.date[stopDateIdx - 1]
            stopDate = transformedInputs.date[stopDateIdx]
            assetReturnLst = []

            for ticker in tickers:
                
                inputsToScale = utils.FilterInputs(transformedInputs, startDate, stopDate)
                x = utils.SelectAndScaleInputs(inputsToScale, alreadyScaled = False)
                y = utils.SelectLogicOutputs(startDate, stopDate, ticker, logicOutputs[lags:])
                x = np.nan_to_num(x)

                kPCAComponents, xKPCA, _ = GetKernelColumns(x, y)
                xKPCACut = xKPCA[:, kPCAComponents]

                xTrain, xTest, yTrain, yTest = utils.TestTrainSplit(xKPCACut, y, window)       
                #xTrain = ScaleInputs(xTrain)   


                if len(np.unique(yTrain)) < 2:
                    continue
                
                lrm = skl_lm.LogisticRegression(random_state=0, verbose = 0).fit(xTrain, yTrain)#beta =...

                if simulationDate == max(simulationDates):
                    mlModels.append({'order':order, 'calibrationWindow': window, 'model': lrm})

                trainScore = lrm.score(xTrain, yTrain)
                testScore = lrm.score(xTest, yTest)
                yPred = lrm.predict(xTest)

                trainScores.append(trainScore)
                testScores.append(testScore)
                simulationDatesLst.append(simulationDate)
                tickersLst.append(ticker)
                trainOrderLst.append(window)
            #     assetReturnLst.append(filteredReturn.values[0])
            # assetReturnByDateLst.append(assetReturnLst)
            # assetReturnByDateFlatLst = [item for sublist in assetReturnByDateLst for item in sublist]

        simRecapDf = pd.DataFrame(list(zip(trainOrderLst, simulationDatesLst, tickersLst, trainScores, testScores)), columns = ['window', 'simulationDate', 'ticker', 'trainScore', 'testScore'])
        simRecapDatesLst.append(simRecapDf)

    simRecapWithWindowsDf = pd.concat(simRecapDatesLst)
    simRecapWithWindowsDf['order'] = order
    scoresByOrderLst.append(simRecapWithWindowsDf)

overallScores = pd.concat(scoresByOrderLst)
#overallScores.loc[overallScores.ticker.str.contains('JPY'), 'strategyReturn'] = overallScores.loc[overallScores.ticker.str.contains('JPY'), 'strategyReturn']/100.

Calculating differencing orders: 0.1
Calculating differencing orders: 0.2
Calculating differencing orders: 0.30000000000000004
Calculating differencing orders: 0.4
Calculating differencing orders: 0.5
Calculating differencing orders: 0.6
Calculating differencing orders: 0.7000000000000001
Calculating differencing orders: 0.8
Calculating differencing orders: 0.9
Calculating differencing orders: 1.0
Differencing order: 0.1
Recalibration window: 2
Recalibration window: 3
Recalibration window: 4
Recalibration window: 5
Recalibration window: 6
Recalibration window: 7
Recalibration window: 8
Recalibration window: 9
Recalibration window: 10
Recalibration window: 11
Recalibration window: 12
Recalibration window: 13
Recalibration window: 14
Recalibration window: 15
Recalibration window: 16
Recalibration window: 17
Recalibration window: 18
Recalibration window: 19
Recalibration window: 20
Recalibration window: 21
Recalibration window: 22
Recalibration window: 23
Recalibration window: 24
Differen

In [171]:
#alt model: fractional trend following
aggScores = overallScores.groupby(['ticker', 'order', 'window']).agg({'testScore':[np.size, np.mean, sps.skew, sps.kurtosis, np.std]}).droplevel(level=[0], axis = 1).reset_index()
aggScores.columns = ['ticker', 'order', 'window', 'count', 'testScore', 'skew', 'kurt', 'testStd']

aggScores = aggScores.sort_values(by = ['testScore'], ascending = False).loc[aggScores['count'] >= 12]
aggScores

Unnamed: 0,ticker,order,window,count,testScore,skew,kurt,testStd
660,CNOR_P,0.2,11,12,0.833333,-1.788854,1.200000,0.389249
1731,SARD_BL,0.3,11,12,0.833333,-1.788854,1.200000,0.389249
211,CALA_P,0.1,3,16,0.812500,-1.601282,0.564103,0.403113
1707,SARD_BL,0.2,8,15,0.800000,-1.500000,0.250000,0.414039
1728,SARD_BL,0.3,8,15,0.800000,-1.500000,0.250000,0.414039
...,...,...,...,...,...,...,...,...
867,CSUD_BL,0.2,8,15,0.200000,1.500000,0.250000,0.414039
1033,CSUD_BL,1.0,6,17,0.176471,1.697337,0.880952,0.392953
1456,NORD_BL,1.0,9,14,0.142857,2.041241,2.166667,0.363137
2484,SICI_P,0.9,8,15,0.133333,2.157277,2.653846,0.351866


In [346]:
#alt model: fractional trend following
aggScores = overallScores.groupby(['order', 'window']).agg({'testScore':[np.size, np.mean, sps.skew, sps.kurtosis, np.std], \
    'strategyReturn': [np.sum], 'alternativeMdlTestScore':[np.mean, sps.skew, sps.kurtosis, np.std], 'alternativeMdlStrategyReturn': [np.sum], 'assetReturn':[np.sum]}).droplevel(level=[0], axis = 1).reset_index()
aggScores.columns = ['order', 'window', 'count', 'testScore', 'skew', 'kurt', 'testStd', 
    'strategyReturn', 'alternativeMdlTestScore', 'amSkew', 'amKurt', 'amtestStd', 'alternativeMdlStrategyReturn', 'assetReturn']

aggScores.sort_values(by = ['strategyReturn'], ascending = False)

Unnamed: 0,order,window,count,testScore,skew,kurt,testStd,strategyReturn,alternativeMdlTestScore,amSkew,amKurt,amtestStd,alternativeMdlStrategyReturn,assetReturn
96,0.5,6,231,0.597403,-0.397220,-1.842216,0.491486,0.219353,0.454545,0.182574,-1.966667,0.499011,-0.065420,-0.442274
105,0.5,15,230,0.621739,-0.502066,-1.747930,0.486011,0.204044,0.460870,0.157003,-1.975350,0.499554,-0.018996,-0.501293
52,0.3,8,234,0.572650,-0.293715,-1.913731,0.495754,0.192007,0.465812,0.137073,-1.981211,0.499899,-0.004341,-0.516664
127,0.6,14,231,0.627706,-0.528346,-1.720850,0.484466,0.191392,0.463203,0.147586,-1.978218,0.499727,-0.018689,-0.503600
173,0.8,14,231,0.614719,-0.471451,-1.777734,0.487719,0.186039,0.463203,0.147586,-1.978218,0.499727,-0.018689,-0.503600
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
207,1.0,2,132,0.507576,-0.030307,-1.999082,0.501847,-0.163413,0.477273,0.091003,-1.991718,0.501386,-0.037308,-0.274630
161,0.8,2,132,0.477273,0.091003,-1.991718,0.501386,-0.170243,0.477273,0.091003,-1.991718,0.501386,-0.037308,-0.274630
3,0.1,5,225,0.462222,0.151544,-1.977034,0.499682,-0.206079,0.457778,0.169494,-1.971272,0.499325,-0.031636,-0.450936
26,0.2,5,225,0.457778,0.169494,-1.971272,0.499325,-0.207025,0.457778,0.169494,-1.971272,0.499325,-0.031636,-0.450936


In [249]:
#alt model: fractional trend following
aggScores = overallScores.groupby(['order', 'window']).agg({'testScore':[np.size, np.mean, sps.skew, sps.kurtosis, np.std], \
    'strategyReturn': [np.sum], 'alternativeMdlTestScore':[np.mean, sps.skew, sps.kurtosis, np.std], 'alternativeMdlStrategyReturn': [np.sum], 'assetReturn':[np.sum]}).droplevel(level=[0], axis = 1).reset_index()
aggScores.columns = ['order', 'window', 'count', 'testScore', 'skew', 'kurt', 'testStd', 
    'strategyReturn', 'alternativeMdlTestScore', 'amSkew', 'amKurt', 'amtestStd', 'alternativeMdlStrategyReturn', 'assetReturn']

aggScores.sort_values(by = ['strategyReturn'], ascending = False)

Unnamed: 0,order,window,count,testScore,skew,kurt,testStd,strategyReturn,alternativeMdlTestScore,amSkew,amKurt,amtestStd,alternativeMdlStrategyReturn,assetReturn
96,0.5,6,231,0.597403,-0.397220,-1.842216,0.491486,0.219353,0.454545,0.182574,-1.966667,0.499011,-0.065420,-0.442274
105,0.5,15,230,0.621739,-0.502066,-1.747930,0.486011,0.204044,0.460870,0.157003,-1.975350,0.499554,-0.018996,-0.501293
52,0.3,8,234,0.572650,-0.293715,-1.913731,0.495754,0.192007,0.465812,0.137073,-1.981211,0.499899,-0.004341,-0.516664
127,0.6,14,231,0.627706,-0.528346,-1.720850,0.484466,0.191392,0.463203,0.147586,-1.978218,0.499727,-0.018689,-0.503600
173,0.8,14,231,0.614719,-0.471451,-1.777734,0.487719,0.186039,0.463203,0.147586,-1.978218,0.499727,-0.018689,-0.503600
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
207,1.0,2,132,0.507576,-0.030307,-1.999082,0.501847,-0.163413,0.477273,0.091003,-1.991718,0.501386,-0.037308,-0.274630
161,0.8,2,132,0.477273,0.091003,-1.991718,0.501386,-0.170243,0.477273,0.091003,-1.991718,0.501386,-0.037308,-0.274630
3,0.1,5,225,0.462222,0.151544,-1.977034,0.499682,-0.206079,0.457778,0.169494,-1.971272,0.499325,-0.031636,-0.450936
26,0.2,5,225,0.457778,0.169494,-1.971272,0.499325,-0.207025,0.457778,0.169494,-1.971272,0.499325,-0.031636,-0.450936


In [242]:
#alt model: fractional trend following
aggScores = overallScores.groupby(['order', 'window']).agg({'testScore':[np.size, np.mean, sps.skew, sps.kurtosis, np.std], \
    'strategyReturn': [np.sum], 'alternativeMdlTestScore':[np.mean, sps.skew, sps.kurtosis, np.std], 'alternativeMdlStrategyReturn': [np.sum], 'assetReturn':[np.sum]}).droplevel(level=[0], axis = 1).reset_index()
aggScores.columns = ['order', 'window', 'count', 'testScore', 'skew', 'kurt', 'testStd', 
    'strategyReturn', 'alternativeMdlTestScore', 'amSkew', 'amKurt', 'amtestStd', 'alternativeMdlStrategyReturn', 'assetReturn']

aggScores.sort_values(by = ['strategyReturn'], ascending = False)

Unnamed: 0,order,window,count,testScore,skew,kurt,testStd,strategyReturn,alternativeMdlTestScore,amSkew,amKurt,amtestStd,alternativeMdlStrategyReturn,assetReturn
13,0.8,11,234,0.619658,-0.492957,-1.756993,0.486512,0.179948,0.465812,0.137073,-1.981211,0.499899,-0.016425,-0.501337
1,0.6,9,235,0.591489,-0.372242,-1.861436,0.492608,0.168449,0.468085,0.12792,-1.983636,0.500045,-0.007502,-0.512261
11,0.8,9,235,0.591489,-0.372242,-1.861436,0.492608,0.15939,0.468085,0.12792,-1.983636,0.500045,-0.007502,-0.512261
10,0.8,8,234,0.57265,-0.293715,-1.913731,0.495754,0.148272,0.465812,0.137073,-1.981211,0.499899,-0.004341,-0.516664
5,0.7,8,234,0.581197,-0.329155,-1.891657,0.494421,0.135907,0.465812,0.137073,-1.981211,0.499899,-0.004341,-0.516664
19,0.9,12,233,0.587983,-0.35751,-1.872187,0.493258,0.124671,0.463519,0.146313,-1.978593,0.499741,-0.021548,-0.50646
0,0.6,8,234,0.564103,-0.258544,-1.933155,0.496937,0.117272,0.465812,0.137073,-1.981211,0.499899,-0.004341,-0.516664
18,0.9,11,234,0.581197,-0.329155,-1.891657,0.494421,0.100532,0.465812,0.137073,-1.981211,0.499899,-0.016425,-0.501337
14,0.8,12,233,0.553648,-0.215838,-1.953414,0.498184,0.088862,0.463519,0.146313,-1.978593,0.499741,-0.021548,-0.50646
15,0.9,8,234,0.551282,-0.206216,-1.957475,0.498429,0.088182,0.465812,0.137073,-1.981211,0.499899,-0.004341,-0.516664


In [12]:
#alt model: fractional trend following
aggScores = overallScores.groupby(['order', 'window']).agg({'testScore':[np.size, np.mean, sps.skew, sps.kurtosis, np.std], \
    'strategyReturn': [np.sum], 'alternativeMdlTestScore':[np.mean, sps.skew, sps.kurtosis, np.std], 'alternativeMdlStrategyReturn': [np.sum], 'assetReturn':[np.sum]}).droplevel(level=[0], axis = 1).reset_index()
aggScores.columns = ['order', 'window', 'count', 'testScore', 'skew', 'kurt', 'testStd', 
    'strategyReturn', 'alternativeMdlTestScore', 'amSkew', 'amKurt', 'amtestStd', 'alternativeMdlStrategyReturn', 'assetReturn']

aggScores.sort_values(by = ['strategyReturn'], ascending = False)

Unnamed: 0,order,window,count,testScore,skew,kurt,testStd,strategyReturn,alternativeMdlTestScore,amSkew,amKurt,amtestStd,alternativeMdlStrategyReturn,assetReturn
57,0.7,11,234,0.581197,-0.329155,-1.891657,0.494421,0.186966,0.465812,0.137073,-1.981211,0.499899,-0.016425,-0.501337
66,0.8,11,234,0.572650,-0.293715,-1.913731,0.495754,0.144149,0.465812,0.137073,-1.981211,0.499899,-0.016425,-0.501337
56,0.7,10,235,0.565957,-0.266156,-1.929161,0.496688,0.106755,0.463830,0.145061,-1.978957,0.499754,-0.019393,-0.500370
58,0.7,12,233,0.519313,-0.077311,-1.994023,0.500702,0.102914,0.463519,0.146313,-1.978593,0.499741,-0.021548,-0.506460
48,0.6,11,234,0.568376,-0.276098,-1.923770,0.496364,0.098438,0.465812,0.137073,-1.981211,0.499899,-0.016425,-0.501337
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85,1.0,12,233,0.480687,0.077311,-1.994023,0.500702,-0.148029,0.463519,0.146313,-1.978593,0.499741,-0.021548,-0.506460
8,0.1,16,229,0.510917,-0.043679,-1.998092,0.500976,-0.150112,0.462882,0.148882,-1.977834,0.499713,-0.016106,-0.502183
76,0.9,12,233,0.489270,0.042928,-1.998157,0.500961,-0.161873,0.463519,0.146313,-1.978593,0.499741,-0.021548,-0.506460
78,0.9,14,231,0.480519,0.077981,-1.993919,0.500705,-0.210426,0.463203,0.147586,-1.978218,0.499727,-0.018689,-0.503600


In [172]:
overallScores.to_csv('out_data/cct_2022-06.csv', index = False)

In [None]:
finalDf = pd.concat([pd.DataFrame(CreatePca(x, np.mean(x, axis = 0),2)[0], columns = ['Principal component 1', 'Principal component 2']), pd.DataFrame(y, columns = ['target'])], axis = 1)

fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2 component PCA', fontsize = 20)
targets = [1., -1.]
colors = ['r', 'b']
for target, color in zip(targets,colors):
    indicesToKeep = finalDf['target'] == target
    ax.scatter(finalDf.loc[indicesToKeep, 'Principal component 1']
               , finalDf.loc[indicesToKeep, 'Principal component 2']
               , c = color
               , s = 50)
ax.legend(targets)
ax.grid()