# Transaction Cost analysis and Statistical Tests



# Libraries and Data

In [None]:
import numpy as np
import pandas as pd
import sys
import os
import pickle
import cvxpy as cp
from scipy.optimize import minimize
from project_lib.backtest import *
from project_lib.utils import *
from project_lib.performance import *
from project_lib.analysis import *
from project_lib.portfolio import Portfolio
from project_lib.backtest import *

HOME_DIRECTORY = 'C:/Users/Harol/OneDrive/Documents/master computational finance/thesis/thesis_UCL/Code/Transaction Costs'
sys.path.append(HOME_DIRECTORY)

In [None]:
# import returns
with open(HOME_DIRECTORY + '/data/processed_daily_data/ret_subset.pkl', 'rb') as f:
    ret = pickle.load(f)

In [None]:
universe_size = 50
ret = ret.iloc[:, :universe_size]  # subset the data
ret = ret.iloc[(4):] # burn


In [None]:
prices = (1 + ret).cumprod()
prices = prices.iloc[:,:universe_size]

## preprocessing

In [None]:
lcca_ls = pd.read_excel("weights_l4_pfrevers_ls.xlsx",sheet_name="30assets_sig_pfrevers_ls")
lcca_nls = pd.read_excel("weights_l4_pfrevers_nls.xlsx",sheet_name="30assets_sig_pfrevers_nls")
lcca_weights = [lcca_ls, lcca_nls]

In [None]:
l6mean_rev = pd.read_excel("weights_l6_2.xlsx",sheet_name="50assets_sig_mean_rev_sample")
l6prs = pd.read_excel("weights_l6_3.xlsx",sheet_name="50assets_sig_prs_sample")
l6mtm = pd.read_excel("weights_l6_4.xlsx",sheet_name="50assets_sig_mtm_sample")
l6pfrevers = pd.read_excel("weights_l6_5.xlsx",sheet_name="50assets_sig_pfrevers_sample")
l6rev = pd.read_excel("weights_l6_6.xlsx",sheet_name="50assets_sig_rev_sample")
ncca_weights = [l6mean_rev,l6prs,l6mtm,l6pfrevers,l6rev]

In [None]:
def preprocess(df,col,indent,datename,norm):
    df = df.set_index(df.columns[col])
    df.index.names = [datename]
    df.index = pd.to_datetime(df.index)
    df = df.iloc[indent:,:]
    if norm:
        m = df.div(df.std(axis=1), axis=0)
        df = m
    return df

In [None]:
for d in range(len(lcca_weights)):
    lcca_weights[d] = preprocess(lcca_weights[d],0,0,"date",True)
for d in range(len(ncca_weights)):
    ncca_weights[d] = preprocess(ncca_weights[d],0,0,"date",True)

# Transaction cost on asset level

Implementation of "Multiperiod portfolio optimization with multiple risky assets and general transaction costs", Mei, Demiguel, Nogales, 2016

In [None]:
def rebalancing(X,X_prev, rho, gamma, kappa, mu,sigma, lag, target="Markowitz"):
    """
        Function to calculate optimal rebalancing on asset level with proportional transaction costs.
        
        Implementation equation (2) in Multiperiod portfolio optimization with multiple risky assets
        and general transaction costs.
        
        Inputs:
                X      : target weights                   [1 x m]
                X_prev : previous weights                 [1 x m]
                rho    : discount rate                    [1 x 1]
                gamma  : absolute risk-aversion parameter [1 x 1]
                kappa  : transaction cost parameter       [1 x 1]
                mu     : mean returns                     [1 x m]
                sigma  : covariance of returns            [m x m]
                lag    : rebalancing horizon              [1 x 1]
        Output:
                new_w : new weights [1 x m]
    """
    
    if target=="Markowitz":
        constraints = []
        m = len(X)
        # initiliase variable
        w = cp.Variable(m)
        # objective function
        obj = cp.Maximize((1-rho)**lag * (w * mu - gamma/2 * w * sigma * w) - kappa*cp.norm(w - X_prev, 1))
        prob = cp.Problem(obj, constraints)
        prob.solve(verbose = False)
        new_w = np.array(w.value)
        
    elif target=="Target":
        # Calculated using SCIPY (CVXPY does not support formulation)
        arguments = (X, kappa, X_prev)
        res = minimize(minimize_target, x0=X, args=arguments)
        new_w = res.x
        
    elif target == "Tradeoff":
        # Calculated using SCIPY (CVXPY does not support formulation)
        arguments = (X, X_prev, gamma, kappa, sigma)
        res = minimize(minimize_tradeoff, x0 = X, args = arguments)
        new_w = res.x

    return new_w

def minimize_target(w, w_target, tcost, w_prev):
    """
        minimizes difference between target weights and actual weights whilst penalizing for difference with previous weights
        
        Equation : w_target - w + tcost * |w - w_prev|
        
        inputs:
                w        : actual weights                   [1 x m]
                w_target : target weights                   [1 x m]
                w_prev   : previous weights                 [1 x m]
                tcost  : transaction cost parameter         [1 x 1]
        outputs:
                norm1 of Equation
    """
    return np.linalg.norm(w_target-w + tcost * np.abs(w - w_prev),1)

def minimize_tradeoff(w,w_target,w_prev,gamma,tcost,covar):
    """
        minimizes difference between target weights and actual weights whilst penalizing for
        the difference with previous weights. Taking into account the covariance matrix, risk aversion (tracking error)
        and transaction cost parameter.
        
        Equation 1 in "Analytical solutions of optimal portfolio rebalancing", Ding Liu, 2019
        
        inputs:
                w        : actual weights                   [1 x m]
                w_target : target weights                   [1 x m]
                w_prev   : previous weights                 [1 x m]
                gamma  : absolute risk-aversion parameter   [1 x 1]
                tcost  : transaction cost parameter         [1 x 1]
                covar  : covariance of returns              [m x m]
        outputs:
                norm1 of Equation 1
        
    """
    # norm1[ 1/(2*gamma) * (w - w_T) @ covar @ (w-w_T)' + tcost * (w - w_(t-1))' ]
    return np.linalg.norm( (1 / (2*gamma)) * (w - w_target) @ covar @ (w - w_target).T + tcost * np.abs(w - w_prev), 1)

In [None]:
def constant_rebalancing(weights, rho, gamma, kappa, returns, lag, target):
    """
        function to perform continuous rebalancing taking into account transaction costs

    """
    # create some variables
    means = returns.rolling(250).mean().iloc[250:, :]
    #covariances = returns.rolling(250).cov()
    covariances = 1
    new_weights = weights.copy()

    new_weights.iloc[0, :] = new_weights.iloc[0, :]

    # first very basic function
    for i in range(1, weights.shape[0]):
        if i % 50 == 0:
            print("iteration {}".format(i))
        target_w = np.array(weights.iloc[i, :])
        prev_w = np.array(new_weights.iloc[i-1, :])

        covariances = get_cov(
            np.array(returns.iloc[i:(i+250), :]), method="nls", square_root=False)

        temp = rebalancing(target_w, prev_w, rho=rho,
                           gamma=gamma, kappa=kappa, mu=means.iloc[i, :], sigma=covariances, lag=lag, target=target)

        for j in range(len(temp)):
            new_weights.iloc[i, j] = temp[j]

    return new_weights

In [None]:
def rebalancing_output_tcosts(naming,tcosts, cca_w, rho, gammas, returns, lag=1, target="Tradeoff"):
    for tcost in tcosts:
        for gamma in gammas:
            print("on tcost {} and gamma {}".format(tcost,gamma))
            tcost_weights = constant_rebalancing(cca_w, rho=rho, gamma=gamma,
                                                 kappa=tcost, returns=returns, lag=lag, target=target)
            tcost_weights.to_csv(naming+str(tcost)+"_"+str(gamma)+".csv")

## impact of different levels of transaction costs

In [None]:
tcosts = [0.0001,0.0002,0.0003]
gammas = [0.25, 1]

In [None]:
# name
name_convention = "ncca_rev_"
rebalancing_output_tcosts(name_convention, tcosts,
                          cca_w=ncca_weights[1], rho=0, gammas=gammas, returns=ret, lag=1, target="Tradeoff")

### import previously extracted files

In [None]:
def extract_tcost_files(variable_list, base_name,sample_name,include_sample = True):
    
    # extract data frames and put them into a list
    list_of_files = [pd.read_csv(base_name+i+".csv") for i in variable_list]
    
    if include_sample:
        assert len(sample_name) != 0, "if you want to include sample cca name, include its file names" 
        list_of_files.insert(0,pd.read_csv(sample_name))
        
    # some quick preprocessing
    for df in list_of_files:
        if "Unnamed: 0" in df.columns:
            df.set_index("Unnamed: 0", inplace=True)
            df.index = pd.to_datetime(df.index)
            df.index.names = ['date']
        elif "date" in df.columns:
            df.set_index("date", inplace=True)
            df.index = pd.to_datetime(df.index)
            df.index.names = ['date']
        
    return list_of_files

def list_to_dict(keys, lst):
    return dict(zip(transaction_costs,cca_tcosts))

In [None]:
from itertools import product
combinations = [str(i)+"_"+str(j) for i in tcosts for j in gammas]

In [None]:
# assign dataset names
name_convention = "ncca_mtm_"
results = extract_tcost_files(variable_list=combinations,
                                 base_name=name_convention,
                                 sample_name="sample_cca_weights.csv",
                                 include_sample=False)

# create dictionary from list
transaction_costs = [str(i) for i in combinations]
transaction_costs.insert(0,"sample")
cca_dict = dict(zip(combinations, results))

### evaluate performance

dictionary to use is *cca_dict*

In [None]:
pnl_results = dict()
ptf_ret = dict()
for k,tc in enumerate(cca_dict):
    print(tc[:6])
    # we turn 'tc' into a float, which is why there is a separation
    if tc!="sample":     
        portfolio =  Portfolio(prices=prices.loc[cca_dict[tc].index], position=cca_dict[tc], period=0,tcost=np.float(tc[:6]))
        ptf_ret[tc] = portfolio.adjusted_profit.to_frame(name="Profit")
        pnl_results[tc] = portfolio.adjusted_nav().to_frame(name="NAV")
    else:
        portfolio =  Portfolio(prices=prices.loc[cca_dict[tc].index], position=cca_dict[tc], period=0,tcost=0)
        ptf_ret[tc] = portfolio.adjusted_profit.to_frame(name="Profit")
        pnl_results[tc] = portfolio.adjusted_nav().to_frame(name="NAV")

In [None]:
nls_summ = build_table2(combinations, ptf_ret)
nls_summ

In [None]:
build_table3(combinations, cca_dict, ptf_ret)

In [None]:
plotting(pnl_results, combinations)

# Statistical corrections

In [None]:
from arch.bootstrap import SPA, MCS
from sklearn.utils import resample

In [None]:
from scipy.stats import ttest_1samp, t

In [None]:
def adjust_sharpe(sharpes, T, method="bonferonni"):
    """
    computes a number of test statistics for trading strategies
    input : sharpe ratios (np.ndarray), T, length of sample (int)
    ouput : data frame of statistical significance tests of trading strategies
    """
    t_stats = np.zeros(len(sharpes))
    p_values = np.zeros(len(sharpes))
    sharpes_adj = np.zeros(len(sharpes))
    p_values_adj = np.zeros(len(sharpes))
    t_stats_adj = np.zeros(len(sharpes))
    summary = pd.DataFrame({"sharpe": sharpes,
                            "adj_sharpe": sharpes_adj,
                            "p_val": p_values,
                            "adj_p_val": p_values_adj,
                            "t_stat": t_stats,
                            "adj_t_stat": t_stats_adj})
    summary["t_stat"] = sharpes*np.sqrt(T)
    summary["p_val"] = t.sf(summary["t_stat"], df=T-1)

    if method == "holm":  # issue with indexes
        summary.sort_values(by=['p_val'], inplace=True)
        summary.reset_index(inplace=True, drop=True)
        adj_p = summary["p_val"] * np.arange(1, len(summary)+1)
        summary["adj_p_val"] = [min(adj_p[i], 1) for i in range(len(summary))]
        summary["adj_t_stat"] = np.abs(t.ppf(summary["adj_p_val"], df=T-1))
        summary["adj_sharpe"] = summary["adj_t_stat"]/np.sqrt(T)
        
    elif method == "BHY":
        c = np.sum([1/(i+1) for i in range(len(sharpes))])
        # sequential loop
        summary.sort_values(by=['p_val'], ascending=False, inplace=True)
        summary.reset_index(inplace=True, drop=True)
        summary.loc[0, "adj_p_val"] = summary.loc[0, "p_val"]
    
        for i in range(1, len(summary)):
            summary.loc[i, "adj_p_val"] = min(summary.loc[i-1, "adj_p_val"], len(summary) * c / (i+1) *
                                              summary.loc[i, "p_val"])
            summary["adj_t_stat"] = np.abs(t.ppf(summary["adj_p_val"], df=T-1))
            summary["adj_sharpe"] = summary["adj_t_stat"]/np.sqrt(T)
        
    return round(summary, 3)

## Holm

In [None]:
sharpes = np.array([0.8,0.81,0.11,0.14,-0.58,-0.49,0.42,0.43,-0.37,-0.32,-1.15,-1.01,0.67,0.68,0.38,
                                0.43,-0.53,-0.44,0.05,0.06,-0.93,-0.87,-1.9,-1.8,0.45,0.46,-0.51,-0.56,-1.39,-1.26])
adjust_sharpe(sharpes, T=750, method="holm")

## FDR

In [None]:
adjust_sharpe(sharpes, T=750, method="BHY")

## white's reality check

In [None]:
def extract_tcost_files(variable_list, base_name,sample_name,include_sample = True):
    
    # extract data frames and put them into a list
    list_of_files = [pd.read_csv(base_name+i+".csv") for i in variable_list]
    
    if include_sample:
        assert len(sample_name) != 0, "if you want to include sample cca name, include its file names" 
        list_of_files.insert(0,pd.read_csv(sample_name))
        
    # some quick preprocessing
    for df in list_of_files:
        if "Unnamed: 0" in df.columns:
            df.set_index("Unnamed: 0", inplace=True)
            df.index = pd.to_datetime(df.index)
            df.index.names = ['date']
        elif "date" in df.columns:
            df.set_index("date", inplace=True)
            df.index = pd.to_datetime(df.index)
            df.index.names = ['date']
        
    return list_of_files

def list_to_dict(keys, lst):
    return dict(zip(transaction_costs,cca_tcosts))

In [None]:
file_names = [ "ncca_meanrev_", "ncca_mtm_","ncca_prs_","lcca_ls_","lcca_nls_"]
lists = []
for f in file_names:
    l = extract_tcost_files(variable_list=combinations,
                                     base_name=name_convention,
                                     sample_name="sample_cca_weights.csv",
                                     include_sample=False)
    for new in l:
        lists.append(new)


In [None]:
from itertools import product
combinations = [str(i)+"_"+str(j) for i in tcosts for j in gammas]

In [None]:
names = [i + j + ".csv" for i in file_names for j in combinations]

signals to portfolio returns

In [None]:
ret=ret[ret.index>=lists[0].index[0]]
ret=ret[ret.index<=lists[0].index[-1]]

In [None]:
lists = [np.array((lists[l] * ret.iloc[:lists[l].shape[0],:lists[l].shape[1]]).sum(axis=1).values) for l in range(len(lists))]

In [None]:
toarr = np.array(lists)

In [None]:
combined = pd.DataFrame(columns=names,data=toarr.T,index=ret.index)

In [None]:
resampled= resample(np.concatenate(lists), replace=True, n_samples=combined.shape[0])

define model losses (since SPA is for predictiveness of models) as negative returns

In [None]:
benchmark = np.random.normal(resampled.mean(),resampled.std(),combined.shape[0])
bm_0 = np.random.normal(0,ret.std().mean(),combined.shape[0])

In [None]:
combined[combined>0]=0

In [None]:
# for stability reasons add jitter
combined += 0.0001*np.random.randn(combined.shape[0],combined.shape[1])

In [None]:
spa = SPA(resampled, combined,reps=10000)
print(spa.compute())
print(spa.pvalues)
print(spa.better_models())
print(spa.critical_values())
spa.reset()

# MCS

In [None]:
import numpy as np
from numpy.random import rand
from numpy import ix_
import pandas as pd


In [None]:
def bootstrap_sample(data, B, w):
    '''
    Bootstrap the input data
    data: input numpy data array
    B: boostrap size
    w: block length of the boostrap
    '''
    t = len(data)
    p = 1 / w
    indices = np.zeros((t, B), dtype=int)
    indices[0, :] = np.ceil(t * rand(1, B))
    select = np.asfortranarray(rand(B, t).T < p)
    vals = np.ceil(rand(1, np.sum(np.sum(select))) * t).astype(int)
    indices_flat = indices.ravel(order="F")
    indices_flat[select.ravel(order="F")] = vals.ravel()
    indices = indices_flat.reshape([B, t]).T
    for i in range(1, t):
        indices[i, ~select[i, :]] = indices[i - 1, ~select[i, :]] + 1
    indices[indices > t] = indices[indices > t] - t
    indices -= 1
    return data[indices]


def compute_dij(losses, bsdata):
    '''Compute the loss difference'''
    t, M0 = losses.shape
    B = bsdata.shape[1]
    dijbar = np.zeros((M0, M0))
    for j in range(M0):
        dijbar[j, :] = np.mean(losses - losses[:, [j]], axis=0)

    dijbarstar = np.zeros((B, M0, M0))
    for b in range(B):
        meanworkdata = np.mean(losses[bsdata[:, b], :], axis=0)
        for j in range(M0):
            dijbarstar[b, j, :] = meanworkdata - meanworkdata[j]

    vardijbar = np.mean((dijbarstar - np.expand_dims(dijbar, 0)) ** 2, axis=0)
    vardijbar += np.eye(M0)

    return dijbar, dijbarstar, vardijbar


def calculate_PvalR(z, included, zdata0):
    '''Calculate the p-value of relative algorithm'''
    empdistTR = np.max(np.max(np.abs(z), 2), 1)
    zdata = zdata0[ix_(included - 1, included - 1)]
    TR = np.max(zdata)
    pval = np.mean(empdistTR > TR)
    return pval


def calculate_PvalSQ(z, included, zdata0):
    '''Calculate the p-value of sequential algorithm'''
    empdistTSQ = np.sum(z ** 2, axis=1).sum(axis=1) / 2
    zdata = zdata0[ix_(included - 1, included - 1)]
    TSQ = np.sum(zdata ** 2) / 2
    pval = np.mean(empdistTSQ > TSQ)
    return pval


def iterate(dijbar, dijbarstar, vardijbar, alpha, algorithm="R"):
    '''Iteratively excluding inferior model'''
    B, M0, _ = dijbarstar.shape
    z0 = (dijbarstar - np.expand_dims(dijbar, 0)) / np.sqrt(
        np.expand_dims(vardijbar, 0)
    )
    zdata0 = dijbar / np.sqrt(vardijbar)

    excludedR = np.zeros([M0, 1], dtype=int)
    pvalsR = np.ones([M0, 1])

    for i in range(M0 - 1):
        included = np.setdiff1d(np.arange(1, M0 + 1), excludedR)
        m = len(included)
        z = z0[ix_(range(B), included - 1, included - 1)]

        if algorithm == "R":
            pvalsR[i] = calculate_PvalR(z, included, zdata0)
        elif algorithm == "SQ":
            pvalsR[i] = calculate_PvalSQ(z, included, zdata0)

        scale = m / (m - 1)
        dibar = np.mean(dijbar[ix_(included - 1, included - 1)], 0) * scale
        dibstar = np.mean(dijbarstar[ix_(range(B), included - 1, included - 1)], 1) * (
            m / (m - 1)
        )
        vardi = np.mean((dibstar - dibar) ** 2, axis=0)
        t = dibar / np.sqrt(vardi)
        modeltoremove = np.argmax(t)
        excludedR[i] = included[modeltoremove]

    maxpval = pvalsR[0]
    for i in range(1, M0):
        if pvalsR[i] < maxpval:
            pvalsR[i] = maxpval
        else:
            maxpval = pvalsR[i]

    excludedR[-1] = np.setdiff1d(np.arange(1, M0 + 1), excludedR)
    pl = np.argmax(pvalsR > alpha)
    includedR = excludedR[pl:]
    excludedR = excludedR[:pl]
    return includedR - 1, excludedR - 1, pvalsR


def MCS2(losses, alpha, B, w, algorithm):
    '''Main function of the MCS'''
    t, M0 = losses.shape
    bsdata = bootstrap_sample(np.arange(t), B, w)
    dijbar, dijbarstar, vardijbar = compute_dij(losses, bsdata)
    includedR, excludedR, pvalsR = iterate(
        dijbar, dijbarstar, vardijbar, alpha, algorithm=algorithm
    )
    return includedR, excludedR, pvalsR

In [None]:
class ModelConfidenceSet(object):
    def __init__(self, data, alpha, B, w, algorithm="SQ", names=None):
        """
        Implementation of Econometrica Paper:
        Hansen, Peter R., Asger Lunde, and James M. Nason. "The model confidence set." Econometrica 79.2 (2011): 453-497.

        Input:
            data->pandas.DataFrame or numpy.ndarray: input data, columns are the losses of each model 
            alpha->float: confidence level
            B->int: bootstrap size for computation covariance
            w->int: block size for bootstrap sampling
            algorithm->str: SQ or R, SQ is the first t-statistics in Hansen (2011) p.465, and R is the second t-statistics
            names->list: the name of each model (corresponding to each columns). 

        Method:
            run(self): compute the MCS procedure

        Attributes:
            included: models that are in the model confidence sets at confidence level of alpha
            excluded: models that are NOT in the model confidence sets at confidence level of alpha
            pvalues: the bootstrap p-values of each models
        """

        if isinstance(data, pd.DataFrame):
            self.data = data.values
            self.names = data.columns.values if names is None else names
        elif isinstance(data, np.ndarray):
            self.data = data
            self.names = np.arange(data.shape[1]) if names is None else names

        if alpha < 0 or alpha > 1:
            raise ValueError(
                f"alpha must be larger than zero and less than 1, found {alpha}"
            )
        if not isinstance(B, int):
            try:
                B = int(B)
            except Exception as identifier:
                raise RuntimeError(
                    f"Bootstrap size B must be a integer, fail to convert", identifier
                )
        if B < 1:
            raise ValueError(f"Bootstrap size B must be larger than 1, found {B}")
        if not isinstance(w, int):
            try:
                w = int(w)
            except Exception as identifier:
                raise RuntimeError(
                    f"Bootstrap block size w must be a integer, fail to convert",
                    identifier,
                )
        if w < 1:
            raise ValueError(f"Bootstrap block size w must be larger than 1, found {w}")

        if algorithm not in ["R", "SQ"]:
            raise TypeError(f"Only R and SQ algorithm supported, found {algorithm}")

        self.alpha = alpha
        self.B = B
        self.w = w
        self.algorithm = algorithm

    def run(self):
        included, excluded, pvals = MCS2(
            self.data, self.alpha, self.B, self.w, self.algorithm
        )

        self.included = self.names[included].ravel().tolist()
        self.excluded = self.names[excluded].ravel().tolist()
        self.pvalues = pd.Series(pvals.ravel(), index=self.excluded + self.included)
        return self

In [None]:
np.random.seed(30)
mcs = ModelConfidenceSet(combined, 0.5,3, 2000).run()


In [None]:
mcs.included