In [11]:
import warnings
import numpy as np
import pandas as pd
from BuildLinearData import linear_data
from BacktestStrategy import backtest_strategy
import statsmodels.api as sm
from BuildLinearModel import build_linear_model
warnings.filterwarnings("ignore")

In [41]:
def backtest_strategy1(train_data, test_data, to_test='Pred', threshold=0.2, l=5, optimise=False):
    """
    Backtests the strategy and prints out the metrics
    :param train_data: Training Dataset
    :param test_data: Test data for prediction
    :param to_test: Backtest with real or predicted data
    :param l: the no. of LAGS for VOI and OIR determined by the ACF Plot
    :param threshold: trading threshold
    :return: dataframe with Price, Predicted MPC, and True MPC as columns
    """
    
    if to_test == 'Pred':
        
        # Retrieve trained model
        model = build_linear_model(train_data, l=l)
    
        # Build the explanatory variables
        df = pd.DataFrame({'MPB': test_data["MPB"] / test_data["Spread"], 'VOI': test_data["VOI_(t)"] / test_data["Spread"], 
                       f'OIR': test_data["OIR_(t)"] / test_data["Spread"], **{f'VOI{i}': test_data[f"VOI_(t-{i})"] / test_data["Spread"] 
                        for i in range(1,l+1)}, **{f'OIR{i}': test_data[f"OIR_(t-{i})"] / test_data["Spread"] for i in range(1,l+1)}})
        
        # Predicting MPC
        y_pred = model.predict(sm.add_constant(df))
        del df
        
        # Converting to multinomial classifier
        y_pred = np.where(y_pred > threshold, 1, np.where(y_pred < -threshold, -1, 0))
        test_data["MPC_pred"] = y_pred
        
        # Formatting Data
        data = test_data[["Price", "MPC_pred"]][(test_data['MPC_pred'] == 1) | (test_data['MPC_pred'] == -1)]
        data.loc[test_data.index[-1]] = test_data.loc[test_data.index[-1], ["Price", "MPC_pred"]]
        data.rename(columns = {"MPC_pred" : "Signal"}, inplace=True)
    
    elif to_test == 'Real':
        
        # Converting to multinomial classifier
        y_true = pd.Series(np.where(test_data["MPC"] > threshold, 1, np.where(test_data["MPC"] < -threshold, -1, 0)), index=test_data.index)
        test_data["MPC"] = y_true
        
        # Formatting Data
        data = test_data[["Price", "MPC"]][(test_data['MPC'] == 1) | (test_data['MPC'] == -1)]
        data.loc[test_data.index[-1]] = test_data.loc[test_data.index[-1], ["Price", "MPC"]]
        data.rename(columns = {"MPC" : "Signal"}, inplace=True)
    
    return_df = pd.DataFrame(columns=["Price", "Position", "Cost", "Profit", "Volume"])
    gen = generate_trade(data)
    return gen
    for index, trade, tc in gen:
        
        return_df.loc[index] = trade
            
    # Print Metrics
    if optimise == False:
        
        print("Profit before transaction cost = {} USD".format(sum(return_df["Cost"])))
        print("Transaction Cost = {} USD".format(tc))
        print("Total Profit = {} USD".format(return_df.iloc[-1,3]))
        print("Total Trade Volume = {} trades".format(return_df.iloc[-1,4]))
    
    return return_df

def generate_trade(data):
        # Define Constants
    own = False
    position = 0
    TC = 0.000207
    cost = 0
    t_cost = 0
    t_volume = 0
    
    for index in data.index:
        
        signal = data.loc[index, "Signal"]
        price = data.loc[index, "Price"]
        
        if own:
            if position == 1 and signal == -1:
                
                position = -1
                cost = 2 * price
                t_cost += 2 * TC * price
                t_volume += 2
                
                
            elif position == -1 and signal == 1:
                
                position = 1
                cost = -2 * price
                t_cost += 2 * TC * price
                t_volume += 2
                
                
            elif index == data.index[-1]:
                
                cost = price if position == 1 else (-1 * price)
                t_cost += TC * price
                t_volume += 1
                position = 0
                
                
        else:
            
            if signal == 1:
                
                cost = -price
                t_cost += TC * price
                t_volume += 1
                position = 1
                own = True
                
                
            elif signal == -1:
                cost = price
                t_cost += TC * price
                t_volume += 1
                position = 1
                yield index, [price, position, cost, (sum(cost) - t_cost), t_volume], t_cost

In [None]:
import numpy as np
import pandas as pd

def calculate_profit(data, df, optimise=False):
    own = False
    position = 0
    TC = 0.000207
    n_rows = len(data)
    cost = np.zeros(n_rows)
    t_cost = 0
    t_volume = 0
    return_df = pd.DataFrame(columns=["Price", "Position", "Profit"])
    
    def update_return_df(index, price, position, cost):
        return_df.loc[index] = [price, position, (sum(cost) - t_cost)]
    
    for i, index in enumerate(data.index):
        curr_data = data.loc[index]
        curr_price = df.loc[index, "Price"]
        
        if own:
            if position == 1 and curr_data == -1:
                cost[i] = curr_price
                cost[i-1] = curr_price
                t_cost += 2 * TC * curr_price
                t_volume += 2
                position = -1
                update_return_df(index, curr_price, position, cost)
            elif position == -1 and curr_data == 1:
                cost[i] = -curr_price
                cost[i-1] = -curr_price
                t_cost += 2 * TC * curr_price
                t_volume += 2
                position = 1
                update_return_df(index, curr_price, position, cost)
            elif i == n_rows-1:
                cost[i] = -curr_price if position == 1 else curr_price
                t_cost += TC * curr_price
                t_volume += 1
                position = 0
                update_return_df(index, curr_price, position, cost)
        else:
            if curr_data == 1:
                cost[i] = -curr_price
                t_cost += TC * curr_price
                t_volume += 1
                position = 1
                own = True
                update_return_df(index, curr_price, position, cost)
            elif curr_data == -1:
                cost[i] = curr_price
                t_cost += TC * curr_price
                t_volume += 1
                position =





            yield (index, price, position, (sum(cost) - t_cost))

        # BUY to TRADE    
        elif own and position == -1 and data.loc[index] == 1:
            own = True
            position = 1
            price = df.loc[index, "Price"]
            cost.append(-1*price)
            cost.append(-1*price)
            t_cost += 2*TC*price
            t_volume += 2
            yield (index, price, position, (sum(cost) - t_cost))

        # CLOSE at day end
        elif position == 1 and index == data.index[-1]:
            position = 0
            price = df.loc[index, "Price"]
            cost.append(price)
            t_cost += TC*price
            t_volume += 1
            yield (index, price, position, (sum(cost) - t_cost))

        # CLOSE at day end
        elif position == -1 and index == data.index[-1]:
            position = 0
            price = df.loc[index, "Price"]
            cost.append(-1*price)
            t_cost += TC*price
            t_volume += 1
            yield (index, price, position, (sum(cost) - t_cost))




In [4]:
df1 = pd.read_csv("OrderBook_10.csv")
df2 = pd.read_csv("KlineData_10.csv")
data1 = linear_data(df1, df2, d=20)

In [5]:
df = data1.copy()
train, test = np.split(df, [int(0.8*len(df))])

In [6]:
testb = backtest_strategy(train, test, threshold=0.2)

Profit before transaction cost = -747.4999999999636 USD
Transaction Cost = 22467.458673899975 USD
Total Profit = -23214.95867389994 USD
Total Trade Volume = 3606 trades


In [7]:
testb

Unnamed: 0_level_0,Price,Position,Profit,MPC,MPC_pred
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
19458 days 10:00:46.437000,30036.9,-1,30030.682362,-1,-1
19458 days 10:00:52.687000,30036.9,1,-30055.552915,-1,1
19458 days 10:00:59.187000,30036.8,-1,30005.611850,1,-1
19458 days 10:01:00.437000,30036.9,1,-30080.623427,1,1
19458 days 10:01:56.187000,30040.0,-1,29986.940013,1,-1
...,...,...,...,...,...
19458 days 15:20:04.687000,30180.0,1,-53352.829344,-1,1
19458 days 15:20:05.937000,30180.0,-1,6994.676136,1,-1
19458 days 15:20:14.937000,30178.5,1,-53374.817763,-1,1
19458 days 15:20:16.187000,30178.6,-1,6969.888296,-1,-1


In [42]:
testb = backtest_strategy1(train, test, threshold=0.2)

In [43]:
next(testb)

TypeError: 'numpy.float64' object is not iterable

In [4]:
cost, tc, vol = backtest_strategy(train, test, optimise=True)

In [19]:
testb = backtest_strategy(train, test, to_test='Real', threshold=24)

Profit before transaction cost = 414.19999999988795 USD
Transaction Cost = 51190.37735999993 USD
Total Profit = -50776.177360000045 USD
Total Trade Volume = 5666 trades


In [5]:
test

Unnamed: 0_level_0,Price,MPC_pred,MPC
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
19458 days 10:00:45.687000,30036.9,0,-1
19458 days 10:00:45.937000,30036.9,0,-1
19458 days 10:00:46.187000,30036.9,-1,-1
19458 days 10:00:46.437000,30036.9,0,1
19458 days 10:00:46.687000,30036.9,0,-1
...,...,...,...
19458 days 15:20:15.687000,30178.5,0,-1
19458 days 15:20:15.937000,30178.6,-1,-1
19458 days 15:20:16.187000,30178.6,-1,-1
19458 days 15:20:16.437000,30178.6,0,-1


In [3]:
df1 = pd.read_csv("OrderBook_11.csv")
df2 = pd.read_csv("KlineData_11.csv")
data2 = linear_data(df1, df2, d=20)

In [4]:
df1 = pd.read_csv("OrderBook_13_1.csv")
df2 = pd.read_csv("KlineData_13_1.csv")
data3 = linear_data(df1, df2, d=20)

In [5]:
df1 = pd.read_csv("OrderBook_15_2.csv")
df2 = pd.read_csv("KlineData_15_2.csv")
data4 = linear_data(df1, df2, d=20)

In [6]:
df1 = pd.read_csv("OrderBook_16_1.csv")
df2 = pd.read_csv("KlineData_16_1.csv")
data5 = linear_data(df1, df2, d=20)

In [7]:
df1 = pd.read_csv("OrderBook_17_1.csv")
df2 = pd.read_csv("KlineData_17_1.csv")
data6 = linear_data(df1, df2, d=20)

In [8]:
df1 = pd.read_csv("OrderBook_17_2.csv")
df2 = pd.read_csv("KlineData_17_2.csv")
data7 = linear_data(df1, df2, d=20)

In [9]:
df1 = pd.read_csv("OrderBook_18.csv")
df2 = pd.read_csv("KlineData_18.csv")
data8 = linear_data(df1, df2, d=20)

In [10]:
df1 = pd.read_csv("OrderBook_19_1.csv")
df2 = pd.read_csv("KlineData_19_1.csv")
data9 = linear_data(df1, df2, d=20)

In [11]:
df1 = pd.read_csv("OrderBook_20_1.csv")
df2 = pd.read_csv("KlineData_20_1.csv")
data10 = linear_data(df1, df2, d=20)

In [12]:
frames = [data1, data2, data3, data4, data5, data6, data7, data8, data9, data10]

result = pd.concat(frames)
df = result.copy()
train, test = np.split(df, [int(0.8*len(df))])

In [15]:
%%timeit
testb = backtest_strategy(train, test, threshold=0.1)

Profit before transaction cost = -2425.899999999998 USD
Transaction Cost = 175540.16354999828 USD
Total Profit = -177966.06354999827 USD
Total Trade Volume = 19570 trades
Profit before transaction cost = -2425.899999999998 USD
Transaction Cost = 175540.16354999828 USD
Total Profit = -177966.06354999827 USD
Total Trade Volume = 19570 trades
Profit before transaction cost = -2425.899999999998 USD
Transaction Cost = 175540.16354999828 USD
Total Profit = -177966.06354999827 USD
Total Trade Volume = 19570 trades
Profit before transaction cost = -2425.899999999998 USD
Transaction Cost = 175540.16354999828 USD
Total Profit = -177966.06354999827 USD
Total Trade Volume = 19570 trades
Profit before transaction cost = -2425.899999999998 USD
Transaction Cost = 175540.16354999828 USD
Total Profit = -177966.06354999827 USD
Total Trade Volume = 19570 trades
Profit before transaction cost = -2425.899999999998 USD
Transaction Cost = 175540.16354999828 USD
Total Profit = -177966.06354999827 USD
Total Tr

In [32]:
testb

Unnamed: 0_level_0,Price,MPC_pred,MPC
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
19460 days 19:50:40.003000,30320.6,0,1
19460 days 19:50:40.253000,30320.6,0,1
19460 days 19:50:40.503000,30320.7,0,1
19460 days 19:50:40.753000,30320.7,0,1
19460 days 19:50:41.003000,30320.7,0,1
...,...,...,...
19464 days 00:22:22.916000,30229.3,0,1
19464 days 00:22:23.166000,30229.3,0,-1
19464 days 00:22:23.416000,30229.2,0,1
19464 days 00:22:23.666000,30229.2,1,1


When calculating the average Trade Price:
$$\bar{TP}_{t}=\frac{1}{N} \cdot \frac{T_{t}-T_{t-1}}{V_{t}-V_{t-1}}$$


Since we calculate $T_{t}=P_{t} \cdot V_{t}$, say the Price does not change but the volume does then:

$$\bar{TP}_{t}=\frac{1}{N} \cdot \frac{P_{t} \cdot V_{t}-P_{t-1} \cdot V_{t-1}}{V_{t}-V_{t-1}}$$
and $P_{t}=P_{t-1}=P$, therefore,
$$\bar{TP}_{t}=\frac{1}{N} \cdot \frac{P \cdot (V_{t} - V_{t-1})}{V_{t}-V_{t-1}}=\frac{P}{N}$$
And hence,
$$MPB_{t}=\bar{TP}_{t}-\bar{MP}_{t}=\frac{P}{N}-\bar{MP}_{t} \approx \frac{P}{N}-\bar{P}$$
For $N=1$ we have:
$$MPB_{t}=P-\bar{P}$$

Now say both the volume and price change:
$$\bar{TP}_{t}=\frac{1}{N} \cdot [\frac{(P + \Delta P) \cdot V_{t}-P \cdot V_{t-1}}{V_{t}-V_{t-1}}]$$
$$\bar{TP}_{t}=\frac{1}{N} \cdot [\frac{P \cdot \Delta V - \Delta P \cdot V_{t}}{V_{t}-V_{t-1}}]$$
$$\bar{TP}_{t}=\frac{1}{N} \cdot [P - \frac{\Delta P \cdot V_{t}}{\Delta V}]$$
For $N=1$ we have:
$$\bar{TP}_{t}=P - \frac{\Delta P \cdot V_{t}}{\Delta V}$$
$$MPB_{t}=P - \frac{\Delta P \cdot V_{t}}{\Delta V}-\bar{P}$$

Now the problem with this is that it's time dependent as $V_{t}$ is the volume since day start. Therefore, the MPB for equal $\Delta P$ and $\Delta V$ will be different at different timestamps. We don't want this!

So I propose a diffrent formula:

For $\Delta V \neq 0$, we have:

If $\Delta P = 0$:

$$\bar{TP}_{t}=P$$

$$\bar{MPB}_{t}=P-\bar{P}$$

If $\Delta P \neq 0$:

$$\bar{TP}_{t}=P + n \cdot \frac{\Delta V}{\Delta P}$$

$$\bar{MPB}_{t}=P + n \cdot \frac{\Delta V}{\Delta P}-\bar{P}$$

The logic in the paper is:
A large positive (negative) quantity means the trades were, on average, closer to the ask (bid) price. Hence, large positive (negative) quantity indicates buyer (seller) initiated.
By removing the $V_{t}$ we remove the temporal component and thus, the MPB for equal $\Delta P$ and $\Delta V$ will be the same at different timestamps.

Now I have divided buyer (seller) initiated into two categories:
1. Weak Form
2. Strong Form

If the price does not change this implies that the change in volume is either zero or quite low. Therefore, the MPB here will be low in magnitude.

For higher changes in volume, the price will change. If the price changes is high but the volume is low, then although the buyer (seller) is somewhat interested but not too aggressive as not a lot of trades happened. In this case the formula in the paper will have a large postive or negative value. But we don't want that! We want a value higher in magnitude than the previous case but still low enough.

The form above describes the "Weak Form" of initiation.

If the price changes is low but the volume is high, then the buyer (seller) is quite aggressive as a lot of trades happened. In this case the formula in the paper will have a low postive or negative value. But we don't want that! We want a value higher in magnitude to represent the aggression.

Of course this doe not take into consideration, the case where both $\Delta P$ and $\Delta V$ are large. I presume that's a rare occurence, assuming people don't line up to suddenly buy a stock that jumped 10% in hopes that it'll jump again!

Note: The contant $n$ has been added in case the ratio $\frac{\Delta V}{\Delta P}$ is not large (small) enough to have a resonable effect. This constant can be deteremined by look at the average changes in volume in a day and consequently price, for example.