### TODO
Add VAR exogenous - OFI, other models 

In [None]:
# import libraries
import pandas as pd
pd.options.display.max_columns = None 

import numpy as np
import math
import datetime as dt
from datetime import timedelta

# charting libraries
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use("seaborn-notebook")

# Import Statsmodels
from statsmodels.tsa.api import VAR
from statsmodels.tsa.stattools import adfuller
from statsmodels.tools.eval_measures import rmse, aic

import warnings
warnings.filterwarnings('ignore')

In [None]:
#################
# Load from S3
#################
resample_period = "10ms"
usdcad_file = "/Volumes/GoogleDrive/Shared drives/data/echo/think_usdcad_20200921_quotes.csv"
gbpusd_file = "/Volumes/GoogleDrive/Shared drives/Data/echo/think_gbpusd_20200921_quotes.csv"
usdjpy_file = "/Volumes/GoogleDrive/Shared drives/Data/echo/think_usdjpy_20200921_quotes.csv"
eurusd_file = "/Volumes/GoogleDrive/Shared drives/Data/echo/think_eurusd_20200921_quotes.csv"

In [None]:
usdcad_df = pd.read_csv(usdcad_file, index_col='t', parse_dates=['t'])
usdcad = usdcad_df[["Bid0", "Offer0"]]
del usdcad_df
usdcad = usdcad.rename(columns={"Bid0": "usdcad_bid0", "Offer0": "usdcad_offer0"})
usdcad["usdcad_mid"] = (usdcad["usdcad_bid0"] + usdcad["usdcad_offer0"]) / 2
usdcad["usdcad_mid_change"] = usdcad["usdcad_mid"].pct_change()
usdcad = usdcad.resample(resample_period).last().ffill()

# sweep model
usdcad["usdcad_3_sum"] = usdcad["usdcad_mid_change"].rolling(10).sum()
usdcad["usdcad_3_sum_stdev"] = usdcad["usdcad_3_sum"].rolling(500).std()


# define the up and down sweeps
usdcad["usdcad_sweep_up_signal"] = np.where(usdcad["usdcad_3_sum"] 
                                                    >= 2*usdcad["usdcad_3_sum_stdev"], 1, 0)
usdcad["usdcad_sweep_down_signal"] = np.where(usdcad["usdcad_3_sum"] 
                                                    <= 2*-usdcad["usdcad_3_sum_stdev"], -1, 0)
usdcad["usdcad_sweep_signal"] = usdcad["usdcad_sweep_up_signal"] + usdcad["usdcad_sweep_down_signal"]

# lag the signal 
usdcad["usdcad_sweep_signal"] = usdcad["usdcad_sweep_signal"].shift(1)
# generate spread
usdcad["usdcad_spread"] = (usdcad["usdcad_offer0"] - usdcad["usdcad_bid0"]) / usdcad["usdcad_mid"]


In [None]:
eurusd_df = pd.read_csv(eurusd_file, index_col='t', parse_dates=['t'])
eurusd = eurusd_df[["Bid0", "Offer0"]]
del eurusd_df
eurusd = eurusd.rename(columns={"Bid0": "eurusd_bid0", "Offer0": "eurusd_offer0"})
eurusd["eurusd_mid"] = (eurusd["eurusd_bid0"] + eurusd["eurusd_offer0"]) / 2
eurusd["eurusd_mid_change"] = eurusd["eurusd_mid"].pct_change()
eurusd = eurusd.resample(resample_period).last().ffill()

# sweep model
eurusd["eurusd_3_sum"] = eurusd["eurusd_mid_change"].rolling(10).sum()
eurusd["eurusd_3_sum_stdev"] = eurusd["eurusd_3_sum"].rolling(500).std()


# define the up and down sweeps
eurusd["eurusd_sweep_up_signal"] = np.where(eurusd["eurusd_3_sum"] 
                                                    >= 2*eurusd["eurusd_3_sum_stdev"], 1, 0)
eurusd["eurusd_sweep_down_signal"] = np.where(eurusd["eurusd_3_sum"] 
                                                    <= 2*-eurusd["eurusd_3_sum_stdev"], -1, 0)
eurusd["eurusd_sweep_signal"] = eurusd["eurusd_sweep_up_signal"] + eurusd["eurusd_sweep_down_signal"]

# lag the signal 
eurusd["eurusd_sweep_signal"] = eurusd["eurusd_sweep_signal"].shift(1)
# generate spread
eurusd["eurusd_spread"] = (eurusd["eurusd_offer0"] - eurusd["eurusd_bid0"]) / eurusd["eurusd_mid"]


In [None]:
usdjpy_df = pd.read_csv(usdjpy_file, index_col='t', parse_dates=['t'])
usdjpy = usdjpy_df[["Bid0", "Offer0"]]
del usdjpy_df
usdjpy = usdjpy.rename(columns={"Bid0": "usdjpy_bid0", "Offer0": "usdjpy_offer0"})
usdjpy["usdjpy_mid"] = (usdjpy["usdjpy_bid0"] + usdjpy["usdjpy_offer0"]) / 2
usdjpy["usdjpy_mid_change"] = usdjpy["usdjpy_mid"].pct_change()
usdjpy = usdjpy.resample(resample_period).last().ffill()

# sweep model
usdjpy["usdjpy_3_sum"] = usdjpy["usdjpy_mid_change"].rolling(10).sum()
usdjpy["usdjpy_3_sum_stdev"] = usdjpy["usdjpy_3_sum"].rolling(500).std()


# define the up and down sweeps
usdjpy["usdjpy_sweep_up_signal"] = np.where(usdjpy["usdjpy_3_sum"] 
                                                    >= 2*usdjpy["usdjpy_3_sum_stdev"], 1, 0)
usdjpy["usdjpy_sweep_down_signal"] = np.where(usdjpy["usdjpy_3_sum"] 
                                                    <= 2*-usdjpy["usdjpy_3_sum_stdev"], -1, 0)
usdjpy["usdjpy_sweep_signal"] = usdjpy["usdjpy_sweep_up_signal"] + usdjpy["usdjpy_sweep_down_signal"]

# lag the signal 
usdjpy["usdjpy_sweep_signal"] = usdjpy["usdjpy_sweep_signal"].shift(1)
# generate spread
usdjpy["usdjpy_spread"] = (usdjpy["usdjpy_offer0"] - usdjpy["usdjpy_bid0"]) / usdjpy["usdjpy_mid"]


In [None]:
gbpusd_df = pd.read_csv(gbpusd_file, index_col='t', parse_dates=['t'])
gbpusd = gbpusd_df[["Bid0", "Offer0"]]
del gbpusd_df
gbpusd = gbpusd.rename(columns={"Bid0": "gbpusd_bid0", "Offer0": "gbpusd_offer0"})
gbpusd["gbpusd_mid"] = (gbpusd["gbpusd_bid0"] + gbpusd["gbpusd_offer0"]) / 2
gbpusd["gbpusd_mid_change"] = gbpusd["gbpusd_mid"].pct_change()
gbpusd = gbpusd.resample(resample_period).last().ffill()

# sweep model
gbpusd["gbpusd_3_sum"] = gbpusd["gbpusd_mid_change"].rolling(10).sum()
gbpusd["gbpusd_3_sum_stdev"] = gbpusd["gbpusd_3_sum"].rolling(500).std()


# define the up and down sweeps
gbpusd["gbpusd_sweep_up_signal"] = np.where(gbpusd["gbpusd_3_sum"] 
                                                    >= 2*gbpusd["gbpusd_3_sum_stdev"], 1, 0)
gbpusd["gbpusd_sweep_down_signal"] = np.where(gbpusd["gbpusd_3_sum"] 
                                                    <= 2*-gbpusd["gbpusd_3_sum_stdev"], -1, 0)
gbpusd["gbpusd_sweep_signal"] = gbpusd["gbpusd_sweep_up_signal"] + gbpusd["gbpusd_sweep_down_signal"]

# lag the signal 
gbpusd["gbpusd_sweep_signal"] = gbpusd["gbpusd_sweep_signal"].shift(1)
# generate spread
gbpusd["gbpusd_spread"] = (gbpusd["gbpusd_offer0"] - gbpusd["gbpusd_bid0"]) / gbpusd["gbpusd_mid"]


In [None]:
df_all = pd.concat([usdcad, usdjpy, eurusd, gbpusd], axis=1)

In [None]:
df_mid_changes = (df_all[["usdcad_mid_change",
                          "usdjpy_mid_change",
                          "gbpusd_mid_change", 
                          "eurusd_mid_change",
                         "usdcad_sweep_signal",
                         "usdjpy_sweep_signal",
                         "gbpusd_sweep_signal",
                         "eurusd_sweep_signal"]])

df_mid_changes = df_mid_changes.fillna(0)
n_obs = 3500000
df_train, df_test = df_mid_changes[0:-n_obs], df_mid_changes[-n_obs:]


In [None]:
# define the model
model = VAR(df_train)

In [None]:
# look at lags method 1
# looking for the lowest point of the AICS
for i in range(5):
    result = model.fit(i)
    print('Lag Order =', i)
    print('AIC : ', result.aic)
    print('BIC : ', result.bic)
    print('FPE : ', result.fpe)
    print('HQIC: ', result.hqic, '\n')

In [None]:
# look at lags method 2
x = model.select_order(maxlags=10)
x.summary()

In [None]:
# fit model
model_fitted = model.fit(10)
model_fitted.summary()

In [None]:
model_fitted.cov_params().to_clipboard()

In [None]:
# Get the lag order
lag_order = model_fitted.k_ar
print(lag_order)  #> 4



In [None]:
# Input data for forecasting
forecast_input = df_mid_changes.values[-lag_order:]


In [None]:
# Forecast
fc = model_fitted.forecast(y=forecast_input, steps=n_obs)
df_forecast = pd.DataFrame(fc, index=df_mid_changes.index[-n_obs:], columns=df_mid_changes.columns + '_f')


In [None]:
df_results = pd.concat([df_forecast, df_mid_changes], axis=1)

In [None]:
df_results.tail(5)

In [None]:
df_results["usdcad_pnl"] = np.sign(df_results["usdcad_mid_change_f"]) * df_results["usdcad_mid_change"]
df_results["usdjpy_pnl"] = np.sign(df_results["usdjpy_mid_change_f"]) * df_results["usdjpy_mid_change"]
df_results["eurusd_pnl"] = np.sign(df_results["eurusd_mid_change_f"]) * df_results["eurusd_mid_change"]
df_results["gbpusd_pnl"] = np.sign(df_results["gbpusd_mid_change_f"]) * df_results["gbpusd_mid_change"]
df_results["usdcad_signal"] = np.sign(df_results["usdcad_mid_change_f"])

In [None]:
df_results["eurusd_pnl"][-n_obs:-1500000].cumsum().resample("1T").last().plot()
plt.legend();

In [None]:
df_results["usdcad_pnl"][-n_obs:-1500000].cumsum().resample("1T").last().plot()
plt.legend();

In [None]:
df_mid_changes["eurusd_sweep_up_pnl"]

In [None]:
usdcad_results = df_mid_changes[["eurusd_sweep_up_signal", "eurusd_sweep_up_pnl"]]

In [None]:
###############
# Counts the number of ticks between changes in the signal
# Calculates pnl per trade 
###############

# these are the df and column for the signals 
signal_df = usdcad_results
signal_column = 'eurusd_sweep_up_signal'

# the df and column for the pnl
pnl_column = usdcad_results['eurusd_sweep_up_pnl']


def SignalPersisenceFast(df,column_name): 
    array= df[column_name].values
    previous_signal  = False 
    Counter = 0
    Times = []
    for x in range(len(array)):
        if((array[x] == previous_signal or Counter == 0) and array[x] != 0):
            Counter = Counter + 1
        else:
            Times.append(Counter)
            if array[x] != 0 : 
                Counter =  1
        previous_signal = array[x]
    return Times

Times = SignalPersisenceFast(usdcad_results,signal_column)
number_trades = (usdcad_results["eurusd_sweep_up_signal"].count() / np.mean(Times))
pnl_per_trade = pnl_column.sum() / number_trades

print("**time in signal/trade**")
print("Mean units of time in trade "f'{(np.mean(Times)):.2f}')
print("25 percentile time in trade " + str(np.percentile(Times, 25, axis=0)))
print("Median time in trade " + str(np.median(Times)))
print("75 percentile time in trade " + str(np.percentile(Times, 75, axis=0)))
print()

print("**performance stats**")
print("Number of signals/trades: "f'{number_trades:.0f}')
print("Cumulative PnL %: "f'{(pnl_column.sum() * 100):.2f}')
print("Average Trade PnL $ per million "f'{(pnl_per_trade*1000000):.2f}')

In [None]:
2000000/number_trades

In [None]:
usdcad_results.count()

In [None]:
df_mid_changes.tail()

In [None]:
# sweep model
df_all["eurusd_3_sum"] = df_all["eurusd_mid_change"].rolling(10).sum()
df_all["eurusd_3_sum_stdev"] = df_all["eurusd_3_sum"].rolling(500).std()


# define the up and down sweeps
df_all["eurusd_sweep_up_signal"] = np.where(df_all["eurusd_3_sum"] 
                                                    >= 2*df_all["eurusd_3_sum_stdev"], 1, 0)
df_all["eurusd_sweep_down_signal"] = np.where(df_all["eurusd_3_sum"] 
                                                    <= 2*-df_all["eurusd_3_sum_stdev"], -1, 0)

# lag the up signal 
df_all["eurusd_sweep_up_signal"] = df_all["eurusd_sweep_up_signal"].shift(1)

# create the up pnl
df_all["eurusd_sweep_up_pnl"] = df_all["eurusd_sweep_up_signal"] * df_all["eurusd_mid_change"]

# lag the down signal 
df_all["eurusd_sweep_down_signal"] = df_all["eurusd_sweep_down_signal"].shift(1)

# create the up pnl
df_all["eurusd_sweep_down_pnl"] = df_all["eurusd_sweep_down_signal"] * df_all["eurusd_mid_change"]

In [None]:
df_all["eurusd_sweep_up_pnl"][0:-2499000].cumsum().resample("1T").last().plot()

In [None]:
df_all["eurusd_sweep_up_pnl"][-n_obs:-3499000].plot()

In [None]:
df_all["gbpusd_spread"][-n_obs:-3499000].plot()

In [None]:
series.replace(0, np.nan)

In [None]:
series.mean()*10000000

In [None]:
df_all["eurusd_sweep_down_pnl"][0:-2499000].replace(0, np.nan).mean()

In [None]:
df_all["eurusd_sweep_up_pnl"].replace(0, np.nan).mean()

### NOTES
It looks like you should widen the price with the spike and then things mean revert 0.5 seconds-plus further out.

Putting the sweep signal in a VAR doesn't work because all the zeros. 