# Imports

Importing all required modules.

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats as stats
import seaborn as sns
import statsmodels.api as sm

import core.information_bars.bars as cib

# TODO(vr): Use below when Tick data will be in PQ on S3
# import vendors_amp.kibot.data.load as vkdl
import vendors_amp.kibot.data.load.file_path_generator as vkdlf
import vendors_amp.kibot.data.types as vkdt

plt.style.use("seaborn-talk")
plt.style.use("bmh")

# Constants

Defining constants to work with through this notebook.

In [None]:
TICK_M = 100
VOLUME_M = 100
DOLLAR_VOLUME_M = 100
PLOT_FIGURE_SIZE = (10, 7)

# Download

In [None]:
# TODO(vr): Use below when Tick data will be in PQ on S3
# downloader = vkdl.KibotDataLoader()
# source_df = downloader.read_data(
#     symbol='TT',
#     asset_class=vkdt.AssetClass.Futures,
#     frequency=vkdt.Frequency.Tick,
#     contract_type=vkdt.ContractType.Continuous,
#     nrows=1000
# )
# source_df

path_generator = vkdlf.FilePathGenerator()
remote_path = path_generator.generate_file_path(
    symbol="TT",
    asset_class=vkdt.AssetClass.Futures,
    frequency=vkdt.Frequency.Tick,
    contract_type=vkdt.ContractType.Continuous,
    ext=vkdt.Extension.CSV,
)
source_df = pd.read_csv(
    remote_path,
    header=0,
    nrows=1000,
    parse_dates=[["date", "time"]],
    names=["date", "time", "price", "volume"],
)
source_df

In [None]:
df = source_df.copy()
df.set_index("date_time", inplace=True)

# Bars

## Tick Bars

In [None]:
tick_df = cib.get_tick_bars(source_df, threshold=TICK_M)
tick_df.set_index("date_time", inplace=True)
n_ticks = tick_df.shape[0]
volume_ratio = (tick_df.cum_buy_volume.sum() / n_ticks).round()
dollar_ratio = (tick_df.cum_dollar_value.sum() / n_ticks).round()
print(f"num ticks: {n_ticks:,}")
print(f"volume ratio: {volume_ratio}")
print(f"dollar ratio: {dollar_ratio}")
tick_df

## Volume Bars

In [None]:
v_bar_df = cib.get_volume_bars(source_df, threshold=VOLUME_M)
v_bar_df.set_index("date_time", inplace=True)
v_bar_df

## Dollar Bars

In [None]:
dv_bar_df = cib.get_dollar_bars(source_df, threshold=DOLLAR_VOLUME_M)
dv_bar_df.set_index("date_time", inplace=True)
dv_bar_df

# Analyzing the Bars

## Count Quantity of Bars By Each Bar Type (Weekly)

Compare series. Scale them so that we compare "apples" to "apples".

In [None]:
def count_bars(df, price_col="cum_dollar_value"):
    return df.resample("s")[price_col].count()


def scale(s):
    return (s - s.min()) / (s.max() - s.min())

In [None]:
tc = scale(count_bars(tick_df))
vc = scale(count_bars(v_bar_df))
dc = scale(count_bars(dv_bar_df))
dfc = scale(count_bars(df, price_col="price"))

In [None]:
f, ax = plt.subplots(figsize=PLOT_FIGURE_SIZE)
tc.plot(ax=ax, ls="-", label="tick count")
vc.plot(ax=ax, ls="--", label="volume count")
dc.plot(ax=ax, ls="-.", label="dollar count")
ax.set_title("scaled bar counts")
ax.legend()

## Which Bar Type Has Most Stable Counts?

In [None]:
bar_types = ["tick", "volume", "dollar", "df"]
bar_std = [tc.std(), vc.std(), dc.std(), dfc.std()]
counts = pd.Series(bar_std, index=bar_types)
counts.sort_values()

## Which Bar Type Has the Lowest Serial Correlation?

In [None]:
def returns(s):
    arr = np.diff(np.log(s))
    return pd.Series(arr, index=s.index[1:])

In [None]:
tr = returns(tick_df.cum_dollar_value)
vr = returns(v_bar_df.cum_dollar_value)
dr = returns(dv_bar_df.cum_dollar_value)
df_ret = returns(df.price)

bar_returns = [tr, vr, dr, df_ret]

In [None]:
def get_test_stats(bar_types, bar_returns, test_func, *args, **kwargs):

    dct = {
        bar_plot: (int(bar_ret.shape[0]), test_func(bar_ret, *args, **kwargs))
        for bar_plot, bar_ret in zip(bar_types, bar_returns)
    }

    df = (
        pd.DataFrame.from_dict(dct)
        .rename(index={0: "sample_size", 1: f"{test_func.__name__}_stat"})
        .T
    )
    return df


autocorrs = get_test_stats(bar_types, bar_returns, pd.Series.autocorr)

In [None]:
autocorrs.sort_values("autocorr_stat")

In [None]:
autocorrs.abs().sort_values("autocorr_stat")

In [None]:
def plot_autocorr(bar_types, bar_returns):
    _, axes = plt.subplots(len(bar_types), figsize=PLOT_FIGURE_SIZE)
    min_lags = min(map(len, bar_returns))
    for i, (bar_plot, typ) in enumerate(zip(bar_returns, bar_types)):
        sm.graphics.tsa.plot_acf(
            bar_plot,
            lags=min_lags - 1,
            ax=axes[i],
            alpha=0.05,
            unbiased=True,
            fft=True,
            zero=False,
            title=f"{typ} AutoCorr",
        )
    plt.tight_layout()


def plot_hist(bar_types, bar_returns):
    _, axes = plt.subplots(len(bar_types), figsize=PLOT_FIGURE_SIZE)
    for i, (bar_plot, typ) in enumerate(zip(bar_returns, bar_types)):
        g = sns.distplot(bar_plot, ax=axes[i], kde=False, label=typ)
        g.set(yscale="log")
        axes[i].legend()
    plt.tight_layout()

In [None]:
plot_autocorr(bar_types, bar_returns)

In [None]:
plot_hist(bar_types, bar_returns)

## Partition Bar Series into Monthly, Compute Variance of Returns, and Variance of Variance

In [None]:
def partition_monthly(s):
    return s.resample("1M").var()

In [None]:
tr_rs = partition_monthly(tr)
vr_rs = partition_monthly(vr)
dr_rs = partition_monthly(dr)
df_ret_rs = partition_monthly(df_ret)
monthly_vars = [tr_rs, vr_rs, dr_rs, df_ret_rs]

In [None]:
get_test_stats(bar_types, monthly_vars, np.var).sort_values("var_stat")

## Compute Jarque-Bera Test, Which Has The Lowest Test Statistic?

In [None]:
def jb(x, test=True):
    np.random.seed(12345678)
    if test:
        return stats.jarque_bera(x)[0]
    return stats.jarque_bera(x)[1]


get_test_stats(bar_types, bar_returns, jb).sort_values("jb_stat")

## Compute Shapiro-Wilk Test

Shapiro-Wilk test statistic > larger is better.

In [None]:
def shapiro(x, test=True):
    np.random.seed(12345678)
    if test:
        return stats.shapiro(x)[0]
    return stats.shapiro(x)[1]


get_test_stats(bar_types, bar_returns, shapiro).sort_values("shapiro_stat")[::-1]