In [1]:
import os
import sys
sys.path.append("..")
import pytz
import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
import ipywidgets as widgets
from IPython.display import display, clear_output
from tqdm import tqdm
from scipy.stats import median_abs_deviation

from src.universe import Universe

In [2]:
DATA_FOLDER = "../data/parquet/1min"
ACCEPTED_CURRENCY_BASE = ["USDT", "USDC"]
MAD_THRESHOLD = 3 # commonly 3, 2.5, 2 where 3 is conservation and 2 is poorly conservatiove

Load all data

In [3]:
# kc_universe = Universe(
#     name="Kucoin-1min",
#     data_source_type="parquet",
#     data_source_url=DATA_FOLDER,
#     start_datetime="2018-01-01 00:00:00",
#     end_datetime="2022-06-19 00:00:00",
#     raw_interval="1min",
#     resample_interval="30min",
#     datetime_format="datetime"
# )
# kc_universe.load()

Loading parquet data for ZRX-ETH: 100%|████████████████████████████████████████████| 1254/1254 [34:02<00:00,  1.63s/it]


In [4]:
# merged = pd.DataFrame()
# for symbol in tqdm(kc_universe.data):
#     current_symbol_data = kc_universe.data[symbol]
#     rearragned_columns = ["symbol", "o", "h", "l", "c", "v", "log_o", "log_h", "log_l", "log_c", "p", "r", "d", "s", "log_r"]
#     current_symbol_data["symbol"] = symbol
#     current_symbol_data = current_symbol_data[rearragned_columns]
#     merged = pd.concat([merged, current_symbol_data])
# merged.to_parquet("../data/parquet/aggregate/universe_30min_2018-01-01_2022-05-31.parquet")

In [3]:
kc_parquet = pd.read_parquet("../data/parquet/aggregate/universe_30min_2018-01-01_2022-05-31.parquet")

In [4]:
summary_stats = kc_parquet.groupby("symbol").describe()
# First filter by counts of volume data points to ensure sufficient data points
summary_stats = summary_stats.loc[summary_stats["v"]["count"] >= summary_stats["v"]["count"].median()]
# Next filter by median returns * volume to ensure symbol has sufficient volume which correlates to liquidity to some extent
# and volatility which can be seen by the returns
summary_stats = summary_stats.loc[summary_stats["r"]["50%"] * summary_stats["v"]["50%"] >= (summary_stats["r"]["50%"] * summary_stats["v"]["50%"]).median()]

In [5]:
kc_parquet_filtered = kc_parquet.loc[kc_parquet.symbol.isin(summary_stats.index.tolist())]

In [6]:
kc_parquet_filtered_separated = {}
for symbol in tqdm(kc_parquet_filtered.symbol.unique()):
    if symbol.split("-")[-1] in ACCEPTED_CURRENCY_BASE:
        symbol_df = kc_parquet_filtered.loc[kc_parquet_filtered.symbol == symbol]
        symbol_df = symbol_df.drop(columns=["symbol"])
        symbol_df.sort_index(inplace=True)
        kc_parquet_filtered_separated[symbol] = symbol_df

100%|████████████████████████████████████████████████████████████████████████████████| 616/616 [05:10<00:00,  1.99it/s]


In [7]:
# Symbol selectors
symbol_1_dropdown = widgets.Dropdown(
    options=list(kc_parquet_filtered_separated.keys()),
    value=None,
    description='Symbol 1',
)
symbol_2_dropdown = widgets.Dropdown(
    options=list(kc_parquet_filtered_separated.keys()),
    value=None,
    description='Symbol 2',
)
symbols_dropdowns = widgets.HBox([symbol_1_dropdown, symbol_2_dropdown])
# Datetime selectors
start_datetime_string = widgets.Text(
    description='Select start datetime',
    placeholder="Start datetime",
    value="2018-01-01 00:00:00",
    disabled=False
)
end_datetime_string = widgets.Text(
    description='Select end datetime',
    placeholder="End datetime",
    value=datetime.datetime.strftime(datetime.datetime.today(), "%Y-%m-%d %H:%M:%S"),
    disabled=False
)
datetime_strings = widgets.HBox([start_datetime_string, end_datetime_string])
# Plot buttons
plot_scatterplot_button = widgets.Button(description="Plot")
plot_output = widgets.Output()

def on_click(click):
    # Extract symbol names
    symbol_1 = symbol_1_dropdown.value
    symbol_2 = symbol_2_dropdown.value
    # Filter log return plots
    returns_1 = kc_parquet_filtered_separated[symbol_1]["log_r"]
    returns_2 = kc_parquet_filtered_separated[symbol_2]["log_r"]
    returns_1 = returns_1.iloc[1:]
    returns_2 = returns_2.iloc[1:]
    # Combine log returns columns
    joined_returns = pd.concat([returns_1, returns_2], axis=1)
    # Drop any columns that have na due to missing information from either symbol for specific date
    joined_returns.dropna(inplace=True)
    # Rename columns
    joined_returns.columns = [symbol_1, symbol_2]
    # Filter data by inidcated start and end datetime
    joined_returns = joined_returns.loc[
        (joined_returns.index >= datetime.datetime.strptime(start_datetime_string.value, "%Y-%m-%d %H:%M:%S"))
    ]
    joined_returns = joined_returns.loc[
        (joined_returns.index <= datetime.datetime.strptime(end_datetime_string.value, "%Y-%m-%d %H:%M:%S"))
    ]
    # Remove anomalies via MAD statistic
    symbol_1_mad = median_abs_deviation(joined_returns[symbol_1])
    symbol_2_mad = median_abs_deviation(joined_returns[symbol_2])
    symbol_1_median = joined_returns[symbol_1].median()
    symbol_2_median = joined_returns[symbol_2].median()

    joined_returns[symbol_1].loc[joined_returns[symbol_1] < symbol_1_median - symbol_1_mad * MAD_THRESHOLD] = None
    joined_returns[symbol_1].loc[joined_returns[symbol_1] > symbol_1_median + symbol_1_mad * MAD_THRESHOLD] = None
    joined_returns[symbol_2].loc[joined_returns[symbol_2] < symbol_2_median - symbol_2_mad * MAD_THRESHOLD] = None
    joined_returns[symbol_2].loc[joined_returns[symbol_2] > symbol_2_median + symbol_2_mad * MAD_THRESHOLD] = None
    # Anomalies will be assigned None and thus needs to be dropped
    joined_returns.dropna(inplace=True)
    joined_returns["ratio"] = joined_returns[symbol_2] / joined_returns[symbol_1]
    with plot_output:
        fig, ax = plt.subplots(2, 1, figsize=(20, 20))
        plot_output.clear_output()
        sns.scatterplot(x=joined_returns[symbol_1], y=joined_returns[symbol_2], ax=ax[0])
        ax[0].set_title("Log returns plot")
        sns.scatterplot(x=range(len(joined_returns)), y=joined_returns["ratio"], ax=ax[1])
        ax[1].set_title("Log returns ratio plot")
        plt.show()

plot_scatterplot_button.on_click(on_click)
display(symbols_dropdowns, datetime_strings, plot_scatterplot_button)
display(plot_output)

HBox(children=(Dropdown(description='Symbol 1', options=('1INCH-USDT', 'AAVE-USDT', 'ABBC-USDT', 'ACE-USDT', '…

HBox(children=(Text(value='2018-01-01 00:00:00', description='Select start datetime', placeholder='Start datet…

Button(description='Plot', style=ButtonStyle())

Output()

# Archived

In [11]:
# Symbol selectors
symbol_1_dropdown = widgets.Dropdown(
    options=list(kc_parquet_filtered_separated.keys()),
    value=None,
    description='Symbol 1',
)
symbol_2_dropdown = widgets.Dropdown(
    options=list(kc_parquet_filtered_separated.keys()),
    value=None,
    description='Symbol 2',
)
symbols_dropdowns = widgets.HBox([symbol_1_dropdown, symbol_2_dropdown])
# Datetime selectors
start_datetime_string = widgets.DatetimePicker(
    description='Input start datetime',
a    value=None,
    disabled=False
)
end_datetime_string = widgets.DatetimePicker(
    description='Input end datetime',
    value=None,
    disabled=False
)
datetime_strings = widgets.HBox([start_datetime_string, end_datetime_string])
# Plot buttons
plot_scatterplot_button = widgets.Button(description="Plot")
plot_output = widgets.Output()

symbol_1 = symbol_1_dropdown.value
symbol_2 = symbol_2_dropdown.value
returns_1 = kc_parquet_filtered_separated[symbol_1]["log_r"]
returns_2 = kc_parquet_filtered_separated[symbol_2]["log_r"]
returns_1 = returns_1.iloc[1:]
returns_2 = returns_2.iloc[1:]
joined_returns = pd.concat([returns_1, returns_2], axis=1)
joined_returns.dropna(inplace=True)
joined_returns.columns = [symbol_1, symbol_2]
plt.figure(figsize=(12, 10))
plt.title("Log Returns Plot")
plot_output.clear_output()
sns.scatterplot(x=joined_returns[symbol_1], y=joined_returns[symbol_2])
plt.show()

HBox(children=(Dropdown(description='Symbol 1', options=('1INCH-USDT', 'AAVE-KCS', 'AAVE-USDT', 'ABBC-USDT', '…

HBox(children=(Text(value='', description='Select start datetime', placeholder='Start datetime'), Text(value='…

Button(description='Plot', style=ButtonStyle())

Output()