In [1]:
import os
import pandas as pd
import numpy as np
import datetime
import pytz
import tqdm
from matplotlib import pyplot as plt
from statsmodels.tsa.stattools import coint
import statsmodels.api as sm
import multiprocessing

import plotly.graph_objects as go
from plotly.subplots import make_subplots
import ipywidgets as widgets
from ipywidgets import interact, interact_manual
import ipydatetime
from IPython.display import display

# Global Variables

In [2]:
DATA_DIR = "../data/parquet"

# Helper functions

In [3]:
def get_all_instruments(data_dir):
    all_instrument_files = os.listdir(data_dir)
    return list(map(lambda x : x.split('_')[0], all_instrument_files))

In [4]:
def utc_to_datetime_index_conversion(df):
    df.index = df.index.map(lambda x : datetime.datetime.fromtimestamp(x))
    return df

In [5]:
def extract_universe(instrument_list, start_time, end_time, period):
    rule = {
        "o": "first",
        "h": "max",
        "l": "min",
        "c": "last",
        "v": "sum",
        "a": "sum"
    }
    universe = {}
    count = 0
    for symbol in tqdm.tqdm(instrument_list):
        df = pd.read_parquet(DATA_DIR + "/" + symbol + "_1min.parquet")
        df = utc_to_datetime_index_conversion(df)
        df = df.loc[(df.index >= start_time) & (df.index <= end_time)]
        df = df.resample(period).agg(rule)
        # Custom values derived from raw
        # p represents p&l which is the difference between current closing price and previous period closing price        
        df["p"] = df["c"].diff()
        # r represents the returns percentage via decimals which is the (current closing price - previous closing price) / previous closing price 
        df["r"] = df["c"].pct_change()
        # d represents spread between the highest price and lowest price
        df["d"] = df["h"] - df["l"]
        # s represents the spread between the open and closing price
        df["s"] = df["c"] - df["o"]
        universe[symbol] = df
        count += 1
        if count == 100:
            break
    return universe

# Viz functions

In [6]:
def visualize_single_candlestick(symbol_name, start_datetime, end_datetime):
    if symbol_name not in universe:
        raise ValueError("Invalid symbol! Please ensure requested symbol exist in universe!")
    fig = make_subplots(rows=2, cols=1, shared_xaxes=True, 
                        vertical_spacing=0.03, subplot_titles=('OHLC', 'Volume'), 
                        row_width=[0.2, 0.7])
    df = universe[symbol_name]
    if start_datetime is not None:
        df = df.loc[df.index >= start_datetime.strftime("%Y-%m-%d %H:%M:%S")]
    if end_datetime is not None:
        df = df.loc[df.index <= end_datetime.strftime("%Y-%m-%d %H:%M:%S")]
    # OHLC plot
    fig.add_trace(
        go.Candlestick(
            x=df.index,
            open=df.o,
            high=df.h,
            low=df.l,
            close=df.c,
            name="OHLC"
        ),
        row=1,
        col=1
    )
    # Volume plot
    fig.add_trace(
        go.Bar(
            x=df.index,
            y=df.v,
            showlegend=False,
            name="Volume"
        ),
        row=2,
        col=1
    )
    fig.update(layout_xaxis_rangeslider_visible=False)
    fig.show()

In [7]:
def visualize_compare_two_symbols(symbol_1_name, symbol_2_name, start_datetime, end_datetime, attribute_to_viz, mode):
    ALLOWED_MODES=("individual", "combined")
    if symbol_1_name not in universe:
        raise ValueError("Invalid symbol_1_name! Please ensure requested symbol exist in universe!")
    if symbol_2_name not in universe:
        raise ValueError("Invalid symbol_2_name! Please ensure requested symbol exist in universe!")
    if mode not in ALLOWED_MODES:
        raise ValueError("Allowed modes are: ".format(ALLOWED_MODES))
    df_symbol_1 = universe[symbol_1_name]
    df_symbol_2 = universe[symbol_2_name]
    df_symbol_1 = df_symbol_1.loc[df_symbol_1.index >= start_datetime.strftime("%Y-%m-%d %H:%M:%S")]
    df_symbol_2 = df_symbol_2.loc[df_symbol_2.index >= start_datetime.strftime("%Y-%m-%d %H:%M:%S")]
    df_symbol_1 = df_symbol_1.loc[df_symbol_1.index <= end_datetime.strftime("%Y-%m-%d %H:%M:%S")]
    df_symbol_2 = df_symbol_2.loc[df_symbol_2.index <= end_datetime.strftime("%Y-%m-%d %H:%M:%S")]
    df_joined = df_symbol_1.join(df_symbol_2, lsuffix="_symbol-1", rsuffix="_symbol-2").dropna()
    # Perform regression as current approach of ploting in plotly does not use the api
    # that generates trendline automatically
    best_fit = sm.OLS(df_joined[attribute_to_viz + "_symbol-2"],sm.add_constant(df_joined[attribute_to_viz + "_symbol-1"])).fit().fittedvalues
    if mode == "individual":
        fig = make_subplots(rows=2, cols=2, shared_yaxes=True, 
                            horizontal_spacing=0.03, subplot_titles=(symbol_1_name, symbol_2_name))
        # Symbol 1 line plot
        fig.add_trace(
            go.Scatter(
                x=df_symbol_1.index,
                y=df_symbol_1[attribute_to_viz],
                name=symbol_1_name + "time series",
                line={"color": "#636EFA"}
            ),
            row=1,
            col=1
        )
        # Symbol 2 line plot
        fig.add_trace(
            go.Scatter(
                x=df_symbol_2.index,
                y=df_symbol_2[attribute_to_viz],
                name=symbol_2_name + "time series",
                line={"color": "#EF553B"}
            ),
            row=1,
            col=2
        )
        # Symbol 1 & 2 scatter plot
        fig.add_trace(
            go.Scatter(
                x=df_joined[attribute_to_viz + "_symbol-1"],
                y=df_joined[attribute_to_viz + "_symbol-2"],
                mode='markers',
                name=symbol_1_name + " & " + symbol_2_name + " " + "'" + attribute_to_viz + "'" + " comparison Scatter",
                line={'color': '#00CC96'}
            ),
            row=2,
            col=1
        )
        fig.add_trace(
            go.Scatter(
                x=df_joined[attribute_to_viz + "_symbol-1"],
                y=best_fit,
                mode='lines',
                name=symbol_1_name + " & " + symbol_2_name + " " + "'" + attribute_to_viz + "'" + " comparison trendline",
                line={'dash': 'dash', 'color': '#990099'}
            ),
            row=2,
            col=1
        )
    elif mode == "combined":
        fig = make_subplots(rows=1, cols=2, shared_yaxes=True, 
                            horizontal_spacing=0.03, subplot_titles=(symbol_1_name + " & " + symbol_2_name, "Comparison Scatter"))
        # Symbol 1& 2 combined line plot
        fig.add_trace(
            go.Scatter(
                x=df_joined.index,
                y=df_joined[attribute_to_viz + "_symbol-1"],
                name=symbol_1_name + " time Series",
                line={"color": "#636EFA"}
            ),
            row=1,
            col=1
        )
        fig.add_trace(
            go.Scatter(
                x=df_joined.index,
                y=df_joined[attribute_to_viz + "_symbol-2"],
                name=symbol_2_name + " time Series Comparison",
                line={"color": "#EF553B"}
            ),
            row=1,
            col=1
        )
        # Symbol 1 & 2 scatter plot
        fig.add_trace(
            go.Scatter(
                x=df_joined[attribute_to_viz + "_symbol-1"],
                y=df_joined[attribute_to_viz + "_symbol-2"],
                mode='markers',
                name=symbol_1_name + " & " + symbol_2_name + " " + "'" + attribute_to_viz + "'" + " comparison Scatter",
                line={'color': '#00CC96'}
            ),
            row=1,
            col=2
        )
        fig.add_trace(
            go.Scatter(
                x=df_joined[attribute_to_viz + "_symbol-1"],
                y=best_fit,
                mode='lines',
                name=symbol_1_name + " & " + symbol_2_name + " " + "'" + attribute_to_viz + "'" + " comparison trendline",
                line={'dash': 'dash', 'color': '#990099'}
            ),
            row=1,
            col=2
        )
    # Update legend position given the size of it         
    fig.update_layout(legend=dict(
                      orientation="h",
                      yanchor="bottom",
                      y=1.1,
                      xanchor="right",
                      x=1
    ))
    # Minor config
    fig.update(layout_xaxis_rangeslider_visible=False)
    fig.show()

In [8]:
def visualize_pairs(symbol_1_name, symbol_2_name, start_datetime, end_datetime):
    if symbol_1_name not in universe:
        raise ValueError("Invalid symbol_1_name! Please ensure requested symbol exist in universe!")
    if symbol_2_name not in universe:
        raise ValueError("Invalid symbol_2_name! Please ensure requested symbol exist in universe!")
    df_symbol_1 = universe[symbol_1_name]
    df_symbol_2 = universe[symbol_2_name]
    df_symbol_1 = df_symbol_1.loc[df_symbol_1.index >= start_datetime.strftime("%Y-%m-%d %H:%M:%S")]
    df_symbol_2 = df_symbol_2.loc[df_symbol_2.index >= start_datetime.strftime("%Y-%m-%d %H:%M:%S")]
    df_symbol_1 = df_symbol_1.loc[df_symbol_1.index <= end_datetime.strftime("%Y-%m-%d %H:%M:%S")]
    df_symbol_2 = df_symbol_2.loc[df_symbol_2.index <= end_datetime.strftime("%Y-%m-%d %H:%M:%S")]
    df_joined = df_symbol_1.join(df_symbol_2, lsuffix="_symbol-1", rsuffix="_symbol-2").dropna()
    # Additional feature engineering
    # pr is price ratio which is symbol 1's closing price / symbol 2's closing price
    df_joined["pr"] = df_joined["c_symbol-1"] / df_joined["c_symbol-2"]
    # npr is the normalized price ratio which is (price ratio - price ratio mean) / standard deviation of price ratio
    df_joined["npr"] = (df_joined["pr"] - df_joined["pr"].mean()) / df_joined["pr"].std()
    # cd is the difference between the closing price of the symbol 1 and symbol 2
    df_joined["cd"] = df_joined["c_symbol-1"] - df_joined["c_symbol-2"]
    # ncd is the normalized difference between the closing price of the symbol 1 and symbol 2
    df_joined["ncd"] = (df_joined["cd"] - df_joined["cd"].mean()) / df_joined["cd"].std()
    # Perform regression as current approach of ploting in plotly does not use the api
    # that generates trendline automatically
    best_fit_c = sm.OLS(df_joined["c_symbol-2"],sm.add_constant(df_joined["c_symbol-1"])).fit().fittedvalues
    best_fit_v = sm.OLS(df_joined["v_symbol-2"],sm.add_constant(df_joined["v_symbol-1"])).fit().fittedvalues
    best_fit_r = sm.OLS(df_joined["r_symbol-2"],sm.add_constant(df_joined["r_symbol-1"])).fit().fittedvalues
    fig = make_subplots(
        rows=5,
        cols=2,
        shared_yaxes=False, 
        horizontal_spacing=0.03,
        subplot_titles=(
            symbol_1_name + " & " + symbol_2_name + " closing price line",
            symbol_1_name + " & " + symbol_2_name + " closing price scatter",
            symbol_1_name + " & " + symbol_2_name + " volume line",
            symbol_1_name + " & " + symbol_2_name + " volume scatter",
            symbol_1_name + " & " + symbol_2_name + " returns line",
            symbol_1_name + " & " + symbol_2_name + " returns scatter",
            symbol_1_name + " & " + symbol_2_name + " price ratio line",
            symbol_1_name + " & " + symbol_2_name + " normalized price ratio line",
            symbol_1_name + " & " + symbol_2_name + " closing price difference line",
            symbol_1_name + " & " + symbol_2_name + " normalized closing price distance line",
            
        )
    )
    # Symbol 1 & 2 closing price line plot
    fig.add_trace(
        go.Scatter(
            x=df_joined.index,
            y=df_joined["c_symbol-1"],
            name=symbol_1_name + " closing price",
            line={"color": "#636EFA"}
        ),
        row=1,
        col=1
    )
    fig.add_trace(
        go.Scatter(
            x=df_joined.index,
            y=df_joined["c_symbol-2"],
            name=symbol_2_name + " closing price",
            line={"color": "#EF553B"}
        ),
        row=1,
        col=1
    )
    # Symbol 1 & 2 closing price scatter
    fig.add_trace(
        go.Scatter(
            x=df_joined["c_symbol-1"],
            y=df_joined["c_symbol-2"],
            mode='markers',
            name=symbol_1_name + " & " + symbol_2_name + " closing price scatter",
            line={'color': '#00CC96'}
        ),
        row=1,
        col=2
    )
    fig.add_trace(
        go.Scatter(
            x=df_joined["c_symbol-1"],
            y=best_fit_c,
            mode='lines',
            name=symbol_1_name + " & " + symbol_2_name + " closing price trendline",
            line={'dash': 'dash', 'color': '#990099'}
        ),
        row=1,
        col=2
    )
    
    # Symbol 1 & 2 volume line plot
    fig.add_trace(
        go.Scatter(
            x=df_joined.index,
            y=df_joined["v_symbol-1"],
            name=symbol_1_name + " volume",
            line={"color": "#636EFA"}
        ),
        row=2,
        col=1
    )
    fig.add_trace(
        go.Scatter(
            x=df_joined.index,
            y=df_joined["v_symbol-2"],
            name=symbol_2_name + " volume",
            line={"color": "#EF553B"}
        ),
        row=2,
        col=1
    )
    # Symbol 1 & 2 volume scatter
    fig.add_trace(
        go.Scatter(
            x=df_joined["v_symbol-1"],
            y=df_joined["v_symbol-2"],
            mode='markers',
            name=symbol_1_name + " & " + symbol_2_name + " volume scatter",
            line={'color': '#00CC96'}
        ),
        row=2,
        col=2
    )
    fig.add_trace(
        go.Scatter(
            x=df_joined["v_symbol-1"],
            y=best_fit_v,
            mode='lines',
            name=symbol_1_name + " & " + symbol_2_name + " volume trendline",
            line={'dash': 'dash', 'color': '#990099'}
        ),
        row=2,
        col=2
    )
    
    # Symbol 1 & 2 returns line plot
    fig.add_trace(
        go.Scatter(
            x=df_joined.index,
            y=df_joined["r_symbol-1"],
            name=symbol_1_name + " returns",
            line={"color": "#636EFA"}
        ),
        row=3,
        col=1
    )
    fig.add_trace(
        go.Scatter(
            x=df_joined.index,
            y=df_joined["r_symbol-2"],
            name=symbol_2_name + " returns",
            line={"color": "#EF553B"}
        ),
        row=3,
        col=1
    )
    # Symbol 1 & 2 returns scatter
    fig.add_trace(
        go.Scatter(
            x=df_joined["r_symbol-1"],
            y=df_joined["r_symbol-2"],
            mode='markers',
            name=symbol_1_name + " & " + symbol_2_name + " returns scatter",
            line={'color': '#00CC96'}
        ),
        row=3,
        col=2
    )
    fig.add_trace(
        go.Scatter(
            x=df_joined["r_symbol-1"],
            y=best_fit_r,
            mode='lines',
            name=symbol_1_name + " & " + symbol_2_name + " returns trendline",
            line={'dash': 'dash', 'color': '#990099'}
        ),
        row=3,
        col=2
    )
    
    # Symbol 1 & 2 price ratio line plot
    fig.add_trace(
        go.Scatter(
            x=df_joined.index,
            y=df_joined["pr"],
            name=symbol_1_name + " & " + symbol_2_name + " price ratio",
            line={"color": "#00CC96"}
        ),
        row=4,
        col=1
    )
    # Symbol 1 & 2 normalized price ratio line plot
    fig.add_trace(
        go.Scatter(
            x=df_joined.index,
            y=df_joined["npr"],
            name=symbol_1_name + " & " + symbol_2_name + " normalized price ratio",
            line={"color": "#00CC96"}
        ),
        row=4,
        col=2
    )
    fig.add_trace(
        go.Scatter(
            x=df_joined.index,
            y=[0]*len(df_joined["v_symbol-1"]),
            mode='lines',
            name=symbol_1_name + " & " + symbol_2_name + " normalized price ratio zero line",
            line={'dash': 'dash', 'color': '#990099'}
        ),
        row=4,
        col=2
    )
    
    # Symbol 1 & 2 closing price difference line plot
    fig.add_trace(
        go.Scatter(
            x=df_joined.index,
            y=df_joined["cd"],
            name=symbol_1_name + " & " + symbol_2_name + " closing price difference",
            line={"color": "#00CC96"}
        ),
        row=5,
        col=1
    )
    # Symbol 1 & 2 normalized price ratio line plot
    fig.add_trace(
        go.Scatter(
            x=df_joined.index,
            y=df_joined["ncd"],
            name=symbol_1_name + " & " + symbol_2_name + " normalized closing price difference",
            line={"color": "#00CC96"}
        ),
        row=5,
        col=2
    )
    fig.add_trace(
        go.Scatter(
            x=df_joined.index,
            y=[0]*len(df_joined.index),
            mode='lines',
            name=symbol_1_name + " & " + symbol_2_name + " normalized closing price difference zero line",
            line={'dash': 'dash', 'color': '#990099'}
        ),
        row=5,
        col=2
    )
    
    # Update legend position given the size of it         
    fig.update_layout(
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.1,
            xanchor="right",
            x=1,
#             font=dict(
#                 family="Courier",
#                 size=12,
#                 color="black"
#             ),
        ),
        height=1600
    )
    # Minor config
    fig.update(layout_xaxis_rangeslider_visible=False)
    fig.show()

# Get universe 

In [9]:
instrument_list = get_all_instruments(DATA_DIR)
universe = extract_universe(instrument_list, "2021-01-01", "2021-11-15", "1T")

# Create widgets

visualize_single_candlestick widgets

In [70]:
universe_selection_candlestick = widgets.Dropdown(
    options=universe.keys(),
    description="Select instrument to view"
)
start_datetime_picker_candlestick = ipydatetime.DatetimePicker(
    value=datetime.datetime(
        2019, 1, 1, 0, 0, 0, tzinfo=pytz.timezone('Asia/Singapore')
    ),
    description='Select start datetime',
    disabled=False
)
end_datetime_picker_candlestick = ipydatetime.DatetimePicker(
    value=datetime.datetime(
        2019, 1, 1, 0, 0, 0, tzinfo=pytz.timezone('Asia/Singapore')
    ),
    description='Select end datetime',
    disabled=False
)
layout_candlestick = widgets.AppLayout(header=universe_selection_candlestick,
                                       left_sidebar=start_datetime_picker_candlestick,
                                       right_sidebar=end_datetime_picker_candlestick)

In [71]:
ohlcv_interact = interact_manual.options(manual_name="Visualise")
ohlcv_interact(
    visualize_single_candlestick,
    symbol_name=universe_selection_candlestick,
    start_datetime=start_datetime_picker_candlestick,
    end_datetime=end_datetime_picker_candlestick
)

interactive(children=(Dropdown(description='Select instrument to view', layout=Layout(grid_area='header'), opt…

<function __main__.visualize_single_candlestick(symbol_name, start_datetime, end_datetime)>

visualize_compare_two_symbols widgets

In [72]:
universe_selection_compare_1 = widgets.Dropdown(
    options=universe.keys(),
    description="Select first instrument to compare"
)
universe_selection_compare_2 = widgets.Dropdown(
    options=universe.keys(),
    description="Select second instrument to compare"
)
start_datetime_picker_compare = ipydatetime.DatetimePicker(
    value=datetime.datetime(
        2019, 1, 1, 0, 0, 0, tzinfo=pytz.timezone('Asia/Singapore')
    ),
    description='Select start datetime',
    disabled=False
)
end_datetime_picker_compare = ipydatetime.DatetimePicker(
    value=datetime.datetime(
        2019, 1, 1, 0, 0, 0, tzinfo=pytz.timezone('Asia/Singapore')
    ),
    description='Select end datetime',
    disabled=False
)
attribute_selection_compare = widgets.Dropdown(
    options=universe[list(universe.keys())[0]].columns.tolist(),
    description="Select attribute to compare"
)
mode_selection_compare = widgets.Dropdown(
    options=("individual", "combined"),
    description="Select visualisation mode"
)
layout_compare = widgets.GridspecLayout(3, 2)
layout_compare[0, 0] = universe_selection_compare_1
layout_compare[0, 1] = universe_selection_compare_2
layout_compare[1, 0] = start_datetime_picker_compare
layout_compare[1, 1] = end_datetime_picker_compare
layout_compare[2, 0] = attribute_selection_compare
layout_compare[2, 1] = mode_selection_compare

In [73]:
compare_interact = interact_manual.options(manual_name="Visualise")
compare_interact(
    visualize_compare_two_symbols,
    symbol_1_name=universe_selection_compare_1,
    symbol_2_name=universe_selection_compare_2,
    start_datetime=start_datetime_picker_compare,
    end_datetime=end_datetime_picker_compare,
    attribute_to_viz=attribute_selection_compare,
    mode=mode_selection_compare
)

interactive(children=(Dropdown(description='Select first instrument to compare', layout=Layout(grid_area='widg…

<function __main__.visualize_compare_two_symbols(symbol_1_name, symbol_2_name, start_datetime, end_datetime, attribute_to_viz, mode)>

visualize_pairs widgets

In [74]:
universe_selection_pair_1 = widgets.Dropdown(
    options=universe.keys(),
    description="Select first instrument to compare"
)
universe_selection_pair_2 = widgets.Dropdown(
    options=universe.keys(),
    description="Select second instrument to compare"
)
start_datetime_picker_pair = ipydatetime.DatetimePicker(
    value=datetime.datetime(
        2019, 1, 1, 0, 0, 0, tzinfo=pytz.timezone('Asia/Singapore')
    ),
    description='Select start datetime',
    disabled=False
)
end_datetime_picker_pair = ipydatetime.DatetimePicker(
    value=datetime.datetime(
        2019, 1, 1, 0, 0, 0, tzinfo=pytz.timezone('Asia/Singapore')
    ),
    description='Select end datetime',
    disabled=False
)
layout_pair = widgets.GridspecLayout(2, 2)
layout_pair[0, 0] = universe_selection_pair_1
layout_pair[0, 1] = universe_selection_pair_2
layout_pair[1, 0] = start_datetime_picker_pair
layout_pair[1, 1] = end_datetime_picker_pair

In [75]:
pair_interact = interact_manual.options(manual_name="Visualise")
compare_interact(
    visualize_pairs,
    symbol_1_name=universe_selection_pair_1,
    symbol_2_name=universe_selection_pair_2,
    start_datetime=start_datetime_picker_pair,
    end_datetime=end_datetime_picker_pair,
)

interactive(children=(Dropdown(description='Select first instrument to compare', layout=Layout(grid_area='widg…

<function __main__.visualize_pairs(symbol_1_name, symbol_2_name, start_datetime, end_datetime)>