## Imports

In [1]:
import logging

import pandas as pd

import helpers.hdbg as hdbg
import helpers.hprint as hprint
import helpers.hsql as hsql
import im_v2.ccxt.data.client.ccxt_clients as imvcdccccl
import im_v2.im_lib_tasks as imvimlita
import im_v2.talos.data.client.talos_clients as imvtdctacl

[0m[36mINFO[0m: > cmd='/venv/lib/python3.8/site-packages/ipykernel_launcher.py -f /home/.local/share/jupyter/runtime/kernel-40360184-c027-40d6-92a3-60b2c55dbbac.json'


  from tqdm.autonotebook import tqdm


In [2]:
hdbg.init_logger(verbosity=logging.INFO)

_LOG = logging.getLogger(__name__)

hprint.config_notebook()

  File "/usr/lib/python3.8/runpy.py", line 194, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.8/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/venv/lib/python3.8/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/venv/lib/python3.8/site-packages/traitlets/config/application.py", line 846, in launch_instance
    app.start()
  File "/venv/lib/python3.8/site-packages/ipykernel/kernelapp.py", line 677, in start
    self.io_loop.start()
  File "/venv/lib/python3.8/site-packages/tornado/platform/asyncio.py", line 199, in start
    self.asyncio_loop.run_forever()
  File "/usr/lib/python3.8/asyncio/base_events.py", line 570, in run_forever
    self._run_once()
  File "/usr/lib/python3.8/asyncio/base_events.py", line 1859, in _run_once
    handle._run()
  File "/usr/lib/python3.8/asyncio/events.py", line 81, in _run
    self._context.run(self._callback, *self._args)
  File "/ve



## Functions

In [3]:
def convert_to_the_format_for_analysis(df, suffix):
    """
    This function does the following:

    - Add a column `diff_in_timestamps` which is a time difference from the timestamp in the previous row.
    - Drop the columns that are not necessary for the analysis.
    - Filter the data, so all data starts from the same time.
    - Choose the rows that where the step from the previous timestamp is greater than 1 minute.
    - Add suffix to distiguish between vendors.
    """
    df = df.reset_index()
    df = df.dropna()
    df["diff_in_timestamps"] = df.timestamp - df.timestamp.shift(1)
    df = df.set_index("timestamp")
    df = df[["diff_in_timestamps"]]
    df = df[df.index > "2022-03-17 00:00:00+00:00"]
    df = df[df["diff_in_timestamps"] != "0 days 00:01:00"]
    df = df.add_suffix(f"{suffix}")
    return df

# Load the data

In [4]:
# Specify the connection.
env_file = imvimlita.get_db_env_path("dev")
connection_params = hsql.get_connection_info_from_env_file(env_file)
connection = hsql.get_connection(*connection_params)
# Specify param for both clients.
resample_1min = True

In [5]:
# General params for `read_data`.
full_symbol = ["binance::ADA_USDT"]
start_date = end_date = None

## Load CCXT data

In [6]:
# Initiate the client.
vendor = "CCXT"
universe_version = "v3"
table_name = "ccxt_ohlcv"
ccxt_client = imvcdccccl.CcxtSqlRealTimeImClient(
    resample_1min, connection, table_name
)

In [7]:
# Load the data.
ada_ccxt = ccxt_client.read_data(full_symbol, start_date, end_date)
display(ada_ccxt.shape)
display(ada_ccxt.head(3))



(215045, 6)

Unnamed: 0_level_0,full_symbol,open,high,low,close,volume
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-11-10 10:11:00+00:00,binance::ADA_USDT,2.227,2.228,2.225,2.225,71884.5
2021-11-10 10:12:00+00:00,binance::ADA_USDT,2.226,2.228,2.225,2.227,64687.0
2021-11-10 10:13:00+00:00,binance::ADA_USDT,2.228,2.232,2.227,2.23,59076.3


## Load Realtime Talos data

In [8]:
# Initialize the client.
universe_version = "v1"
table_name = "talos_ohlcv"
mode = "market_data"
talos_client = imvtdctacl.TalosSqlRealTimeImClient(
    universe_version, resample_1min, connection, table_name, mode
)



In [9]:
# Load the data.
ada_talos = talos_client.read_data(full_symbol, start_date, end_date)
display(ada_talos.shape)
display(ada_talos.head(3))

  data = self._apply_talos_normalization(


(32847, 8)

Unnamed: 0_level_0,open,high,low,close,volume,full_symbol,start_timestamp,asset_id
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2022-03-16 22:47:00+00:00,0.825,0.826,0.825,0.825,18427.9,binance::ADA_USDT,2022-03-16 22:46:00+00:00,3303714233
2022-03-16 22:48:00+00:00,0.825,0.826,0.825,0.825,52798.5,binance::ADA_USDT,2022-03-16 22:47:00+00:00,3303714233
2022-03-16 22:49:00+00:00,0.825,0.826,0.824,0.824,101067.8,binance::ADA_USDT,2022-03-16 22:48:00+00:00,3303714233


# Research of NaNs in timestamps

In [10]:
diff_ccxt = convert_to_the_format_for_analysis(ada_ccxt, "_ccxt")
diff_talos = convert_to_the_format_for_analysis(ada_talos, "_talos")
# The unique DataFrame with the comparison of NaN data.
df = pd.concat([diff_ccxt, diff_talos], axis=1)
# Add a column that shows the difference between NaN sequences of vendors.
df["diff"] = df["diff_in_timestamps_talos"] - df["diff_in_timestamps_ccxt"]
df.head(3)

Unnamed: 0_level_0,diff_in_timestamps_ccxt,diff_in_timestamps_talos,diff
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-03-17 13:32:00+00:00,0 days 11:59:00,0 days 12:01:00,0 days 00:02:00
2022-03-18 01:56:00+00:00,0 days 11:58:00,0 days 12:01:00,0 days 00:03:00
2022-03-18 15:07:00+00:00,0 days 12:00:00,0 days 12:01:00,0 days 00:01:00


The description of the columns in the created DataFrame:
- `timestamp` - Shows the first piece of data that appears after NaN sequence.
- `diff_in_timestamps_ccxt` - Shows the time value of sequence of NaNs in CCXT data.
- `diff_in_timestamps_talos` - Same as above but for Talos.
- `diff` - Difference between NaN sequences of vendors.

In [11]:
# Cases where both vendors have NaN sequences.
df[(df.diff_in_timestamps_ccxt.notna()) & df.diff_in_timestamps_talos.notna()]

Unnamed: 0_level_0,diff_in_timestamps_ccxt,diff_in_timestamps_talos,diff
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-03-17 13:32:00+00:00,0 days 11:59:00,0 days 12:01:00,0 days 00:02:00
2022-03-18 01:56:00+00:00,0 days 11:58:00,0 days 12:01:00,0 days 00:03:00
2022-03-18 15:07:00+00:00,0 days 12:00:00,0 days 12:01:00,0 days 00:01:00
2022-03-22 22:33:00+00:00,4 days 04:01:00,4 days 04:04:00,0 days 00:03:00
2022-03-24 08:59:00+00:00,0 days 18:31:00,0 days 18:33:00,0 days 00:02:00
2022-03-25 16:11:00+00:00,0 days 11:59:00,0 days 12:01:00,0 days 00:02:00
2022-03-28 11:53:00+00:00,2 days 16:33:00,2 days 16:35:00,0 days 00:02:00
2022-03-30 13:24:00+00:00,0 days 11:59:00,0 days 12:02:00,0 days 00:03:00
2022-03-30 16:37:00+00:00,0 days 00:38:00,0 days 00:41:00,0 days 00:03:00
2022-04-01 14:17:00+00:00,0 days 08:04:00,0 days 08:07:00,0 days 00:03:00


An important notice is that the most NaN sequences are ending at the same time in both vendors that is an indicator of this data is absent on the data provider side.

In [12]:
# The data is presented in CCXT, but not in Talos.
df[df.diff_in_timestamps_ccxt.isna()]

Unnamed: 0_level_0,diff_in_timestamps_ccxt,diff_in_timestamps_talos,diff
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-03-25 16:17:00+00:00,NaT,0 days 00:02:00,NaT
2022-03-30 13:31:00+00:00,NaT,0 days 00:03:00,NaT
2022-03-30 18:37:00+00:00,NaT,0 days 00:43:00,NaT
2022-03-31 21:03:00+00:00,NaT,0 days 04:51:00,NaT
2022-03-31 21:14:00+00:00,NaT,0 days 00:03:00,NaT
2022-03-31 21:23:00+00:00,NaT,0 days 00:05:00,NaT
2022-04-04 12:35:00+00:00,NaT,0 days 07:21:00,NaT


In [13]:
# The data is presented in Talos, but not in CCXT.
df[df.diff_in_timestamps_talos.isna()]

Unnamed: 0_level_0,diff_in_timestamps_ccxt,diff_in_timestamps_talos,diff
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-03-30 18:36:00+00:00,0 days 00:40:00,NaT,NaT
2022-03-31 21:07:00+00:00,0 days 04:52:00,NaT,NaT


In [14]:
num_both_seq = df[
    (df.diff_in_timestamps_ccxt.notna()) & df.diff_in_timestamps_talos.notna()
].shape[0]
num_unique_seq_ccxt = df[df.diff_in_timestamps_talos.isna()].shape[0]
num_unique_seq_talos = df[df.diff_in_timestamps_ccxt.isna()].shape[0]

total_time_talos = df["diff_in_timestamps_talos"].sum()
total_time_ccxt = df["diff_in_timestamps_ccxt"].sum()
diff_in_total_time = total_time_talos - total_time_ccxt
mean_time_diff = df["diff"].mean()

print(
    f"Number of NaN sequences that are the same in both vendors: {num_both_seq}"
)
print(
    f"Number of NaN sequences that are presented in CCXT, but not in Talos: {num_unique_seq_ccxt}"
)
print(
    f"Number of NaN sequences that are presented in Talos, but not in CCXT: {num_unique_seq_talos}"
)

print(f"Total time of NaN sequences in Talos - {total_time_talos}")
print(f"Total time of NaN sequences in CCXT - {total_time_ccxt}")
print(
    f"Talos NaN sequences are greater than CCXT by the amount of {diff_in_total_time}"
)
print(
    f"Mean difference of NaN sequence between two vendors (Talos has greater sequences) - {mean_time_diff}"
)

Number of NaN sequences that are the same in both vendors: 12
Number of NaN sequences that are presented in CCXT, but not in Talos: 2
Number of NaN sequences that are presented in Talos, but not in CCXT: 7
Total time of NaN sequences in Talos - 11 days 13:36:00
Total time of NaN sequences in CCXT - 11 days 05:32:00
Talos NaN sequences are greater than CCXT by the amount of 0 days 08:04:00
Mean difference of NaN sequence between two vendors (Talos has greater sequences) - 0 days 00:02:20
