## Imports

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
import logging

import pandas as pd

import helpers.hdatetime as hdateti
import helpers.hdbg as hdbg
import helpers.hprint as hprint
import im_v2.ccxt.data.client.ccxt_clients as imvcdccccl
import im_v2.talos.data.client.talos_clients as imvtdctacl
import im_v2.talos.data.extract.exchange_class as imvtdeexcl

import im_v2.im_lib_tasks as imvimlita
import helpers.hsql as hsql

In [None]:
hdbg.init_logger(verbosity=logging.INFO)

_LOG = logging.getLogger(__name__)

hprint.config_notebook()

## Functions

In [None]:
def convert_to_the_format_for_analysis(df, suffix):
    """
    This function does the following:
    - Add a column `diff_in_timestamps` which is a time difference from the timestamp in the previous row.
    - Drop the columns that are not necessary for the analysis.
    - Filter the data, so all data starts from the same time.
    - Choose the rows that where the step from the previous timestamp is greater than 1 minute.
    - Add suffix to distiguish between vendors.
    """
    df = df.reset_index()
    df = df.dropna()
    df['diff_in_timestamps'] = df.timestamp - df.timestamp.shift(1)
    df = df.set_index("timestamp")
    df = df[["diff_in_timestamps", "volume"]]
    df = df[df.index > "2022-03-17 00:00:00+00:00"]
    df = df[df["diff_in_timestamps"]!="0 days 00:01:00"]
    df = df.add_suffix(f"{suffix}")
    return df

# Load the data

In [None]:
# Specify the connection.
env_file = imvimlita.get_db_env_path("dev")
connection_params = hsql.get_connection_info_from_env_file(env_file)
connection = hsql.get_connection(*connection_params)
# Specify param for both clients.
resample_1min = True

In [None]:
# General params for `read_data`.
full_symbol = ["binance::ADA_USDT"]
start_date = end_date = None

## Load CCXT data

In [None]:
# Initiate the client.
vendor = "CCXT"
ccxt_client = imvcdccccl.CcxtCddDbClient(vendor, resample_1min, connection)

In [None]:
# Load the data.
ada_ccxt = ccxt_client.read_data(full_symbol, start_date, end_date)
display(ada_ccxt.shape)
display(ada_ccxt.head(3))

## Load Realtime Talos data

In [None]:
# Initialize the client.
table_name = "talos_ohlcv"
mode = "market_data"
talos_client = imvtdctacl.RealTimeSqlTalosClient(resample_1min, connection, table_name, mode)

In [None]:
# Load the data.
ada_talos = talos_client.read_data(full_symbol, start_date, end_date)
display(ada_talos.shape)
display(ada_talos.head(3))

# Research of NaNs in timestamps 

In [None]:
diff_ccxt = convert_to_the_format_for_analysis(ada_ccxt, "_ccxt")
diff_talos = convert_to_the_format_for_analysis(ada_talos, "_talos")

In [None]:
df = pd.concat([diff_ccxt, diff_talos],axis=1)

In [None]:
df

In [None]:
df[(df.diff_in_timestamps_ccxt.isna())|df.diff_in_timestamps_talos.isna()]

In [None]:
df[df.diff_in_timestamps_ccxt.isna()]

In [None]:
df[df.diff_in_timestamps_talos.isna()]

In [None]:
df[(df.diff_in_timestamps_ccxt.notna())&df.diff_in_timestamps_talos.notna()]

In [None]:
gg = df[(df.diff_in_timestamps_ccxt.notna())&df.diff_in_timestamps_talos.notna()]
gg.volume_ccxt == gg.volume_talos