# Imports

In [1]:
import logging

import numpy as np
import pandas as pd

import helpers.hdbg as hdbg
import helpers.henv as henv
import helpers.hpandas as hpandas
import helpers.hprint as hprint
import helpers.hs3 as hs3
import im.common.data.types as imcodatyp
import im.kibot.data.load.kibot_s3_data_loader as imkdlksdlo
import im.kibot.metadata.load.s3_backend as imkmls3ba

In [2]:
hdbg.init_logger(verbosity=logging.INFO)

_LOG = logging.getLogger(__name__)

_LOG.info("%s", henv.get_system_signature()[0])

hprint.config_notebook()

[0m[36mINFO[0m: > cmd='/venv/lib/python3.8/site-packages/ipykernel_launcher.py -f /home/.local/share/jupyter/runtime/kernel-e61bc576-464f-47bd-acd7-fbbee63963d9.json'
>>ENV<<: is_inside_container=True: code_version=1.0.7, container_version=1.0.7, is_inside_docker=True, is_inside_ci=False, CI_defined=True, CI=''
>>ENV<<: AM_AWS_PROFILE=True AM_ECR_BASE_PATH=True AM_S3_BUCKET=True AM_TELEGRAM_TOKEN=True AWS_ACCESS_KEY_ID=False AWS_DEFAULT_REGION=False AWS_SECRET_ACCESS_KEY=False GH_ACTION_ACCESS_TOKEN=True
INFO  # Git
    branch_name='CMTask1219_Explore_Kibot_data_in_S3'
    hash='c4de28364'
    # Last commits:
      *   c4de28364 max-rsrch Merge branch 'master' into CMTask1219_Explore_Kibot_data_in_S3    (30 minutes ago) Mon Feb 21 14:58:23 2022  (HEAD -> CMTask1219_Explore_Kibot_data_in_S3)
      |\  
      | * ecfade9cb Grigorii Pomazkin CmTask851: omit certain files in coverage report  (#1261)         (51 minutes ago) Mon Feb 21 14:37:36 2022  (origin/master, origin/HEAD, master)


In [3]:
# Disabling INFO messages from data downloads.
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)

# Functions

In [4]:
# TODO: Merge this function into `compute_start_end_table` in `research_amp/cc/statistics.py`
def calculate_datetime_statistics_for_kibot_data(
    list_of_symbols: list, contract_type: str, futures_frequency: str
) -> pd.DataFrame:
    """
    Load the data for each asset through the loop and proccess it to obtain
    datetime statistics:

    - start date
    - end date
    - data points count

    :param list_of_symbols: tickers for asset in desired universe
    :param contract_type: either 'Futures' or 'Stocks'
    :param futures_frequency: only for Futures; "daily" or "minutely"
    :return: datetime statistics for every asset in the given universe
    """
    # Create dictionaries that will store the datetime statistics.
    start_date_dict = {}
    end_date_dict = {}
    data_count_dict = {}
    # Create a loop that loads data for a single asset and proccess it to extract datetime statistics.
    for ticker in list_of_symbols:
        # The code below loads the data.
        if contract_type == "Futures":
            asset_df = kibot_loader.read_data(
                exchange="Any Exchange",
                symbol=ticker,
                asset_class=imcodatyp.AssetClass.Futures,
                contract_type=imcodatyp.ContractType.Continuous,
                frequency=futures_frequency,
            )
        elif contract_type == "Stocks":
            asset_df = kibot_loader.read_data(
                exchange="Any Exchange",
                symbol=ticker,
                asset_class=imcodatyp.AssetClass.Stocks,
                frequency=imcodatyp.Frequency.Minutely,
                unadjusted=False,
            )
        # Here is a condition that cuts out empty dataframes.
        # See section 'Example of an empty stock data' for reference.
        if asset_df.shape[0] in [1, 2]:
            # The logic here and below: mapping the value of start date to the
            # specific company ticker.
            start_ind[ticker] = np.nan
            # The logic here and below: add a particular ticket related data
            # to the dictionaries with data for all tickers.
            start_date_dict = start_date_dict | start_ind.items()
            end_ind[ticker] = np.nan
            end_date_dict = end_date_dict | end_ind.items()
            data_count_ind[ticker] = np.nan
            data_count_dict = data_count_dict | data_count_ind.items()
        # The non-empty dataframes are proccessed to extract datetime statistics.
        else:
            # Reseting index to unleash the column with datetime data.
            asset_df.reset_index(inplace=True)
            # Collecting datetime statistics.
            max_date = asset_df["datetime"].max()
            min_date = asset_df["datetime"].min()
            data_points = asset_df["datetime"].count()
            # Writing these values into the dictionaries.
            start_ind = {ticker: min_date}
            start_date_dict = start_date_dict | start_ind.items()
            end_ind = {ticker: max_date}
            end_date_dict = end_date_dict | end_ind.items()
            data_count_ind = {ticker: data_points}
            data_count_dict = data_count_dict | data_count_ind.items()
        # Once all the dictionaries are filled with data - turn them to dataframes.
        # The logic here and below: transform dictionary into `pd.DataFrame`.
        # It has two columns: tickers and statistics value. The code below sets tickers
        # as an index during this transformation.
        final_start_date = pd.DataFrame(
            start_date_dict, columns=["", "start_date"]
        ).set_index("")
        final_end_date = pd.DataFrame(
            end_date_dict, columns=["", "end_date"]
        ).set_index("")
        final_data_count = pd.DataFrame(
            data_count_dict, columns=["", "data_points_count"]
        ).set_index("")
        # Combine all statistics into a single table.
        result = pd.concat(
            [final_start_date, final_end_date, final_data_count], axis=1
        )
    return result.sort_index(ascending=True)


def calculate_general_datetime_stats(df: pd.DataFrame) -> pd.DataFrame:
    """
    Take the table with datetime stats for individual assets and compute
    generalized stats for all universe:

    - median start date
    - median end date
    - min start date
    - max start date
    - median data points

    :param df: Table with datetime statistics for every asset in the given universe
    :return: Table with general datetime statistics for all assets in given universe
    """
    median_start_date = df["start_date"].median()
    median_end_date = df["end_date"].median()
    min_start_date = df["start_date"].min()
    max_end_date = df["end_date"].max()
    median_data_points = df["data_points_count"].median()
    result = pd.DataFrame(
        [
            median_start_date,
            median_end_date,
            min_start_date,
            max_end_date,
            median_data_points,
        ],
        index=[
            "median_start_date",
            "median_end_date",
            "min_start_date",
            "max_end_date",
            "median_data_points",
        ],
        columns=["value"],
    )
    return result

# Explore the universe

In [5]:
s3_backend = imkmls3ba.S3Backend()

## Futures

In [6]:
one_min_contract_metadata = s3_backend.read_1min_contract_metadata()
print("Number of contracts:", one_min_contract_metadata.shape[0])
display(one_min_contract_metadata.head(3))

Number of contracts: 14962


Unnamed: 0_level_0,Symbol,Link,Description
All_Futures_Contracts_1min.csv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,JY,http://api.kibot.com/?action=download&link=151...,CONTINUOUS JAPANESE YEN CONTRACT
2,JYF18,http://api.kibot.com/?action=download&link=vrv...,JAPANESE YEN JANUARY 2018
3,JYF19,http://api.kibot.com/?action=download&link=8r8...,JAPANESE YEN JANUARY 2019


In [7]:
daily_contract_metadata = s3_backend.read_daily_contract_metadata()
print("Number of contracts:", daily_contract_metadata.shape[0])
display(daily_contract_metadata.head(3))

Number of contracts: 14962


Unnamed: 0_level_0,Symbol,Link,Description
All_Futures_Contracts_daily.csv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,JY,http://api.kibot.com/?action=download&link=151...,CONTINUOUS JAPANESE YEN CONTRACT
2,JYF18,http://api.kibot.com/?action=download&link=vrv...,JAPANESE YEN JANUARY 2018
3,JYF19,http://api.kibot.com/?action=download&link=8r8...,JAPANESE YEN JANUARY 2019


In [8]:
tickbidask_contract_metadata = s3_backend.read_tickbidask_contract_metadata()
print("Number of contracts:", tickbidask_contract_metadata.shape[0])
display(tickbidask_contract_metadata.head(3))

Number of contracts: 5749


Unnamed: 0,SymbolBase,Symbol,StartDate,Size(MB),Description,Exchange
1,ES,ES,2009-09-30,50610.0,CONTINUOUS E-MINI S&P 500 CONTRACT,Chicago Mercantile Exchange Mini Sized Contrac...
2,ES,ESH11,2010-04-06,891.0,E-MINI S&P 500 MARCH 2011,Chicago Mercantile Exchange Mini Sized Contrac...
3,ES,ESH12,2011-03-06,1060.0,E-MINI S&P 500 MARCH 2012,Chicago Mercantile Exchange Mini Sized Contrac...


In [9]:
continuous_contract_metadata = s3_backend.read_continuous_contract_metadata()
print("Number of contracts:", continuous_contract_metadata.shape[0])
display(continuous_contract_metadata.head(3))

Number of contracts: 87


Unnamed: 0,SymbolBase,Symbol,StartDate,Size(MB),Description,Exchange
1,JY,JY,2009-09-27,183.0,CONTINUOUS JAPANESE YEN CONTRACT,Chicago Mercantile Exchange (CME GLOBEX)
2,TY,TY,2009-09-27,180.0,CONTINUOUS 10 YR US TREASURY NOTE CONTRACT,Chicago Board Of Trade (CBOT GLOBEX)
3,FV,FV,2009-09-27,171.0,CONTINUOUS 5 YR US TREASURY NOTE CONTRACT,Chicago Board Of Trade (CBOT GLOBEX)


In [10]:
kibot_exchange_mapping = s3_backend.read_kibot_exchange_mapping()
print("Number of contracts:", kibot_exchange_mapping.shape[0])
display(kibot_exchange_mapping.head(3))

Number of contracts: 72


Unnamed: 0_level_0,Exchange_group,Exchange_abbreviation,Exchange_symbol
Kibot_symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AC,CME,CBOT,EH
AE,CME,CBOT,AW
BGI,CME,CME,LE


## Stocks

In [11]:
stocks_symbols = s3_backend.get_symbols_for_dataset("all_stocks_1min")
stocks_symbols[:5]

['A', 'AA', 'AA.B', 'AAAP', 'AABA']

In [12]:
len(stocks_symbols)

11687

# Example for data loading

In [13]:
kibot_loader = imkdlksdlo.KibotS3DataLoader()

## Futures

In [14]:
# Example for CME Ethanol Daily Continuous Futures.
# Data is presented in OHLCV type.
ethanol_futures = kibot_loader.read_data(
    exchange="Unknown",
    symbol="AC",
    asset_class=imcodatyp.AssetClass.Futures,
    frequency=imcodatyp.Frequency.Daily,
    contract_type=imcodatyp.ContractType.Continuous,
)
ethanol_futures.head()

Unnamed: 0_level_0,open,high,low,close,vol
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2005-05-05,1.18,1.18,1.18,1.18,0
2005-05-06,1.18,1.18,1.18,1.18,0
2005-05-09,1.18,1.18,1.18,1.18,0
2005-05-10,1.18,1.18,1.18,1.18,0
2005-05-11,1.18,1.18,1.18,1.18,0


In [15]:
# Example for Minutely Expiry Futures (JAPANESE YEN JANUARY 2018).
japan_yen = kibot_loader.read_data(
    exchange="Unknown",
    symbol="JYF18",
    asset_class=imcodatyp.AssetClass.Futures,
    frequency=imcodatyp.Frequency.Minutely,
    contract_type=imcodatyp.ContractType.Expiry,
)
japan_yen.head()

Unnamed: 0_level_0,open,high,low,close,vol
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-09-18 20:54:00,0.009027,0.009027,0.009027,0.009027,5
2017-09-19 07:32:00,0.009019,0.009019,0.009019,0.009019,1
2017-09-19 08:23:00,0.009026,0.009026,0.009026,0.009026,5
2017-09-19 08:39:00,0.009031,0.009031,0.009031,0.009031,5
2017-09-19 09:16:00,0.009038,0.00904,0.009038,0.00904,10


## Stocks

In [16]:
# Example for Apple stock.
aapl = kibot_loader.read_data(
    exchange="Q",
    symbol="AAPL",
    asset_class=imcodatyp.AssetClass.Stocks,
    frequency=imcodatyp.Frequency.Minutely,
    unadjusted=False,
)
aapl.head()

Unnamed: 0_level_0,open,high,low,close,vol
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2015-09-29 04:00:00,103.9,103.96,103.9,103.96,974
2015-09-29 04:07:00,104.24,104.24,104.24,104.24,108
2015-09-29 04:13:00,104.29,104.29,104.29,104.29,216
2015-09-29 04:14:00,104.34,104.36,104.34,104.34,2272
2015-09-29 04:18:00,104.37,104.37,104.37,104.37,962


In [17]:
# Interesting note: the necessary param 'exchange' can be any value.
aapl_any_exchange = kibot_loader.read_data(
    exchange="Any Exchange",
    symbol="AAPL",
    asset_class=imcodatyp.AssetClass.Stocks,
    frequency=imcodatyp.Frequency.Minutely,
    unadjusted=False,
)
aapl_any_exchange.head()

Unnamed: 0_level_0,open,high,low,close,vol
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2015-09-29 04:00:00,103.9,103.96,103.9,103.96,974
2015-09-29 04:07:00,104.24,104.24,104.24,104.24,108
2015-09-29 04:13:00,104.29,104.29,104.29,104.29,216
2015-09-29 04:14:00,104.34,104.36,104.34,104.34,2272
2015-09-29 04:18:00,104.37,104.37,104.37,104.37,962


### Example of an empty stock data

In [18]:
# Some files in stock universe contain no data and consider empty.
# There are different types of these empty files.
empty_stock_file_1 = kibot_loader.read_data(
    exchange="Any Exchange",
    symbol="AACC",
    asset_class=imcodatyp.AssetClass.Stocks,
    frequency=imcodatyp.Frequency.Minutely,
    unadjusted=False,
)
empty_stock_file_2 = kibot_loader.read_data(
    exchange="Any Exchange",
    symbol="ACND.U",
    asset_class=imcodatyp.AssetClass.Stocks,
    frequency=imcodatyp.Frequency.Minutely,
    unadjusted=False,
)
display(empty_stock_file_1)
display(empty_stock_file_2)

Unnamed: 0,0
0,405 Data Not Found.
1,No data found for the specified period for AACC.


Unnamed: 0,0
0,404 Symbol Not Found405 Data Not Found.
1,No data found for the specified period for ACN...


# Period of time availability

## Stocks

In [135]:
final_stats_stocks = calculate_datetime_statistics_for_kibot_data(
    stocks_symbols, "Stocks", "stock_datasets"
)
display(final_stats_stocks.shape)
display(final_stats_stocks)

(11687, 3)

Unnamed: 0,start_date,end_date,data_points_count
A,2015-09-29 09:30:00,2020-08-20 14:01:00,477400.0
AA,2015-09-29 07:50:00,2020-08-20 14:01:00,516662.0
AA.B,2015-09-29 09:30:00,2016-10-31 16:00:00,17078.0
AAAP,2015-11-11 10:27:00,2018-02-09 16:00:00,66561.0
AABA,2015-09-29 04:41:00,2019-10-02 16:06:00,417064.0
...,...,...,...
ZYME,2017-04-28 09:49:00,2020-08-19 15:29:00,123359.0
ZYNE,2015-09-29 09:09:00,2020-08-19 15:29:00,312789.0
ZYXI,2015-09-30 09:40:00,2020-08-19 15:28:00,98710.0
ZZ,NaT,NaT,


In [107]:
general_stats_all_stocks = calculate_general_datetime_stats(final_stats_stocks)
general_stats_all_stocks

Unnamed: 0,value
median_start_date,2015-09-29 09:30:00
median_end_date,2020-08-18 16:00:00
min_start_date,2015-08-10 12:46:00
max_end_date,2020-08-21 16:38:00
median_data_points,76472.0


In [110]:
# Dataframe with empty stock data files.
empty_dataframes = final_stats_stocks[
    final_stats_stocks["data_points_count"].isna()
]
# Number of empty stock data files.
len(final_stats_stocks)

2059

In [139]:
print(
    hprint.perc(len(empty_dataframes), len(final_stats)),
    "of files in stock universe are empty.",
)

17.62 % of files in stock universe are empty.


## Futures

### Continuous contracts 1min

In [20]:
futures_continuous_contracts_1min_symbols = s3_backend.get_symbols_for_dataset(
    "all_futures_continuous_contracts_1min"
)
len(futures_continuous_contracts_1min_symbols)

252

In [21]:
# Getting a sample of 5 contracts.
futures_continuous_contracts_1min_symbols_sample = (
    futures_continuous_contracts_1min_symbols[:10]
)

In [22]:
continuous_contracts_minutely_stats = (
    calculate_datetime_statistics_for_kibot_data(
        futures_continuous_contracts_1min_symbols_sample,
        "Futures",
        imcodatyp.Frequency.Minutely,
    )
)
continuous_contracts_minutely_stats

Unnamed: 0,start_date,end_date,data_points_count
,,,
AC,2009-09-28 11:56:00,2019-07-19 14:42:00,65213.0
AD,2009-09-27 18:00:00,2019-07-19 16:59:00,3282325.0
AE,2007-05-16 10:35:00,2019-07-19 11:41:00,29928.0
AEX,2009-09-28 06:01:00,2019-07-19 15:59:00,1682164.0
AJY,2009-10-21 11:22:00,2019-07-19 15:21:00,43759.0
ALJ,2014-03-31 02:30:00,2019-07-19 11:29:00,687524.0
ALM,2014-03-31 02:30:00,2019-07-19 11:29:00,421324.0
BB,2009-09-27 19:45:00,2019-07-19 16:42:00,444242.0
BBN,2011-09-25 15:48:00,2019-07-19 05:28:00,28372.0


### Continuous contracts Daily

In [23]:
futures_continuous_contracts_daily_symbols = s3_backend.get_symbols_for_dataset(
    "all_futures_continuous_contracts_daily"
)
len(futures_continuous_contracts_daily_symbols)

252

In [24]:
continuous_contracts_daily_stats = calculate_datetime_statistics_for_kibot_data(
    futures_continuous_contracts_daily_symbols,
    "Futures",
    imcodatyp.Frequency.Daily,
)
continuous_contracts_daily_stats.head(3)

Unnamed: 0,start_date,end_date,data_points_count
,,,
AC,2005-05-05,2019-07-19,3487.0
AD,1995-09-13,2019-07-19,6030.0
AE,2006-10-02,2019-07-19,3228.0


In [25]:
general_stats_all_futures = calculate_general_datetime_stats(
    continuous_contracts_daily_stats
)
general_stats_all_futures

Unnamed: 0,value
median_start_date,2005-09-20 00:00:00
median_end_date,2019-07-19 00:00:00
min_start_date,1982-05-03 00:00:00
max_end_date,2019-07-19 00:00:00
median_data_points,3483.5


# Read raw data

In [26]:
s3fs = hs3.get_s3fs("am")

## Example of raw data for Stocks

In [27]:
file_path_stock = "s3://alphamatic-data/data/kibot/all_stocks_1min/AAPL.csv.gz"

In [28]:
stream, kwargs = hs3.get_local_or_s3_stream(file_path_stock, s3fs=s3fs)
aapl_raw = hpandas.read_csv_to_df(stream, **kwargs)
aapl_raw.head()

Unnamed: 0,09/29/2015,04:00,103.9,103.96,103.9.1,103.96.1,974
0,09/29/2015,04:07,104.24,104.24,104.24,104.24,108
1,09/29/2015,04:13,104.29,104.29,104.29,104.29,216
2,09/29/2015,04:14,104.34,104.36,104.34,104.34,2272
3,09/29/2015,04:18,104.37,104.37,104.37,104.37,962
4,09/29/2015,04:19,104.38,104.41,104.38,104.41,649


## Example of raw data for Futures

In [29]:
file_path_futures = "s3://alphamatic-data/data/kibot/all_futures_continuous_contracts_daily/AE.csv.gz"

In [30]:
stream, kwargs = hs3.get_local_or_s3_stream(file_path_futures, s3fs=s3fs)
ae_futures_raw = hpandas.read_csv_to_df(stream, **kwargs)
ae_futures_raw.head()

Unnamed: 0,10/02/2006,158.8,158.8.1,158.8.2,158.8.3,0
0,10/03/2006,155.7,155.7,155.7,155.7,0
1,10/04/2006,157.0,157.0,157.0,157.0,0
2,10/05/2006,159.0,159.0,159.0,159.0,50
3,10/06/2006,160.2,160.2,160.2,160.2,0
4,10/09/2006,163.6,163.6,163.6,163.5,100


## Difference of raw Parquet stock data vs. CSV stock data

### CSV example of QCOM

In [31]:
file_path_stock = "s3://alphamatic-data/data/kibot/all_stocks_1min/QCOM.csv.gz"

In [32]:
stream, kwargs = hs3.get_local_or_s3_stream(file_path_futures, s3fs=s3fs)
csv_qcom = hpandas.read_csv_to_df(stream, **kwargs)
csv_qcom.head()

Unnamed: 0,10/02/2006,158.8,158.8.1,158.8.2,158.8.3,0
0,10/03/2006,155.7,155.7,155.7,155.7,0
1,10/04/2006,157.0,157.0,157.0,157.0,0
2,10/05/2006,159.0,159.0,159.0,159.0,50
3,10/06/2006,160.2,160.2,160.2,160.2,0
4,10/09/2006,163.6,163.6,163.6,163.5,100


### PQ example of QCOM

In [33]:
file_path_stock_parquet = (
    "s3://alphamatic-data/data/kibot/pq/all_stocks_1min/QCOM.pq"
)

In [34]:
stream, kwargs = hs3.get_local_or_s3_stream(file_path_stock_parquet, s3fs=s3fs)
pq_qcom = hpandas.read_parquet_to_df(stream, **kwargs)
pq_qcom.head()

Unnamed: 0_level_0,open,high,low,close,vol
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2015-09-29 08:05:00,44.32,44.32,44.32,44.32,357
2015-09-29 08:09:00,44.32,44.32,44.32,44.32,119
2015-09-29 08:11:00,44.32,44.32,44.32,44.32,595
2015-09-29 08:13:00,44.32,44.32,44.32,44.32,119
2015-09-29 08:18:00,44.32,44.32,44.32,44.32,476


# Summary

- The Kibot universe that is extracted using general methods is not consistent with the actual downloaded data and most likely is being parsed from the website at some period of time.
   - In order to observe the actual universe that is available in the database one need to run get_symbols_for_dataset().
- Data is presented in OHLCV format.
- The necessary param 'exchange' from read_data() is not specific at all: it can be any value.
- The stocks data in the database is huge and consists of >11.000 tickers.
   - However, 17.62 % of files in stock universe consists no data.
- The average available time period for stocks is ~5 years, up to 2020.
- The futures availability in database is much less: 252 continuous contracts for both daily and minutely frequencies.
- The OHLCV data inside raw files is identical by values and time range.
- PQ data is already transformed to the desired format (unlike CSV data):
   - The heading is in place.
   - Datetime is converted to index and presented in a complete data-time format.