## Process input file

In this notebook we will load sample trades data (https://drive.google.com/file/d/1up5otVlfw-RX1S6K8o4d2nNRPP-lKran/view), resample them and store on S3 in a parquet tiled format

Assuming the tar archive is in the root of the repository

In [1]:
! mkdir data && tar xf /app/msfttaqcsv202308.tar -C ./data

In [2]:
!ls ./data

 metadata
'uT1dPod8mR2s_MSFT US Equity_quotes_1_1.csv.gz'
'uT1dPod8mR2s_MSFT US Equity_trades_1_1.csv.gz'


## Imports

In [3]:
import logging

import pandas as pd

import helpers.hdbg as hdbg
import helpers.henv as henv
import helpers.hparquet as hparque
import helpers.hprint as hprint

  from tqdm.autonotebook import tqdm


In [4]:
hdbg.init_logger(verbosity=logging.INFO)
log_level = logging.INFO

_LOG = logging.getLogger(__name__)

_LOG.info("%s", henv.get_system_signature()[0])

hprint.config_notebook()

[0m[36mINFO[0m: > cmd='/venv/lib/python3.8/site-packages/ipykernel_launcher.py -f /home/.local/share/jupyter/runtime/kernel-1c85050e-7966-49b2-b438-8abbf7666c9b.json'
INFO  # Git
  branch_name='CmampTask5425_FGP_equities_data_sample_exploratory_analysis'
  hash='e980801bf'
  # Last commits:
    * e980801bf jsmerix  Checkpoint                                                        (    8 days ago) Mon Sep 18 17:07:01 2023  (HEAD -> CmampTask5425_FGP_equities_data_sample_exploratory_analysis, origin/CmampTask5425_FGP_equities_data_sample_exploratory_analysis)
    * 740085bf4 jsmerix  Update after resampling full dataset                              (    8 days ago) Mon Sep 18 17:02:59 2023           
    * bab78b2db jsmerix  Add example notebook to load equity data and store to s3 as parquet (   11 days ago) Fri Sep 15 18:52:04 2023           
# Machine info
  system=Linux
  node name=0fe0a862b0be
  release=5.15.0-1044-aws
  version=#49~20.04.1-Ubuntu SMP Mon Aug 21 17:09:32 UTC 2023


## Load data

In [5]:
data = pd.read_csv("data/uT1dPod8mR2s_MSFT US Equity_trades_1_1.csv.gz")

In [6]:
data.head()

Unnamed: 0,SECURITY,TICK_SEQUENCE_NUMBER,TICK_TYPE,EVT_TRADE_TIME,TRADE_REPORTED_TIME,EVT_TRADE_EXECUTION_TIME,EVT_TRADE_IDENTIFIER,EVENT_ORIGINAL_TRADE_ID,EVENT_ORIGINAL_TRADE_TIME,EVT_TRADE_PRICE,EVT_TRADE_SIZE,EVT_TRADE_LOCAL_EXCH_SOURCE,EVT_TRADE_CONDITION_CODE,EVT_TRADE_BUY_BROKER,EVT_TRADE_SELL_BROKER,TRACE_RPT_PARTY_SIDE_LAST_TRADE,EVT_TRADE_RPT_PARTY_TYP,EVT_TRADE_BIC,EVT_TRADE_MIC,EVT_TRADE_ESMA_TRADE_FLAGS,EVT_TRADE_AGGRESSOR,EVT_TRADE_RPT_CONTRA_TYP,EVT_TRADE_REMUNERATION,EVT_TRADE_ATS_INDICATOR
0,MSFT US Equity,4417360,NEW,2023-08-01T00:00:00.050Z,2023-08-01T00:00:00.050Z,,,,,336.0,0.0,UF,OC,,,,,,,,,,,
1,MSFT US Equity,4417361,NEW,2023-08-01T00:00:00.050Z,2023-08-01T00:00:00.050Z,,,,,335.95,0.0,VY,OC,,,,,,,,,,,
2,MSFT US Equity,4417362,NEW,2023-08-01T00:00:00.050Z,2023-08-01T00:00:00.050Z,,,,,335.95,0.0,UX,OC,,,,,,,,,,,
3,MSFT US Equity,4417363,NEW,2023-08-01T00:00:00.050Z,2023-08-01T00:00:00.050Z,,,,,335.94,0.0,VF,OC,,,,,,,,,,,
4,MSFT US Equity,4417364,NEW,2023-08-01T00:00:00.050Z,2023-08-01T00:00:00.050Z,,,,,336.0,0.0,VG,OC,,,,,,,,,,,


Drop columns with all NaNs

In [7]:
data = data.dropna(axis=1, how="all")

In [8]:
data.head()

Unnamed: 0,SECURITY,TICK_SEQUENCE_NUMBER,TICK_TYPE,EVT_TRADE_TIME,TRADE_REPORTED_TIME,EVT_TRADE_IDENTIFIER,EVENT_ORIGINAL_TRADE_ID,EVT_TRADE_PRICE,EVT_TRADE_SIZE,EVT_TRADE_LOCAL_EXCH_SOURCE,EVT_TRADE_CONDITION_CODE
0,MSFT US Equity,4417360,NEW,2023-08-01T00:00:00.050Z,2023-08-01T00:00:00.050Z,,,336.0,0.0,UF,OC
1,MSFT US Equity,4417361,NEW,2023-08-01T00:00:00.050Z,2023-08-01T00:00:00.050Z,,,335.95,0.0,VY,OC
2,MSFT US Equity,4417362,NEW,2023-08-01T00:00:00.050Z,2023-08-01T00:00:00.050Z,,,335.95,0.0,UX,OC
3,MSFT US Equity,4417363,NEW,2023-08-01T00:00:00.050Z,2023-08-01T00:00:00.050Z,,,335.94,0.0,VF,OC
4,MSFT US Equity,4417364,NEW,2023-08-01T00:00:00.050Z,2023-08-01T00:00:00.050Z,,,336.0,0.0,VG,OC


In [9]:
data["SECURITY"].value_counts()

MSFT US Equity    7629374
Name: SECURITY, dtype: int64

In [10]:
null_size = data[data["EVT_TRADE_SIZE"] == 0]

In [11]:
null_size.shape

(1222, 11)

Some trades have 0 size

In [12]:
null_size.head()

Unnamed: 0,SECURITY,TICK_SEQUENCE_NUMBER,TICK_TYPE,EVT_TRADE_TIME,TRADE_REPORTED_TIME,EVT_TRADE_IDENTIFIER,EVENT_ORIGINAL_TRADE_ID,EVT_TRADE_PRICE,EVT_TRADE_SIZE,EVT_TRADE_LOCAL_EXCH_SOURCE,EVT_TRADE_CONDITION_CODE
0,MSFT US Equity,4417360,NEW,2023-08-01T00:00:00.050Z,2023-08-01T00:00:00.050Z,,,336.0,0.0,UF,OC
1,MSFT US Equity,4417361,NEW,2023-08-01T00:00:00.050Z,2023-08-01T00:00:00.050Z,,,335.95,0.0,VY,OC
2,MSFT US Equity,4417362,NEW,2023-08-01T00:00:00.050Z,2023-08-01T00:00:00.050Z,,,335.95,0.0,UX,OC
3,MSFT US Equity,4417363,NEW,2023-08-01T00:00:00.050Z,2023-08-01T00:00:00.050Z,,,335.94,0.0,VF,OC
4,MSFT US Equity,4417364,NEW,2023-08-01T00:00:00.050Z,2023-08-01T00:00:00.050Z,,,336.0,0.0,VG,OC


In [13]:
data.dtypes

SECURITY                        object
TICK_SEQUENCE_NUMBER             int64
TICK_TYPE                       object
EVT_TRADE_TIME                  object
TRADE_REPORTED_TIME             object
EVT_TRADE_IDENTIFIER            object
EVENT_ORIGINAL_TRADE_ID        float64
EVT_TRADE_PRICE                float64
EVT_TRADE_SIZE                 float64
EVT_TRADE_LOCAL_EXCH_SOURCE     object
EVT_TRADE_CONDITION_CODE        object
dtype: object

Set datetime index

In [14]:
data["timestamp"] = pd.to_datetime(data["TRADE_REPORTED_TIME"])
data = data.set_index("timestamp", drop=True)

Prepare relevant columns and set index

In [15]:
data = data[["EVT_TRADE_PRICE", "EVT_TRADE_SIZE"]]

## Compute OHLCV

Our time interval labelling convention is that time interval [a, b) is labelled as b.

E.g. for interval [06:40:00, 06:41:00) the timestamp is
06:41:00

In [16]:
data_ohlcv = (
    data["EVT_TRADE_PRICE"].resample("1T", closed="left", label="right").ohlc()
)

In [17]:
data_volume = (
    data["EVT_TRADE_SIZE"].resample("1T", closed="left", label="right").sum()
)
data_volume.name = "volume"

In [18]:
data = pd.concat([data_ohlcv, data_volume], axis=1)

In [19]:
data.head()

Unnamed: 0_level_0,open,high,low,close,volume
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-08-01 00:01:00+00:00,336.0,336.0,335.6,335.92,0.0
2023-08-01 00:02:00+00:00,,,,,0.0
2023-08-01 00:03:00+00:00,,,,,0.0
2023-08-01 00:04:00+00:00,,,,,0.0
2023-08-01 00:05:00+00:00,,,,,0.0


In [20]:
data["currency_pair"] = "MSFT"
data["knowledge_timestamp"] = pd.Timestamp.utcnow()

In [21]:
data.head()

Unnamed: 0_level_0,open,high,low,close,volume,currency_pair,knowledge_timestamp
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2023-08-01 00:01:00+00:00,336.0,336.0,335.6,335.92,0.0,MSFT,2023-09-26 16:16:04.623076+00:00
2023-08-01 00:02:00+00:00,,,,,0.0,MSFT,2023-09-26 16:16:04.623076+00:00
2023-08-01 00:03:00+00:00,,,,,0.0,MSFT,2023-09-26 16:16:04.623076+00:00
2023-08-01 00:04:00+00:00,,,,,0.0,MSFT,2023-09-26 16:16:04.623076+00:00
2023-08-01 00:05:00+00:00,,,,,0.0,MSFT,2023-09-26 16:16:04.623076+00:00


## Save as parquet

In [22]:
partition_mode = "by_year_month"
# TODO(Juraj): FGP doesn't have access to this bucket
s3_path = "s3://cryptokaizen-data-test/v3/bulk/manual/resampled_1min/parquet/ohlcv/spot/v1/bloomberg/us_market/v1_0_0/"
aws_profile = "ck"

In [23]:
data, partition_cols = hparque.add_date_partition_columns(data, partition_mode)
hparque.to_partitioned_parquet(
    data,
    ["currency_pair"] + partition_cols,
    s3_path,
    aws_profile=aws_profile,
)

## Load back from parquet

In [24]:
ohlcv_data = hparque.from_parquet(s3_path, aws_profile=aws_profile)

In [25]:
ohlcv_data.head()

Unnamed: 0_level_0,open,high,low,close,volume,knowledge_timestamp,currency_pair,year,month
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2023-08-01 00:01:00+00:00,336.0,336.0,335.6,335.92,0.0,2023-09-26 16:16:04.623076+00:00,MSFT,2023,8
2023-08-01 00:02:00+00:00,,,,,0.0,2023-09-26 16:16:04.623076+00:00,MSFT,2023,8
2023-08-01 00:03:00+00:00,,,,,0.0,2023-09-26 16:16:04.623076+00:00,MSFT,2023,8
2023-08-01 00:04:00+00:00,,,,,0.0,2023-09-26 16:16:04.623076+00:00,MSFT,2023,8
2023-08-01 00:05:00+00:00,,,,,0.0,2023-09-26 16:16:04.623076+00:00,MSFT,2023,8


In [26]:
ohlcv_data.index.min()

Timestamp('2023-08-01 00:01:00+0000', tz='UTC')

In [27]:
ohlcv_data.index.max()

Timestamp('2023-08-31 23:30:00+0000', tz='UTC')