# Binance native downloader

## Clone repo

```bash
git clone https://github.com/binance/binance-public-data.git
cd binance-public-data
```

## Download one month data

```bash
python3 download-trade.py -t um -s "BTCUSDT" -startDate 2023-03-01 -endDate 2023-03-01 -folder "/app/im_v2/binance/data/download"
```

```bash
python3 download-trade.py -t spot -s "BTCUSDT" -startDate 2020-01-01 -endDate 2020-02-01 -folder "/app/im_v2/binance/data/download"
```

# Imports

In [4]:
import logging

import numpy as np
import pandas as pd

import helpers.hdatetime as hdateti
import helpers.hdbg as hdbg
import helpers.henv as henv
import helpers.hpandas as hpandas
import helpers.hprint as hprint
import im_v2.ccxt.data.extract.extractor as ivcdexex
import im_v2.common.data.client.im_raw_data_client as imvcdcimrdc
import im_v2.common.data.qa.dataset_validator as imvcdqdava
import im_v2.common.data.qa.qa_check as imvcdqqach
import im_v2.common.universe as ivcu
import im_v2.common.universe.universe as imvcounun

%load_ext autoreload
%autoreload 2

  from tqdm.autonotebook import tqdm


In [5]:
hdbg.init_logger(verbosity=logging.INFO)

_LOG = logging.getLogger(__name__)

_LOG.info("%s", henv.get_system_signature()[0])

hprint.config_notebook()

[0m[36mINFO[0m: > cmd='/venv/lib/python3.8/site-packages/ipykernel_launcher.py -f /home/.local/share/jupyter/runtime/kernel-9ce134c5-c361-4152-866a-3fc617f88e8a.json'
[31m-----------------------------------------------------------------------------
This code is not in sync with the container:
code_version='1.4.3' != container_version='1.4.0'
-----------------------------------------------------------------------------
You need to:
- merge origin/master into your branch with `invoke git_merge_master`
- pull the latest container with `invoke docker_pull`[0m
INFO  # Git
  branch_name='CmTask4098_Download_binance_spotfutures_historical_trades_data'
  hash='bc6b60131'
  # Last commits:
    * bc6b60131 Grigorii Pomazkin checkpoint (#4230)                                                (    2 days ago) Fri Apr 14 20:18:37 2023  (HEAD -> CmTask4098_Download_binance_spotfutures_historical_trades_data, origin/master, origin/HEAD, origin/CmTask4098_Download_binance_spotfutures_historical_tra

# Download 1 day by CCXT extractor

In [18]:
%%bash
/app/im_v2/common/data/extract/download_bulk.py \
    --download_mode 'bulk' \
    --downloading_entity 'manual' \
    --action_tag 'downloaded_1min' \
    --vendor 'ccxt' \
    --start_timestamp '2023-03-01T00:00:00+00:00' \
    --end_timestamp '2023-03-01T00:10:00+00:00' \
    --exchange_id 'binance' \
    --universe 'v7.3' \
    --aws_profile 'ck' \
    --data_type 'trades' \
    --data_format 'csv' \
    --contract_type 'futures' \
    --s3_path 's3://cryptokaizen-data-test/'

[0m[36mINFO[0m: > cmd='/app/im_v2/common/data/extract/download_bulk.py --download_mode bulk --downloading_entity manual --action_tag downloaded_1min --vendor ccxt --start_timestamp 2023-03-01T00:00:00+00:00 --end_timestamp 2023-03-01T00:10:00+00:00 --exchange_id binance --universe v7.3 --aws_profile ck --data_type trades --data_format csv --contract_type futures --s3_path s3://cryptokaizen-data-test/'
report_memory_usage=False report_cpu_usage=False
[36mINFO[0m: Saving log to file '/app/im_v2/common/data/extract/download_bulk.py.log'
11:11:56 - [36mINFO [0m dataset_schema_utils.py _get_dataset_schema_file_path:39 Loading dataset schema file: /app/data_schema/dataset_schema_versions/dataset_schema_v3.json
11:11:56 - [36mINFO [0m dataset_schema_utils.py get_dataset_schema:74          Loaded dataset schema version v3
11:12:00 - [36mINFO [0m extract_utils.py download_historical_data:734          Saving the dataset into s3://cryptokaizen-data-test/v3/bulk/manual/downloaded_1min/c

# Read CCXT extractor data

In [59]:
signature = (
    "bulk.manual.downloaded_1min.csv.trades.futures.v7_3.ccxt.binance.v1_0_0"
)
reader = imvcdcimrdc.RawDataReader(signature, stage="test")
start_timestamp = pd.Timestamp("2023-03-01T00:00:00+00:00")
end_timestamp = pd.Timestamp("2023-03-02T00:00:00+00:00")
currency_pair = "BTC_USDT"
ccxt_trades = reader.load_csv(
    currency_pair, start_timestamp=start_timestamp, end_timestamp=end_timestamp
)
ccxt_trades.head()

s3://cryptokaizen-data-test
s3://cryptokaizen-data-test/v3/bulk/manual/downloaded_1min/csv/trades/futures/v7_3/ccxt/binance/v1_0_0


Unnamed: 0,id,timestamp,symbol,side,price,amount,end_download_timestamp,currency_pair,exchange_id,knowledge_timestamp
0,1618988283,1677628800053,BTC/USDT,buy,23129.7,2.451,2023-03-15 15:12:07.394774+00:00,BTC_USDT,binance,2023-03-15 15:12:07.397800+00:00
1,1618988284,1677628800085,BTC/USDT,sell,23129.6,0.321,2023-03-15 15:12:07.394774+00:00,BTC_USDT,binance,2023-03-15 15:12:07.397800+00:00
2,1618988285,1677628803913,BTC/USDT,buy,23129.7,0.002,2023-03-15 15:12:07.394774+00:00,BTC_USDT,binance,2023-03-15 15:12:07.397800+00:00
3,1618988286,1677628804251,BTC/USDT,buy,23129.7,8.72,2023-03-15 15:12:07.394774+00:00,BTC_USDT,binance,2023-03-15 15:12:07.397800+00:00
4,1618988287,1677628804255,BTC/USDT,buy,23129.8,0.059,2023-03-15 15:12:07.394774+00:00,BTC_USDT,binance,2023-03-15 15:12:07.397800+00:00


In [41]:
ccxt_trades

Unnamed: 0,id,timestamp,symbol,side,price,amount,end_download_timestamp,currency_pair,exchange_id,knowledge_timestamp
0,1618988283,1677628800053,BTC/USDT,buy,23129.7,2.451,2023-03-15 15:12:07.394774+00:00,BTC_USDT,binance,2023-03-15 15:12:07.397800+00:00
1,1618988284,1677628800085,BTC/USDT,sell,23129.6,0.321,2023-03-15 15:12:07.394774+00:00,BTC_USDT,binance,2023-03-15 15:12:07.397800+00:00
2,1618988285,1677628803913,BTC/USDT,buy,23129.7,0.002,2023-03-15 15:12:07.394774+00:00,BTC_USDT,binance,2023-03-15 15:12:07.397800+00:00
3,1618988286,1677628804251,BTC/USDT,buy,23129.7,8.720,2023-03-15 15:12:07.394774+00:00,BTC_USDT,binance,2023-03-15 15:12:07.397800+00:00
4,1618988287,1677628804255,BTC/USDT,buy,23129.8,0.059,2023-03-15 15:12:07.394774+00:00,BTC_USDT,binance,2023-03-15 15:12:07.397800+00:00
...,...,...,...,...,...,...,...,...,...,...
6671,1618994954,1677629399445,BTC/USDT,sell,23108.7,0.002,2023-03-15 15:12:07.394774+00:00,BTC_USDT,binance,2023-03-15 15:12:07.397800+00:00
6672,1618994955,1677629399637,BTC/USDT,sell,23108.7,0.015,2023-03-15 15:12:07.394774+00:00,BTC_USDT,binance,2023-03-15 15:12:07.397800+00:00
6673,1618994956,1677629399807,BTC/USDT,sell,23108.7,0.001,2023-03-15 15:12:07.394774+00:00,BTC_USDT,binance,2023-03-15 15:12:07.397800+00:00
6674,1618994957,1677629399807,BTC/USDT,sell,23108.6,0.005,2023-03-15 15:12:07.394774+00:00,BTC_USDT,binance,2023-03-15 15:12:07.397800+00:00


# Read native Binance data

In [121]:
file_path = "/app/im_v2/binance/data/download/data/futures/um/daily/trades/BTCUSDT/2023-03-01_2023-03-01/BTCUSDT-trades-2023-03-01.zip"
# columns = ["trade_id", "price", "qty", "quoteQty", "time", "isBuyerMaker", "isBestMatch"]
binance_data = pd.read_csv(file_path)

In [122]:
start_timestamp = pd.Timestamp("2023-03-01T00:00:00+00:00")
end_timestamp = pd.Timestamp("2023-03-01T00:10:00+00:00")
start_timestamp = hdateti.convert_timestamp_to_unix_epoch(start_timestamp, "ms")
end_timestamp = hdateti.convert_timestamp_to_unix_epoch(end_timestamp, "ms")

In [123]:
binance_data = binance_data[
    binance_data.time.between(start_timestamp, end_timestamp)
]

In [124]:
binance_data

Unnamed: 0,id,price,qty,quote_qty,time,is_buyer_maker
0,3348676218,23129.7,0.224,5181.05,1677628800053,False
1,3348676219,23129.7,0.004,92.51,1677628800084,False
2,3348676220,23129.7,0.023,531.98,1677628800084,False
3,3348676221,23129.7,0.048,1110.22,1677628800084,False
4,3348676222,23129.7,0.007,161.90,1677628800084,False
...,...,...,...,...,...,...
22045,3348698263,23108.7,0.001,23.10,1677629399807,True
22046,3348698264,23108.6,0.001,23.10,1677629399807,True
22047,3348698265,23108.6,0.001,23.10,1677629399807,True
22048,3348698266,23108.6,0.003,69.32,1677629399807,True


## Compare min-max timestamps

In [116]:
binance_data.time.agg(["min", "max"]).apply(pd.Timestamp, unit="ms")

min   2023-03-01 00:00:00.053
max   2023-03-01 00:09:59.880
Name: time, dtype: datetime64[ns]

In [40]:
ccxt_trades.timestamp.agg(["min", "max"]).apply(pd.Timestamp, unit="ms")

min   2023-03-01 00:00:00.053
max   2023-03-01 00:09:59.880
Name: timestamp, dtype: datetime64[ns]

## Compare IDs

In [47]:
ccxt_trades[ccxt_trades.id.isin(binance_data.id)]

Unnamed: 0,id,timestamp,symbol,side,price,amount,end_download_timestamp,currency_pair,exchange_id,knowledge_timestamp


In [58]:
ccxt_trades[ccxt_trades.timestamp.isin(binance_data.time)].head(10)["timestamp"]

0    1677628800053
1    1677628800085
2    1677628803913
3    1677628804251
4    1677628804255
5    1677628804298
6    1677628804323
7    1677628804358
8    1677628804414
9    1677628804417
Name: timestamp, dtype: int64

In [80]:
ccxt_trades[ccxt_trades.timestamp == 1677628804255]

Unnamed: 0,id,timestamp,symbol,side,price,amount,end_download_timestamp,currency_pair,exchange_id,knowledge_timestamp
4,1618988287,1677628804255,BTC/USDT,buy,23129.8,0.059,2023-03-15 15:12:07.394774+00:00,BTC_USDT,binance,2023-03-15 15:12:07.397800+00:00


In [86]:
binance_data[binance_data.time == 1677628804255]

Unnamed: 0,id,price,qty,quote_qty,time,is_buyer_maker
49,3348676267,23129.7,1.448,33491.8,1677628804255,False
50,3348676268,23129.7,0.08,1850.37,1677628804255,False
51,3348676269,23129.7,0.344,7956.61,1677628804255,False
52,3348676270,23129.7,0.158,3654.49,1677628804255,False
53,3348676271,23129.8,0.059,1364.65,1677628804255,False


In [84]:
binance_data[(binance_data.time == 1677628804255) & (binance_data.qty == 0.059)]

Unnamed: 0,id,price,qty,quote_qty,time,is_buyer_maker
53,3348676271,23129.8,0.059,1364.65,1677628804255,False


In [79]:
binance_data[binance_data.time == 1677628804255].agg("sum")["qty"]

2.089

## Adjust CCXT data

In [3]:
ccxt_trades = ccxt_trades.reset_index(drop=True)

NameError: name 'ccxt_trades' is not defined

In [160]:
btc_ccxt_trades = ccxt_trades[ccxt_trades.currency_pair == "BTC_USDT"]

In [161]:
btc_ccxt_trades.drop(
    columns=[
        "year",
        "month",
        "day",
        "currency_pair",
        "exchange_id",
        "knowledge_timestamp",
        "symbol",
    ],
    inplace=True,
)

In [162]:
btc_ccxt_trades = btc_ccxt_trades.reset_index(drop=True)

In [163]:
btc_ccxt_trades[~btc_ccxt_trades.timestamp.isin(binance_data.time.unique())]

Unnamed: 0,timestamp,side,price,amount


## Adjust binance native data

In [170]:
binance_data["side"] = binance_data.is_buyer_maker.map(
    {False: "sell", True: "buy"}
)

In [168]:
binance_data.rename(columns={"time": "timestamp", "qty": "amount"}, inplace=True)

In [171]:
binance_data = binance_data[["timestamp", "side", "price", "amount"]]

In [172]:
btc_ccxt_trades.sort_values(by=["timestamp"], ignore_index=True, inplace=True)

In [173]:
len(binance_data) - len(btc_ccxt_trades)

2969174

In [174]:
binance_data = binance_data[
    binance_data.timestamp.isin(btc_ccxt_trades.timestamp)
]

In [178]:
len(binance_data) - len(btc_ccxt_trades)

2000805

# Bare CCXT client data

In [201]:
import ccxt

extractor = ccxt.binance({"options": {"defaultType": "future"}})

In [202]:
start_timestamp = pd.Timestamp("2023-03-01T00:00:00+00:00")
start_timestamp = hdateti.convert_timestamp_to_unix_epoch(start_timestamp)
currency_pair = "BTC/USDT"
limit = 1000
data = extractor.fetch_trades(
    currency_pair,
    since=start_timestamp,
    limit=limit,
)

In [203]:
data = pd.DataFrame(data)

In [204]:
data[data.timestamp.isin(binance_data.time)].head()

Unnamed: 0,info,timestamp,datetime,symbol,id,order,type,side,takerOrMaker,price,amount,cost,fee,fees
0,"{'a': '1618988283', 'p': '23129.70', 'q': '2.4...",1677628800053,2023-03-01T00:00:00.053Z,BTC/USDT,1618988283,,,buy,taker,23129.7,2.451,56690.8947,,[]
1,"{'a': '1618988284', 'p': '23129.60', 'q': '0.3...",1677628800085,2023-03-01T00:00:00.085Z,BTC/USDT,1618988284,,,sell,taker,23129.6,0.321,7424.6016,,[]
2,"{'a': '1618988285', 'p': '23129.70', 'q': '0.0...",1677628803913,2023-03-01T00:00:03.913Z,BTC/USDT,1618988285,,,buy,taker,23129.7,0.002,46.2594,,[]
3,"{'a': '1618988286', 'p': '23129.70', 'q': '8.7...",1677628804251,2023-03-01T00:00:04.251Z,BTC/USDT,1618988286,,,buy,taker,23129.7,8.72,201690.984,,[]
4,"{'a': '1618988287', 'p': '23129.80', 'q': '0.0...",1677628804255,2023-03-01T00:00:04.255Z,BTC/USDT,1618988287,,,buy,taker,23129.8,0.059,1364.6582,,[]


In [133]:
import json

In [205]:
data[data.timestamp == 1677628800053]

Unnamed: 0,info,timestamp,datetime,symbol,id,order,type,side,takerOrMaker,price,amount,cost,fee,fees
0,"{'a': '1618988283', 'p': '23129.70', 'q': '2.4...",1677628800053,2023-03-01T00:00:00.053Z,BTC/USDT,1618988283,,,buy,taker,23129.7,2.451,56690.8947,,[]


In [206]:
data[data.timestamp == 1677628800053]["info"].to_json()

'{"0":{"a":"1618988283","p":"23129.70","q":"2.451","f":"3348676218","l":"3348676236","T":"1677628800053","m":false}}'

In [207]:
binance_data[binance_data.time == 1677628800053]

Unnamed: 0,id,price,qty,quote_qty,time,is_buyer_maker
0,3348676218,23129.7,0.224,5181.05,1677628800053,False


# BinanceExtractor download - futures

In [60]:
import im_v2.binance.data.extract.extractor as ivbdexex

contract_type = "futures"
binance_extractor = ivbdexex.BinanceExtractor(contract_type)

In [61]:
exchange_id = "binance"
currency_pair = "BTC_USDT"
start_timestamp = pd.Timestamp("2021-08-31 00:00:00")
end_timestamp = pd.Timestamp("2021-08-31 23:59:59")
binance_data = binance_extractor._download_trades(
    exchange_id,
    currency_pair,
    start_timestamp=start_timestamp,
    end_timestamp=end_timestamp,
)


File Download: https://data.binance.vision/data/futures/um/daily/trades/BTCUSDT/BTCUSDT-trades-2021-08-31.zip
[##################################################]INFO  Extracting ./tmp/binance/8fa3fe97-10c1-4107-b8b6-d592c8126a61/BTCUSDT-trades-2021-08-31.zip


In [62]:
binance_data.head()

Unnamed: 0,timestamp,price,amount,side,end_download_timestamp
0,1630368000073,47024.69,0.001,buy,2023-03-24 18:12:35.904601+00:00
1,1630368003229,47024.69,0.021,buy,2023-03-24 18:12:35.904601+00:00
2,1630368003229,47024.69,0.01,buy,2023-03-24 18:12:35.904601+00:00
3,1630368003230,47024.69,0.024,buy,2023-03-24 18:12:35.904601+00:00
4,1630368003231,47024.7,0.007,sell,2023-03-24 18:12:35.904601+00:00


In [53]:
binance_data.timestamp.agg(["min", "max"]).apply(pd.Timestamp, unit="ms")

min   2021-08-31 00:00:00.073
max   2021-08-31 23:59:58.985
Name: timestamp, dtype: datetime64[ns]

# BinanceExtractor download - trades - spot

In [10]:
import im_v2.binance.data.extract.extractor as ivbdexex

contract_type = "spot"
time_period = ivbdexex.BinanceNativeTimePeriod.DAILY
binance_extractor = ivbdexex.BinanceExtractor(contract_type, time_period)

In [11]:
exchange_id = "binance"
currency_pair = "BTC_USDT"
start_timestamp = pd.Timestamp("2021-04-01T00:00:00+00:00")
end_timestamp = pd.Timestamp("2021-04-30T23:59:59+00:00")
binance_data = binance_extractor._download_trades(
    exchange_id,
    currency_pair,
    start_timestamp=start_timestamp,
    end_timestamp=end_timestamp,
)


File Download: https://data.binance.vision/data/spot/daily/trades/BTCUSDT/BTCUSDT-trades-2021-04-01.zip
[##################################################]


File Download: https://data.binance.vision/data/spot/daily/trades/BTCUSDT/BTCUSDT-trades-2021-04-02.zip
[##################################################]


File Download: https://data.binance.vision/data/spot/daily/trades/BTCUSDT/BTCUSDT-trades-2021-04-03.zip
[##################################################]


File Download: https://data.binance.vision/data/spot/daily/trades/BTCUSDT/BTCUSDT-trades-2021-04-04.zip
[##################################################]


File Download: https://data.binance.vision/data/spot/daily/trades/BTCUSDT/BTCUSDT-trades-2021-04-05.zip
[##################################################]


File Download: https://data.binance.vision/data/spot/daily/trades/BTCUSDT/BTCUSDT-trades-2021-04-06.zip
[##################################################]


File Download: https://data.binance.vision/da

In [12]:
df1 = next(binance_data)

In [15]:
df1.timestamp.sort_values().apply(pd.Timestamp, unit="ms")

0         2021-04-09 00:00:00.007
1         2021-04-09 00:00:00.065
2         2021-04-09 00:00:00.129
3         2021-04-09 00:00:00.160
4         2021-04-09 00:00:00.166
                    ...          
1615957   2021-04-09 23:59:59.665
1615958   2021-04-09 23:59:59.747
1615959   2021-04-09 23:59:59.838
1615960   2021-04-09 23:59:59.876
1615961   2021-04-09 23:59:59.904
Name: timestamp, Length: 1615962, dtype: datetime64[ns]

# BinanceExtractor download - trades - spot - monthly

In [90]:
import im_v2.binance.data.extract.extractor as ivbdexex

contract_type = "spot"
time_period = ivbdexex.BinanceNativeTimePeriod.MONTHLY
binance_extractor = ivbdexex.BinanceExtractor(
    contract_type, time_period=time_period
)

In [91]:
exchange_id = "binance"
currency_pair = "BTCUSDT"
start_timestamp = pd.Timestamp("2020-01-01 00:00:00")
end_timestamp = pd.Timestamp("2020-02-01 00:00:00")
monthly_iterator = binance_extractor._get_trades_iterator(
    currency_pair, start_timestamp, end_timestamp
)


File Download: https://data.binance.vision/data/spot/monthly/trades/BTCUSDT/BTCUSDT-trades-2020-01.zip
[##################################################]



In [92]:
df = next(monthly_iterator)

In [93]:
del monthly_iterator

In [94]:
hasattr(binance_extractor, "tmp_dir_path")

True

In [95]:
binance_extractor.tmp_dir_path

'./tmp/binance/1a9df707-acaf-4f93-8650-27a681f6f21d'

In [96]:
del binance_extractor

In [56]:
import os

import helpers.hio as hio

In [97]:
os.path.exists("./tmp/binance/1a9df707-acaf-4f93-8650-27a681f6f21d")

False

In [37]:
df

Unnamed: 0,timestamp,price,amount,side,end_download_timestamp
0,1577836800594,7195.24,0.001500,buy,2023-04-04 17:54:30.113616+00:00
1,1577836800862,7196.25,0.022706,sell,2023-04-04 17:54:30.113616+00:00
2,1577836800862,7196.25,0.099650,sell,2023-04-04 17:54:30.113616+00:00
3,1577836800998,7196.25,0.001543,sell,2023-04-04 17:54:30.113616+00:00
4,1577836801503,7195.25,0.400000,buy,2023-04-04 17:54:30.113616+00:00
...,...,...,...,...,...
19995,1577845659218,7236.91,0.003329,sell,2023-04-04 17:54:30.113616+00:00
19996,1577845659401,7236.84,0.060016,sell,2023-04-04 17:54:30.113616+00:00
19997,1577845659768,7237.08,0.060192,sell,2023-04-04 17:54:30.113616+00:00
19998,1577845660008,7236.83,0.006467,sell,2023-04-04 17:54:30.113616+00:00


# Download by script - trades - futures

In [17]:
%%bash
/app/im_v2/common/data/extract/download_bulk.py \
    --download_mode 'bulk' \
    --downloading_entity 'airflow' \
    --action_tag 'downloaded_1min' \
    --vendor 'binance' \
    --start_timestamp '2021-08-31T00:00:00+00:00' \
    --end_timestamp '2021-08-31T23:59:00+00:00' \
    --exchange_id 'binance' \
    --universe 'v1' \
    --incremental \
    --aws_profile 'ck' \
    --data_type 'trades' \
    --data_format 'parquet' \
    --contract_type 'futures' \
    --s3_path 's3://cryptokaizen-data-test/'

[0m[36mINFO[0m: > cmd='/app/im_v2/common/data/extract/download_bulk.py --download_mode bulk --downloading_entity airflow --action_tag downloaded_1min --vendor binance --start_timestamp 2021-08-31T00:00:00+00:00 --end_timestamp 2021-08-31T23:59:00+00:00 --exchange_id binance --universe v1 --incremental --aws_profile ck --data_type trades --data_format parquet --contract_type futures --s3_path s3://cryptokaizen-data-test/'
report_memory_usage=False report_cpu_usage=False
[36mINFO[0m: Saving log to file '/app/im_v2/common/data/extract/download_bulk.py.log'
04:51:53 - [36mINFO [0m dataset_schema_utils.py _get_dataset_schema_file_path:39 Loading dataset schema file: /app/data_schema/dataset_schema_versions/dataset_schema_v3.json
04:51:53 - [36mINFO [0m dataset_schema_utils.py get_dataset_schema:74          Loaded dataset schema version v3
04:51:53 - [36mINFO [0m dataset_schema_utils.py _get_dataset_schema_file_path:39 Loading dataset schema file: /app/data_schema/dataset_schema_v

04:54:50 - [36mINFO [0m extract_utils.py download_historical_data:734          Saving the dataset into s3://cryptokaizen-data-test/v3/bulk/airflow/downloaded_1min/parquet/trades/futures/v1/binance/binance/v1_0_0

File Download: ./tmp/binance/62019104-d1cd-4cdc-b487-0d2d6cca8ccb/RUNEUSDT-trades-2021-08-31.zip
[##################################################]04:54:54 - [36mINFO [0m extractor.py _extract_data_from_binance_files:215      Extracting ./tmp/binance/62019104-d1cd-4cdc-b487-0d2d6cca8ccb/RUNEUSDT-trades-2021-08-31.zip
04:54:54 - [36mINFO [0m extract_utils.py download_historical_data:734          Saving the dataset into s3://cryptokaizen-data-test/v3/bulk/airflow/downloaded_1min/parquet/trades/futures/v1/binance/binance/v1_0_0

File Download: ./tmp/binance/b6add9d8-e7e7-43fd-9dfc-63bbe8e631ad/BAKEUSDT-trades-2021-08-31.zip
[##################################################]04:55:00 - [36mINFO [0m extractor.py _extract_data_from_binance_files:215      Extracting ./tmp/

## Read data saved by script

In [134]:
signature = (
    "bulk.airflow.downloaded_1min.parquet.trades.spot.v1.binance.binance.v1_0_0"
)
reader = imvcdcimrdc.RawDataReader(signature, stage="test")
start_timestamp = pd.Timestamp("2020-02-10T00:00:00+00:00")
end_timestamp = pd.Timestamp("2020-02-10T23:59:00+00:00")
currency_pairs = ["BTC_USDT"]
binance_trades = reader.read_data(
    start_timestamp,
    end_timestamp,
    currency_pairs=currency_pairs,
)
binance_trades.head()

INFO  Loading dataset schema file: /app/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Loading dataset schema file: /app/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Loading dataset schema file: /app/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3


Unnamed: 0_level_0,timestamp,price,amount,side,exchange_id,knowledge_timestamp,currency_pair,year,month,day
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1


In [135]:
binance_trades.isna().any().any()

False

In [136]:
binance_trades.shape

(0, 10)

In [137]:
binance_trades.timestamp.agg(["min", "max"]).apply(pd.Timestamp, unit="ms")

min   NaT
max   NaT
Name: timestamp, dtype: datetime64[ns]

# Download by script - trades - spot

In [146]:
%%bash
/app/im_v2/common/data/extract/download_bulk.py \
    --download_mode 'bulk' \
    --downloading_entity 'airflow' \
    --action_tag 'downloaded_1min' \
    --vendor 'binance' \
    --start_timestamp '2021-07-01T00:00:00+00:00' \
    --end_timestamp '2021-07-01T23:59:00+00:00' \
    --exchange_id 'binance' \
    --universe 'v1' \
    --incremental \
    --aws_profile 'ck' \
    --data_type 'trades' \
    --data_format 'parquet' \
    --contract_type 'spot' \
    --s3_path 's3://cryptokaizen-data-test/'

[0m[36mINFO[0m: > cmd='/app/im_v2/common/data/extract/download_bulk.py --download_mode bulk --downloading_entity airflow --action_tag downloaded_1min --vendor binance --start_timestamp 2021-07-01T00:00:00+00:00 --end_timestamp 2021-07-01T23:59:00+00:00 --exchange_id binance --universe v1 --incremental --aws_profile ck --data_type trades --data_format parquet --contract_type spot --s3_path s3://cryptokaizen-data-test/'
report_memory_usage=False report_cpu_usage=False
[36mINFO[0m: Saving log to file '/app/im_v2/common/data/extract/download_bulk.py.log'
05:40:15 - [36mINFO [0m dataset_schema_utils.py _get_dataset_schema_file_path:39 Loading dataset schema file: /app/data_schema/dataset_schema_versions/dataset_schema_v3.json
05:40:15 - [36mINFO [0m dataset_schema_utils.py get_dataset_schema:74          Loaded dataset schema version v3
05:40:15 - [36mINFO [0m dataset_schema_utils.py _get_dataset_schema_file_path:39 Loading dataset schema file: /app/data_schema/dataset_schema_vers

[##################################################]05:41:32 - [36mINFO [0m extractor.py _extract_data_from_binance_files:226      Extracting ./tmp/binance/fdb63e21-2e5d-443f-a158-b1e2893868bc/CRVUSDT-trades-2021-07-01.zip
05:41:32 - [36mINFO [0m extract_utils.py download_historical_data:734          Saving the dataset into s3://cryptokaizen-data-test/v3/bulk/airflow/downloaded_1min/parquet/trades/spot/v1/binance/binance/v1_0_0

File Download: https://data.binance.vision/data/spot/daily/trades/RUNEUSDT/RUNEUSDT-trades-2021-07-01.zip
[##################################################]05:41:34 - [36mINFO [0m extractor.py _extract_data_from_binance_files:226      Extracting ./tmp/binance/a04834e2-da09-4733-bdc7-3cebd0991a06/RUNEUSDT-trades-2021-07-01.zip
05:41:34 - [36mINFO [0m extract_utils.py download_historical_data:734          Saving the dataset into s3://cryptokaizen-data-test/v3/bulk/airflow/downloaded_1min/parquet/trades/spot/v1/binance/binance/v1_0_0

File Download: http

# Download by script - trades - spot - monthly - 2020

In [77]:
%%bash
/app/im_v2/common/data/extract/download_bulk.py \
    --download_mode 'bulk' \
    --downloading_entity 'airflow' \
    --action_tag 'downloaded_1min' \
    --vendor 'binance' \
    --start_timestamp '2020-02-01T00:00:00+00:00' \
    --end_timestamp '2020-03-01T00:00:00+00:00' \
    --exchange_id 'binance' \
    --universe 'v1' \
    --aws_profile 'ck' \
    --data_type 'trades' \
    --data_format 'parquet' \
    --contract_type 'spot' \
    --s3_path 's3://cryptokaizen-data-test/'

[0m[36mINFO[0m: > cmd='/app/im_v2/common/data/extract/download_bulk.py --download_mode bulk --downloading_entity airflow --action_tag downloaded_1min --vendor binance --start_timestamp 2020-02-01T00:00:00+00:00 --end_timestamp 2020-03-01T00:00:00+00:00 --exchange_id binance --universe v1 --aws_profile ck --data_type trades --data_format parquet --contract_type spot --s3_path s3://cryptokaizen-data-test/'
report_memory_usage=False report_cpu_usage=False
[36mINFO[0m: Saving log to file '/app/im_v2/common/data/extract/download_bulk.py.log'
10:58:53 - [36mINFO [0m dataset_schema_utils.py _get_dataset_schema_file_path:39 Loading dataset schema file: /app/data_schema/dataset_schema_versions/dataset_schema_v3.json
10:58:53 - [36mINFO [0m dataset_schema_utils.py get_dataset_schema:74          Loaded dataset schema version v3
10:58:53 - [36mINFO [0m dataset_schema_utils.py _get_dataset_schema_file_path:39 Loading dataset schema file: /app/data_schema/dataset_schema_versions/dataset_s

Traceback (most recent call last):
  File "/app/im_v2/common/data/extract/download_bulk.py", line 83, in <module>
    _main(_parse())
  File "/app/im_v2/common/data/extract/download_bulk.py", line 80, in _main
    _run(args)
  File "/app/im_v2/common/data/extract/download_bulk.py", line 76, in _run
    imvcdeexut.download_historical_data(args, exchange)
  File "/app/im_v2/common/data/extract/extract_utils.py", line 745, in download_historical_data
    hs3.dassert_path_not_exists(path_to_dataset, args["aws_profile"])
  File "/app/helpers/hs3.py", line 135, in dassert_path_not_exists
    hdbg.dassert(not s3fs_.exists(path), f"S3 path '{path}' already exist!")
  File "/app/helpers/hdbg.py", line 159, in dassert
  File "/app/helpers/hdbg.py", line 142, in _dfatal
    dfatal(dfatal_txt)
  File "/app/helpers/hdbg.py", line 71, in dfatal
    raise assertion_type(ret)
AssertionError: 
################################################################################
* Failed assertion *
cond=Fal

10:58:53 - [36mINFO [0m hcache.py clear_global_cache:292                       Before clear_global_cache: 'global mem' cache: path='/mnt/tmpfs/tmp.cache.mem', size=32.0 KB
10:58:53 - [41mWARN [0m hcache.py clear_global_cache:293                       Resetting 'global mem' cache '/mnt/tmpfs/tmp.cache.mem'
10:58:53 - [41mWARN [0m hcache.py clear_global_cache:303                       Destroying '/mnt/tmpfs/tmp.cache.mem' ...
10:58:53 - [36mINFO [0m hcache.py clear_global_cache:319                       After clear_global_cache: 'global mem' cache: path='/mnt/tmpfs/tmp.cache.mem', size=nan


CalledProcessError: Command 'b"/app/im_v2/common/data/extract/download_bulk.py \\\n    --download_mode 'bulk' \\\n    --downloading_entity 'airflow' \\\n    --action_tag 'downloaded_1min' \\\n    --vendor 'binance' \\\n    --start_timestamp '2020-02-01T00:00:00+00:00' \\\n    --end_timestamp '2020-03-01T00:00:00+00:00' \\\n    --exchange_id 'binance' \\\n    --universe 'v1' \\\n    --aws_profile 'ck' \\\n    --data_type 'trades' \\\n    --data_format 'parquet' \\\n    --contract_type 'spot' \\\n    --s3_path 's3://cryptokaizen-data-test/'\n"' returned non-zero exit status 1.

## Read and check the data

In [22]:
signature = (
    "bulk.airflow.downloaded_1min.parquet.trades.spot.v1.binance.binance.v1_0_0"
)
reader = imvcdcimrdc.RawDataReader(signature, stage="test")
start_timestamp = pd.Timestamp("2020-12-01T00:00:00+00:00")
end_timestamp = pd.Timestamp("2021-01-01T00:00:00+00:00")
currency_pairs = ["BTC_USDT"]
binance_trades = reader.read_data(
    start_timestamp,
    end_timestamp,
    currency_pairs=currency_pairs,
)
binance_trades.head()

INFO  Loading dataset schema file: /app/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Loading dataset schema file: /app/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3
INFO  Loading dataset schema file: /app/data_schema/dataset_schema_versions/dataset_schema_v3.json
INFO  Loaded dataset schema version v3


Unnamed: 0_level_0,timestamp,price,amount,side,exchange_id,knowledge_timestamp,currency_pair,year,month,day
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2020-12-01 00:00:00.040000+00:00,1606780800040,19695.87,0.061389,sell,binance,2023-04-05 13:57:18.553026+00:00,BTC_USDT,2020,12,1
2020-12-01 00:00:00.258000+00:00,1606780800258,19695.86,0.25,buy,binance,2023-04-05 13:57:18.553026+00:00,BTC_USDT,2020,12,1
2020-12-01 00:00:00.258000+00:00,1606780800258,19695.86,0.43,buy,binance,2023-04-05 13:57:18.553026+00:00,BTC_USDT,2020,12,1
2020-12-01 00:00:00.273000+00:00,1606780800273,19695.86,0.17,buy,binance,2023-04-05 13:57:18.553026+00:00,BTC_USDT,2020,12,1
2020-12-01 00:00:00.273000+00:00,1606780800273,19695.17,0.059377,buy,binance,2023-04-05 13:57:18.553026+00:00,BTC_USDT,2020,12,1


In [23]:
binance_trades.isna().any().any()

False

In [24]:
binance_trades.shape

(45342701, 10)

In [25]:
binance_trades.timestamp.agg(["min", "max"]).apply(pd.Timestamp, unit="ms")

min   2020-12-01 00:00:00.040
max   2020-12-31 23:59:58.615
Name: timestamp, dtype: datetime64[ns]