In [3]:
!sudo /venv/bin/pip install pysftp



In [43]:
# Importing modules.
import gzip
import logging
import os
from io import BytesIO

import pandas as pd
import pysftp

import helpers.haws as haws
import helpers.hdbg as hdbg
import helpers.henv as henv
import helpers.hprint as hprint
import helpers.hs3 as hs3

In [25]:
hdbg.init_logger(verbosity=logging.INFO)
log_level = logging.INFO

_LOG = logging.getLogger(__name__)

_LOG.info("%s", henv.get_system_signature()[0])

hprint.config_notebook()

# Configuration
config = {
    "stage": "test",
    "save_path_prefix": "sonaal/cryptocom/historical_bid_ask/",
    "hostname": "data.crypto.com",
    "username": "user005",
    "private_key_path": "/app/amp/cryptocom-privatekey.pem",
    # Download config.
    "currency_pair": "BTC_USDT",
    "date": "2023-10-01",
}

# Disable host key checking
cnopts = pysftp.CnOpts()
cnopts.hostkeys = None

INFO  # Git
  branch_name='CmampTask8618_Get_historical_bidask_data_from_cryptocom'
  hash='4d9345e9d'
  # Last commits:
    * 4d9345e9d sonaalKant checkpoint                                                        (  10 hours ago) Mon Jun 17 06:33:55 2024  (HEAD -> CmampTask8618_Get_historical_bidask_data_from_cryptocom, origin/CmampTask8618_Get_historical_bidask_data_from_cryptocom)
    * ccab990af Nina Lee CmTask8531_Trading_AirFlow_DAGs_clean_up (#8629)                  (    3 days ago) Fri Jun 14 19:54:13 2024  (origin/master, origin/HEAD)
    * c24fda767 Vedanshu Joshi CmTask8440 Update docs for shadow trading DAGs (#8582)            (    3 days ago) Fri Jun 14 18:38:32 2024           
# Machine info
  system=Linux
  node name=905f8b01ec55
  release=5.15.0-1056-aws
  version=#61~20.04.1-Ubuntu SMP Wed Mar 13 17:40:41 UTC 2024
  machine=x86_64
  processor=x86_64
  cpu count=8
  cpu freq=scpufreq(current=2499.998, min=0.0, max=0.0)
  memory=svmem(total=33280270336, available=2295760

In [27]:
def sftp_to_s3(sftp, remote_dir, s3_bucket, s3_prefix):
    """
    Download data from sftp server and upload to S3 bucket.
    """
    for item in sftp.listdir_attr(remote_dir):
        remote_path = f"{remote_dir}/{item.filename}"
        s3_key = f"{s3_prefix}/{item.filename}" if s3_prefix else item.filename
        with sftp.open(remote_path) as file_obj:
            file_data = file_obj.read()
            s3_client.upload_fileobj(BytesIO(file_data), s3_bucket, s3_key)
            _LOG.info(f"Uploaded: {remote_path} to s3://{s3_bucket}/{s3_key}")

In [77]:
def load_data(
    s3: boto3.client,
    s3_path: str,
    s3_bucket: str,
) -> list:
    """
    Loads and performs a QA check on files in a specified S3 bucket and path.

    This function lists all objects in a given S3 path, downloads each
    file, checks for empty files, reads and parses the content of
    gzipped JSON files, and collects the data into a list of pandas
    DataFrames.

    :param s3: an S3 client object from boto3.
    :param s3_path: S3 path (prefix) to list and load files from.
    :param s3_bucket: name of the S3 bucket.
    :return: list of pandas DataFrames containing the data from the
        files.
    """
    dataframes = []
    # List all objects in the specified S3 bucket and path
    files = haws.list_all_objects(s3, s3_bucket, s3_path)
    for file in files:
        # Check for empty files
        if file["Size"] == 0:
            _LOG.info("Found empty file %s", file["Key"])
            continue
        # Download the file from S3 to a local temporary path
        local_dst_path = "tmp.data.gz"
        s3_file_path = file["Key"]
        s3.download_file(s3_bucket, s3_file_path, local_dst_path)
        # Read and parse the gzipped JSON file
        with gzip.open(local_dst_path, "rt") as gz_file:
            file_content = gz_file.read()
            df = pd.read_json(file_content, lines=True)
        # Append the DataFrame to the list
        dataframes.append(df)
    return dataframes

In [72]:
hostname = config["hostname"]
username = config["username"]
private_key = config["private_key_path"]

s3_client = haws.get_service_client(aws_profile="ck", service_name="s3")
bucket_name = hs3.get_s3_bucket_from_stage(stage=config["stage"])
prefix = config["save_path_prefix"]

currency_pair = config["currency_pair"]
date = pd.to_datetime(config["date"])
year = date.year
month = date.month
day = date.day

sftp_data_path = (
    f"/exchange/book_l2_150_0010/{year}/{month}/{day}/cdc/{currency_pair}"
)

s3_save_path = os.path.join(prefix, currency_pair, config["date"])

# Establish the SFTP connection
with pysftp.Connection(
    hostname, username=username, private_key=private_key_path, cnopts=cnopts
) as sftp:
    print("Connection successfully established ...")
    # Start the recursive download from the remote root directory
    sftp_to_s3(sftp, sftp_data_path, bucket_name, s3_save_path)

print("All files have been downloaded successfully.")

data = load_data(s3, s3_save_path, bucket_name)

INFO  Connected (version 2.0, client AWS_SFTP_1.1)
INFO  Authentication (publickey) successful!
Connection successfully established ...


KeyboardInterrupt: 

In [78]:
data = load_data(s3_client, s3_save_path, bucket_name)

sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696118400003.gz


  df = pd.read_json(file_content, lines=True)


sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696118483737.gz


  df = pd.read_json(file_content, lines=True)
  df = pd.read_json(file_content, lines=True)


sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696118586375.gz
sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696118663726.gz


  df = pd.read_json(file_content, lines=True)
  df = pd.read_json(file_content, lines=True)


sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696118741974.gz
sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696118826888.gz
sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696118844019.gz


  df = pd.read_json(file_content, lines=True)
  df = pd.read_json(file_content, lines=True)


sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696118950529.gz


  df = pd.read_json(file_content, lines=True)
  df = pd.read_json(file_content, lines=True)
  df = pd.read_json(file_content, lines=True)


sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696119023792.gz
sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696119118424.gz
sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696119203723.gz
sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696119283662.gz


  df = pd.read_json(file_content, lines=True)
  df = pd.read_json(file_content, lines=True)


sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696119384096.gz
sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696119494142.gz


  df = pd.read_json(file_content, lines=True)
  df = pd.read_json(file_content, lines=True)


sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696119563747.gz


  df = pd.read_json(file_content, lines=True)
  df = pd.read_json(file_content, lines=True)


sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696119700300.gz
sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696119744166.gz


  df = pd.read_json(file_content, lines=True)
  df = pd.read_json(file_content, lines=True)


sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696119857380.gz
sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696119923721.gz


  df = pd.read_json(file_content, lines=True)
  df = pd.read_json(file_content, lines=True)
  df = pd.read_json(file_content, lines=True)


sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696120060049.gz
sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696120103918.gz
sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696120191609.gz
sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696120273205.gz


  df = pd.read_json(file_content, lines=True)
  df = pd.read_json(file_content, lines=True)


sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696120368288.gz


  df = pd.read_json(file_content, lines=True)


sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696120459602.gz
sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696120528767.gz


  df = pd.read_json(file_content, lines=True)
  df = pd.read_json(file_content, lines=True)


sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696120602168.gz
sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696120643734.gz


  df = pd.read_json(file_content, lines=True)
  df = pd.read_json(file_content, lines=True)


sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696120720255.gz


  df = pd.read_json(file_content, lines=True)


sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696120822723.gz
sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696120919301.gz


  df = pd.read_json(file_content, lines=True)
  df = pd.read_json(file_content, lines=True)


sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696120980749.gz
sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696121003955.gz


  df = pd.read_json(file_content, lines=True)
  df = pd.read_json(file_content, lines=True)


sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696121097399.gz
sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696121183847.gz


  df = pd.read_json(file_content, lines=True)
  df = pd.read_json(file_content, lines=True)


sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696121278247.gz
sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696121364031.gz


  df = pd.read_json(file_content, lines=True)
  df = pd.read_json(file_content, lines=True)


sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696121524313.gz
sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696121544149.gz


  df = pd.read_json(file_content, lines=True)
  df = pd.read_json(file_content, lines=True)


sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696121641396.gz
sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696121723722.gz


  df = pd.read_json(file_content, lines=True)
  df = pd.read_json(file_content, lines=True)


sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696121853842.gz
sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696121904090.gz


  df = pd.read_json(file_content, lines=True)
  df = pd.read_json(file_content, lines=True)


sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696122047791.gz
sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696122083883.gz


  df = pd.read_json(file_content, lines=True)
  df = pd.read_json(file_content, lines=True)


sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696122191703.gz
sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696122264011.gz


  df = pd.read_json(file_content, lines=True)
  df = pd.read_json(file_content, lines=True)


sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696122372123.gz


  df = pd.read_json(file_content, lines=True)


sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696122443702.gz


  df = pd.read_json(file_content, lines=True)
  df = pd.read_json(file_content, lines=True)


sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696122556043.gz
sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696122623743.gz


  df = pd.read_json(file_content, lines=True)
  df = pd.read_json(file_content, lines=True)


sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696122751024.gz
sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696122803865.gz


  df = pd.read_json(file_content, lines=True)
  df = pd.read_json(file_content, lines=True)
  df = pd.read_json(file_content, lines=True)


sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696122926900.gz
sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696122983747.gz
sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696123092041.gz
sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696123163761.gz


  df = pd.read_json(file_content, lines=True)
  df = pd.read_json(file_content, lines=True)


sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696123283960.gz
sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696123343777.gz


  df = pd.read_json(file_content, lines=True)
  df = pd.read_json(file_content, lines=True)


sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696123445031.gz
sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696123523795.gz


  df = pd.read_json(file_content, lines=True)
  df = pd.read_json(file_content, lines=True)


sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696123616561.gz
sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696123703610.gz


  df = pd.read_json(file_content, lines=True)
  df = pd.read_json(file_content, lines=True)


sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696123757443.gz
sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696123811168.gz


  df = pd.read_json(file_content, lines=True)
  df = pd.read_json(file_content, lines=True)


sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696123883757.gz
sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696123973147.gz


  df = pd.read_json(file_content, lines=True)
  df = pd.read_json(file_content, lines=True)


sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696124063911.gz
sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696124157379.gz


  df = pd.read_json(file_content, lines=True)
  df = pd.read_json(file_content, lines=True)


sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696124243918.gz
sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696124381250.gz


  df = pd.read_json(file_content, lines=True)
  df = pd.read_json(file_content, lines=True)
  df = pd.read_json(file_content, lines=True)


sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696124423734.gz
sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696124534244.gz


  df = pd.read_json(file_content, lines=True)
  df = pd.read_json(file_content, lines=True)


sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696124603771.gz
sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696124716749.gz


  df = pd.read_json(file_content, lines=True)
  df = pd.read_json(file_content, lines=True)


sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696124783724.gz
sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696124956540.gz
sonaal/cryptocom/historical_bid_ask/BTC_USDT/2023-10-01/1696125094131.gz


  df = pd.read_json(file_content, lines=True)


KeyboardInterrupt: 

In [76]:
data[0].head()

Unnamed: 0,S,s,t,p,a,b
0,BTC_USDT,1,1696118400003,1696118400007,"[[26962.18, 0.23306000000000002], [26964.73, 0...","[[26962.17, 0.00018], [26960.42, 0.04], [26960..."
1,BTC_USDT,1,1696118400034,1696118400036,"[[26962.18, 0.23306000000000002], [26964.73, 0...","[[26962.17, 0.00018], [26960.41, 0.08], [26960..."
2,BTC_USDT,1,1696118400098,1696118400100,"[[26962.18, 0.23901000000000003], [26964.73, 0...","[[26962.17, 0.03654], [26960.42, 0.04], [26960..."
3,BTC_USDT,1,1696118400199,1696118400201,"[[26962.18, 0.23306000000000002], [26964.73, 0...","[[26962.17, 0.03654], [26960.42, 0.04], [26960..."
4,BTC_USDT,1,1696118400255,1696118400256,"[[26962.18, 0.23306000000000002], [26962.82, 0...","[[26962.17, 0.03654], [26960.42, 0.04], [26960..."
