In [1]:
import pandas as pd
from dateutil import parser
from datetime import datetime as dt
import datetime
import time 

from binance.client import Client
import os
from dotenv import load_dotenv
load_dotenv("../../constants/.env")

True

In [2]:
api_key = os.environ.get('BINANCE_KEY')
secret_key = os.environ.get('BINANCE_SECRET')

In [3]:
client = Client(api_key = api_key, api_secret = secret_key, tld = "com")

valid intervals - 1m, 3m, 5m, 15m, 30m, 1h, 2h, 4h, 6h, 8h, 12h, 1d, 3d, 1w, 1M

In [10]:
CANDLE_COUNT = 5000
THROTTLE_TIME = 0.1
SLEEP = 0.1
INCREMENTS = {
    'M1': 1 * CANDLE_COUNT,
    '1m': 1 * CANDLE_COUNT,
    '15m': 15 * CANDLE_COUNT,
    '1h': 60 * CANDLE_COUNT,
    '5m' : 5 * CANDLE_COUNT,
    'H1' : 60 * CANDLE_COUNT,
    'H4' : 240 * CANDLE_COUNT,
    'D1' : 1440 * CANDLE_COUNT
}

In [11]:
last_req_time = dt.now()
def throttle():
    global last_req_time
    el_s = (dt.now() - last_req_time).total_seconds()
    if el_s < THROTTLE_TIME:
        time.sleep(THROTTLE_TIME - el_s)
    last_req_time = dt.now()

In [12]:
def get_history(symbol, interval, start, end, api):

    throttle()
    
    bars = api.get_historical_klines(symbol = symbol, interval = interval,
                                        start_str = start, end_str = end, limit = 1000)
    df = pd.DataFrame(bars)
    df["Date"] = pd.to_datetime(df.iloc[:,0], unit = "ms")
    df.columns = ["Open Time", "Open", "High", "Low", "Close", "Volume",
                  "Clos Time", "Quote Asset Volume", "Number of Trades",
                  "Taker Buy Base Asset Volume", "Taker Buy Quote Asset Volume", "Ignore", "Time"]
    df = df[["Time", "Open", "High", "Low", "Close", "Volume"]].copy()
    # df.set_index("Time", inplace = True)
    # print(df["Time"])
    for column in ["Open", "High", "Low", "Close", "Volume"]:
        df[column] = pd.to_numeric(df[column], errors = "coerce")
    
    return df

In [13]:
def save_file(final_df: pd.DataFrame, file_prefix, granularity, pair):
    filename = f"{file_prefix}{pair}_{granularity}.pkl"

    final_df.drop_duplicates(subset=['Time'], inplace=True)
    final_df.sort_values(by='Time', inplace=True)
    final_df.reset_index(inplace=True, drop=True)
    final_df.to_pickle(filename)

    print(f"**** {pair} {granularity}, {final_df.Time.min()} {final_df.Time.max()} --> {final_df.shape}")


def fetch_candles(pair, granularity, date_f: str, date_t: str, api: Client):
    
    attempts = 0

    while attempts < 3:
        
        candles_df = get_history(
            symbol=pair,
            interval=granularity,
            start=date_f,
            end=date_t,
            api=api
        )

        if candles_df is not None:
            break

        attempts += 1

    if candles_df is not None and candles_df.empty == False:
        return candles_df
    else:
        return None

In [14]:
def collect_data(pair, granularity, date_f, date_t, file_prefix, api: Client):
    
    time_step = INCREMENTS[granularity]

    from_date = parser.parse(date_f)
    end_date = parser.parse(date_t)
    
    candle_dfs = []

    to_date = from_date
    while to_date < end_date:

        to_date = from_date + datetime.timedelta(minutes=time_step)
        if to_date > end_date:
            to_date = end_date

        to_date_str = to_date.strftime("%Y-%m-%d %H:%M:%S")
        from_date_str = from_date.strftime("%Y-%m-%d %H:%M:%S")
        candles = fetch_candles(
            pair,
            granularity,
            from_date_str,
            to_date_str,
            api
        )
        if candles is not None and candles.empty == False:
            print(f"{pair} {granularity}, {from_date} {to_date_str}  | {candles.Time.min()} {candles.Time.max()} --> {candles.shape[0]} candles")
            candle_dfs.append(candles)
            if candles.Time.max() > to_date:
                from_date = candles.Time.max()
            else:
                from_date = to_date

        else:
            print(f"{pair} {granularity}, {from_date} {to_date} --> NO CANDLES")
            from_date = to_date

    time.sleep(SLEEP)

    if len(candle_dfs) > 0:
        final_df = pd.concat(candle_dfs)
        save_file(final_df, file_prefix, granularity, pair)
    else:
        print(f"{pair} {granularity}, {from_date} {to_date} --> NO DATA SAVED")



In [17]:
# pairs = ['DOTUSDT','DOGEUSDT','ADAUSDT','XRPUSDT','TRXUSDT','XLMUSDT','APTUSDT','VIDTUSDT','BTCUSDT',
#          'ETHUSDT','AVAXUSDT','BNBUSDT','LTCUSDT','LINKUSDT',
#          'BCHUSDT','EOSUSDT','ETCUSDT','DASHUSDT','ALGOUSDT','SANDUSDT',
#          'ARUSDT', 'INJUSDT', 'MASKUSDT', 'CELOUSDT', 'BANDUSDT', 'ALPHAUSDT', 'PERPUSDT',
#          'ACHUSDT', 'CHRUSDT', 'BELUSDT']

pairs = ['DOGEUSDT','XRPUSDT','ETHUSDT','BNBUSDT']

for p in pairs:
    print(str(p))
    collect_data(
        p,
        '1h',
        "2020-01-01 00:00:00",
        "2024-11-01 00:00:00",
        "./",
        client
    )

ETHUSDT
ETHUSDT 1h, 2020-01-01 00:00:00 2020-07-27 08:00:00  | 2020-01-01 00:00:00 2020-07-27 08:00:00 --> 4989 candles
ETHUSDT 1h, 2020-07-27 08:00:00 2021-02-20 16:00:00  | 2020-07-27 08:00:00 2021-02-20 16:00:00 --> 4994 candles
ETHUSDT 1h, 2021-02-20 16:00:00 2021-09-17 00:00:00  | 2021-02-20 16:00:00 2021-09-17 00:00:00 --> 4991 candles
ETHUSDT 1h, 2021-09-17 00:00:00 2022-04-13 08:00:00  | 2021-09-17 00:00:00 2022-04-13 08:00:00 --> 4999 candles
ETHUSDT 1h, 2022-04-13 08:00:00 2022-11-07 16:00:00  | 2022-04-13 08:00:00 2022-11-07 15:00:00 --> 5000 candles
ETHUSDT 1h, 2022-11-07 16:00:00 2023-06-04 00:00:00  | 2022-11-07 16:00:00 2023-06-04 00:00:00 --> 5000 candles
ETHUSDT 1h, 2023-06-04 00:00:00 2023-12-29 08:00:00  | 2023-06-04 00:00:00 2023-12-29 07:00:00 --> 5000 candles
ETHUSDT 1h, 2023-12-29 08:00:00 2024-07-24 16:00:00  | 2023-12-29 08:00:00 2024-07-24 15:00:00 --> 5000 candles
ETHUSDT 1h, 2024-07-24 16:00:00 2024-11-01 00:00:00  | 2024-07-24 16:00:00 2024-11-01 00:00:00 -