# Data Sources

Get data 

## import

In [1]:
import os
import sys
import time
import datetime
import pandas as pd
import numpy as np
import requests
#import yfinance as yf
import ccxt

## definitions

In [2]:
#yf_eth = yf.Ticker("ETH-USD")
#yf_btc = yf.Ticker("BTC-USD")

ETH_TICK = "ETH/USDT"
BTC_TICK = "BTC/USDT"
FIRST_DATE = "2022-01-01 00:00:00"
LAST_DATE =  "2023-01-01 00:00:00"
TIMEFRAME = "1h"

## Download data

In [3]:
# params
ticker = ETH_TICK
from_datetime = FIRST_DATE
to_datetime = LAST_DATE
exchange = ccxt.binance({
    'rateLimit': 1000,
    'enableRateLimit': True,
})


# def
def req_ohlcv(
        exchange,
        ticker,
        timeframe,
        from_datetime,  
        to_datetime=None
    ):
    
    # common constants
    msec = 1000
    minute = 60 * msec
    hour = 60 * minute
    hold = 30

    from_timestamp = exchange.parse8601(from_datetime)

    if to_datetime is None:
        to_timestamp = exchange.milliseconds()
    else:
        to_timestamp = exchange.parse8601(to_datetime)
        

    data = []

    while from_timestamp < to_timestamp:

        try:

            print(exchange.milliseconds(), 'Fetching candles starting from', exchange.iso8601(from_timestamp))
            ohlcvs = exchange.fetch_ohlcv(ticker, '1h', from_timestamp)
            print(exchange.milliseconds(), 'Fetched', len(ohlcvs), 'candles')
            first = ohlcvs[0][0]
            last = ohlcvs[-1][0]
            print('First candle epoch', first, exchange.iso8601(first))
            print('Last candle epoch', last, exchange.iso8601(last))
            from_timestamp += len(ohlcvs) * hour
            data += ohlcvs

        except (ccxt.ExchangeError, ccxt.AuthenticationError, ccxt.ExchangeNotAvailable, ccxt.RequestTimeout) as error:

            print('Got an error', type(error).__name__, error.args, ', retrying in', hold, 'seconds...')
            time.sleep(hold)
    
    df = pd.DataFrame(data)
    df.columns = (["Time", "Open", "High", "Low", "Close", "Volume"])

    df['date'] = pd.to_datetime(df['Time'], unit='ms')

    return df


df_eth = req_ohlcv(
        exchange,
        ETH_TICK,
        TIMEFRAME,
        from_datetime,  
        to_datetime,
    )
    


1675879112817 Fetching candles starting from 2022-01-01T00:00:00.000Z
1675879116213 Fetched 500 candles
First candle epoch 1640995200000 2022-01-01T00:00:00.000Z
Last candle epoch 1642791600000 2022-01-21T19:00:00.000Z
1675879116213 Fetching candles starting from 2022-01-21T20:00:00.000Z
1675879117442 Fetched 500 candles
First candle epoch 1642795200000 2022-01-21T20:00:00.000Z
Last candle epoch 1644591600000 2022-02-11T15:00:00.000Z
1675879117442 Fetching candles starting from 2022-02-11T16:00:00.000Z
1675879118435 Fetched 500 candles
First candle epoch 1644595200000 2022-02-11T16:00:00.000Z
Last candle epoch 1646391600000 2022-03-04T11:00:00.000Z
1675879118435 Fetching candles starting from 2022-03-04T12:00:00.000Z
1675879119233 Fetched 500 candles
First candle epoch 1646395200000 2022-03-04T12:00:00.000Z
Last candle epoch 1648191600000 2022-03-25T07:00:00.000Z
1675879119234 Fetching candles starting from 2022-03-25T08:00:00.000Z
1675879120444 Fetched 500 candles
First candle epoch 1

In [4]:
df_eth

Unnamed: 0,Time,Open,High,Low,Close,Volume,date
0,1640995200000,3676.22,3730.00,3676.22,3723.04,9023.3740,2022-01-01 00:00:00
1,1640998800000,3723.04,3748.45,3714.10,3724.89,8997.7569,2022-01-01 01:00:00
2,1641002400000,3724.88,3738.47,3722.23,3728.32,5028.4531,2022-01-01 02:00:00
3,1641006000000,3728.41,3737.00,3721.84,3723.96,3543.1853,2022-01-01 03:00:00
4,1641009600000,3723.96,3728.07,3707.24,3708.21,6117.2820,2022-01-01 04:00:00
...,...,...,...,...,...,...,...
8995,1673377200000,1341.08,1344.04,1338.50,1339.48,22563.0593,2023-01-10 19:00:00
8996,1673380800000,1339.48,1347.69,1335.08,1341.98,22371.3361,2023-01-10 20:00:00
8997,1673384400000,1341.97,1343.21,1338.76,1339.01,14307.3162,2023-01-10 21:00:00
8998,1673388000000,1339.00,1339.45,1334.45,1334.46,11972.0830,2023-01-10 22:00:00


## Tests

In [5]:
assert df_eth.shape[0] == df_eth.dropna().shape[0], "Some data are NaN !"
diff_time = np.diff(
    np.array( 
        (df_eth["date"] - pd.Timestamp("1970-01-01")) / pd.Timedelta('1h') 
    )
)
assert np.max(diff_time) == np.min(diff_time) 


In [6]:
np.argmin(diff_time) 

0

In [7]:
diff_time.shape

(8999,)

In [8]:
df_eth[7247:7254]

Unnamed: 0,Time,Open,High,Low,Close,Volume,date
7247,1667084400000,1611.0,1621.52,1607.06,1619.55,16677.1518,2022-10-29 23:00:00
7248,1667088000000,1619.55,1624.41,1606.17,1607.68,20302.3286,2022-10-30 00:00:00
7249,1667091600000,1607.68,1615.77,1605.74,1615.31,12404.448,2022-10-30 01:00:00
7250,1667095200000,1615.32,1619.67,1609.93,1616.67,10938.0959,2022-10-30 02:00:00
7251,1667098800000,1616.66,1624.4,1616.64,1622.18,13612.8197,2022-10-30 03:00:00
7252,1667102400000,1622.19,1624.85,1618.49,1618.5,9397.0836,2022-10-30 04:00:00
7253,1667106000000,1618.49,1627.45,1618.48,1625.39,15845.7737,2022-10-30 05:00:00


In [9]:
np.argmax(diff_time)

0

In [10]:
df_eth[2035:2045]

Unnamed: 0,Time,Open,High,Low,Close,Volume,date
2035,1648321200000,3125.44,3127.02,3116.11,3123.52,8190.3235,2022-03-26 19:00:00
2036,1648324800000,3123.53,3153.2,3121.45,3143.35,22509.0798,2022-03-26 20:00:00
2037,1648328400000,3143.35,3145.01,3125.97,3138.8,9843.4033,2022-03-26 21:00:00
2038,1648332000000,3138.8,3147.61,3134.28,3134.28,6189.2653,2022-03-26 22:00:00
2039,1648335600000,3134.28,3150.52,3133.58,3145.0,7100.3566,2022-03-26 23:00:00
2040,1648339200000,3145.01,3152.63,3137.29,3149.86,11856.3601,2022-03-27 00:00:00
2041,1648342800000,3149.86,3155.7,3142.09,3149.3,9366.3673,2022-03-27 01:00:00
2042,1648346400000,3149.29,3170.0,3143.67,3156.36,14360.1853,2022-03-27 02:00:00
2043,1648350000000,3156.35,3156.85,3138.4,3143.76,8296.0077,2022-03-27 03:00:00
2044,1648353600000,3143.77,3150.34,3133.82,3138.92,7559.899,2022-03-27 04:00:00
