In [1]:
import ccxt
import pandas as pd
from datetime import datetime, timezone
import warnings
warnings.filterwarnings('ignore')

In [7]:
# Initialize Binance API
exchange = ccxt.binance()

# Parameters
symbol = 'BNB/USDT'
timeframe = '1h'  # 1-hour data
start_date = '2017-01-01'  # Binance's data starts in 2017
end_date = '2024-12-31'

# Convert date to timestamps
start_timestamp = exchange.parse8601(f'{start_date}T00:00:00Z')
end_timestamp = exchange.parse8601(f'{end_date}T00:00:00Z')

# Fetch historical data
all_data = []
while start_timestamp < end_timestamp:
    print(f"Fetching data from {datetime.fromtimestamp(start_timestamp / 1000, tz=timezone.utc)}...")
    ohlcv = exchange.fetch_ohlcv(symbol, timeframe, since=start_timestamp, limit=1000)
    if not ohlcv:
        break
    all_data += ohlcv
    start_timestamp = ohlcv[-1][0] + 1

# Create DataFrame
columns = ['timestamp', 'Open', 'High', 'Low', 'Close', 'Volume']
data = pd.DataFrame(all_data, columns=columns)
data['timestamp'] = pd.to_datetime(data['timestamp'], unit='ms')
data.set_index('timestamp', inplace=True)

Fetching data from 2017-01-01 00:00:00+00:00...
Fetching data from 2017-12-17 18:00:00.001000+00:00...
Fetching data from 2018-01-28 11:00:00.001000+00:00...
Fetching data from 2018-03-12 12:00:00.001000+00:00...
Fetching data from 2018-04-23 04:00:00.001000+00:00...
Fetching data from 2018-06-03 20:00:00.001000+00:00...
Fetching data from 2018-07-16 06:00:00.001000+00:00...
Fetching data from 2018-08-26 22:00:00.001000+00:00...
Fetching data from 2018-10-07 14:00:00.001000+00:00...
Fetching data from 2018-11-18 16:00:00.001000+00:00...
Fetching data from 2018-12-30 08:00:00.001000+00:00...
Fetching data from 2019-02-10 00:00:00.001000+00:00...
Fetching data from 2019-03-23 22:00:00.001000+00:00...
Fetching data from 2019-05-04 14:00:00.001000+00:00...
Fetching data from 2019-06-15 16:00:00.001000+00:00...
Fetching data from 2019-07-27 08:00:00.001000+00:00...
Fetching data from 2019-09-07 08:00:00.001000+00:00...
Fetching data from 2019-10-19 00:00:00.001000+00:00...
Fetching data fro

In [8]:
# Display the first few rows
display(data.head())

Unnamed: 0_level_0,Open,High,Low,Close,Volume
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-11-06 03:00:00,1.5,1.799,0.5,1.7,649.12
2017-11-06 04:00:00,1.3,1.65,1.3,1.6479,8147.72
2017-11-06 05:00:00,1.5457,1.5525,1.5455,1.5458,6628.2
2017-11-06 06:00:00,1.5458,1.681,1.5387,1.681,22767.9
2017-11-06 07:00:00,1.6809,1.6809,1.6,1.625,14938.73


In [9]:
# Check for missing values
if data.isnull().values.any():
    print("There are missing values, baby. Cleaning them...")
    data.dropna(inplace=True)

# Check data consistency
print(data.info())

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 62210 entries, 2017-11-06 03:00:00 to 2024-12-16 06:00:00
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Open    62210 non-null  float64
 1   High    62210 non-null  float64
 2   Low     62210 non-null  float64
 3   Close   62210 non-null  float64
 4   Volume  62210 non-null  float64
dtypes: float64(5)
memory usage: 2.8 MB
None


In [10]:
# Define split dates
in_sample_end = '2022-12-31'
out_of_sample_start = '2023-01-01'

# Split data
in_sample_data = data[:in_sample_end]
out_of_sample_data = data[out_of_sample_start:]

# Check the splits
print("In-sample data:", in_sample_data.head(3), in_sample_data.tail(3), sep="\n")
print('------------')
print("Out-of-sample data:", out_of_sample_data.head(3), out_of_sample_data.tail(3), sep="\n")

In-sample data:
                       Open    High     Low   Close   Volume
timestamp                                                   
2017-11-06 03:00:00  1.5000  1.7990  0.5000  1.7000   649.12
2017-11-06 04:00:00  1.3000  1.6500  1.3000  1.6479  8147.72
2017-11-06 05:00:00  1.5457  1.5525  1.5455  1.5458  6628.20
                      Open   High    Low  Close    Volume
timestamp                                                
2022-12-31 21:00:00  247.0  247.5  246.9  247.0  5611.203
2022-12-31 22:00:00  247.0  247.3  245.9  246.3  5626.283
2022-12-31 23:00:00  246.3  246.9  246.0  246.3  4340.382
------------
Out-of-sample data:
                      Open   High    Low  Close    Volume
timestamp                                                
2023-01-01 00:00:00  246.3  246.7  245.5  245.7  4233.967
2023-01-01 01:00:00  245.8  246.0  245.5  245.8  2424.087
2023-01-01 02:00:00  245.8  245.9  245.2  245.3  3971.279
                       Open    High     Low   Close    Volume
time

In [11]:
# Extract the base symbol
base_symbol = symbol.split('/')[0]

# Save to CSV with dynamic file names
in_sample_data.to_csv(f'{base_symbol}_in_sample_data.csv')
out_of_sample_data.to_csv(f'{base_symbol}_out_of_sample_data.csv')