In [2]:
# Import standard libraries
import numpy as np
import pandas as pd
import datetime as dt
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import time
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from matplotlib.dates import DateFormatter, MonthLocator
import seaborn as sns
import warnings
import requests
import itertools

# Import data analysis and signal processing libraries
from scipy.stats import mstats
from scipy.signal import argrelextrema
from tqdm import tqdm, tnrange, notebook
import statsmodels.api as sm
from statsmodels.tsa.stattools import coint

# Import asynchronous libraries
import asyncio
import aiohttp

# Import machine learning libraries
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from sklearn.metrics import (
    roc_auc_score, confusion_matrix, accuracy_score,
    precision_score, recall_score, f1_score
)
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, BatchNormalization
from keras import callbacks
import talib
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression

# # Import Backtesting library
# from backtesting import Backtest, Strategy
# from backtesting.lib import crossover

# Set seaborn style
sns.set()

# Filter out warnings
warnings.filterwarnings("ignore")




In [6]:
coins = [
    "BTCUSDT",  # Bitcoin
    "XMRUSDT",  # Monero
    "AAVEUSDT", # Aave
    "DOTUSDT",  # Polkadot
    "CRVUSDT",  # Curve DAO Token
    "LTCUSDT",  # Litecoin
    "SOLUSDT",  # Solana
    "UNIUSDT",  # Uniswap
    "TRXUSDT",  # TRON
    "BNBUSDT",  # BNB
    "ADAUSDT",  # Cardano
    "EOSUSDT",  # EOS
    "DOGEUSDT", # Dogecoin
    "MATICUSDT", # Polygon
    "ETHUSDT"   # Ethereum
]

base_url = "https://api.binance.com/api/v3/klines"
fix_interval = '1d'
time_lapse = relativedelta(years=1)

# Function to fetch hourly data from Binance
def fetch_binance_data(symbol, interval, start_time, end_time):
    params = {
        'symbol': symbol,
        'interval': interval,
        'startTime': int(start_time.timestamp() * 1000),
        'endTime': int(end_time.timestamp() * 1000),
    }
    response = requests.get(base_url, params=params)
    data = response.json()
    return data

# Function to convert Binance data to DataFrame
def binance_data_to_dataframe(data, symbol):
    # From klines in Binance API
    df = pd.DataFrame(data, columns=['Open Time', 'Open', 'High', 'Low', 'Close', 'Volume', 'Close Time', 
                                     'Quote Asset Volume', 'Number of Trades', 'Taker Buy Base Asset Volume', 
                                     'Taker Buy Quote Asset Volume', 'Ignore'])
    
    # Convert timestamp to datetime
    df['Open Time'] = pd.to_datetime(df['Open Time'], unit='ms')
    df['Close Time'] = pd.to_datetime(df['Close Time'], unit='ms')

    #df = df[['Open Time', 'Close', 'Close Time']]  # Keep only Open Time and Close columns
    return df

prices = pd.DataFrame()

for coin in coins:
    print(coin)
    interval = fix_interval
    end_time = datetime.now()
    start_time = end_time - time_lapse  
    data = pd.DataFrame()

    # There is a limit of 500 fetches per query, hence I used a loop to fetch data 
    while start_time < end_time:
        fetched_data = fetch_binance_data(coin, interval, start_time, end_time)
        df = binance_data_to_dataframe(fetched_data, coin)
    
        if not df.empty:
            data = pd.concat([data, df])
            start_time = pd.to_datetime(df['Close Time'].iloc[-1]) + timedelta(hours=1)
        else:
            break
    data = data.set_index(['Open Time'])
    data = data.rename(columns={'Close':coin})
    data = data[coin]
    prices = pd.concat([prices,data], axis = 1)
    
prices.to_csv('./coin_data.csv', index=True)
prices = prices.astype(float)
prices.head()

BTCUSDT
XMRUSDT
AAVEUSDT
DOTUSDT
CRVUSDT
LTCUSDT
SOLUSDT
UNIUSDT
TRXUSDT
BNBUSDT
ADAUSDT
EOSUSDT
DOGEUSDT
MATICUSDT
ETHUSDT


Unnamed: 0,BTCUSDT,XMRUSDT,AAVEUSDT,DOTUSDT,CRVUSDT,LTCUSDT,SOLUSDT,UNIUSDT,TRXUSDT,BNBUSDT,ADAUSDT,EOSUSDT,DOGEUSDT,MATICUSDT,ETHUSDT
2022-12-03 00:00:00,16885.2,145.3,62.8,5.49,0.635,76.47,13.29,6.1,0.05358,290.4,0.3196,0.922,0.0997,0.9029,1240.51
2022-12-04 00:00:00,17105.7,142.2,64.4,5.59,0.67,77.07,13.71,6.27,0.05309,291.8,0.3221,0.93,0.10406,0.9212,1279.41
2022-12-05 00:00:00,16966.35,145.1,64.4,5.48,0.675,80.19,13.82,6.17,0.05334,288.4,0.3193,0.931,0.10149,0.9116,1259.41
2022-12-06 00:00:00,17088.96,143.4,64.0,5.5,0.679,79.88,14.3,6.18,0.05364,290.2,0.3183,0.958,0.10019,0.9176,1271.32
2022-12-07 00:00:00,16836.64,143.9,61.4,5.29,0.652,76.89,13.49,6.0,0.05312,284.2,0.3102,0.989,0.09575,0.8973,1231.18


# EDA and data cleaning

In [4]:
data_info = []

for coin in notebook.tqdm(prices.columns):
    c = {}
    c['coin'] = coin
    c['mindate'] = prices[coin].dropna().index.min()
    c['maxdate'] = prices[coin].dropna().index.max()
    c['days'] = prices[coin].dropna().index.max() - prices[coin].dropna().index.min()
    data_info.append(c)

data_info = pd.DataFrame(data_info).sort_values('maxdate')    
data_info

  0%|          | 0/15 [00:00<?, ?it/s]

Unnamed: 0,coin,mindate,maxdate,days
0,BTCUSDT,2022-12-03,2023-12-02,364 days
1,XMRUSDT,2022-12-03,2023-12-02,364 days
2,AAVEUSDT,2022-12-03,2023-12-02,364 days
3,DOTUSDT,2022-12-03,2023-12-02,364 days
4,CRVUSDT,2022-12-03,2023-12-02,364 days
5,LTCUSDT,2022-12-03,2023-12-02,364 days
6,SOLUSDT,2022-12-03,2023-12-02,364 days
7,UNIUSDT,2022-12-03,2023-12-02,364 days
8,TRXUSDT,2022-12-03,2023-12-02,364 days
9,BNBUSDT,2022-12-03,2023-12-02,364 days


In [5]:
prices.isnull().any().any()

False