In [47]:
import requests
import pandas as pd
from datetime import datetime
import numpy as np
import time
import os


API_KEY = "TEczHISqFV6E0CZybwku74twqRSRJLU2dHwQ8o8UbU0hzfOJ"
BASE_URL = 'https://api.datasource.cybotrade.rs'

# Convert the input date string (YYYY-MM-DD) to UNIX timestamp in milliseconds for params purpose
def date_to_unix_ms(date_str):
    dt = datetime.strptime(date_str, "%Y-%m-%d")
    return int(dt.timestamp() * 1000)


In [48]:
def fetch_cryptoquant_data(endpoint, params=None, max_retries=3):
    # if no parameters is input
    if params is None:
        params = {}
    
    url = f"{BASE_URL}/cryptoquant/{endpoint}"
    headers = {"X-API-Key" : API_KEY}
    print(f"Making API request to: {url}")

    retry_count = 0
    backoff_time = 5

    while retry_count < max_retries:
        try:
            print(f"Making API Request to :{endpoint}")
            response = requests.get(url, headers=headers, params=params)

            if response.status_code == 200:
                print(f"Request succesful: {response.status_code}")
                return response.json()
            elif response.status_code == 429:
                print(f"Rate Limit eceeded. Waiting before Retry...")
                retry_count += 1
                wait_time = backoff_time * (2 ** (retry_count - 1))  # Exponential backoff
                print(f"Rate Limit exceeded. Waiting {wait_time} seconds before retry...")
                time.sleep(wait_time)
            else:
                print(f"Error{response.status_code} : {response.text}")
                return None
        except Exception as e:
            print(f"Exception during API request: {str(e)}")
            retry_count += 1
            time.sleep(backoff_time)

    print(f"Failed after {max_retries} attempts")
    return None


In [49]:
def collect_btc_data(start_date, end_date):
    start_ts = date_to_unix_ms(start_date)
    end_ts = date_to_unix_ms(end_date)

    params = {
        "start_time" : start_ts,
        "end_time" : end_ts,
    }

    data_dict = {}

    data_types = [
        # Price Data
        {"name": "price", "endpoint": "btc/market-data/price-ohlcv", "params": { "window": "hour"}},

        # Exchange flows
        {"name": "exchange_inflow", "endpoint": "btc/exchange-flows/inflow", "params": {"exchange": "all_exchange","window": "hour"}},
        {"name": "exchange_outflow", "endpoint": "btc/exchange-flows/outflow", "params": {"exchange": "all_exchange", "window": "hour"}},
        {"name": "exchange_netflow", "endpoint": "btc/exchange-flows/netflow", "params": {"exchange": "all_exchange", "window": "hour"}},

        #coinbase_premium_index
        {"name": "coinbase-premium-index", "endpoint": "btc/market-data/coinbase-premium-index", "params": {"window": "hour"}},

        # # Futures data
        # {"name": "funding_rate", "endpoint": "derivatives/funding-rate", "params": {"asset": "btc", "exchange": "all"}},
        # {"name": "open_interest", "endpoint": "derivatives/open-interest", "params": {"asset": "btc", "exchange": "all"}}
    ]

    for data_type in data_types:
        print(f"\nCollecting {data_type['name']} data...")

        req_params = params.copy()
        if "params" in data_type:
            req_params.update(data_type["params"])

        response = fetch_cryptoquant_data(data_type["endpoint"], req_params)

        if response and "data" in response:
            df = pd.DataFrame(response["data"])

            if "timestamp" in df.columns:
                df["datetime"] = pd.to_datetime(df["timestamp"], unit = "ms")
                df.set_index("datetime")

            # Store in dictionary
            data_dict[data_type["name"]] = df
            print(f"Successfully collected {len(df)} records for {data_type['name']}")
        else:
            print(f"Failed to collect {data_type['name']} data")

        time.sleep(2)

    return data_dict
            

In [50]:
# Function to save data to CSV files
def save_data_to_csv(data_dict, output_dir="data"):
    """Save each DataFrame in the dictionary to a CSV file"""
    
    # Create output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # Save each DataFrame to a CSV file
    for name, df in data_dict.items():
        file_path = os.path.join(output_dir, f"btc_{name}.csv")
        df.to_csv(file_path)
        print(f"Saved {name} data to {file_path}")

In [51]:
def merge_datasets(data_dict):
    # Initialize merged_df as None
    merged_df = None
    
    for name, df in data_dict.items():
        temp_df = df.reset_index()

        if "datetime" in temp_df.columns:
            temp_df["datetime"] = pd.to_datetime(temp_df["datetime"])
        
        if name == "price":
            # For price data, keep OHLCV
            if all(col in temp_df.columns for col in ["open", "high", "low", "close", "volume"]):
                selected_cols = ["datetime", "open", "high", "low", "close", "volume"]
                rename_dict = {
                    "open": "btc_open",
                    "high": "btc_high",
                    "low": "btc_low",
                    "close": "btc_close",
                    "volume": "btc_volume"
                }
                temp_df = temp_df[selected_cols].rename(columns=rename_dict)
        elif name == "exchange_inflow" and "inflow_total" in temp_df.columns:
            # For inflow data
            selected_cols = ["datetime", "inflow_total"]
            rename_dict = {"inflow_total": "btc_inflow_total"}
            temp_df = temp_df[selected_cols].rename(columns=rename_dict)
        elif name == "exchange_outflow" and "outflow_total" in temp_df.columns:
            # For outflow data
            selected_cols = ["datetime", "outflow_total"]
            rename_dict = {"outflow_total": "btc_outflow_total"}
            temp_df = temp_df[selected_cols].rename(columns=rename_dict)
        elif name == "exchange_netflow" and "netflow_total" in temp_df.columns:
            # For netflow data
            selected_cols = ["datetime", "netflow_total"]
            rename_dict = {"netflow_total": "btc_netflow_total"}
            temp_df = temp_df[selected_cols].rename(columns=rename_dict)
            
        elif name == "coinbase-premium-index" and "coinbase_premium_index" in temp_df.columns:
            # For ncoinbase premium index
            selected_cols = ["datetime", "coinbase_premium_index", "coinbase_premium_gap"]
            rename_dict = {
                "coinbase_premium_index": "btc_coinbase_premium_index",
                "coinbase_premium_gap" : "btc_coinbase_premium_gap"
                }
            temp_df = temp_df[selected_cols].rename(columns=rename_dict)
        else:
            # Skip datasets with unexpected structure
            print(f"Skipping {name} - unexpected column structure")
            continue
        
        # Merge with existing data
        if merged_df is None:
            merged_df = temp_df
        else:
            merged_df = pd.merge(merged_df, temp_df, on="datetime", how="outer")
    
    # Sort by datetime
    if merged_df is not None:
        merged_df.sort_values("datetime", inplace=True)
    
    return merged_df

In [52]:
# Function to prepare features for HMM model
def prepare_hmm_features(merged_df):
    # Create a copy to avoid modifying the original
    df = merged_df.copy()
    
    # Calculate returns if price data is available
    if "btc_close" in df.columns:
        # Daily returns
        df["returns"] = df["btc_close"].pct_change()
        
        # Log returns
        df["log_returns"] = np.log(df["btc_close"]/df["btc_close"].shift(1))
        
        # Volatility (rolling 14-period standard deviation of returns)
        df["volatility_14d"] = df["returns"].rolling(window=14).std()
        
        # Moving averages
        df["ma_7d"] = df["btc_close"].rolling(window=7).mean()
        df["ma_30d"] = df["btc_close"].rolling(window=30).mean()
        df["ma_90d"] = df["btc_close"].rolling(window=90).mean()
        
        # MACD
        df["ema_12d"] = df["btc_close"].ewm(span=12, adjust=False).mean()
        df["ema_26d"] = df["btc_close"].ewm(span=26, adjust=False).mean()
        df["macd"] = df["ema_12d"] - df["ema_26d"]
        df["macd_signal"] = df["macd"].ewm(span=9, adjust=False).mean()
        df["macd_hist"] = df["macd"] - df["macd_signal"]
        
        # RSI
        delta = df["btc_close"].diff()
        gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
        loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
        rs = gain / loss
        df["rsi_14d"] = 100 - (100 / (1 + rs))
    
    # Process on-chain metrics if available
    # Exchange flow ratio
    if all(col in df.columns for col in ["btc_exchange_inflow", "btc_exchange_outflow"]):
        df["exchange_flow_ratio"] = df["btc_exchange_inflow"] / df["btc_exchange_outflow"]
    
    # Calculate z-scores for normalization
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        if col != "datetime":
            z_col = f"{col}_zscore"
            df[z_col] = (df[col] - df[col].mean()) / df[col].std()
    
    # Drop rows with NaN values (from rolling calculations)
    df.dropna()
    
    return df

In [37]:
# Define all training periods from 2020 to 2024
training_periods = [
    # 2020
    ("2020-01-01", "2020-02-01"),
    ("2020-02-01", "2020-03-01"),
    ("2020-03-01", "2020-04-01"),
    ("2020-04-01", "2020-05-01"),
    ("2020-05-01", "2020-06-01"),
    ("2020-06-01", "2020-07-01"),
    ("2020-07-01", "2020-08-01"),
    ("2020-08-01", "2020-09-01"),
    ("2020-09-01", "2020-10-01"), 
    ("2020-10-01", "2020-11-01"), 
    ("2020-11-01", "2020-12-01"),
    ("2020-12-01", "2021-01-01"),

]

# testing_period = ("2023-01-01", "2025-04-07")

os.makedirs("data", exist_ok=True)
os.makedirs("figures", exist_ok=True)

training_data = {}
    
for start_date, end_date in training_periods:
    print(f"\n=== Collecting data for period {start_date} to {end_date} ===")
    period_data = collect_btc_data(start_date, end_date)
        
        # Merge with existing data
    for key, df in period_data.items():
        if key in training_data:
            training_data[key] = pd.concat([training_data[key], df])
        else:
            training_data[key] = df
    
    # Save training data
save_data_to_csv(training_data, "data/training")
    
# Merge training datasets
print("\n=== Merging training datasets ===")
merged_training = merge_datasets(training_data)
merged_training.to_csv("data/merged_training_data_year2.csv")
print(f"Merged training data shape: {merged_training.shape}")

# Prepare features for HMM
print("\n=== Preparing features for HMM model ===")
hmm_features = prepare_hmm_features(merged_training)
hmm_features.to_csv("data/hmm_features_training_year2.csv")
print(f"HMM features shape: {hmm_features.shape}")


=== Collecting data for period 2020-01-01 to 2020-02-01 ===

Collecting coinbase-premium-index data...
Making API request to: https://api.datasource.cybotrade.rs/cryptoquant/btc/market-data/coinbase-premium-index
Making API Request to :btc/market-data/coinbase-premium-index
Request succesful: 200
Successfully collected 743 records for coinbase-premium-index

=== Collecting data for period 2020-02-01 to 2020-03-01 ===

Collecting coinbase-premium-index data...
Making API request to: https://api.datasource.cybotrade.rs/cryptoquant/btc/market-data/coinbase-premium-index
Making API Request to :btc/market-data/coinbase-premium-index
Request succesful: 200
Successfully collected 690 records for coinbase-premium-index

=== Collecting data for period 2020-03-01 to 2020-04-01 ===

Collecting coinbase-premium-index data...
Making API request to: https://api.datasource.cybotrade.rs/cryptoquant/btc/market-data/coinbase-premium-index
Making API Request to :btc/market-data/coinbase-premium-index
Re

In [None]:
# Define periods for 2021 (you already have this)
training_periods_2024 = [
    ("2021-01-01", "2021-01-31"),
    ("2021-02-01", "2021-02-29"),  # 2021 is a leap year
    ("2021-03-01", "2021-03-31"),
    ("2021-04-01", "2021-04-30"),
    ("2021-05-01", "2021-05-31"),
    ("2021-06-01", "2021-06-30"),
    ("2021-07-01", "2021-07-31"),
    ("2021-08-01", "2021-08-31"),
    ("2021-09-01", "2021-09-30"),
    ("2021-10-01", "2021-10-31"),
    ("2021-11-01", "2021-11-30"),
    ("2021-12-01", "2021-12-31"),
]

# Collect 2021 data as you're already doing
training_data_2024 = {}
    
for start_date, end_date in training_periods_2024:
    print(f"\n=== Collecting data for period {start_date} to {end_date} ===")
    period_data = collect_btc_data(start_date, end_date)
        
    # Merge with existing data
    for key, df in period_data.items():
        if key in training_data_2024:
            training_data_2024[key] = pd.concat([training_data_2024[key], df])
        else:
            training_data_2024[key] = df
    
# Save 2021 training data
save_data_to_csv(training_data_2024, "data/training_2024")

# Now load previously saved 2021 data
training_data_2023 = {}
data_types = ["price", "exchange_inflow", "exchange_outflow", "exchange_netflow"]

for data_type in data_types:
    try:
        file_path = f"data/training/btc_{data_type}.csv"
        df = pd.read_csv(file_path, index_col=0)
        if "datetime" in df.columns:
            df["datetime"] = pd.to_datetime(df["datetime"])
        training_data_2023[data_type] = df
        print(f"Loaded {data_type} data from {file_path}")
    except FileNotFoundError:
        print(f"Could not find {file_path}")

# Combine 2021 and 2021 data
combined_data = {}
for key in set(list(training_data_2023.keys()) + list(training_data_2024.keys())):
    if key in training_data_2023 and key in training_data_2024:
        # Ensure datetime column is properly converted in both dataframes
        if "datetime" in training_data_2023[key].columns:
            training_data_2023[key]["datetime"] = pd.to_datetime(training_data_2023[key]["datetime"])
        if "datetime" in training_data_2024[key].columns:
            training_data_2024[key]["datetime"] = pd.to_datetime(training_data_2024[key]["datetime"])
            
        combined_data[key] = pd.concat([training_data_2023[key], training_data_2024[key]])
    elif key in training_data_2023:
        combined_data[key] = training_data_2023[key]
    else:
        combined_data[key] = training_data_2024[key]

# Save combined data
save_data_to_csv(combined_data, "data/training_combined")
    
# Merge combined datasets
print("\n=== Merging combined datasets ===")
merged_combined = merge_datasets(combined_data)
merged_combined.to_csv("data/merged_training_data_2023_2024.csv")
print(f"Merged combined data shape: {merged_combined.shape}")

# Prepare features for HMM
print("\n=== Preparing features for HMM model ===")
hmm_features_combined = prepare_hmm_features(merged_combined)
hmm_features_combined.to_csv("data/hmm_features_training_2023_2024.csv")
print(f"HMM features shape: {hmm_features_combined.shape}")

In [53]:
# Define all training periods from 2020 to 2024
training_periods = [
    # 2020
    ("2020-01-01", "2020-02-01"),
    ("2020-02-01", "2020-03-01"),
    ("2020-03-01", "2020-04-01"),
    ("2020-04-01", "2020-05-01"),
    ("2020-05-01", "2020-06-01"),
    ("2020-06-01", "2020-07-01"),
    ("2020-07-01", "2020-08-01"),
    ("2020-08-01", "2020-09-01"),
    ("2020-09-01", "2020-10-01"), 
    ("2020-10-01", "2020-11-01"), 
    ("2020-11-01", "2020-12-01"),
    ("2020-12-01", "2021-01-01"),

    #2021
    ("2021-01-01", "2021-02-01"),
    ("2021-02-01", "2021-03-01"),
    ("2021-03-01", "2021-04-01"),
    ("2021-04-01", "2021-05-01"),
    ("2021-05-01", "2021-06-01"),
    ("2021-06-01", "2021-07-01"),
    ("2021-07-01", "2021-08-01"),
    ("2021-08-01", "2021-09-01"),
    ("2021-09-01", "2021-10-01"), 
    ("2021-10-01", "2021-11-01"), 
    ("2021-11-01", "2021-12-01"),
    ("2021-12-01", "2022-01-01"),

    #2022
    ("2022-01-01", "2022-02-01"),
    ("2022-02-01", "2022-03-01"),
    ("2022-03-01", "2022-04-01"),
    ("2022-04-01", "2022-05-01"),
    ("2022-05-01", "2022-06-01"),
    ("2022-06-01", "2022-07-01"),
    ("2022-07-01", "2022-08-01"),
    ("2022-08-01", "2022-09-01"),
    ("2022-09-01", "2022-10-01"), 
    ("2022-10-01", "2022-11-01"), 
    ("2022-11-01", "2022-12-01"),
    ("2022-12-01", "2023-01-01"),

    #2023
    ("2023-01-01", "2023-02-01"),
    ("2023-02-01", "2023-03-01"),
    ("2023-03-01", "2023-04-01"),
    ("2023-04-01", "2023-05-01"),
    ("2023-05-01", "2023-06-01"),
    ("2023-06-01", "2023-07-01"),
    ("2023-07-01", "2023-08-01"),
    ("2023-08-01", "2023-09-01"),
    ("2023-09-01", "2023-10-01"), 
    ("2023-10-01", "2023-11-01"), 
    ("2023-11-01", "2023-12-01"),
    ("2023-12-01", "2024-01-01"),

    #2024
    ("2024-01-01", "2024-02-01"),
    ("2024-02-01", "2024-03-01"),
    ("2024-03-01", "2024-04-01"),
    ("2024-04-01", "2024-05-01"),
    ("2024-05-01", "2024-06-01"),
    ("2024-06-01", "2024-07-01"),
    ("2024-07-01", "2024-08-01"),
    ("2024-08-01", "2024-09-01"),
    ("2024-09-01", "2024-10-01"), 
    ("2024-10-01", "2024-11-01"), 
    ("2024-11-01", "2024-12-01"),
    ("2024-12-01", "2025-01-02"),
]

def collect_and_save_data_by_year(year, periods):
    print(f"\n=== Collecting data for year {year} ===")
    
    # Filter periods for this year
    year_periods = [p for p in periods if p[0].startswith(year)]
    
    # Create directory for this year
    year_dir = f"data/{year}"
    os.makedirs(year_dir, exist_ok=True)
    
    year_data = {}
    
    for start_date, end_date in year_periods:
        print(f"\n=== Collecting data for period {start_date} to {end_date} ===")
        period_data = collect_btc_data(start_date, end_date)
        
        # Merge with existing data
        for key, df in period_data.items():
            if key in year_data:
                year_data[key] = pd.concat([year_data[key], df])
            else:
                year_data[key] = df
        
        # Sleep to avoid API rate limits
        time.sleep(5)
    
    # Save year data
    save_data_to_csv(year_data, year_dir)
    
    # Create merged features for this year
    print(f"\n=== Merging datasets for year {year} ===")
    merged_year = merge_datasets(year_data)
    merged_year.to_csv(f"{year_dir}/merged_data.csv")
    print(f"Merged data shape: {merged_year.shape}")
    
    # Prepare features for HMM
    print(f"\n=== Preparing features for HMM model for year {year} ===")
    hmm_features_year = prepare_hmm_features(merged_year)
    hmm_features_year.to_csv(f"{year_dir}/hmm_features.csv")
    print(f"HMM features shape: {hmm_features_year.shape}")
    
    return year_data

In [54]:
def combine_all_years(years):
    all_data = {}
    
    for year in years:
        year_dir = f"data/{year}"
        data_types = ["price", "exchange_inflow", "exchange_outflow", "exchange_netflow", "coinbase-premium-index"]
        
        for data_type in data_types:
            try:
                file_path = f"{year_dir}/btc_{data_type}.csv"
                df = pd.read_csv(file_path, index_col=0)
                if "datetime" in df.columns:
                    df["datetime"] = pd.to_datetime(df["datetime"])
                
                if data_type in all_data:
                    all_data[data_type] = pd.concat([all_data[data_type], df])
                else:
                    all_data[data_type] = df
                print(f"Loaded {data_type} data from {file_path}")
            except FileNotFoundError:
                print(f"Could not find {file_path}")
    
    # Sort each dataframe by datetime
    for key in all_data:
        if "datetime" in all_data[key].columns:
            all_data[key] = all_data[key].sort_values("datetime")
    
    # Save combined data
    save_data_to_csv(all_data, "data/all_years_combined")
    
    # Merge combined datasets
    print("\n=== Merging all combined datasets ===")
    merged_all = merge_datasets(all_data)
    merged_all.to_csv("data/merged_data_2020_2024.csv")
    print(f"Merged combined data shape: {merged_all.shape}")
    
    # Prepare features for HMM
    print("\n=== Preparing features for HMM model for all data ===")
    hmm_features_all = prepare_hmm_features(merged_all)
    hmm_features_all.to_csv("data/hmm_features_2020_2024.csv")
    print(f"HMM features shape: {hmm_features_all.shape}")
    
    return merged_all, hmm_features_all

In [55]:
# Create directories
os.makedirs("data", exist_ok=True)
os.makedirs("figures", exist_ok=True)

# Define years to process
years_to_process = ["2020", "2021", "2022", "2023", "2024"]

# Process each year individually
year_data_dict = {}
for year in years_to_process:
    year_data = collect_and_save_data_by_year(year, training_periods)
    year_data_dict[year] = year_data

# Combine all years
merged_all, hmm_features_all = combine_all_years(years_to_process)

print("Complete data collection and processing finished!")


=== Collecting data for year 2020 ===

=== Collecting data for period 2020-01-01 to 2020-02-01 ===

Collecting price data...
Making API request to: https://api.datasource.cybotrade.rs/cryptoquant/btc/market-data/price-ohlcv
Making API Request to :btc/market-data/price-ohlcv
Request succesful: 200
Successfully collected 744 records for price

Collecting exchange_inflow data...
Making API request to: https://api.datasource.cybotrade.rs/cryptoquant/btc/exchange-flows/inflow
Making API Request to :btc/exchange-flows/inflow
Request succesful: 200
Successfully collected 744 records for exchange_inflow

Collecting exchange_outflow data...
Making API request to: https://api.datasource.cybotrade.rs/cryptoquant/btc/exchange-flows/outflow
Making API Request to :btc/exchange-flows/outflow
Request succesful: 200
Successfully collected 744 records for exchange_outflow

Collecting exchange_netflow data...
Making API request to: https://api.datasource.cybotrade.rs/cryptoquant/btc/exchange-flows/netfl