## Data Visualization of Raw Data

In [None]:
from utilsnumpy import load_data, data_processing
import matplotlib.pyplot as plt

data1 = "./data/bybit_candle_btc_1h.csv"
data2 = "./data/cryptoquant_btc_coinbase-premium-index_1h.csv"
factor = 'coinbase_premium_index'

unselected_df = load_data(data1, data2)
df = unselected_df[["start_time", "close", factor]].copy()
df.columns = ["start_time", "close", factor]
df = data_processing(df, "diff", factor)
# df = data_processing(df, "cbrt", factor)

# Visualize the raw data of factor do not need close price
fig, ax1 = plt.subplots(figsize=(15, 8))
ax1.plot(df['start_time'], df[factor], label=factor, color='green', linewidth=2)
ax1.set_xlabel("Date", fontsize=12)
ax1.set_ylabel(factor, fontsize=12, color='green')
ax1.tick_params(axis='y', labelcolor='green')
# Add title and grid
plt.title(f"Raw Data of {factor}", fontsize=16)
fig.tight_layout()  # Adjust layout to prevent overlap
plt.grid(True)
plt.show()

## Split Train Backtest + show heatmap of Split backtest

In [None]:
from tqdm import tqdm
from itertools import product
from utilsnumpy import backtest , load_data, load_single_data, combine_factors
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import json

annualizer_dict = {
    '1m': 525600,  # 1-minute intervals in a year
    '5m': 105120,  # 5-minute intervals in a year
    '15m': 35040,  # 15-minute intervals in a year
    '30m': 17520,  # 30-minute intervals in a year
    '1h': 8760,    # 1-hour intervals in a year
    '4h': 2190,    # 4-hour intervals in a year
    '1d': 365,     # 1-day intervals in a year
    '1w': 52,      # 1-week intervals in a year
    '1M': 12       # 1-month intervals in a year
}

all_backtest_results = []
plot_data = []

def main(data1, data2, data3, factor, factor2, interval, operation,preprocess, model, entry, window_end, window_step, threshold_end, threshold_step):
    # Load data
    unselected_df = load_data(data1, data2)
    # Select wanted data column
    df = unselected_df[["start_time", "close", factor]].copy()
    # rename data column
    df.columns = ["start_time", "close", factor]

    # Load data3 if operation sign provided
    if operation != 'none':
        df1 = load_single_data(data3, factor2)
        # Merge df and df1
        df = pd.merge_asof(df, df1.sort_values('start_time'), on="start_time", direction="nearest")
        df, new_column_name = combine_factors(df, factor, factor2, operation)
        factor = new_column_name
    # print(df.columns)
    # print(factor)

    # metrics setting
    window_start= 5
    threshold_start = 0.0
    annualizer = annualizer_dict.get(interval, None) # Day data, so 365
    train_split = annualizer * 3
    
    backtest_report = []

    # Split data into train and test sets (Train set: 3Year, Test set use remaining data)
    df_train = df[:train_split].reset_index(drop=True).copy()
    # df_test = df[train_split:].reset_index(drop=True).copy()

    # backtest
    for rolling_window in range(window_start, window_end, window_step):
        for threshold in np.arange(threshold_start, threshold_end, threshold_step):
            backtest_report.append(backtest(df_train, rolling_window, threshold, preprocess, entry, annualizer, model, factor, interval, "sr"))
    all_backtest_results.append(backtest_report)
    

    # Extract pivot table for SR to plot heatmap
    backtest_df = pd.DataFrame(backtest_report)
    plot_data.append((model, entry, backtest_df))

def plot_heatmaps(sr_threshold=1.5):
    for model, entry, backtest_df in plot_data:
        # ✅ Optimized pivot using groupby instead of pivot
        sr_pivot_data = backtest_df.groupby(['rolling_window', 'threshold'])['SR'].mean().unstack()

        # ✅ Dynamically set SR threshold based on entry name
        sr_threshold = 1.2 if entry.startswith('S') else sr_threshold  # Use 1.2 for entries starting with 'S', otherwise 1.8

        # ✅ Check if the entire heatmap is NaN
        if sr_pivot_data.isna().all().all():
            print(f"⚠️ Skipping {model}_{entry} heatmap: All SR values are NaN.")
            continue  # Skip plotting

        # ✅ Check if there is at least one SR > threshold
        if not np.any(sr_pivot_data.to_numpy() > sr_threshold):
            print(f"⚠️ Skipping {model}_{entry} heatmap: No SR value exceeds {sr_threshold}.")
            continue  # Skip plotting

        plt.figure(figsize=(18, 14))  # ✅ Reduced figure size for faster rendering
        sns.heatmap(sr_pivot_data, annot=True, fmt=".2f", cmap="RdYlGn", linewidths=0.3, cbar_kws={'label': 'Sharpe Ratio'})
        plt.title(f"{model}_{preprocess}_{entry} Train Period BackTest SR Heatmap", fontsize=14)
        plt.show()  # ✅ Display the plot
        plt.close()  # ✅ Free memory after each plot

# models = ['zscore', 'momentum', 'volatility', 'robust', 'sma_diff', 'ewm', 'minmax', 'percentile', 'maxabs', 'mean_norm', 'roc', 'rsi', 'psy', 'rvi', 'mad', 'ma_ratio']
# entrys = ['Trend', 'Trend_Reverse', 'MR', 'MR_Reverse', 'Trend_NoHold',  'Trend_emaFilter', 'Trend_NoHold_emaFilter', 
#            'L_Trend', 'L_Trend_Reverse', 'L_MR', 'L_MR_Reverse', 'L_Trend_NoHold', 'L_Trend_emaFilter', 'L_Trend_NoHold_emaFilter', 
#            'S_Trend', 'S_Trend_Reverse', 'S_MR', 'S_MR_Reverse' ,'S_Trend_NoHold', 'S_Trend_emaFilter', 'S_Trend_NoHold_emaFilter']

models = ['robust']
entrys = ['Trend_Reverse', 'MR_Reverse', 'L_Trend_Reverse', 'L_Trend_emaFilter']
           
factor = 'inflow_total'
factor2 = 'netflow_total'
interval = '1d'
operation = '/'
preprocess = 'diff'
total_combinations = len(models) * len(entrys)
for model, entry in tqdm(product(models, entrys), 
                        total=total_combinations,
                        desc="🔍 Backtesting Strategies",
                        unit="strategy", 
                        leave = True):
    main(
        f"./data/bybit_candle_btc_{interval}.csv",
        f"./data/cryptoquant_btc_inflow_{interval}.csv",
        f"./data/cryptoquant_btc_netflow_{interval}.csv",
        factor,
        factor2,
        interval,
        operation,
        preprocess,
        model,
        entry,
        window_end=201,
        window_step=5,
        threshold_end=3.01,
        threshold_step=0.1
)

# ✅ Plot all heatmaps that SR > 1.5after backtesting
plot_heatmaps(1.5)
   
# Output backtest json file with all model and entry
# output_filename = f"{factor}_{interval}_split_backtest.json" 
# output_backtest_data = {"backtests": all_backtest_results}
# with open(output_filename, "w") as json_file:
#     json.dump(output_backtest_data, json_file, indent=4)

## Plot SR HeatMap by input model name(From backtest json file)

In [None]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load JSON file
def load_json(file_path):
    with open(file_path, "r") as f:
        data = json.load(f)
    return data["backtests"]

# Extract SR data grouped by backtest_mode
def extract_sr_data(backtests, model):
    sr_data_dict = {}
    
    for backtest_list in backtests:
        for bt in backtest_list:
            if bt["model"] == model:
                mode = bt["backtest_mode"]
                if mode not in sr_data_dict:
                    sr_data_dict[mode] = []
                sr_data_dict[mode].append((bt["rolling_window"], bt["threshold"], bt["SR"]))

    return sr_data_dict

# Create heatmaps for each backtest_mode
def plot_sr_heatmaps(sr_data_dict, model):
    if not sr_data_dict:
        print("No data found for the given model.")
        return

    for mode, sr_data in sr_data_dict.items():
        df = pd.DataFrame(sr_data, columns=["rolling_window", "threshold", "SR"])
        pivot_table = df.pivot(index="rolling_window", columns="threshold", values="SR")
        
        plt.figure(figsize=(20,16))
        sns.heatmap(pivot_table, annot=True, fmt=".2f", cmap="RdYlGn", linewidths=0.5, cbar_kws={'label': 'Sharpe Ratio'})
        plt.title(f"Train Period BackTest Sharpe Ratio Heatmap - {mode} ({model})")
        plt.xlabel("Rolling Window")
        plt.ylabel("Threshold")
        plt.show()

# Main function
def main():
    file_path = "mvrv_1d_split_backtest.json"  # Update with actual file path
    model = "zscore"
    
    backtests = load_json(file_path)
    sr_data_dict = extract_sr_data(backtests, model)
    plot_sr_heatmaps(sr_data_dict, model)

if __name__ == "__main__":
    main()


## Split Forward Testing Period

In [None]:
from utilsnumpy import backtest , load_data, load_single_data, combine_factors
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import talib
import math
import json

annualizer_dict = {
    '1m': 525600,  # 1-minute intervals in a year
    '5m': 105120,  # 5-minute intervals in a year
    '15m': 35040,  # 15-minute intervals in a year
    '30m': 17520,  # 30-minute intervals in a year
    '1h': 8760,    # 1-hour intervals in a year
    '4h': 2190,    # 4-hour intervals in a year
    '1d': 365,     # 1-day intervals in a year
    '1w': 52,      # 1-week intervals in a year
    '1M': 12       # 1-month intervals in a year
}

def main(data1, data2, data3, factor, factor2, interval, operation, preprocess, model, entry, window, threshold):
    # Merge Data
    unselected_df = load_data(data1, data2)
    # Select wanted data column
    df = unselected_df[["start_time", "close", factor]].copy()
    # rename column
    df.columns = ["start_time", "close", factor]
    
    # Load data3 if operation sign provided
    if operation != 'none':
        df1 = load_single_data(data3, factor2)
        # Merge df and df1
        df = pd.merge_asof(df, df1.sort_values('start_time'), on="start_time", direction="nearest")
        df, new_column_name = combine_factors(df, factor, factor2, operation)
        factor = new_column_name
    
    # metrics setting
    rolling_window = window
    threshold = threshold
    annualizer = annualizer_dict.get(interval, None) # Day data, so 365
    train_split = annualizer * 3

    # Split data into train and test sets (Train set: 3Year, Test set use remaining data)
    # df_train = df[:train_split].reset_index(drop=True).copy()
    df_test = df[train_split:].reset_index(drop=True).copy()

    forwardtest_report = []
    forwardtest_report.append(backtest(df_test, rolling_window, threshold, preprocess, entry, annualizer, model, factor, interval))

    print(json.dumps(forwardtest_report, indent=4))
    

    # Plot close price on the left y-axis
    fig, ax1 = plt.subplots(figsize=(15, 8))
    ax1.plot(df_test['start_time'], df_test['close'], label='Close Price', color='green', linewidth=2)
    ax1.set_xlabel("Date", fontsize=12)
    ax1.set_ylabel("Close Price", fontsize=12, color='green')
    ax1.tick_params(axis='y', labelcolor='green')
    # Plot cumulative PnL on the right y-axis
    ax2 = ax1.twinx()
    ax2.plot(df_test['start_time'], df_test['cumu_pnl'], label='Cumulative PnL', color='blue', linewidth=2)
    ax2.set_ylabel("Cumulative PnL", fontsize=12, color='blue')
    ax2.tick_params(axis='y', labelcolor='blue')
    # Add title and grid
    plt.title("Close Price and Cumulative PnL Plot (Forward Test Period)", fontsize=16)
    fig.tight_layout()  # Adjust layout to prevent overlap
    plt.grid(True)
    plt.show()

    # output_forwardtest_data = {"forward_test": forwardtest_report}
    # with open(f"final_{factor}_{interval}_forward_test.json", "w") as json_file:
    #     json.dump(output_forwardtest_data, json_file, indent=4)
    
    # Export df to csv
    # df.to_csv("backtest_df.csv", index=False)

factor = 'inflow_total'
factor2 = 'netflow_total'
interval = '1d'
operation = '/'
preprocess = 'diff'
model = 'zscore'
entry = 'L_Trend'
window=25
threshold=1.4

main(
    f"./data/bybit_candle_btc_{interval}.csv",
    f"./data/cryptoquant_btc_open-interest_{interval}.csv",
    f"./data/cryptoquant_btc_netflow_{interval}.csv",
    factor,
    factor2,
    interval,
    operation,
    preprocess,
    model,
    entry,
    window,
    threshold
)

## Split Train backtest (For cumuPNL Graph)

In [None]:
from utilsnumpy import backtest , load_data, load_single_data, combine_factors
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import talib
import math
import json

annualizer_dict = {
    '1m': 525600,  # 1-minute intervals in a year
    '5m': 105120,  # 5-minute intervals in a year
    '15m': 35040,  # 15-minute intervals in a year
    '30m': 17520,  # 30-minute intervals in a year
    '1h': 8760,    # 1-hour intervals in a year
    '4h': 2190,    # 4-hour intervals in a year
    '1d': 365,     # 1-day intervals in a year
    '1w': 52,      # 1-week intervals in a year
    '1M': 12       # 1-month intervals in a year
}

def main(data1, data2, data3, factor, factor2, interval, operation, preprocess, model, entry, window, threshold):
    # Merge Data
    unselected_df = load_data(data1, data2)
    # Select wanted data column
    df = unselected_df[["start_time", "close", factor]].copy()
    # rename column
    df.columns = ["start_time", "close", factor]
    
    # Load data3 if operation sign provided
    if operation != 'none':
        df1 = load_single_data(data3, factor2)
        # Merge df and df1
        df = pd.merge_asof(df, df1.sort_values('start_time'), on="start_time", direction="nearest")
        df, new_column_name = combine_factors(df, factor, factor2, operation)
        factor = new_column_name    
    # metrics setting
    rolling_window = window
    threshold = threshold
    annualizer = annualizer_dict.get(interval, None)
    train_split = annualizer * 3

    # Split data into train and test sets (Train set: 3Year, Test set use remaining data)
    df_train = df[:train_split].reset_index(drop=True).copy()
    # df_test = df[train_split:].reset_index(drop=True).copy()

    forwardtest_report = []
    forwardtest_report.append(backtest(df_train, rolling_window, threshold, preprocess, entry, annualizer, model, factor, interval))

    print(json.dumps(forwardtest_report, indent=4))
    

    # Plot close price on the left y-axis
    fig, ax1 = plt.subplots(figsize=(15, 8))
    ax1.plot(df_train['start_time'], df_train['close'], label='Close Price', color='green', linewidth=2)
    ax1.set_xlabel("Date", fontsize=12)
    ax1.set_ylabel("Close Price", fontsize=12, color='green')
    ax1.tick_params(axis='y', labelcolor='green')
    # Plot cumulative PnL on the right y-axis
    ax2 = ax1.twinx()
    ax2.plot(df_train['start_time'], df_train['cumu_pnl'], label='Cumulative PnL', color='blue', linewidth=2)
    ax2.set_ylabel("Cumulative PnL", fontsize=12, color='blue')
    ax2.tick_params(axis='y', labelcolor='blue')
    # Add title and grid
    plt.title("Close Price and Cumulative PnL Plot (Split Train Period)", fontsize=16)
    fig.tight_layout()  # Adjust layout to prevent overlap
    plt.grid(True)
    plt.show()

    # Export df to csv
    # df.to_csv("backtest_df.csv", index=False)

factor = 'inflow_total'
factor2 = 'netflow_total'
interval = '1d'
operation = '/'
preprocess = 'diff'
model = 'zscore'
entry = 'L_Trend'
window=25
threshold=1.4

main(
    f"./data/bybit_candle_btc_{interval}.csv",
    f"./data/cryptoquant_btc_open-interest_{interval}.csv",
    f"./data/cryptoquant_btc_netflow_{interval}.csv",
    factor,
    factor2,
    interval,
    operation,
    preprocess,
    model,
    entry,
    window,
    threshold
)

## Backtest(No Permutation)(HandTest)

In [None]:
from utilsnumpy import backtest , load_data, load_single_data, combine_factors
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import talib
import math
import json
import modin

annualizer_dict = {
    '1m': 525600,  # 1-minute intervals in a year
    '5m': 105120,  # 5-minute intervals in a year
    '15m': 35040,  # 15-minute intervals in a year
    '30m': 17520,  # 30-minute intervals in a year
    '1h': 8760,    # 1-hour intervals in a year
    '4h': 2190,    # 4-hour intervals in a year
    '1d': 365,     # 1-day intervals in a year
    '1w': 52,      # 1-week intervals in a year
    '1M': 12       # 1-month intervals in a year
}

def main(data1, data2, data3, factor, factor2, interval, operation, preprocess, model, entry, window, threshold):
    # Merge Data
    unselected_df = load_data(data1, data2)
    # Select wanted data column
    df = unselected_df[["start_time", "close", factor]].copy()
    # rename column
    df.columns = ["start_time", "close", factor]
    
    # Load data3 if operation sign provided
    if operation != 'none':
        df1 = load_single_data(data3, factor2)
        # Merge df and df1
        df = pd.merge_asof(df, df1.sort_values('start_time'), on="start_time", direction="nearest")
        df, new_column_name = combine_factors(df, factor, factor2, operation)
        factor = new_column_name
        
    # metrics setting
    rolling_window = window
    threshold = threshold
    annualizer = annualizer_dict.get(interval, None) # Day data, so 365

    backtest_report = []
    backtest_report.append(backtest(df, rolling_window, threshold, preprocess, entry, annualizer, model, factor, interval))

    print(json.dumps(backtest_report, indent=4))

    # Plot close price on the left y-axis
    fig, ax1 = plt.subplots(figsize=(15, 8))
    ax1.plot(df['start_time'], df['close'], label='Close Price', color='green', linewidth=2)
    ax1.set_xlabel("Date", fontsize=12)
    ax1.set_ylabel("Close Price", fontsize=12, color='green')
    ax1.tick_params(axis='y', labelcolor='green')
    # Plot cumulative PnL on the right y-axis
    ax2 = ax1.twinx()
    ax2.plot(df['start_time'], df['cumu_pnl'], label='Cumulative PnL', color='blue', linewidth=2)
    ax2.set_ylabel("Cumulative PnL", fontsize=12, color='blue')
    ax2.tick_params(axis='y', labelcolor='blue')
    # Add title and grid
    plt.title("Close Price and Cumulative PnL Plot (Full Length)", fontsize=16)
    fig.tight_layout()  # Adjust layout to prevent overlap
    plt.grid(True)
    plt.show()

    # Export df to csv
    # df.to_csv("backtest_df.csv", index=False)
    # df.to_csv(f"./liveRunning/excel_for_each_backtest/{factor}_{preprocess}_{interval}_{model}_{entry}_{window}_{threshold}.csv", index=False)

factor = 'inflow_total'
factor2 = 'netflow_total'
interval = '1d'
operation = '/'
preprocess = 'diff'
model = 'zscore'
entry = 'L_Trend'
window=25
threshold=1.4

main(
    f"./data/bybit_candle_btc_{interval}.csv",
    f"./data/cryptoquant_btc_open-interest_{interval}.csv",
    f"./data/cryptoquant_btc_netflow_{interval}.csv",
    factor,
    factor2,
    interval,
    operation,
    preprocess,
    model,
    entry,
    window,
    threshold
)