In [None]:
# This is necessary to recognize the modules
import os
import sys
import warnings

warnings.filterwarnings("ignore")

root_path = os.path.abspath(os.path.join(os.getcwd(), "../.."))
sys.path.append(root_path)

In [None]:
from core.data_sources.clob import CLOBDataSource

clob = CLOBDataSource()
clob.load_candles_cache(root_path)

In [None]:
candles = list(clob.candles_cache.values())

In [None]:
import numpy as np
import pandas as pd
from statsmodels.tsa.stattools import coint
from tqdm.notebook import tqdm

# First, let's prepare the close prices and calculate returns
pair_returns = {}
for candle in candles:
    df = candle.data
    if df is not None and not df.empty:
        # Get the trading pair from the DataFrame
        pair_name = candle.trading_pair
        # Calculate returns using pct_change
        returns = df["close"].pct_change().dropna()
        pair_returns[pair_name] = returns

# Create a list of all pairs
pairs = list(pair_returns.keys())
n_pairs = len(pairs)

# Initialize the cointegration matrix
cointegration_matrix = pd.DataFrame(np.zeros((n_pairs, n_pairs)), index=pairs, columns=pairs)

# Calculate cointegration p-values for each pair combination
for i in tqdm(range(n_pairs)):
    for j in range(i + 1, n_pairs):
        pair1, pair2 = pairs[i], pairs[j]

        # Get the returns series
        series1 = pair_returns[pair1]
        series2 = pair_returns[pair2]

        # Align the series to have the same index
        series1, series2 = series1.align(series2, join="inner")

        if len(series1) > 0:
            # Perform cointegration test
            _, p_value, _ = coint(series1, series2)

            # Fill the matrix (make it symmetric)
            cointegration_matrix.iloc[i, j] = p_value
            cointegration_matrix.iloc[j, i] = p_value

# Fill diagonal with 1s
np.fill_diagonal(cointegration_matrix.values, 1)

# Display the results
print("Cointegration p-values matrix:")
print("(Lower p-values indicate stronger cointegration)")
cointegration_matrix

In [None]:
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
from scipy.cluster.hierarchy import fcluster, linkage
from scipy.spatial.distance import squareform

# Convert p-values to distances (same as before)
distance_matrix = -np.log(cointegration_matrix)
distance_matrix[np.isinf(distance_matrix)] = np.max(distance_matrix[~np.isinf(distance_matrix)]) * 2
distances_condensed = squareform(distance_matrix)

# Perform hierarchical clustering
Z = linkage(distances_condensed, method="ward")

# Plot dendrogram using plotly
fig = ff.create_dendrogram(
    Z,
    orientation="left",
)
fig.update_layout(title="Hierarchical Clustering Dendrogram", width=1000, height=800, showlegend=False)
fig.show()

# Choose number of clusters and create cluster assignments (same as before)
n_clusters = 5
clusters = fcluster(Z, n_clusters, criterion="maxclust")

# Create DataFrame and calculate metrics (same as before)
cluster_df = pd.DataFrame({"trading_pair": pairs, "cluster": clusters})

# Calculate volume metrics (same as before)
volume_metrics = {}
for candle in candles:
    if candle.data is not None and not candle.data.empty:
        pair_name = candle.trading_pair
        avg_volume = candle.data["volume"].mean()
        volume_stability = candle.data["volume"].std() / avg_volume
        volume_metrics[pair_name] = {"avg_volume": avg_volume, "volume_stability": volume_stability}

cluster_df["avg_volume"] = cluster_df["trading_pair"].map(lambda x: volume_metrics.get(x, {}).get("avg_volume", 0))
cluster_df["volume_stability"] = cluster_df["trading_pair"].map(
    lambda x: volume_metrics.get(x, {}).get("volume_stability", float("inf"))
)

# Create scatter plot using plotly
fig = px.scatter(
    cluster_df,
    x="avg_volume",
    y="volume_stability",
    color="cluster",
    hover_data=["trading_pair"],
    log_x=True,
    log_y=True,
    title="Clusters by Volume Metrics",
)

fig.update_layout(xaxis_title="Average Volume", yaxis_title="Volume Stability (lower is better)", width=1000, height=600)
fig.show()

# Create heatmap of cointegration matrix using plotly
fig = go.Figure(data=go.Heatmap(z=cointegration_matrix, x=pairs, y=pairs, colorscale="RdBu_r", zmin=0, zmax=0.05))

fig.update_layout(
    title="Cointegration P-values Heatmap (Darker colors indicate stronger cointegration)",
    width=1000,
    height=1000,
    xaxis_tickangle=-45,
)
fig.show()


# Select top pairs (same function as before)
def select_top_pairs(cluster_df, n_pairs_per_cluster=3):
    selected_pairs = []
    for cluster_num in cluster_df["cluster"].unique():
        cluster_pairs = cluster_df[cluster_df["cluster"] == cluster_num].copy()
        cluster_pairs["volume_score"] = cluster_pairs["avg_volume"] / cluster_pairs["volume_stability"]
        top_pairs = cluster_pairs.nlargest(n_pairs_per_cluster, "volume_score")
        selected_pairs.append(top_pairs)
    return pd.concat(selected_pairs)


# Select and display top pairs
top_pairs = select_top_pairs(cluster_df, n_pairs_per_cluster=3)

print("\nTop pairs by cluster:")
for cluster_num in top_pairs["cluster"].unique():
    print(f"\nCluster {cluster_num}:")
    cluster_result = top_pairs[top_pairs["cluster"] == cluster_num]
    print(cluster_result[["trading_pair", "avg_volume", "volume_stability", "volume_score"]])

In [None]:
cluster_df.iloc[307]

In [None]:
def plot_clusters_timeseries(candles, Z, pairs, cut_height=None, n_clusters=None):
    """
    Create a line plot of price changes over time, colored by clusters.

    Args:
        candles: List of candle dataframes
        Z: Linkage matrix from hierarchical clustering
        pairs: List of trading pair names
        cut_height: Height to cut the dendrogram (if None, n_clusters is used)
        n_clusters: Number of clusters (used if cut_height is None)
    """
    # Get clusters based on either cut_height or n_clusters
    if cut_height is not None:
        clusters = fcluster(Z, cut_height, criterion="distance")
    else:
        clusters = fcluster(Z, n_clusters, criterion="maxclust")

    # Prepare data for plotting
    plot_data = []
    for candle, pair in zip(candles, pairs):
        if candle.data is not None and not candle.data.empty:
            df = candle.data.copy()
            # Calculate cumulative returns to show relative price movement
            df["cum_returns"] = (1 + df["close"].pct_change()).cumprod()
            df["trading_pair"] = pair
            # Find the cluster for this pair
            pair_cluster = clusters[pairs.index(pair)]
            df["cluster"] = f"Cluster {pair_cluster}"
            plot_data.append(df)

    # Combine all data
    combined_df = pd.concat(plot_data)

    # Create line plot
    fig = px.line(
        combined_df,
        x="timestamp",
        y="cum_returns",
        color="cluster",
        line_group="trading_pair",
        hover_data=["trading_pair", "close"],
        title=f"Cumulative Returns Over Time by Cluster ({'Cut Height: ' + str(cut_height) if cut_height else 'Clusters: ' + str(n_clusters)})",
    )

    fig.update_layout(
        xaxis_title="Time",
        yaxis_title="Cumulative Returns (1 = start)",
        width=1200,
        height=800,
        # showlegend=True
    )

    fig.show()

    # Print cluster statistics
    print("\nCluster Statistics:")
    cluster_stats = (
        combined_df.groupby("cluster").agg({"cum_returns": ["mean", "std", "count"], "trading_pair": "nunique"}).round(4)
    )
    print(cluster_stats)

    return clusters


# Alternative version with normalized prices for easier comparison
def plot_clusters_timeseries_normalized(candles, Z, pairs, cut_height=None, n_clusters=None):
    """
    Create a line plot of normalized prices over time, colored by clusters.
    All prices are normalized to start at 1 for easier comparison.
    """
    # Get clusters based on either cut_height or n_clusters
    if cut_height is not None:
        clusters = fcluster(Z, cut_height, criterion="distance")
    else:
        clusters = fcluster(Z, n_clusters, criterion="maxclust")

    # Prepare data for plotting
    plot_data = []
    for candle, pair in zip(candles, pairs):
        if candle.data is not None and not candle.data.empty:
            df = candle.data.copy()
            # Normalize prices to start at 1
            df["normalized_price"] = df["close"] / df["close"].iloc[0]
            df["trading_pair"] = pair
            # Find the cluster for this pair
            pair_cluster = clusters[pairs.index(pair)]
            df["cluster"] = f"Cluster {pair_cluster}"
            plot_data.append(df)

    # Combine all data
    combined_df = pd.concat(plot_data)

    # Create line plot
    fig = px.line(
        combined_df,
        x="timestamp",
        y="normalized_price",
        color="cluster",
        line_group="trading_pair",
        hover_data=["trading_pair", "close"],
        title=f"Normalized Price Movement by Cluster ({'Cut Height: ' + str(cut_height) if cut_height else 'Clusters: ' + str(n_clusters)})",
    )

    fig.update_layout(xaxis_title="Time", yaxis_title="Normalized Price (1 = start)", width=1200, height=800, showlegend=True)

    fig.show()

    return clusters


# Example usage:
# Using cumulative returns
clusters = plot_clusters_timeseries(candles, Z, pairs, n_clusters=10)

# Using normalized prices
clusters = plot_clusters_timeseries_normalized(candles, Z, pairs, n_clusters=10)

In [None]:
def select_representative_markets(candles, Z, pairs, n_clusters, top_n=1):
    """
    Select representative markets from each cluster based on USD volume and volatility.

    Args:
        candles: List of candle dataframes
        Z: Linkage matrix from hierarchical clustering
        pairs: List of trading pair names
        n_clusters: Number of clusters to form
        top_n: Number of markets per cluster
    """
    # Get cluster assignments
    clusters = fcluster(Z, n_clusters, criterion="maxclust")

    # Create base DataFrame with cluster assignments
    cluster_df = pd.DataFrame({"trading_pair": pairs, "cluster": clusters})

    # Calculate metrics for each market
    market_metrics = []
    for candle in candles:
        if candle.data is not None and not candle.data.empty:
            # Calculate USD volume by multiplying volume by price
            usd_volume = candle.data["volume"] * candle.data["close"]

            metrics = {
                "trading_pair": candle.trading_pair,
                "avg_usd_volume": usd_volume.mean(),
                "volatility": candle.data["close"].pct_change().std(),
                "price_mean": candle.data["close"].mean(),
                "n_trades": len(candle.data),
                "volume_stability": usd_volume.std() / usd_volume.mean() if usd_volume.mean() != 0 else float("inf"),
            }
            market_metrics.append(metrics)

    # Create metrics DataFrame
    metrics_df = pd.DataFrame(market_metrics)

    # Merge metrics with cluster assignments
    cluster_df = cluster_df.merge(metrics_df, on="trading_pair", how="left")

    # Normalize metrics
    for col in ["avg_usd_volume", "volatility"]:
        cluster_df[f"{col}_normalized"] = (cluster_df[col] - cluster_df[col].min()) / (
            cluster_df[col].max() - cluster_df[col].min()
        )

    # Calculate combined score (you can adjust weights here)
    cluster_df["score"] = (
        cluster_df["avg_usd_volume_normalized"] * 0.6  # Higher weight for volume
        + cluster_df["volatility_normalized"] * 0.4  # Lower weight for volatility
    )

    # Select top markets from each cluster
    selected_markets = []
    for cluster_num in range(1, n_clusters + 1):
        cluster_markets = cluster_df[cluster_df["cluster"] == cluster_num].copy()
        top_markets = cluster_markets.nlargest(top_n, "score")
        selected_markets.append(top_markets)

    selected_df = pd.concat(selected_markets)

    # Sort by cluster and score
    selected_df = selected_df.sort_values(["cluster", "score"], ascending=[True, False])

    # Create visualization
    fig = px.scatter(
        cluster_df[cluster_df["trading_pair"].isin(selected_df["trading_pair"])],
        x="avg_usd_volume",
        y="volatility",
        color="cluster",
        text="trading_pair",
        title=f"Market Selection by USD Volume and Volatility (Top {top_n} per cluster)",
        hover_data=["trading_pair", "avg_usd_volume", "volatility", "volume_stability", "score"],
        log_x=True,  # Use log scale for volume
    )

    # Highlight selected markets
    selected_pairs = selected_df["trading_pair"].tolist()
    for pair in selected_pairs:
        market_data = cluster_df[cluster_df["trading_pair"] == pair]
        fig.add_trace(
            go.Scatter(
                x=[market_data["avg_usd_volume"].iloc[0]],
                y=[market_data["volatility"].iloc[0]],
                mode="markers",
                marker=dict(symbol="star", size=15, line=dict(width=2, color="black"), showscale=False),
                name=f"Selected: {pair}",
                showlegend=True,
            )
        )

    fig.update_layout(
        width=1000, height=600, showlegend=True, xaxis_title="Average USD Volume (log scale)", yaxis_title="Volatility"
    )
    fig.show()

    # Print detailed results
    print("\nSelected Markets by Cluster:")
    for cluster_num in range(1, n_clusters + 1):
        print(f"\nCluster {cluster_num}:")
        cluster_results = selected_df[selected_df["cluster"] == cluster_num]
        display_cols = ["trading_pair", "avg_usd_volume", "volatility", "volume_stability", "score"]

        # Format the results for better readability
        formatted_results = cluster_results[display_cols].copy()
        formatted_results["avg_usd_volume"] = formatted_results["avg_usd_volume"].apply(lambda x: f"${x:,.2f}")
        formatted_results["volatility"] = formatted_results["volatility"].apply(lambda x: f"{x:.4f}")
        formatted_results["volume_stability"] = formatted_results["volume_stability"].apply(lambda x: f"{x:.4f}")
        formatted_results["score"] = formatted_results["score"].apply(lambda x: f"{x:.4f}")

        print(formatted_results.to_string(index=False))

    return selected_df


# Example usage:
selected_markets = select_representative_markets(
    candles=candles,
    Z=Z,
    pairs=pairs,
    n_clusters=10,  # Number of clusters
    top_n=2,  # Number of markets per cluster
)