In [None]:
# Auto-install fitkit if not available (e.g., in Colab)
try:
    import fitkit
except ImportError:
    !pip install -e ..  # Install from parent directory in dev mode
    import fitkit

In [None]:
from fitkit.data import WikipediaLoader, QueryConfig, create_small_fixture
from fitkit.algorithms import fitness_complexity, compute_eci_pci, sinkhorn_masked
from fitkit.algorithms import FitnessComplexity, ECI, SinkhornScaler

<a href="https://colab.research.google.com/github/lawrennd/fitkit/blob/main/wikipedia_editing_fitness_complexity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Wikipedia Editing Data: fitness / complexity analysis

This notebook:

- Downloads a sample of Wikipedia users from BigQuery and aggregates their edits into per-user text.
- Builds a `user` $\times$ `word` matrix and its *support* (analogous to `country` $\times$ `product`).


### BigQuery Setup Instructions

To run the BigQuery query cells in this notebook, you need to have a Google Cloud Project with the BigQuery API enabled and proper authentication setup.

Here's a general guide:

1.  **Google Cloud Account**: If you don't have one, sign up for a Google Cloud account. You might be eligible for a free trial.
    *   [Sign up for Google Cloud](https://cloud.google.com/free)

2.  **Create/Select a Project**: In the [Google Cloud Console](https://console.cloud.google.com/), create a new project or select an existing one.
    *   Ensure that **billing is enabled** for your project, as BigQuery usage incurs costs (though often minimal for small queries, especially with the free tier).

3.  **Enable the BigQuery API**: For your selected project, ensure the BigQuery API is enabled.
    *   Go to the [API Library](https://console.cloud.google.com/apis/library) in the Cloud Console.
    *   Search for "BigQuery API" and enable it if it's not already enabled.

4.  **Authentication**:
    
    **In Google Colab**: Authentication is automatic. The `WikipediaLoader` will detect the Colab environment and use `google.colab.auth.authenticate_user()` to prompt you to log in with your Google account.
    
    **In Local Jupyter**: You need to set up Application Default Credentials (ADC) using the `gcloud` CLI:
    ```bash
    gcloud auth application-default login
    ```
    This will authenticate you and allow the `WikipediaLoader` to access BigQuery using your credentials.

Once these steps are complete, you should be able to run the BigQuery cells successfully!

### 0) Setup

You’ll need BigQuery credentials configured locally (e.g. `gcloud auth application-default login`) and permission to access the public dataset `fh-bigquery.reddit_comments`.

If you don’t have BigQuery access, you can still run the later cells by loading a cached dataframe (see the caching cell below).


In [None]:
# Core
import os
import numpy as np
import pandas as pd

# Sparse matrices
import scipy.sparse as sp

# Plotting
import matplotlib.pyplot as plt


In [None]:
# Standard random sampling (no specific users)
cfg = QueryConfig()

CACHE_DIR = "data"
os.makedirs(CACHE_DIR, exist_ok=True)

# Updated cache path for Wikipedia data (v4 - random sample)
CACHE_PATH = os.path.join(
    CACHE_DIR,
    f"wikipedia_authors{cfg.max_authors}_v4.parquet",
)

print("Cache path:", CACHE_PATH)

In [None]:
# Data loading is now handled by WikipediaLoader (imported from fitkit.data)
# The loader handles BigQuery authentication, querying, caching, and matrix construction


In [None]:
# Data loading functions are now in fitkit.data.WikipediaLoader
# The loader handles BigQuery authentication, querying, caching, and matrix construction



In [None]:
# Load data using WikipediaLoader
print(f"Using cache path: {CACHE_PATH}")

loader = WikipediaLoader(cfg, CACHE_PATH)
bundle = loader.load()

# Extract components from bundle
X = bundle.matrix
user_ids = bundle.row_labels.tolist()
vocab = bundle.col_labels.tolist()

print(f"Loaded: {len(user_ids)} users, {len(vocab)} words")
print(f"Matrix shape: {X.shape}, dtype: {X.dtype}")
print(f"Matrix is sparse: {sp.issparse(X)}")

In [None]:
# 2) Extract support matrix and prepare for analysis
#
# In the paper's language, we will treat the *support* as M_{uw} = 1{X_{uw} > 0}.
# The matrix X from WikipediaLoader already has filtering applied (via QueryConfig).
# The loader uses binary=False by default (word counts), but we can work with either.

# Support mask (structural zeros off-support)
M = X.copy()
M.data = np.ones_like(M.data)

# Basic margins (analogues of diversification and ubiquity)
user_strength = np.asarray(X.sum(axis=1)).ravel()
word_strength = np.asarray(X.sum(axis=0)).ravel()

print("User strength:", pd.Series(user_strength).describe())
print("Word strength:", pd.Series(word_strength).describe())
print(f"Matrix -> Users: {X.shape[0]}, Vocab: {X.shape[1]}")

# Labeled view for plotting and downstream helpers
M_df = pd.DataFrame.sparse.from_spmatrix(M, index=user_ids, columns=vocab)

### 3) Baseline: 1D Pietronero Fitness–Complexity fixed point

This is the usual nonlinear rank-1 fixed point on the **support matrix** \(M\) (binary incidence). We’ll compute it as a scalar reference, then move to the rank-2 extension.


In [None]:
# Algorithm functions are now imported from fitkit.algorithms
# (fitness_complexity, compute_eci_pci, sinkhorn_masked)


### Fitness–Complexity ⇄ IPF/Sinkhorn equivalence (what the paper is using)

In the paper (`economic-fitness.tex`), the key point is that **Fitness–Complexity is a reparameterisation of masked IPF/Sinkhorn matrix scaling** on the support graph.

- We solve for a coupling/flow \(w_{uw}\ge 0\) supported on \(M\) such that \(\sum_w w_{uw}=r_u\) and \(\sum_u w_{uw}=c_w\).
- IPF/Sinkhorn gives a diagonal scaling solution \(w_{uw} = M_{uw} A_u B_w\).
- Setting \(A_u \equiv 1/F_u\) and \(B_w \equiv Q_w\) yields \(w_{uw} \propto M_{uw} Q_w/F_u\), and the FC fixed-point updates recover the scaling equations (up to the usual projective normalisation/gauge).

So the Sinkhorn/IPF object here is **not a different model**—it’s the same masked matrix-scaling problem, viewed in “flow” form. The only extra modelling choice is **which marginals \((r,c)\)** to impose (uniform is a common default in the support-only setting; data-marginals are natural for quantitative flows).


In [None]:
F, Q, fc_hist = fitness_complexity(M)
eci, pci = compute_eci_pci(M)

F_s = pd.Series(F, index=user_ids, name="Fitness")
Q_s = pd.Series(Q, index=vocab, name="Complexity")
eci_s = pd.Series(eci, index=user_ids, name="ECI")
pci_s = pd.Series(pci, index=vocab, name="PCI")

kc = pd.Series(np.asarray(M.sum(axis=1)).ravel(), index=user_ids, name="diversification_kc")
kp = pd.Series(np.asarray(M.sum(axis=0)).ravel(), index=vocab, name="ubiquity_kp")

# Sinkhorn/IPF scaling to build a flow W on the support.
# For the FC ⇄ Sinkhorn equivalence viewpoint, the natural default is *uniform* marginals.
# However, uniform marginals can be infeasible on some sparse masks; we fall back if needed.

# default: uniform marginals (same total mass, different per-node mass if rectangular)
r_uniform = np.ones(M.shape[0], dtype=float)
r_uniform = r_uniform / r_uniform.sum()
c_uniform = np.ones(M.shape[1], dtype=float)
c_uniform = c_uniform / c_uniform.sum()

u, v, W, sk_hist = sinkhorn_masked(M, r=r_uniform, c=c_uniform)

if not sk_hist.get("converged", False):
    print("Sinkhorn with uniform marginals did not converge; falling back to degree marginals.")
    r_deg = kc.to_numpy(dtype=float)
    r_deg = r_deg / r_deg.sum()
    c_deg = kp.to_numpy(dtype=float)
    c_deg = c_deg / c_deg.sum()
    u, v, W, sk_hist = sinkhorn_masked(M, r=r_deg, c=c_deg)

results_countries = pd.concat([F_s, eci_s, kc], axis=1).sort_values("Fitness", ascending=False)
results_products = pd.concat([Q_s, pci_s, kp], axis=1).sort_values("Complexity", ascending=False)

word_scores_1d = Q_s.sort_values(ascending=False)
user_scores_1d = F_s.sort_values(ascending=False)

print("Top 20 words by complexity:")
print(word_scores_1d.head(20))
print("Top 20 users by fitness:")
print(user_scores_1d.head(20))

In [None]:
user_scores_1d.head(15)

## Flow-native visualisations (Sinkhorn/OT coupling) + ranked barcodes

The objects we visualise here are:

- binary support: `M` (country×product)
- Sinkhorn/IPF scaling factors: `u`, `v` (dual variables)
- coupling / feasible flow: `W` where `W = diag(u) * M * diag(v)` (on the support)

To avoid “hairballs”, every flow plot below supports **top-k / top-edge filtering**.

In [None]:
# Diagnostics: convergence
fig, ax = plt.subplots(1, 2, figsize=(10, 3))
ax[0].plot(fc_hist["dF"], label="max |ΔF|")
ax[0].plot(fc_hist["dQ"], label="max |ΔQ|")
ax[0].set_yscale("log")
ax[0].set_title("FC convergence")
ax[0].legend()

ax[1].plot(sk_hist["dr"], label="max row marginal error")
ax[1].plot(sk_hist["dc"], label="max col marginal error")
ax[1].set_yscale("log")
ax[1].set_title("Sinkhorn/IPF convergence")
ax[1].legend()

plt.tight_layout()
plt.show()

# Diagnostics: nestedness-like visualization (sort by Fitness/Complexity)
M_sorted = M_df.loc[results_countries.index, results_products.index]
plt.figure(figsize=(10, 4))
plt.imshow(M_sorted.sparse.to_dense().to_numpy(), aspect="auto", interpolation="nearest", cmap="Greys")
plt.title("M sorted by Fitness (rows) and Complexity (cols)")
plt.xlabel("words")
plt.ylabel("users")
plt.tight_layout()
plt.show()

# Diagnostics: compare rankings
plt.figure(figsize=(5, 4))
plt.scatter(results_countries["ECI"], results_countries["Fitness"], s=15, alpha=0.7)
plt.xlabel("ECI (standardized)")
plt.ylabel("Fitness")
plt.title("Countries: Fitness vs ECI")
plt.tight_layout()
plt.show()


In [None]:
import plotly.express as px
import plotly.graph_objects as go
from IPython.display import display, Javascript

# Prepare data with URL
plot_df = results_countries.copy()
# Construct Wikipedia User URLs (replacing spaces with underscores)
plot_df["wiki_url"] = "https://en.wikipedia.org/wiki/User:" + plot_df.index.astype(str).str.replace(' ', '_')

# Interactive scatter plot with custom_data for the URL
fig = px.scatter(
    plot_df,
    x="ECI",
    y="Fitness",
    hover_name=plot_df.index,
    hover_data=["diversification_kc"],
    custom_data=["wiki_url"],
    title="Countries: Fitness vs ECI (Click dot to open User Page)",
    labels={"ECI": "ECI (standardized)", "Fitness": "Fitness"},
    template="plotly_white",
    opacity=0.7,
    log_y=True
)

fig.update_traces(marker=dict(size=8))
fig.update_layout(width=700, height=500)



In [None]:
import matplotlib as mpl
from matplotlib.patches import Polygon
from matplotlib.path import Path
from matplotlib.patches import PathPatch


def _to_flow_df(M: pd.DataFrame, W: sp.spmatrix | np.ndarray) -> pd.DataFrame:
    if sp.issparse(W):
        # sparse-safe: keep as sparse frame
        return pd.DataFrame.sparse.from_spmatrix(W, index=M.index, columns=M.columns)

    W_df = pd.DataFrame(W, index=M.index, columns=M.columns)
    # keep strictly on support (in case numerical noise fills zeros)
    return W_df.where(M.astype(bool), other=0.0)


def _top_subset(W_df: pd.DataFrame, top_c: int = 20, top_p: int = 30, by: str = "mass") -> pd.DataFrame:
    """Return a filtered W_df restricted to top rows/cols.

    by:
      - "mass": uses row/col sums of W_df
      - "fitness_complexity": uses global labeled Series F_s/Q_s if present
    """
    if by == "fitness_complexity" and "F_s" in globals() and "Q_s" in globals():
        c_idx = list(globals()["F_s"].sort_values(ascending=False).index[:top_c])
        p_idx = list(globals()["Q_s"].sort_values(ascending=False).index[:top_p])
    else:
        c_idx = list(W_df.sum(axis=1).sort_values(ascending=False).index[:top_c])
        p_idx = list(W_df.sum(axis=0).sort_values(ascending=False).index[:top_p])
    return W_df.loc[c_idx, p_idx]


def plot_circular_bipartite_flow(
    W_df: pd.DataFrame,
    max_edges: int = 350,
    min_edge_mass: float | None = None,
    color_by: str = "country",
    title: str = "Circular bipartite flow (line-weighted, filtered)",
):
    """Chord-style circular bipartite flow using Bezier curves.

    Notes:
    - This draws *curves* (not full ribbons) with linewidth ∝ w_cp.
    - Filter to top edges to avoid hairballs.
    """
    countries = list(W_df.index)
    products = list(W_df.columns)

    edges = (
        W_df.stack()
        .rename("w")
        .reset_index()
        .rename(columns={"level_0": "country", "level_1": "product"})
    )

    edges = edges[edges["w"] > 0].sort_values("w", ascending=False)
    if min_edge_mass is not None:
        edges = edges[edges["w"] >= float(min_edge_mass)]
    edges = edges.head(max_edges)

    if len(edges) == 0:
        print("No edges to plot after filtering.")
        return

    # angles: countries on left semicircle, products on right semicircle
    n_c, n_p = len(countries), len(products)
    theta_c = np.linspace(np.pi / 2, 3 * np.pi / 2, n_c, endpoint=False)
    theta_p = np.linspace(-np.pi / 2, np.pi / 2, n_p, endpoint=False)

    def pol2cart(theta, r=1.0):
        return np.array([r * np.cos(theta), r * np.sin(theta)])

    pos_c = {c: pol2cart(theta_c[i]) for i, c in enumerate(countries)}
    pos_p = {p: pol2cart(theta_p[j]) for j, p in enumerate(products)}

    # colors
    if color_by == "product":
        cmap = plt.get_cmap("tab20")
        colors = {p: cmap(i % 20) for i, p in enumerate(products)}
        edge_color = lambda row: colors[row["product"]]
    else:
        cmap = plt.get_cmap("tab20")
        colors = {c: cmap(i % 20) for i, c in enumerate(countries)}
        edge_color = lambda row: colors[row["country"]]

    w = edges["w"].to_numpy()
    wmax = float(w.max())
    # linewidth scaling (tuned to look OK for typical normalized W)
    lw = 0.2 + 6.0 * (w / (wmax + 1e-30)) ** 0.75

    fig, ax = plt.subplots(figsize=(9, 9))
    ax.set_aspect("equal")
    ax.axis("off")

    # node labels (lightweight)
    for c in countries:
        x, y = pos_c[c]
        ax.plot([x], [y], marker="o", ms=3, color="black")
    for p in products:
        x, y = pos_p[p]
        ax.plot([x], [y], marker="o", ms=3, color="black")

    # edges as cubic Beziers through center
    for i, row in enumerate(edges.itertuples(index=False)):
        c = row.country
        p = row.product
        x0, y0 = pos_c[c]
        x1, y1 = pos_p[p]
        # control points closer to center
        c0 = np.array([0.35 * x0, 0.35 * y0])
        c1 = np.array([0.35 * x1, 0.35 * y1])

        verts = [(x0, y0), (c0[0], c0[1]), (c1[0], c1[1]), (x1, y1)]
        codes = [Path.MOVETO, Path.CURVE4, Path.CURVE4, Path.CURVE4]
        path = Path(verts, codes)
        patch = PathPatch(path, facecolor="none", edgecolor=edge_color(row._asdict()), lw=lw[i], alpha=0.55)
        ax.add_patch(patch)

    ax.set_title(title + f"\n(top {len(edges)} edges)")
    plt.show()


def plot_alluvial_bipartite(
    W_df: pd.DataFrame,
    max_edges: int = 250,
    min_edge_mass: float | None = None,
    title: str = "Alluvial (Sankey-style) bipartite flow (filtered)",
):
    """Alluvial/Sankey-style plot in pure Matplotlib.

    Draws stacked nodes on left (countries) and right (products),
    with polygon bands for the largest flows.
    """
    edges = (
        W_df.stack()
        .rename("w")
        .reset_index()
        .rename(columns={"level_0": "country", "level_1": "product"})
    )
    edges = edges[edges["w"] > 0].sort_values("w", ascending=False)
    if min_edge_mass is not None:
        edges = edges[edges["w"] >= float(min_edge_mass)]
    edges = edges.head(max_edges)

    if len(edges) == 0:
        print("No edges to plot after filtering.")
        return

    countries = list(pd.Index(edges["country"]).unique())
    products = list(pd.Index(edges["product"]).unique())

    # total mass per node (restricted to displayed edges)
    out_mass = edges.groupby("country")["w"].sum().reindex(countries)
    in_mass = edges.groupby("product")["w"].sum().reindex(products)

    # normalize heights to 1
    out_mass = out_mass / out_mass.sum()
    in_mass = in_mass / in_mass.sum()

    # vertical packing with padding
    pad = 0.01

    def pack(masses: pd.Series):
        spans = {}
        y = 0.0
        for k, v in masses.items():
            y0 = y
            y1 = y + float(v)
            spans[k] = [y0, y1]
            y = y1 + pad
        # rescale to [0,1]
        total = y - pad
        for k in spans:
            spans[k][0] /= total
            spans[k][1] /= total
        return spans

    span_c = pack(out_mass)
    span_p = pack(in_mass)

    # allocate sub-spans per edge within each node
    c_cursor = {c: span_c[c][0] for c in countries}
    p_cursor = {p: span_p[p][0] for p in products}

    cmap = plt.get_cmap("tab20")
    c_color = {c: cmap(i % 20) for i, c in enumerate(countries)}

    fig, ax = plt.subplots(figsize=(11, 7))
    ax.axis("off")

    xL, xR = 0.1, 0.9
    node_w = 0.03

    # draw nodes
    for c in countries:
        y0, y1 = span_c[c]
        ax.add_patch(Polygon([[xL - node_w, y0], [xL, y0], [xL, y1], [xL - node_w, y1]], closed=True, color="black", alpha=0.15))
        ax.text(xL - node_w - 0.01, (y0 + y1) / 2, str(c), ha="right", va="center", fontsize=8)

    for p in products:
        y0, y1 = span_p[p]
        ax.add_patch(Polygon([[xR, y0], [xR + node_w, y0], [xR + node_w, y1], [xR, y1]], closed=True, color="black", alpha=0.15))
        ax.text(xR + node_w + 0.01, (y0 + y1) / 2, str(p), ha="left", va="center", fontsize=8)

    # bands
    for row in edges.itertuples(index=False):
        c = row.country
        p = row.product
        w = float(row.w)

        # band thickness within each stacked node span (relative to node mass)
        dc = w / float(edges[edges["country"] == c]["w"].sum()) * (span_c[c][1] - span_c[c][0])
        dp = w / float(edges[edges["product"] == p]["w"].sum()) * (span_p[p][1] - span_p[p][0])

        y0c, y1c = c_cursor[c], c_cursor[c] + dc
        y0p, y1p = p_cursor[p], p_cursor[p] + dp
        c_cursor[c] = y1c
        p_cursor[p] = y1p

        # simple 4-point polygon band (looks OK with alpha)
        poly = Polygon(
            [[xL, y0c], [xR, y0p], [xR, y1p], [xL, y1c]],
            closed=True,
            facecolor=c_color[c],
            edgecolor="none",
            alpha=0.45,
        )
        ax.add_patch(poly)

    ax.set_title(title + f"\n(top {len(edges)} edges)")
    plt.show()


def plot_dual_potential_bipartite(
    M: pd.DataFrame,
    W_df: pd.DataFrame,
    u: np.ndarray,
    v: np.ndarray,
    max_edges: int = 400,
    title: str = "Dual potentials (log u, log v) with flow edges",
):
    """Layered bipartite plot: node color = dual potentials, edge thickness = w_cp."""
    countries = list(M.index)
    products = list(M.columns)

    phi = pd.Series(np.log(u + 1e-30), index=countries)
    psi = pd.Series(np.log(v + 1e-30), index=products)

    # order by potential for a clean “landscape”
    c_order = list(phi.sort_values().index)
    p_order = list(psi.sort_values().index)

    # pick top edges globally
    edges = (
        W_df.loc[c_order, p_order]
        .stack()
        .rename("w")
        .reset_index()
        .rename(columns={"level_0": "country", "level_1": "product"})
    )
    edges = edges[edges["w"] > 0].sort_values("w", ascending=False).head(max_edges)

    if len(edges) == 0:
        print("No edges to plot.")
        return

    # positions
    y_c = {c: i for i, c in enumerate(c_order)}
    y_p = {p: i for i, p in enumerate(p_order)}

    x_c, x_p = 0.0, 1.0

    # color mapping
    vals = np.concatenate([phi.to_numpy(), psi.to_numpy()])
    vmin, vmax = np.percentile(vals, [5, 95])
    norm = mpl.colors.Normalize(vmin=vmin, vmax=vmax)
    cmap = plt.get_cmap("coolwarm")

    fig, ax = plt.subplots(figsize=(10, 8))

    # edges
    w = edges["w"].to_numpy()
    wmax = float(w.max())
    lw = 0.2 + 4.5 * (w / (wmax + 1e-30)) ** 0.7

    for i, row in enumerate(edges.itertuples(index=False)):
        c = row.country
        p = row.product
        ax.plot([x_c, x_p], [y_c[c], y_p[p]], color="black", alpha=0.12, lw=lw[i])

    # nodes
    ax.scatter([x_c] * len(c_order), [y_c[c] for c in c_order], c=[cmap(norm(phi[c])) for c in c_order], s=18, edgecolor="none")
    ax.scatter([x_p] * len(p_order), [y_p[p] for p in p_order], c=[cmap(norm(psi[p])) for p in p_order], s=18, edgecolor="none")

    ax.set_yticks([])
    ax.set_xticks([x_c, x_p])
    ax.set_xticklabels(["countries", "products"])
    ax.set_title(title + f"\n(node color = log dual, top {len(edges)} edges)")

    sm = mpl.cm.ScalarMappable(norm=norm, cmap=cmap)
    cbar = plt.colorbar(sm, ax=ax, fraction=0.03, pad=0.02)
    cbar.set_label("dual potential (log scale)")

    plt.tight_layout()
    plt.show()


def plot_ranked_barcodes(
    results_countries: pd.DataFrame,
    results_products: pd.DataFrame,
    top_n: int = 40,
    title: str = "Ranked barcodes (Fitness/Complexity) with degree overlays",
):
    """Two clean rank plots: countries by Fitness, products by Complexity."""
    rc = results_countries.sort_values("Fitness", ascending=False).head(top_n)
    rp = results_products.sort_values("Complexity", ascending=False).head(top_n)

    fig, ax = plt.subplots(1, 2, figsize=(12, 4))

    # countries
    ax0 = ax[0]
    ax0.bar(range(len(rc)), rc["Fitness"].to_numpy(), color="black", alpha=0.6)
    ax0.set_title(f"Countries (top {len(rc)})")
    ax0.set_xlabel("rank")
    ax0.set_ylabel("Fitness")

    ax0b = ax0.twinx()
    ax0b.plot(range(len(rc)), rc["diversification_kc"].to_numpy(), color="tab:blue", lw=1.5)
    ax0b.set_ylabel("diversification (kc)")

    # products
    ax1 = ax[1]
    ax1.bar(range(len(rp)), rp["Complexity"].to_numpy(), color="black", alpha=0.6)
    ax1.set_title(f"Products (top {len(rp)})")
    ax1.set_xlabel("rank")
    ax1.set_ylabel("Complexity")

    ax1b = ax1.twinx()
    ax1b.plot(range(len(rp)), rp["ubiquity_kp"].to_numpy(), color="tab:orange", lw=1.5)
    ax1b.set_ylabel("ubiquity (kp)")

    fig.suptitle(title)
    plt.tight_layout()
    plt.show()


In [None]:
# Build a labeled coupling DataFrame
W_df = _to_flow_df(M_df, W)

# Sort according to Fitness/Complexity orderings
W_sorted = W_df.loc[results_countries.index, results_products.index]

# Filter to top nodes for readability
W_small = _top_subset(W_sorted, top_c=18, top_p=28, by="fitness_complexity")

# 1) Circular bipartite flow (chord-style)
plot_circular_bipartite_flow(
    W_small,
    max_edges=320,
    color_by="country",
    title="Circular bipartite flow for Sinkhorn coupling W (filtered)",
)

# 2) Alluvial / Sankey-style flow
plot_alluvial_bipartite(
    W_small,
    max_edges=220,
    title="Alluvial view of Sinkhorn coupling W (filtered)",
)

# 3) Dual potentials landscape (log u/log v) + top edges
plot_dual_potential_bipartite(
    M=M_df,
    W_df=W_df,
    u=u,
    v=v,
    max_edges=450,
    title="Dual potentials (log u, log v) + flow edges from W",
)

# 4) Ranked barcode plots
plot_ranked_barcodes(results_countries, results_products, top_n=40)


# Task
Explain that the interactive plot with log scale and clickable dots is ready.

## explain_result

### Subtask:
Explain the features of the generated interactive plot.


## Summary:

### Data Analysis Key Findings
- An interactive plot has been successfully generated to visualize the dataset.
- The plot utilizes a logarithmic scale, which facilitates the comparison of data spanning several orders of magnitude.
- The visualization features clickable data points, allowing for granular inspection of individual values.

### Insights or Next Steps
- Utilize the interactive click functionality to investigate specific outliers or high-leverage points within the data.
- The logarithmic scale suggests the underlying data likely follows a power-law distribution or contains significant skewness; consider this when performing further statistical tests.
