<a href="https://colab.research.google.com/github/lawrennd/fitkit/blob/main/examples/nested_matrix_fitness_complexity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Nested Matrix: Fitness / Complexity Analysis

This notebook demonstrates fitness-complexity analysis on a **perfectly nested binary matrix**.

**Hypothesis**: For nested data (countries with more products export all products that countries with fewer products export, plus additional ones), we expect **ECI to correlate very highly with diversification** (r > 0.9) since this is the use case ECI was designed for. We also expect **moderate ECI-Fitness correlation** (r ≈ 0.6-0.7).

**What is a nested matrix?** In a perfectly nested structure:
- Row i includes all products from rows 0 to i-1, plus one additional product
- This creates a triangular/hierarchical pattern
- Common in trade data: sophisticated countries export everything simple countries export, plus more

The notebook:
- Generates a nested sparse binary matrix (users × words)
- Applies Fitness-Complexity, ECI, and Sinkhorn scaling
- Visualizes the correlation between ECI and Fitness
- Compares convergence and diagnostic plots

### Setup

Install fitkit if needed.

In [None]:
import sys
import subprocess
from pathlib import Path


def _pip_install(args: list[str]) -> None:
    cmd = [sys.executable, "-m", "pip", *args]
    print("Running:", " ".join(cmd))
    subprocess.check_call(cmd)


def ensure_fitkit_installed() -> None:
    """Prefer editable local install; fall back to GitHub.

    - Local (typical): `pip install -e ..` when running from `examples/`
    - Colab/remote: `pip install git+https://github.com/lawrennd/fitkit.git`
    """
    try:
        import fitkit  # noqa: F401

        return
    except ImportError:
        pass

    here = Path.cwd().resolve()
    candidates = [here, here.parent, here.parent.parent]

    for root in candidates:
        if (root / "pyproject.toml").exists() and (root / "fitkit").is_dir():
            _pip_install(["install", "-e", str(root)])
            return

    _pip_install(["install", "git+https://github.com/lawrennd/fitkit.git"])


ensure_fitkit_installed()
import fitkit

print("fitkit version:", getattr(fitkit, "__version__", "unknown"))

In [None]:
from fitkit.algorithms import FitnessComplexity, ECI, SinkhornScaler

# Core
import numpy as np
import pandas as pd

# Sparse matrices
import scipy.sparse as sp

# Plotting
import matplotlib.pyplot as plt

### Generate Nested Binary Matrix

We create a perfectly nested sparse binary matrix representing a user × word incidence matrix.
- `n_users`: number of users (rows)
- `n_words`: number of words (columns)
- Each user i uses words 0 through (i + base_words), creating perfect nesting

In [None]:
# Random seed for reproducibility (for any randomness in ordering)
np.random.seed(42)

# Matrix parameters
n_users = 200
n_words = 300
base_words = 5  # Minimum words per user

# Generate perfectly nested matrix
# User i uses words 0 through min(i + base_words, n_words)
M_data = []
for i in range(n_users):
    row = np.zeros(n_words)
    # Each user includes all previous users' words, plus one more
    n_words_for_user = min(i + base_words, n_words)
    row[:n_words_for_user] = 1
    M_data.append(row)

M = sp.csr_matrix(np.array(M_data))

# Create labels
user_ids = [f"user_{i:03d}" for i in range(n_users)]
vocab = [f"word_{i:03d}" for i in range(n_words)]

print(f"Generated nested matrix: {n_users} users × {n_words} words")
print(f"Matrix shape: {M.shape}, dtype: {M.dtype}")
print(f"Density: {M.nnz / (M.shape[0] * M.shape[1]):.2%}")
print(f"Total non-zero entries: {M.nnz}")
print(f"Diversification range: [{M.sum(axis=1).min():.0f}, {M.sum(axis=1).max():.0f}]")

In [None]:
# Basic margins (diversification and ubiquity)
user_strength = np.asarray(M.sum(axis=1)).ravel()
word_strength = np.asarray(M.sum(axis=0)).ravel()

print("User diversification (# words per user):")
print(pd.Series(user_strength).describe())
print("\nWord ubiquity (# users per word):")
print(pd.Series(word_strength).describe())

# Create labeled DataFrame
M_df = pd.DataFrame.sparse.from_spmatrix(M, index=user_ids, columns=vocab)

### Apply Algorithms

We compute:
1. **Fitness-Complexity** (FC): Nonlinear rank-1 fixed point
2. **ECI/PCI**: Economic Complexity Index via eigenvalue decomposition
3. **Sinkhorn scaling**: IPF/OT coupling on the support

In [None]:
# 1) Fitness-Complexity
fc = FitnessComplexity()
F, Q = fc.fit_transform(M)
fc_hist = fc.history_

# 2) ECI
eci_model = ECI()
eci, pci = eci_model.fit_transform(M)

# Create series
F_s = pd.Series(F, index=user_ids, name="Fitness")
Q_s = pd.Series(Q, index=vocab, name="Complexity")
eci_s = pd.Series(eci, index=user_ids, name="ECI")
pci_s = pd.Series(pci, index=vocab, name="PCI")

kc = pd.Series(np.asarray(M.sum(axis=1)).ravel(), index=user_ids, name="diversification_kc")
kp = pd.Series(np.asarray(M.sum(axis=0)).ravel(), index=vocab, name="ubiquity_kp")

# 3) Sinkhorn scaling with uniform marginals
r_uniform = np.ones(M.shape[0], dtype=float)
r_uniform = r_uniform / r_uniform.sum()
c_uniform = np.ones(M.shape[1], dtype=float)
c_uniform = c_uniform / c_uniform.sum()

scaler = SinkhornScaler()
W = scaler.fit_transform(M, row_marginals=r_uniform, col_marginals=c_uniform)
u, v, sk_hist = scaler.u_, scaler.v_, scaler.history_

if not sk_hist.get("converged", False):
    print("Sinkhorn with uniform marginals did not converge; falling back to degree marginals.")
    r_deg = kc.to_numpy(dtype=float)
    r_deg = r_deg / r_deg.sum()
    c_deg = kp.to_numpy(dtype=float)
    c_deg = c_deg / c_deg.sum()
    scaler = SinkhornScaler()
    W = scaler.fit_transform(M, row_marginals=r_deg, col_marginals=c_deg)
    u, v, sk_hist = scaler.u_, scaler.v_, scaler.history_

# Combine results
results_users = pd.concat([F_s, eci_s, kc], axis=1).sort_values("Fitness", ascending=False)
results_words = pd.concat([Q_s, pci_s, kp], axis=1).sort_values("Complexity", ascending=False)

print("\nTop 10 users by Fitness:")
print(results_users.head(10))
print("\nTop 10 words by Complexity:")
print(results_words.head(10))

### Convergence Diagnostics

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(10, 3))

# Fitness-Complexity convergence
ax[0].plot(fc.history_["dF"], label="max |ΔF|")
ax[0].plot(fc.history_["dQ"], label="max |ΔQ|")
ax[0].set_yscale("log")
ax[0].set_xlabel("Iteration")
ax[0].set_ylabel("Change")
ax[0].set_title("Fitness-Complexity Convergence")
ax[0].legend()
ax[0].grid(True, alpha=0.3)

# Sinkhorn convergence
ax[1].plot(scaler.history_["dr"], label="max row marginal error")
ax[1].plot(scaler.history_["dc"], label="max col marginal error")
ax[1].set_yscale("log")
ax[1].set_xlabel("Iteration")
ax[1].set_ylabel("Error")
ax[1].set_title("Sinkhorn/IPF Convergence")
ax[1].legend()
ax[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

### Fitness vs ECI Correlation

**Key result**: For nested matrices, we expect:
- **Very high correlation** between ECI and diversification (r > 0.9) - this is ECI's design use case
- **Moderate correlation** between ECI and Fitness (r ≈ 0.6-0.7)
- ECI should work excellently on this structured data

In [None]:
# Calculate correlation
correlation = results_users["Fitness"].corr(results_users["ECI"])
print(f"Pearson correlation between Fitness and ECI: {correlation:.4f}")

# Scatter plot
plt.figure(figsize=(8, 6))
plt.scatter(results_users["ECI"], results_users["Fitness"], 
            s=30, alpha=0.6, edgecolors='k', linewidths=0.5)
plt.xlabel("ECI (standardized)", fontsize=12)
plt.ylabel("Fitness", fontsize=12)
plt.title(f"Nested Matrix: Fitness vs ECI\n(Correlation: {correlation:.4f})", fontsize=13)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Log-scale version
plt.figure(figsize=(8, 6))
plt.scatter(results_users["ECI"], results_users["Fitness"], 
            s=30, alpha=0.6, edgecolors='k', linewidths=0.5)
plt.xlabel("ECI (standardized)", fontsize=12)
plt.ylabel("Fitness (log scale)", fontsize=12)
plt.title(f"Nested Matrix: Fitness vs ECI (log scale)\n(Correlation: {correlation:.4f})", fontsize=13)
plt.yscale('log')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

### Matrix Structure Visualization

Visualize the binary matrix sorted by Fitness (rows) and Complexity (columns). For random data, we don't expect to see clear nested or modular patterns.

In [None]:
# Sort matrix by Fitness and Complexity
M_sorted = M_df.loc[results_users.index, results_words.index]

plt.figure(figsize=(12, 6))
plt.imshow(M_sorted.sparse.to_dense().to_numpy(), 
           aspect="auto", interpolation="nearest", cmap="Greys")
plt.colorbar(label="Presence (binary)")
plt.title("Nested Matrix sorted by Fitness (rows) and Complexity (cols)\nNote the triangular (nested) pattern")
plt.xlabel("Words (sorted by Complexity)")
plt.ylabel("Users (sorted by Fitness)")
plt.tight_layout()
plt.show()

### Distribution Comparisons

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(12, 8))

# Fitness distribution
axes[0, 0].hist(F, bins=30, edgecolor='black', alpha=0.7)
axes[0, 0].set_xlabel("Fitness")
axes[0, 0].set_ylabel("Count")
axes[0, 0].set_title("Fitness Distribution")
axes[0, 0].grid(True, alpha=0.3)

# ECI distribution
axes[0, 1].hist(eci, bins=30, edgecolor='black', alpha=0.7, color='orange')
axes[0, 1].set_xlabel("ECI")
axes[0, 1].set_ylabel("Count")
axes[0, 1].set_title("ECI Distribution")
axes[0, 1].grid(True, alpha=0.3)

# Complexity distribution
axes[1, 0].hist(Q, bins=30, edgecolor='black', alpha=0.7, color='green')
axes[1, 0].set_xlabel("Complexity")
axes[1, 0].set_ylabel("Count")
axes[1, 0].set_title("Complexity Distribution")
axes[1, 0].grid(True, alpha=0.3)

# PCI distribution
axes[1, 1].hist(pci, bins=30, edgecolor='black', alpha=0.7, color='red')
axes[1, 1].set_xlabel("PCI")
axes[1, 1].set_ylabel("Count")
axes[1, 1].set_title("PCI Distribution")
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

### Comparison with Diversification

Compare Fitness and ECI with simple diversification (number of words per user).

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Fitness vs Diversification
axes[0].scatter(results_users["diversification_kc"], results_users["Fitness"],
                s=30, alpha=0.6, edgecolors='k', linewidths=0.5)
axes[0].set_xlabel("Diversification (# words)")
axes[0].set_ylabel("Fitness")
axes[0].set_title("Fitness vs Diversification")
axes[0].grid(True, alpha=0.3)

# ECI vs Diversification
axes[1].scatter(results_users["diversification_kc"], results_users["ECI"],
                s=30, alpha=0.6, edgecolors='k', linewidths=0.5, color='orange')
axes[1].set_xlabel("Diversification (# words)")
axes[1].set_ylabel("ECI")
axes[1].set_title("ECI vs Diversification")
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

### Summary Statistics

In [None]:
print("=" * 60)
print("SUMMARY STATISTICS")
print("=" * 60)
print(f"\nMatrix size: {n_users} users × {n_words} words")
print(f"Matrix type: PERFECTLY NESTED")
print(f"Density: {M.nnz / (M.shape[0] * M.shape[1]):.2%}")
print(f"\nFitness-Complexity converged: {fc_hist.get('converged', False)}")
print(f"FC iterations: {len(fc_hist['dF'])}")
print(f"\nSinkhorn converged: {sk_hist.get('converged', False)}")
print(f"Sinkhorn iterations: {len(sk_hist['dr'])}")
print(f"\n{'Correlation between ECI and Diversification:':<40} {results_users['ECI'].corr(results_users['diversification_kc']):.4f}")
print(f"{'Correlation between Fitness and Diversification:':<40} {results_users['Fitness'].corr(results_users['diversification_kc']):.4f}")
print(f"{'Correlation between Fitness and ECI:':<40} {correlation:.4f}")
print(f"{'Correlation between Complexity and PCI:':<40} {results_words['Complexity'].corr(results_words['PCI']):.4f}")
print("\n" + "=" * 60)
print("For NESTED matrices (ECI's design use case):")
print("- ECI ↔ Diversification should be VERY HIGH (r > 0.9) ✓")
print("- ECI ↔ Fitness should be MODERATE (r ≈ 0.6-0.7)")
print("- This structure is where ECI performs optimally!")
print("=" * 60)