<a href="https://colab.research.google.com/github/lawrennd/economic-fitness/blob/main/reddit_user_word_complexity_2d.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Reddit user × word data: “fitness / complexity” analysis

This notebook:

- Downloads a sample of Reddit users from BigQuery and aggregates their comments into per-user text.
- Builds a `user` $\times$ `word` matrix and its *support* (analogous to `country` $\times$ `product`).


### BigQuery Setup Instructions

To run the BigQuery query cells in this notebook, you need to have a Google Cloud Project with the BigQuery API enabled and proper authentication setup.

Here's a general guide:

1.  **Google Cloud Account**: If you don't have one, sign up for a Google Cloud account. You might be eligible for a free trial.
    *   [Sign up for Google Cloud](https://cloud.google.com/free)

2.  **Create/Select a Project**: In the [Google Cloud Console](https://console.cloud.google.com/), create a new project or select an existing one.
    *   Ensure that **billing is enabled** for your project, as BigQuery usage incurs costs (though often minimal for small queries, especially with the free tier).

3.  **Enable the BigQuery API**: For your selected project, ensure the BigQuery API is enabled.
    *   Go to the [API Library](https://console.cloud.google.com/apis/library) in the Cloud Console.
    *   Search for "BigQuery API" and enable it if it's not already enabled.

4.  **Authenticate Colab**: In your Colab environment, the `google.colab.auth.authenticate_user()` function (called in Cell 3) will handle the authentication process by prompting you to log in with your Google account. This provides the necessary credentials for BigQuery access.

    Alternatively, if you are working locally or need specific application-default credentials, you might use the `gcloud` CLI:
    ```bash
    gcloud auth application-default login
    ```

Once these steps are complete, you should be able to run the BigQuery cells successfully!

### 0) Setup

You’ll need BigQuery credentials configured locally (e.g. `gcloud auth application-default login`) and permission to access the public dataset `fh-bigquery.reddit_comments`.

If you don’t have BigQuery access, you can still run the later cells by loading a cached dataframe (see the caching cell below).


In [None]:
# Core
import os
from dataclasses import dataclass, field
from typing import Tuple

import numpy as np
import pandas as pd

# Sparse matrices
import scipy.sparse as sp

# Text features
from sklearn.feature_extraction.text import CountVectorizer

# Plotting
import matplotlib.pyplot as plt


In [None]:
@dataclass(frozen=True)
class QueryConfig:
    # Config for Wikipedia query
    max_authors: int = 1000
    min_comments_per_author: int = 20
    max_docs_per_author: int = 2000
    # Optional: List of specific users to prioritize/include if they meet criteria
    specific_users: Tuple[str] = field(default_factory=tuple)

    # Legacy/Unused for Wikipedia
    target_subreddit: str = "datascience"
    start_suffix: str = "2015_01"
    end_suffix: str = "2015_03"
    max_rows: int = 200_000


# Standard random sampling (no specific users)
cfg = QueryConfig()

CACHE_DIR = "data"
os.makedirs(CACHE_DIR, exist_ok=True)

# Updated cache path for Wikipedia data (v4 - random sample)
CACHE_PATH = os.path.join(
    CACHE_DIR,
    f"wikipedia_authors{cfg.max_authors}_v4.parquet",
)

print("Cache path:", CACHE_PATH)

In [None]:
import os
import pandas as pd
import numpy as np
from google.cloud import bigquery
from google.colab import auth
import subprocess


In [None]:
def generate_synthetic_data(cfg):
    """Generate dummy data if BigQuery fails."""
    print("Generating synthetic Wikipedia-like data...")
    rng = np.random.default_rng(24)
    authors = [f"user_{i}" for i in range(cfg.max_authors)]

    # A small vocabulary
    vocab = ["data", "science", "python", "learning", "machine", "big", "query",
             "analysis", "matrix", "complexity", "fitness", "network", "plot",
             "code", "algorithm", "statistics", "neural", "deep", "model",
             "optimization", "linear", "algebra", "visualization", "mining",
             "edit", "wiki", "page", "revision", "history", "link"]

    data = []
    for author in authors:
        # Random text length
        n_words = rng.integers(50, 500)
        words = rng.choice(vocab, size=n_words)
        user_text = " ".join(words)
        n_comments = rng.integers(cfg.min_comments_per_author, 1000)
        data.append({"author": author, "user_text": user_text, "n_comments": n_comments})

    return pd.DataFrame(data)

def load_or_query_wikipedia(cfg: QueryConfig, cache_path: str) -> pd.DataFrame:
    if os.path.exists(cache_path):
        print("Loading cached dataframe…")
        df = pd.read_parquet(cache_path)
        return df

    print("No cache found. Querying BigQuery (Wikipedia)…")

    try:
        auth.authenticate_user()

        # Automate project selection
        print("Searching for a valid BigQuery project...")
        client = None

        try:
            # Get list of projects using gcloud
            proc = subprocess.run(['gcloud', 'projects', 'list', '--format=value(projectId)'], capture_output=True, text=True)
            projects = proc.stdout.strip().split('\n')

            for pid in projects:
                if not pid: continue
                try:
                    print(f"Trying project: {pid}...")
                    c = bigquery.Client(project=pid)
                    c.query("SELECT 1").result()
                    print(f"-> Success! Using project: {pid}")
                    client = c
                    break
                except Exception as e:
                    print(f"   Skipping {pid}: {e}")

            if client is None:
                 raise RuntimeError("Could not find any project with BigQuery enabled.")

        except Exception as e:
            print(f"Automatic project setup failed: {e}")
            raise e

        # Query Wikipedia samples
        # Clean comments: remove /* Section */ markers and HTML tags
        # Filter out bots: exclude usernames ending in "bot"
        QUERY = """
    WITH edits AS (
      SELECT
        contributor_username AS author,
        REGEXP_REPLACE(comment, r'/\\*.*?\\*/|<[^>]+>', '') AS body
      FROM `bigquery-public-data.samples.wikipedia`
      WHERE
        contributor_username IS NOT NULL
        AND comment IS NOT NULL
        AND NOT REGEXP_CONTAINS(LOWER(contributor_username), r'bot$')
    ),
    valid_edits AS (
      SELECT author, body
      FROM edits
      WHERE LENGTH(TRIM(body)) > 10  -- Filter out empty or very short comments after cleaning
    ),
    sampled_authors AS (
      SELECT author
      FROM valid_edits
      GROUP BY author
      HAVING COUNT(*) >= @min_comments_per_author
      ORDER BY RAND()
      LIMIT @max_authors
    )
    SELECT
      author,
      ARRAY_TO_STRING(ARRAY_AGG(body ORDER BY LENGTH(body) DESC LIMIT @max_docs_per_author), '\\n') AS user_text,
      COUNT(*) AS n_comments
    FROM valid_edits
    JOIN sampled_authors
    USING (author)
    GROUP BY author
    ORDER BY n_comments DESC
    """

        job_config = bigquery.QueryJobConfig(
            query_parameters=[
                bigquery.ScalarQueryParameter("max_authors", "INT64", cfg.max_authors),
                bigquery.ScalarQueryParameter("min_comments_per_author", "INT64", cfg.min_comments_per_author),
                bigquery.ScalarQueryParameter("max_docs_per_author", "INT64", cfg.max_docs_per_author),
            ]
        )

        df = client.query(QUERY, job_config=job_config).to_dataframe()

    except Exception as e:
        print(f"\nBigQuery query failed: {e}")
        print("Falling back to synthetic data so you can continue the tutorial.")
        df = generate_synthetic_data(cfg)

    print("Saving cache…")
    df.to_parquet(cache_path, index=False)
    return df



In [None]:
# Force a new cache path to ensure we query real data
print(f"Using cache path: {CACHE_PATH}")

df = load_or_query_wikipedia(cfg, CACHE_PATH)
print(df.head())
print("N users:", len(df))
print(df["n_comments"].describe())

In [None]:
# 2) Build user × word matrix
#
# In the paper's language, we will treat the *support* as M_{uw} = 1{X_{uw} > 0}.
# For the rank-2 extension to be identifiable, it's helpful to keep *counts* (not just binary).
# If you prefer pure support-only (presence/absence), set BINARY=True.

BINARY = False

vectorizer = CountVectorizer(
    lowercase=True,
    stop_words="english",
    min_df=3,          # ignore words used by <3 users
    max_features=5000, # keep it demo-friendly
    binary=BINARY,
)

# df is loaded from the previous cell
X = vectorizer.fit_transform(df["user_text"].fillna(""))
X = X.astype(np.float64)

vocab = vectorizer.get_feature_names_out()
user_ids = df["author"].astype(str).tolist()

print(f"Raw Matrix -> Users: {X.shape[0]}, Vocab: {X.shape[1]}, binary: {BINARY}")

# Support mask (structural zeros off-support)
M = X.copy()
M.data = np.ones_like(M.data)

# Basic margins (analogues of diversification and ubiquity)
user_strength = np.asarray(X.sum(axis=1)).ravel()
word_strength = np.asarray(X.sum(axis=0)).ravel()

print("User strength:", pd.Series(user_strength).describe())
print("Word strength:", pd.Series(word_strength).describe())

# Filter out degenerate rows/cols (helps numerics)
min_user_mass = 5
min_word_mass = 5

keep_users = user_strength >= min_user_mass
keep_words = word_strength >= min_word_mass

X = X[keep_users][:, keep_words]
M = M[keep_users][:, keep_words]

user_ids = [u for u, ok in zip(user_ids, keep_users) if ok]
vocab = vocab[keep_words]

user_strength = np.asarray(X.sum(axis=1)).ravel()
word_strength = np.asarray(X.sum(axis=0)).ravel()

print(f"After filtering -> Users: {X.shape[0]}, Vocab: {X.shape[1]}")

### 3) Baseline: 1D Pietronero Fitness–Complexity fixed point

This is the usual nonlinear rank-1 fixed point on the **support matrix** \(M\) (binary incidence). We’ll compute it as a scalar reference, then move to the rank-2 extension.


In [None]:
def fitness_complexity(M_bin: sp.spmatrix, n_iter: int = 200, tol: float = 1e-10):
    """Compute Fitness–Complexity fixed point on binary incidence matrix M.

    M_bin: scipy sparse matrix (n_users × n_words), entries in {0,1}

    Returns:
      F (n_users,), Q (n_words,)
    """
    n_users, n_words = M_bin.shape
    F = np.ones(n_users, dtype=float)
    Q = np.ones(n_words, dtype=float)

    M_csr = M_bin.tocsr()
    M_csc = M_bin.tocsc()

    history = {"dF": [], "dQ": []}

    for it in range(n_iter):
        F_new = M_csr @ Q
        F_new = np.maximum(F_new, 1e-12)
        F_new = F_new / F_new.mean()

        invF = 1.0 / F_new
        denom = M_csc.T @ invF
        denom = np.maximum(denom, 1e-12)
        Q_new = 1.0 / denom
        Q_new = Q_new / Q_new.mean()

        delta = max(np.max(np.abs(F_new - F)), np.max(np.abs(Q_new - Q)))
        F, Q = F_new, Q_new

        dF = np.max(np.abs(F_new - F))
        dQ = np.max(np.abs(Q_new - Q))
        history["dF"].append(dF)
        history["dQ"].append(dQ)

        if delta < tol:
            print(f"Converged in {it+1} iterations")
            break

    return F, Q, history

def compute_eci_pci(M_bin: sp.spmatrix):
    """Compute ECI/PCI from binary matrix using the standard spectral formulation.

    Returns:
      eci: pd.Series indexed by country
      pci: pd.Series indexed by product

    Notes:
    - This uses the country-country matrix: C = (M/kc) (M^T/kp)
    - The trivial eigenvector corresponds to eigenvalue 1; we use the 2nd largest.
    """
    Mv = M_bin.toarray()
    kc = Mv.sum(axis=1)
    kp = Mv.sum(axis=0)

    # avoid divide-by-zero: drop zero-degree nodes if any
    keep_c = kc > 0
    keep_p = kp > 0
    Mv = Mv[keep_c][:, keep_p]
    kc = kc[keep_c]
    kp = kp[keep_p]

    Dc_inv = np.diag(1.0 / kc)
    Dp_inv = np.diag(1.0 / kp)

    C = Dc_inv @ Mv @ Dp_inv @ Mv.T

    # eigen-decomposition (symmetric)
    evals, evecs = np.linalg.eigh(C)
    order = np.argsort(evals)[::-1]
    evals = evals[order]
    evecs = evecs[:, order]

    if evecs.shape[1] < 2:
        raise ValueError("Not enough dimensions for ECI (need at least 2 eigenvectors).")

    eci_vec = evecs[:, 1]
    # sign is arbitrary; fix by correlating with diversification (positive)
    if np.corrcoef(eci_vec, kc)[0, 1] < 0:
        eci_vec = -eci_vec

    # PCI as projection back to products
    pci_vec = Dp_inv @ Mv.T @ eci_vec

    # standardize for convenience
    eci = (eci_vec - eci_vec.mean()) / (eci_vec.std(ddof=0) + 1e-12)
    pci = (pci_vec - pci_vec.mean()) / (pci_vec.std(ddof=0) + 1e-12)

    return eci, pci

def sinkhorn_masked(M_bin: sp.spmatrix, r: np.ndarray, c: np.ndarray, n_iter: int = 2000, tol: float = 1e-12) -> tuple[np.ndarray, np.ndarray, np.ndarray, dict]:
    """Sinkhorn scaling on a binary mask M with kernel K = M (uniform cost on support).

    Finds u,v such that W = diag(u) K diag(v) has row sums r and col sums c.

    Requires K to have support that makes (r,c) feasible.
    """
    K = M_bin #.toarray()

    u = np.ones(K.shape[0])
    v = np.ones(K.shape[1])

    history = {"dr": [], "dc": []}

    for _ in range(n_iter):
        Ku = K @ v
        u_new = r / (Ku + 1e-30)
        Kt = K.T @ u_new
        v_new = c / (Kt + 1e-30)

        W = (u_new[:, None] * K) * v_new[None, :]
        dr = np.max(np.abs(W.sum(axis=1) - r))
        dc = np.max(np.abs(W.sum(axis=0) - c))
        history["dr"].append(dr)
        history["dc"].append(dc)

        u, v = u_new, v_new
        if max(dr, dc) < tol:
            break

    W = (u[:, None] * K) * v[None, :]
    return u, v, W, history


In [None]:
F, Q, fc_hist = fitness_complexity(M)
eci, pci = compute_eci_pci(M)

# Sinkhorn scaling with simple marginals
r = np.ones(M.shape[0])
r = r / r.sum()
c = np.ones(M.shape[1])
c = c / c.sum()

u, v, W, sk_hist = sinkhorn_masked(M, r=r, c=c)

results_countries = pd.DataFrame({
    "Fitness": F,
    "ECI": eci.reindex(F.index),
    "diversification_kc": M.sum(axis=1),
}).sort_values("Fitness", ascending=False)

results_products = pd.DataFrame({
    "Complexity": Q,
    "PCI": pci.reindex(Q.index),
    "ubiquity_kp": M.sum(axis=0),
}).sort_values("Complexity", ascending=False)

word_scores_1d = pd.Series(Q, index=vocab).sort_values(ascending=False)
user_scores_1d = pd.Series(F, index=user_ids).sort_values(ascending=False)

print("Top 20 words by complexity:")
print(word_scores_1d.head(20))
print("Top 20 users by fitness:")
print(user_scores_1d.head(20))

In [None]:
user_scores_1d.head(15)

In [None]:
# Diagnostics: convergence
fig, ax = plt.subplots(1, 2, figsize=(10, 3))
ax[0].plot(fc_hist["dF"], label="max |ΔF|")
ax[0].plot(fc_hist["dQ"], label="max |ΔQ|")
ax[0].set_yscale("log")
ax[0].set_title("FC convergence")
ax[0].legend()

ax[1].plot(sk_hist["dr"], label="max row marginal error")
ax[1].plot(sk_hist["dc"], label="max col marginal error")
ax[1].set_yscale("log")
ax[1].set_title("Sinkhorn/IPF convergence")
ax[1].legend()

plt.tight_layout()
plt.show()

# Diagnostics: nestedness-like visualization (sort by Fitness/Complexity)
M_sorted = M.loc[results_countries.index, results_products.index]
plt.figure(figsize=(10, 4))
plt.imshow(M_sorted.to_numpy(), aspect="auto", interpolation="nearest", cmap="Greys")
plt.title("M sorted by Fitness (rows) and Complexity (cols)")
plt.xlabel("products")
plt.ylabel("countries")
plt.tight_layout()
plt.show()

# Diagnostics: compare rankings
plt.figure(figsize=(5, 4))
plt.scatter(results_countries["ECI"], results_countries["Fitness"], s=15, alpha=0.7)
plt.xlabel("ECI (standardized)")
plt.ylabel("Fitness")
plt.title("Countries: Fitness vs ECI")
plt.tight_layout()
plt.show()
