## Page structure similarity

In [None]:
import math
import glob
from tqdm import tqdm
from typing import List, Tuple, Dict, Any
import json
import pandas as pd
import numpy as np
import os
from matplotlib import pyplot as plt


In [None]:
WIKIPEDIA_PATH = "../grokipedia_wikipedia_articles.ndjson"
GROKIPEDIA_GLOB = "../scraped_data/batch_*.jsonl"
RESULT_DIR = "../results"

### Helper functions

In [None]:
def _norm(s: str) -> str:
    return " ".join((s or "").strip().lower().split())

def _level_to_int(level: str) -> int:
    # 'h1' -> 1, 'h2' -> 2, ...
    level = (level or "").lower().strip()
    if level.startswith("h") and level[1:].isdigit():
        return int(level[1:])
    return 2  # default neutral

def extract_outline_from_wikipedia(wiki_obj):
    outline = []
    for sec in (wiki_obj.get("sections") or []):
        name = _norm(sec.get("name"))
        if not name:
            continue
        lvl = 1 if name == "abstract" else 2
        outline.append((lvl, name))
    return outline

def extract_outline_from_grokipedia(g_obj: Dict[str, Any]) -> List[Tuple[int, str]]:
    """
    Grokipedia JSON (data.sections): keep (level, title) sequence; handle multiple h1s.
    """
    sections = (((g_obj or {}).get("data") or {}).get("sections") or [])
    outline = []
    for sec in sections:
        lvl = _level_to_int(sec.get("level"))
        name = _norm(sec.get("title"))
        if name:
            outline.append((lvl, name))
    return outline

def _lcs_len(a: List[str], b: List[str]) -> int:
    # classic O(n*m) DP LCS length
    n, m = len(a), len(b)
    dp = [0]*(m+1)
    for i in range(1, n+1):
        prev = 0
        for j in range(1, m+1):
            tmp = dp[j]
            if a[i-1] == b[j-1]:
                dp[j] = prev + 1
            else:
                dp[j] = max(dp[j], dp[j-1])
            prev = tmp
    return dp[m]

def _jaccard(a: List[str], b: List[str]) -> float:
    A, B = set(a), set(b)
    if not A and not B:
        return 1.0
    return len(A & B) / max(1, len(A | B))

def _cosine(u: Dict[int, int], v: Dict[int, int]) -> float:
    keys = set(u) | set(v)
    num = sum(u.get(k,0)*v.get(k,0) for k in keys)
    du = math.sqrt(sum((u.get(k,0))**2 for k in keys))
    dv = math.sqrt(sum((v.get(k,0))**2 for k in keys))
    if du == 0 or dv == 0:
        return 1.0 if du == dv else 0.0
    return num / (du*dv)

def _depth_hist(outline: List[Tuple[int,str]]) -> Dict[int,int]:
    h = {}
    for lvl,_ in outline:
        h[lvl] = h.get(lvl,0)+1
    return h

def _as_indent_tree(outline: List[Tuple[int,str]]) -> List[Tuple[int,str]]:
    """
    Normalize to a tree-like preorder list using heading levels (indent structure).
    We don’t compute full edit distance; we’ll compare shape via a preorder signature.
    """
    # Ensure non-decreasing by at most +1 to avoid malformed jumps (optional clamp).
    norm = []
    last = 1
    for lvl,name in outline:
        lvl = max(1, lvl)
        if lvl > last+1:
            lvl = last+1
        norm.append((lvl,name))
        last = lvl
    return norm

def _tree_signature(t: List[Tuple[int,str]]) -> List[int]:
    # Encode just the shape: level sequence deltas
    if not t: return []
    sig = [t[0][0]]
    for i in range(1,len(t)):
        sig.append(t[i][0]-t[i-1][0])
    return sig

def _overlap_ratio(a: List[int], b: List[int]) -> float:
    # simple longest-common-prefix-like + LCS hybrid; here use LCS on small ints
    # Map ints to strings for LCS (reuse)
    sa = list(map(str,a))
    sb = list(map(str,b))
    l = _lcs_len(sa, sb)
    denom = max(1, max(len(sa), len(sb)))
    return l/denom

def compare_structures(
    wiki_outline: List[Tuple[int,str]],
    grok_outline: List[Tuple[int,str]]
) -> Dict[str, float]:
    # Flatten to sequences of titles
    w_titles = [t for _,t in wiki_outline]
    g_titles = [t for _,t in grok_outline]

    lcs = _lcs_len(w_titles, g_titles)
    lcs_ratio = lcs / max(1, max(len(w_titles), len(g_titles)))
    jacc = _jaccard(w_titles, g_titles)

    # Depth / level profile
    w_hist = _depth_hist(wiki_outline)
    g_hist = _depth_hist(grok_outline)
    depth_cos = _cosine(w_hist, g_hist)

    # Tree-ish shape similarity from heading indentation
    w_tree = _as_indent_tree(wiki_outline)
    g_tree = _as_indent_tree(grok_outline)
    shape_sim = _overlap_ratio(_tree_signature(w_tree), _tree_signature(g_tree))

    # Outline length comparison
    w_len = len(wiki_outline)
    g_len = len(grok_outline)
    length_diff = g_len - w_len
    length_ratio = g_len / w_len if w_len > 0 else None

    return dict(
        lcs_ratio=lcs_ratio,
        jaccard=jacc,
        depth_cosine=depth_cos,
        shape_similarity=shape_sim,
        wikipedia_outline_length=w_len,
        grokipedia_outline_length=g_len,
        outline_length_diff=length_diff,
        outline_length_ratio=length_ratio
    )

In [None]:
def stream_ndjson(path):
    """Yields objects (dicts) one at a time from an .ndjson file."""
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            if line.strip():
                yield json.loads(line)

def stream_grokipedia_jsonls(pattern):
    """Yields objects (dicts) one at a time from all JSONL files matching a pattern."""
    for fname in sorted(glob.glob(pattern)):
        with open(fname, "r", encoding="utf-8") as f:
            for line in f:
                if line.strip():
                    yield json.loads(line)

def build_title_pointer_index(path, title_field="name", norm_func=_norm):
    """
    Index file offsets of each normalized title for later recovery.
    Returns {normalized_title: file_offset}
    """
    idx = {}
    with open(path, "r", encoding="utf-8") as f:
        pos = 0
        for line in f:
            # Record position before reading this line
            line_start = pos
            
            if line.strip():
                obj = json.loads(line)
                title = obj.get(title_field, '')
                if title:
                    idx[norm_func(title)] = line_start
            
            # Update position by adding the byte length of this line
            pos += len(line.encode('utf-8'))
    
    return idx

def build_grokipedia_file_map(pattern, norm_func=_norm):
    """
    Maps normalized title to (filename, offset within file) for all Grokipedia jsonl files.
    Returns {normalized_title: (filename, file_offset)}
    """
    idx = {}
    for fname in sorted(glob.glob(pattern)):
        with open(fname, "r", encoding="utf-8") as f:
            pos = 0
            for line in f:
                # Record position before reading this line
                line_start = pos
                
                if line.strip():
                    obj = json.loads(line)
                    title = obj.get('data', {}).get('main_title', '')
                    if title:
                        idx[norm_func(title)] = (fname, line_start)
                
                # Update position by adding the byte length of this line
                pos += len(line.encode('utf-8'))
    
    return idx

def get_object_at_offset(path, offset):
    with open(path, "r", encoding="utf-8") as f:
        f.seek(offset)
        line = f.readline()
        return json.loads(line)

In [None]:
def compare_structures_on_datasets_memlite(
    wikipedia_path=WIKIPEDIA_PATH,
    grokipedia_glob=GROKIPEDIA_GLOB
):
    """
    Compares structures for matched titles between Wikipedia and Grokipedia datasets,
    with efficient file scanning (not loading whole files into memory).
    Yields dicts: {'title': ... , **metrics}
    """
    # Step 1: Index Wikipedia titles with file offsets
    print("Indexing Wikipedia titles...")
    wiki_idx = build_title_pointer_index(wikipedia_path, norm_func=_norm)
    # Step 2: Index Grokipedia titles with (filename, offset)
    print("Indexing Grokipedia titles...")
    grok_idx = build_grokipedia_file_map(grokipedia_glob, norm_func=_norm)

    match_titles = set(wiki_idx).intersection(grok_idx)
    print(f"Matched {len(match_titles)} titles.")

    for title in tqdm(sorted(match_titles), desc="Comparing structures"):
        wiki_offset = wiki_idx[title]
        grok_fname, grok_offset = grok_idx[title]

        # Retrieve objects by file offsets—never in memory all at once!
        wiki_obj = get_object_at_offset(wikipedia_path, wiki_offset)
        grok_obj = get_object_at_offset(grok_fname, grok_offset)

        wiki_outline = extract_outline_from_wikipedia(wiki_obj)
        grok_outline = extract_outline_from_grokipedia(grok_obj)
        metrics = compare_structures(wiki_outline, grok_outline)
        metrics['title'] = title
        yield metrics

In [None]:
chunk_size = 10000
current_chunk = []

output_path = f"{RESULT_DIR}/structural_comparison.csv"
os.makedirs(os.path.dirname(output_path), exist_ok=True)

header_written = False

for i, result in enumerate(compare_structures_on_datasets_memlite()):
    current_chunk.append(result)
    if len(current_chunk) == chunk_size:
        df_chunk = pd.DataFrame(current_chunk)
        df_chunk.to_csv(output_path, mode='a', header=not header_written, index=False)
        header_written = True
        print(f"Processed and wrote chunk ending at row {i}")
        current_chunk = []

# Process any remaining results
if current_chunk:
    df_chunk = pd.DataFrame(current_chunk)
    df_chunk.to_csv(output_path, mode='a', header=not header_written, index=False)
    print("Processed and wrote final chunk.")

In [None]:
df = pd.read_csv(f"{RESULT_DIR}/structural_comparison.csv")

In [None]:
len(df[df['outline_length_ratio'] > 1]) / len(df)

In [None]:
RESULT_DIR = '../results'

with open(f"{RESULT_DIR}/grokipedia_wo_license.txt", encoding="utf-8") as f:
    grokipedia_wo_license_df = pd.DataFrame({"title": [line.rstrip('\n').lower() for line in f]})

with open(f"{RESULT_DIR}/grokipedia_w_license.txt", encoding="utf-8") as f:
    grokipedia_w_license_df = pd.DataFrame({"title": [line.rstrip('\n').lower() for line in f]})

df_wo_license = df.merge(grokipedia_wo_license_df, on="title")
df_w_license = df.merge(grokipedia_w_license_df, on="title")

In [None]:
# Prepare data
wo_ratios = df_wo_license["outline_length_ratio"][df_wo_license["outline_length_ratio"] > 0]
w_ratios = df_w_license["outline_length_ratio"][df_w_license["outline_length_ratio"] > 0]

# Choose number of bins
n_bins = 100

# Get combined nonzero ratios
all_ratios = pd.concat([wo_ratios, w_ratios])

# Compute equal-width bin edges: visually equal-sized buckets in data space
min_ratio = all_ratios.min()
max_ratio = all_ratios.max()
edges = np.logspace(np.log10(min_ratio), np.log10(max_ratio), n_bins + 1)

plt.figure(figsize=(10, 6))
n, bins, patches = plt.hist(
    [wo_ratios, w_ratios],
    bins=edges,
    color=["tab:orange", "tab:blue"],
    label=["Without CC License", "With CC License"],
    alpha=0.7,
    histtype="stepfilled",
    log=True,
)

# Calculate medians
wo_median = np.median(wo_ratios)
w_median = np.median(w_ratios)

# Plot median lines and add them to legend, with parenthetical median values
wo_line = plt.axvline(
    wo_median, color="black", linestyle="--", linewidth=2, 
    label=f"No CC license median = {wo_median:.2g}"
)
w_line = plt.axvline(
    w_median, color="black", linestyle=":", linewidth=2, 
    label=f"CC license median = {w_median:.2g}"
)

plt.xscale("log")
plt.xlabel("Outline Length Ratio (log scale)", fontsize=16)
plt.ylabel("Count (log scale)", fontsize=16)
plt.title("Outline Length Ratio Distribution (With vs Without CC License)", fontsize=18)

# Make sure to show both histogram entries and line entries in the legend
plt.legend()
plt.tight_layout()
plt.savefig("../graphics/outline_length_ratio_distribution.pdf")
plt.show()

## Semantic similarity

Actual embedding and scoring done on a GPU, see `../scripts/boomhauer/*.py`

In [None]:
pairwise = pd.read_parquet("../results/embeddings_similarities_pairwise_stats.parquet")
top1 = pd.read_parquet("../results/embeddings_similarities_pairwise_top1_alignments.parquet")

In [None]:
# percentage of articles where Grokipedia has as many or more chunks than Wikipedia
len(pairwise[pairwise['n_w'] <= pairwise['n_g']]) / len(pairwise)

In [None]:
with open(f"{RESULT_DIR}/grokipedia_wo_license.txt", encoding="utf-8") as f:
    grokipedia_wo_license_df = pd.DataFrame({"title": [line.rstrip('\n').replace(" ", "_") for line in f]})

with open(f"{RESULT_DIR}/grokipedia_w_license.txt", encoding="utf-8") as f:
    grokipedia_w_license_df = pd.DataFrame({"title": [line.rstrip('\n').replace(" ", "_") for line in f]})

top1_w_license = pd.merge(grokipedia_w_license_df, top1, left_on="title", right_on="title")
top1_wo_license = pd.merge(grokipedia_wo_license_df, top1, left_on="title", right_on="title")

In [None]:
top1.groupby('title').agg({'similarity': 'mean'}).describe()

In [None]:
top1_w_license.groupby('title').agg({'similarity': 'mean'}).describe()

In [None]:
top1_wo_license.groupby('title').agg({'similarity': 'mean'}).describe()

In [None]:
plt.figure(figsize=(10, 6))
plt.hist(
    [psim_w_license["similarity"], psim_wo_license["similarity"]],
    bins=500,
    color=["tab:blue", "tab:orange"],
    label=["With CC License", "Without CC License"],
    alpha=0.7,
    histtype="stepfilled",
)
plt.xlabel("Similarity", fontsize=18)
plt.ylabel("Count", fontsize=18)
plt.title("Average Page Embedding Similarity Distributions:\n With vs. Without CC License", fontsize=18)
plt.legend(fontsize=16)
plt.tick_params(labelsize=14)
plt.tight_layout()
plt.savefig("../graphics/embedding_similarity_distribution.pdf")
plt.show()

In [None]:
import scipy.stats as stats
import numpy as np

def map_chunk_id_to_bucket(chunk_id):
    if chunk_id == 0:
        return "1"
    elif chunk_id == 1:
        return "2"
    elif chunk_id == 2:
        return "3"
    elif chunk_id == 3:
        return "4"
    elif chunk_id == 4:
        return "5"
    elif chunk_id <= 10:
        return "6-10"
    elif chunk_id <= 20:
        return "11-20"
    elif chunk_id <= 30:
        return "21-30"
    elif chunk_id <= 40:
        return "31-40"
    elif chunk_id <= 50:
        return "41-50"
    elif chunk_id <= 75:
        return "51-75"
    elif chunk_id <= 100:
        return "76-100"
    elif chunk_id <= 125:
        return "101-125"
    elif chunk_id <= 150:
        return "126-150"
    elif chunk_id <= 175:
        return "151-175"
    elif chunk_id <= 200:
        return "176-200"
    elif chunk_id <= 250:
        return "201-250"
    elif chunk_id <= 300:
        return "251-300"
    elif chunk_id <= 350:
        return "301-350"
    elif chunk_id <= 400:
        return "351-400"
    else:
        return ">400"

# Order of buckets for plotting, matching the function's logic
chunk_bucket_order = [
    "1", "2", "3", "4", "5", "6-10", "11-20", "21-30", "31-40", "41-50",
    "51-75", "76-100", "101-125", "126-150", "151-175", "176-200", "201-250", "251-300",
    "301-350", "351-400", ">400"
]

# Compute bucket labels column
top1['chunk_bucket'] = top1['wiki_chunk_id'].apply(map_chunk_id_to_bucket)

# Compute group statistics for each bucket
grouped = top1.groupby('chunk_bucket')["similarity"]
avg_similarity = grouped.mean()
count = grouped.count()
std = grouped.std()

# Calculate 95% confidence intervals
alpha = 0.05
z_score = stats.norm.ppf(1 - alpha/2)
sem = std / np.sqrt(count)
ci_halfwidth = z_score * sem

ci_lower = avg_similarity - ci_halfwidth
ci_upper = avg_similarity + ci_halfwidth

# Prepare DataFrame for plotting and reindex for plotting order
plot_df = pd.DataFrame({
    "mean": avg_similarity,
    "ci_lower": ci_lower,
    "ci_upper": ci_upper
}).reindex(chunk_bucket_order)

x = np.arange(len(plot_df.index))

plt.figure(figsize=(10, 6))
plt.plot(x, plot_df["mean"], marker='o', label="Mean similarity")
plt.fill_between(
    x, 
    plot_df["ci_lower"], 
    plot_df["ci_upper"], 
    color="tab:blue", 
    alpha=0.2, 
    label="95% Confidence Interval"
)
plt.xlabel('Wikipedia Chunk Position', fontsize=18)
plt.ylabel('Average Similarity', fontsize=18)
plt.title('Average Similarity by Wikipedia Chunk Position\n(with 95% Confidence Intervals)', fontsize=18)
plt.xticks(x, plot_df.index, rotation=45, fontsize=14)
plt.grid(True)
plt.legend(fontsize=16)
plt.tick_params(labelsize=14)
plt.tight_layout()
plt.savefig("../graphics/average_similarity_by_chunk_position.pdf")
plt.show()

In [None]:
controversial = set()

with open('../results/controversial_pages_in_grokipedia.txt', encoding="utf-8") as f:
    for line in f:
        controversial.add(line.rstrip('\n').replace(" ", "_"))

congress = pd.read_csv('../supplemental_data/wikidata_queries/us_members_of_congress.csv')
parliament = pd.read_csv('../supplemental_data/wikidata_queries/uk_members_of_parliament.csv')
mps = set(congress['personLabel'].str.replace(" ", "_")) | set(parliament['personLabel'].str.replace(" ", "_"))


In [None]:
psim_w_license = top1_w_license[['title', 'similarity']].groupby('title').mean()
psim_wo_license = top1_wo_license[['title', 'similarity']].groupby('title').mean()
psim_controversial = top1[top1['title'].str.lower().isin(controversial)][['title', 'similarity']].groupby('title').mean()
psim_mps = top1[top1['title'].isin(mps)][['title', 'similarity']].groupby('title').mean()

In [None]:
psim_mps.similarity.describe()

In [None]:
psim_controversial.sort_values(by='similarity', ascending=True)[:50]

In [None]:
# w license high similarity: Sono_Sachiko, Aubrey_de_Sélincourt, Korba_Super_Thermal_Power_Station
# w license medium similarity: (Sittin'_On)_The_Dock_of_the_Bay, Lycorine, Stayman_convention
# w license low similarity: Duncan_Jones, MacGyver_in_popular_culture, Ellen_David

# wo license high similarity: Seliwanoff's_test, The_Sandlot_2, Mejia_Thermal_Power_Station
# wo license medium similarity: Capital_punishment_in_the_Soviet_Union, Grand_Junction,_Colorado, James_Jacobus_Roosevelt
# wo license low similarity: Henry_V, Potato_famine, Soylent

# controversial high similarity: Christopher_Paul_Neil, Historicity_of_Jesus, Number_of_the_Beast
# controversial medium similarity: Dylann_Roof, Criticism_of_Judaism, Presidency_of_George_W._Bush	
# controversial low similarity: Criticism_of_the_United_States_government, Racism_in_the_United_States, Media_bias_in_the_United_States

# pol high similarity: David_Scott, Tom_Collins, Chris_Deluzio
# pol medium similarity: Chris_Murphy, Bill_Cassidy, Alistair_Strathern
# pol low similarity: Chris_Pappas, Adrian_Smith, Nigel_Farage, Keir_Starmer, Pramila_Jayapal, 