## pageview data

from Wikipedia dumps + pageviews.wmcloud.org

In [None]:
import pandas as pd
import bz2
import os
import json
import time
import requests
from typing import Dict, Optional
import random
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
import pickle as pkl
import geopandas as gpd
import pycountry

In [None]:
grok_idx = pkl.load(open('../results/cached_grok_idx.pkl', "rb"))
# sep_2025_enwiki_view = 6_916_125_844 # from public API
oct_2025_enwiki_view = 7_153_203_240 # from public API

In [None]:
def process_pageview_file(filepath, grok_idx):
    total_views = 0
    i = 0
    page_ids = set()
    
    # Debug: Check grok_idx structure
    print(f"grok_idx type: {type(grok_idx)}")
    if isinstance(grok_idx, dict):
        grok_keys = set(grok_idx.keys())
        print(f"grok_idx is dict with {len(grok_keys)} keys")
        sample_keys = list(grok_keys)[:5]
        print(f"Sample grok_idx keys: {sample_keys}")
    else:
        grok_keys = grok_idx if isinstance(grok_idx, set) else set(grok_idx)
        print(f"grok_idx is {type(grok_idx)} with {len(grok_keys)} items")
        sample_keys = list(grok_keys)[:5]
        print(f"Sample grok_idx items: {sample_keys}")
    
    sample_pageview_titles = []
    sample_normalized = []
    matches_found = []
    lines_checked = 0
    en_wiki_count = 0
    other_projects = {}
    sample_raw_lines = []
    
    with bz2.open(filepath, mode='rt', encoding='utf-8', errors='replace') as f:
        for j, line in enumerate(f):
            line = line.strip()
            if not line:
                continue
            
            # Collect first few raw lines for debugging
            if len(sample_raw_lines) < 5:
                sample_raw_lines.append(line[:100])  # First 100 chars
            
            try:
                # Format can be either:
                # 1. project_code page_title page_id daily_total hourly_counts...
                # 2. project_code page_title page_id access_type daily_total hourly_counts...
                parts = line.split(' ')
                if len(parts) < 4:
                    continue
                
                lines_checked += 1
                project = parts[0]
                
                # Track project types
                if project != 'en.wikipedia':
                    if project not in other_projects:
                        other_projects[project] = 0
                    other_projects[project] += 1
                    continue
                
                en_wiki_count += 1
                
                # Try to determine format - check if parts[3] looks like an access type
                # Access types are usually: desktop, mobile-web, mobile-app, etc.
                access_types = {'desktop', 'mobile-web', 'mobile-app', 'all-sites'}
                has_access_type = parts[3] in access_types if len(parts) > 3 else False
                
                if has_access_type:
                    # Format: project page_title page_id access_type daily_total ...
                    original_title = parts[1]
                    page_id = parts[2]
                    daily_total = int(parts[4]) if len(parts) > 4 else 0
                else:
                    # Format: project page_title page_id daily_total ...
                    original_title = parts[1]
                    page_id = parts[2]
                    daily_total = int(parts[3]) if len(parts) > 3 else 0

                # Convert Wikipedia title (title case with underscores) to lowercase with spaces
                # e.g., "Barack_Obama" -> "barack obama"
                page_title = original_title.replace('_', ' ').lower()
                
                # Skip entries with special titles like "-" or "null"
                if original_title in ['-', 'null'] or not original_title:
                    continue
                
                # Collect samples for debugging
                if len(sample_pageview_titles) < 10:
                    sample_pageview_titles.append(original_title)
                    sample_normalized.append(page_title)
                    # Check if this normalized title is in grok_keys
                    if page_title in grok_keys:
                        matches_found.append((original_title, page_title))
                        print(f"MATCH FOUND: '{original_title}' -> '{page_title}'")
                
                # Check if normalized title is in grok_keys
                if page_title in grok_keys:
                    # Only add page_id if it matches grok_keys
                    if page_id not in page_ids:
                        page_ids.add(page_id)
                    i += 1
                    if i % 10000 == 0:
                        print(f"Matches so far: {i}, total views: {total_views:,}")
                    total_views += daily_total
            except Exception as e:
                if j < 100:  # Only print first few errors
                    print(f"Error on line {j}: {e}, line content: {line[:100]}")
                continue
    
    # Print debugging summary
    print(f"\n=== Debugging Summary ===")
    print(f"Total lines read: {j+1}")
    print(f"Lines with valid format: {lines_checked}")
    print(f"en.wikipedia lines found: {en_wiki_count}")
    if en_wiki_count == 0:
        print(f"\n⚠️  WARNING: No 'en.wikipedia' entries found in this file!")
        print(f"   The file appears to contain only: {list(other_projects.keys())[:10]}")
        print(f"   You may need to use a different pageview file that contains English Wikipedia data.")
    print(f"Other projects seen: {dict(list(other_projects.items())[:10])}")
    print(f"Total matches found: {i}")
    print(f"Total views: {total_views:,}")
    print(f"\nSample raw lines (first 100 chars): {sample_raw_lines[:3]}")
    print(f"\nSample pageview titles (original): {sample_pageview_titles[:10]}")
    print(f"Sample normalized titles: {sample_normalized[:10]}")
    
    return total_views, page_ids

# grok_sep_2025_views, grok_sep_2025_page_ids = process_pageview_file(os.path.expanduser('~/Downloads/pageviews-202509-user.bz2'), grok_idx)
grok_oct_2025_views, grok_oct_2025_page_ids = process_pageview_file(os.path.expanduser('~/Downloads/pageviews-202510-user-en-wikipedia-only.bz2'), grok_idx)

# print("September 2025 matched grok_idx views:", grok_sep_2025_views)
print("October 2025 matched grok_idx views:", grok_oct_2025_views)

In [None]:
len(grok_oct_2025_page_ids)

In [None]:
# percent of enwiki views in october 2025 that grokipedia pages capture
grok_oct_2025_views / oct_2025_enwiki_view

In [None]:
grok_oct_2025_page_ids = {
    int(page_id)
    for page_id in grok_oct_2025_page_ids
    if page_id is not None and page_id != "null"
}

### geo pv analysis

In [None]:
df0 = pd.read_csv('../supplemental_data/pageview_data/pageviews_0.csv')
df1 = pd.read_csv('../supplemental_data/pageview_data/pageviews_1.csv')
df = pd.concat([df0, df1])

In [None]:
df = df[df['project'] == 'en.wikipedia']

In [None]:
df.page_id.dtype

In [None]:
df[df['page_id'].isin(grok_oct_2025_page_ids)]

In [None]:
# World map visualization of filtered noisy_views by country

# Filter the dataframe
filtered_df = df[df['page_id'].isin(grok_oct_2025_page_ids)]

# Aggregate noisy_views by country_code
filtered_agg = filtered_df.groupby('country_code')['noisy_views'].sum().reset_index()
world = gpd.read_file("https://raw.githubusercontent.com/holtzy/D3-graph-gallery/master/DATA/world.geojson")

# Convert ISO 2 to ISO 3 for merging
def iso2_to_iso3(iso2_code):
    try:
        country = pycountry.countries.get(alpha_2=iso2_code)
        return country.alpha_3 if country else None
    except:
        return None

filtered_agg['country_code_iso3'] = filtered_agg['country_code'].apply(iso2_to_iso3)

# Merge with world map
filtered_world = world.merge(filtered_agg, left_on='id', right_on='country_code_iso3', how='left')

# Replace NaN with 0 for countries with no data
filtered_world['noisy_views'] = filtered_world['noisy_views'].fillna(0)

# Create figure
fig, ax = plt.subplots(figsize=(16, 10))

# Find min/max for color scale (excluding zeros)
non_zero_values = filtered_world[filtered_world['noisy_views'] > 0]['noisy_views'].values
if len(non_zero_values) > 0:
    vmin = non_zero_values.min()
    vmax = non_zero_values.max()
else:
    vmin = 1
    vmax = 1

# Plot filtered dataset
filtered_world.plot(
    column='noisy_views',
    ax=ax,
    cmap='YlOrRd',
    missing_kwds={'color': 'lightgray'},
    legend=True,
    norm=LogNorm(vmin=max(vmin, 1), vmax=vmax),
    legend_kwds={'shrink': 0.6, 'aspect': 20},
    edgecolor='black',
    linewidth=0.1
)
ax.set_title('Grokipedia Pageviews by Country (October 2025, Log Scale)', fontsize=16)
ax.axis('off')

plt.tight_layout()
plt.savefig('../graphics/pageview_world_map_grok.pdf', dpi=300, bbox_inches='tight')
plt.show()

# Print summary statistics
print(f"Filtered dataset: {len(filtered_agg)} countries, {filtered_agg['noisy_views'].sum():,} total views")

In [None]:
filtered_world['noisy_views'].sum()

In [None]:
filtered_world.noisy_views.sort_values()[:5].sum()

In [None]:
filtered_world.sort_values(by='noisy_views', ascending=False)[:10].noisy_views.sum()

In [None]:
for i in filtered_world[['country_code', 'noisy_views']].sort_values(by='noisy_views', ascending=False)[:10]:
    print(i)

## 30k sample for topic and quality modeling

In [None]:
def fetch_article_metadata(N: int, input_file: str = '../grokipedia_wikipedia_articles.ndjson'):
    """
    Fetch quality and topic metadata for N articles from the ndjson file.
    
    Args:
        N: Number of articles to process
        input_file: Path to the ndjson file containing articles
    
    Returns:
        List of dictionaries with article titles and their metadata
    """    
    # Single pass: record byte offsets of each line start
    line_offsets = []
    with open(input_file, 'rb') as f:
        line_offsets.append(f.tell())  # First line starts at position 0
        while f.readline():
            line_offsets.append(f.tell())
    
    file_line_count = len(line_offsets) - 1  # Last offset is EOF
    
    if N > file_line_count:
        N = file_line_count
    
    # Select N unique random line numbers (0-indexed)
    chosen_line_numbers = sorted(random.sample(range(file_line_count), N))
    
    # Seek to selected lines and read them
    articles = []
    with open(input_file, 'r', encoding='utf-8') as f:
        for line_num in chosen_line_numbers:
            f.seek(line_offsets[line_num])
            line = f.readline()
            try:
                article_data = json.loads(line.strip())
                title = article_data.get('name', '')
                if title:
                    # Replace spaces with underscores
                    title_underscore = title.replace(' ', '_')
                    articles.append({
                        'original_title': title,
                        'title': title_underscore
                    })
            except json.JSONDecodeError as e:
                print(f"Error parsing random-selected line {line_num+1}: {e}")
                continue

    print(f"Randomly selected and loaded {len(articles)} articles")

    # Setup session with user agent
    session = requests.Session()
    session.headers.update({
        'User-Agent': 'htriedman grokipedia'
    })
    
    results = []
    
    for idx, article in enumerate(articles):
        title = article['title']
        print(f"Processing {idx+1}/{len(articles)}: {title}")
        
        result = {
            'title': title,
            'original_title': article['original_title'],
            'quality_data': None,
            'topic_data': None,
            'errors': []
        }
        
        # Fetch quality data
        quality_url = f"https://misalignment.wmcloud.org/api/v1/quality-article-features?lang=en&title={title}"
        quality_data = fetch_with_retry(session, quality_url, method='GET')
        if quality_data:
            result['quality_data'] = quality_data
        else:
            result['errors'].append('quality_fetch_failed')
        
        # Small delay between requests
        time.sleep(0.05)
        
        # Fetch topic data
        topic_url = "https://api.wikimedia.org/service/lw/inference/v1/models/outlink-topic-model:predict"
        topic_payload = {
            "page_title": title,
            "lang": "en",
            "threshold": 0.1
        }
        topic_data = fetch_with_retry(session, topic_url, method='POST', json_data=topic_payload)
        if topic_data:
            result['topic_data'] = topic_data
        else:
            result['errors'].append('topic_fetch_failed')
        
        # Delay between articles to avoid rate limiting
        time.sleep(0.05)
        
        results.append(result)
    
    return results


def fetch_with_retry(session: requests.Session, url: str, method: str = 'GET', 
                     json_data: Optional[Dict] = None, max_retries: int = 3, 
                     base_delay: float = 2.0) -> Optional[Dict]:
    """
    Fetch data from an API with retry logic and rate limit handling.
    
    Args:
        session: Requests session object
        url: URL to fetch
        method: HTTP method ('GET' or 'POST')
        json_data: JSON payload for POST requests
        max_retries: Maximum number of retries
        base_delay: Base delay in seconds for exponential backoff
    
    Returns:
        JSON response as dict, or None if all retries failed
    """
    for attempt in range(max_retries):
        try:
            if method == 'GET':
                response = session.get(url, timeout=30)
            elif method == 'POST':
                response = session.post(url, json=json_data, timeout=30)
            else:
                raise ValueError(f"Unsupported method: {method}")
            
            # Check for rate limiting (429) or server errors (5xx)
            if response.status_code == 429:
                retry_after = int(response.headers.get('Retry-After', base_delay * (2 ** attempt)))
                print(f"Rate limited. Waiting {retry_after} seconds...")
                time.sleep(retry_after)
                continue
            
            if response.status_code == 503:
                wait_time = base_delay * (2 ** attempt)
                print(f"Service unavailable. Waiting {wait_time} seconds...")
                time.sleep(wait_time)
                continue
            
            # Check for other errors
            if response.status_code >= 500:
                wait_time = base_delay * (2 ** attempt)
                print(f"Server error {response.status_code}. Waiting {wait_time} seconds...")
                time.sleep(wait_time)
                continue
            
            # Success
            if response.status_code == 200:
                return response.json()
            else:
                print(f"Unexpected status code {response.status_code}: {response.text[:200]}")
                return None
                
        except requests.exceptions.Timeout:
            wait_time = base_delay * (2 ** attempt)
            print(f"Timeout on attempt {attempt+1}. Waiting {wait_time} seconds...")
            time.sleep(wait_time)
            
        except requests.exceptions.RequestException as e:
            print(f"Request error on attempt {attempt+1}: {e}")
            if attempt < max_retries - 1:
                wait_time = base_delay * (2 ** attempt)
                time.sleep(wait_time)
            else:
                return None
    
    print(f"Failed to fetch {url} after {max_retries} attempts")
    return None

In [None]:
results = fetch_article_metadata(N=30000)

In [None]:
# with open('../results/article_metadata_results1.jsonl', 'w') as f:
#     for item in results:
#         f.write(json.dumps(item) + '\n')

## Similarities by topic and article class

In [None]:
df = pd.read_json('../results/article_metadata_results.jsonl', lines=True)
similarities = pd.read_parquet('../results/embeddings_similarities_pairwise_top1_alignments.parquet')

In [None]:
RESULT_DIR = '../results'

with open(f"{RESULT_DIR}/grokipedia_wo_license.txt", encoding="utf-8") as f:
    grokipedia_wo_license_df = pd.DataFrame({"title": [line.rstrip('\n').lower() for line in f]})

with open(f"{RESULT_DIR}/grokipedia_w_license.txt", encoding="utf-8") as f:
    grokipedia_w_license_df = pd.DataFrame({"title": [line.rstrip('\n').lower() for line in f]})

In [None]:
df = pd.merge(df, similarities, on='title', how='left')

In [None]:
def get_topics_from_topic_data(topic_data):
    if topic_data is None:
        return None
    prediction = topic_data.get('prediction')
    if not prediction:
        return None
    results = prediction.get('results', [])
    if not results:
        return None
    # Only return topics with score > 0.5
    filtered_topics = [topic for topic in results if topic.get('score', 0) > 0.5]
    if not filtered_topics:
        return None
    return filtered_topics

def get_article_class(quality_data):
    if quality_data is None:
        return None
    class_ = quality_data.get('class', '')

    if not class_:
        return None
    return class_

def get_article_quality(quality_data):
    if quality_data is None:
        return None
    quality = quality_data.get('quality', None)

    if not quality:
        return None
    return quality

In [None]:
df['topics'] = df['topic_data'].apply(get_topics_from_topic_data)
df = df.explode('topics')
df['topic'] = df['topics'].apply(lambda x: x.get('topic') if isinstance(x, dict) else None)

# Extract first-level topic prefix before the first dot
df['topic_prefix'] = df['topic'].apply(lambda t: t if not (isinstance(t, str) and '.' in t) else t.split('.', 1)[0])
df['2nd_level_topic'] = df['topic'].apply(lambda t: t if not (isinstance(t, str) and '.' in t) else '.'.join(t.split('.')[:2]))
df['region_topic'] = df['topic'].apply(lambda t: t if (isinstance(t, str) and 'Region' in t) else None)

df['class'] = df['quality_data'].apply(get_article_class)
df['quality'] = df['quality_data'].apply(get_article_quality)
top_prefixes = df['topic_prefix'].value_counts().index.tolist()
second_level_topics = df['2nd_level_topic'].value_counts().index.tolist()

In [None]:
second_level_topics = [t for t in sorted(second_level_topics) if 'Region' not in t]

In [None]:
df['continent'] = df['region_topic'].apply(lambda t: t.split('.')[2] if isinstance(t, str) and '.' in t and len(t.split('.')) > 2 else None)

df['subcontinent'] = df['topic'].apply(
    lambda t: t.split('.')[3] if (isinstance(t, str) and 'Region' in t and len(t.split('.')) > 3) else None
)

In [None]:
df['region_topic'] = df['topic'].apply(lambda t: t if (isinstance(t, str) and 'Region' in t) else None)

In [None]:
import math

In [None]:
# Plot similarity distributions by license status for each top-level topic group
# and for each second-level topic group.
# License lists are in grokipedia_w_license_df and grokipedia_wo_license_df (lowercase, with spaces)

# ---- PLOT: TOPIC PREFIX (1st level) ----
n_prefixes = len(top_prefixes)
ncols = 2
nrows = math.ceil(n_prefixes / ncols)
fig, axes = plt.subplots(nrows, ncols, figsize=(12, 5 * nrows), squeeze=False)

for i, prefix in enumerate(top_prefixes):
    row = i // ncols
    col = i % ncols
    ax = axes[row][col]

    # Select rows for this topic prefix
    prefix_df = df[df['topic_prefix'] == prefix].copy()
    # Prepare for join: lowercase & replace underscores with spaces to match license df
    prefix_df["title_lc"] = prefix_df["title"].str.replace('_', ' ').str.lower()

    # Join with license/wo_license dfs on normalized title
    prefix_w_license = pd.merge(
        grokipedia_w_license_df, prefix_df, left_on="title", right_on="title_lc"
    )
    prefix_wo_license = pd.merge(
        grokipedia_wo_license_df, prefix_df, left_on="title", right_on="title_lc"
    )

    ax.hist(
        [prefix_w_license["similarity"].dropna(), prefix_wo_license["similarity"].dropna()],
        bins=100,
        color=["tab:blue", "tab:orange"],
        label=["With License", "Without License"],
        alpha=0.7,
        histtype="stepfilled",
    )
    ax.set_xlabel("Similarity")
    ax.set_ylabel("Count")
    ax.set_title(f"'{prefix}'")
    ax.legend()

# Remove empty subplots, if any
for j in range(i + 1, nrows * ncols):
    fig.delaxes(axes[j // ncols][j % ncols])

plt.suptitle("Embedding Similarity Distributions by Topic Prefix and License Status", fontsize=16)
plt.tight_layout(rect=[0, 0.03, 1, 0.97])
plt.savefig(f"../graphics/embedding_similarity_by_topic_prefix_subplots_license_split.pdf")
plt.show()

In [None]:
# ---- PLOT: TOP 20 2ND LEVEL TOPICS BY CHUNK COUNT ----

# Count number of chunks per 2nd level topic
top_20_2nd_level_topics = (
    df['2nd_level_topic']
    .value_counts()
    .head(20)
    .index
    .tolist()
)

top_20_2nd_level_topics = sorted(top_20_2nd_level_topics)

n_second_levels = len(top_20_2nd_level_topics)
ncols2 = 5
nrows2 = math.ceil(n_second_levels / ncols2)
fig2, axes2 = plt.subplots(
    nrows2,
    ncols2,
    figsize=(4 * ncols2, 4 * nrows2),  # square & smaller subplots
    squeeze=False
)

for i, second_level in enumerate(top_20_2nd_level_topics):
    row = i // ncols2
    col = i % ncols2
    ax = axes2[row][col]

    # Select rows for this second-level topic
    second_level_df = df[df['2nd_level_topic'] == second_level].copy()
    # Prepare for join: normalize as before
    second_level_df["title_lc"] = second_level_df["title"].str.replace('_', ' ').str.lower()

    second_level_w_license = pd.merge(
        grokipedia_w_license_df, second_level_df, left_on="title", right_on="title_lc"
    )
    second_level_wo_license = pd.merge(
        grokipedia_wo_license_df, second_level_df, left_on="title", right_on="title_lc"
    )

    h = ax.hist(
        [second_level_w_license["similarity"].dropna(), second_level_wo_license["similarity"].dropna()],
        bins=100,
        color=["tab:blue", "tab:orange"],
        label=["With License", "Without License"],
        alpha=0.7,
        histtype="stepfilled",
    )
    ax.set_xlabel("Similarity")
    ax.set_ylabel("Count")
    ax.set_title(f"'{second_level}'")

# Add legend to only the first axis
axes2[0][0].legend()

# Remove empty subplots, if any
for j in range(i + 1, nrows2 * ncols2):
    fig2.delaxes(axes2[j // ncols2][j % ncols2])

plt.suptitle("Embedding Similarity Distributions by Top 20 2nd-Level Topics (by chunk count) and License Status", fontsize=16)
plt.tight_layout(rect=[0, 0.03, 1, 0.97])
plt.savefig(f"../graphics/embedding_similarity_by_second_level_topic_subplots_license_split.pdf")
plt.show()

In [None]:
filtered_df = pd.read_csv('../results/reliability_citation_diff.csv')

import matplotlib.pyplot as plt
import numpy as np
from matplotlib.patches import Patch
import math

# --- Setup categories, labels, colors ---
column_order = [
    'reliable', 'unreliable', 'blacklist', 'no_consensus', 'deprecated', 'other'
]
display_names = {
    'reliable': 'Generally reliable',
    'unreliable': 'Generally unreliable',
    'blacklist': 'Blacklisted',
    'no_consensus': 'No consensus',
    'deprecated': 'Deprecated',
    'other': 'Other'
}
color_map = {
    'reliable': 'green',
    'unreliable': 'red',
    'blacklist': 'black',
    'no_consensus': 'yellow',
    'deprecated': 'orange',
    'other': 'grey'
}

# Add missing columns if they don't exist (set to 0)
for col in ['wp_blacklist', 'wp_no_consensus', 'wp_deprecated', 
            'grok_blacklist', 'grok_no_consensus', 'grok_deprecated']:
    if col not in filtered_df.columns:
        filtered_df[col] = 0

# --- Join with topic data ---
# Get unique titles with their 2nd level topics from df
# Note: df may have multiple rows per title due to explode, so we keep all topic assignments
df_with_topics = df[['title', '2nd_level_topic']].copy()

# Merge citation data with topic data - join directly on title (both use underscores)
# This will create multiple rows per article if an article has multiple topics
citation_with_topics = pd.merge(
    filtered_df, df_with_topics, on='title', how='inner'
)

print(f"Total articles in filtered_df: {len(filtered_df)}")
print(f"Total articles after join: {len(citation_with_topics)}")
print(f"Articles with topics: {citation_with_topics['2nd_level_topic'].notna().sum()}")
print(f"Sample citation counts - reliable: {citation_with_topics['wp_reliable'].sum()}, unreliable: {citation_with_topics['wp_unreliable'].sum()}, other: {citation_with_topics['wp_other'].sum()}")

# Get top 20 2nd level topics by citation count
top_20_2nd_level_topics = (
    citation_with_topics['2nd_level_topic']
    .value_counts()
    .head(20)
    .index
    .tolist()
)

top_20_2nd_level_topics = sorted([t for t in top_20_2nd_level_topics if t is not None])

# --- Create subplots ---
n_second_levels = len(top_20_2nd_level_topics)
ncols2 = 5
nrows2 = math.ceil(n_second_levels / ncols2)
fig2, axes2 = plt.subplots(
    nrows2,
    ncols2,
    figsize=(4 * ncols2, 4 * nrows2),
    squeeze=False
)

for i, second_level in enumerate(top_20_2nd_level_topics):
    row = i // ncols2
    col = i % ncols2
    ax = axes2[row][col]

    # Select rows for this second-level topic
    second_level_df = citation_with_topics[citation_with_topics['2nd_level_topic'] == second_level].copy()
    
    # Aggregate counts for this topic
    agg = {
        'wp_reliable': second_level_df['wp_reliable'].sum(),
        'wp_unreliable': second_level_df['wp_unreliable'].sum(),
        'wp_blacklist': second_level_df['wp_blacklist'].sum(),
        'wp_no_consensus': second_level_df['wp_no_consensus'].sum(),
        'wp_deprecated': second_level_df['wp_deprecated'].sum(),
        'wp_other': second_level_df['wp_other'].sum(),
        'grok_reliable': second_level_df['grok_reliable'].sum(),
        'grok_unreliable': second_level_df['grok_unreliable'].sum(),
        'grok_blacklist': second_level_df['grok_blacklist'].sum(),
        'grok_no_consensus': second_level_df['grok_no_consensus'].sum(),
        'grok_deprecated': second_level_df['grok_deprecated'].sum(),
        'grok_other': second_level_df['grok_other'].sum(),
    }

    # Make DF of shape: index=['Wikipedia', 'Grokipedia'], columns=column_order
    wp_row = [agg[f'wp_{k}'] for k in column_order]
    grok_row = [agg[f'grok_{k}'] for k in column_order]
    prop_df = pd.DataFrame(
        [wp_row, grok_row],
        columns=column_order,
        index=['Wikipedia', 'Grokipedia']
    )
    prop_df_norm = prop_df.div(prop_df.sum(axis=1), axis=0).fillna(0)

    # --- Plotting stacked bars ---
    labels = ['Wikipedia', 'Grokipedia']
    x = np.arange(len(labels))
    bar_sep = 0.09
    width = 0.18

    # Set up stacking
    bottoms = [0, 0]

    # For synchronized stacking, process in column order:
    for j, col_name in enumerate(column_order):
        color = color_map.get(col_name, 'grey')
        # WP bar proportions
        wp_prop = prop_df_norm.loc['Wikipedia', col_name]
        grok_prop = prop_df_norm.loc['Grokipedia', col_name]
        ax.bar(x[0] - width/2 - bar_sep/2, wp_prop, width=width,
                bottom=bottoms[0], color=color, edgecolor='none', zorder=2, alpha=0.5)
        ax.bar(x[1] + width/2 + bar_sep/2, grok_prop, width=width,
                bottom=bottoms[1], color=color, edgecolor='none', zorder=2, alpha=0.5)

        # Diagonal change fill
        wp_top = bottoms[0] + wp_prop
        grok_top = bottoms[1] + grok_prop
        ax.fill_between(
            [x[0] - width/2, x[1] + width/2],
            [wp_top, grok_top],
            [bottoms[0], bottoms[1]],
            color=color, alpha=0.14, zorder=1, linewidth=0
        )
        bottoms[0] += wp_prop
        bottoms[1] += grok_prop

    # Set axis ticks and labels
    ax.set_xticks(x)
    ax.set_xticklabels(labels)
    ax.set_ylabel("Proportion of Citations")
    ax.set_title(f"'{second_level}'")
    
    # Make axis tight with bars
    ax.set_xlim(-0.23, 1.23)
    ax.set_ylim(bottom=0, top=1.01)
    ax.spines["right"].set_visible(False)
    ax.spines["top"].set_visible(False)

# Add legend to only the first axis
legend_elements = [Patch(facecolor=color_map.get(col, 'grey'), label=display_names.get(col, col), alpha=0.55) for col in column_order]
axes2[0][0].legend(handles=legend_elements, title='Source Status', loc='upper center', framealpha=0.9)

# Remove empty subplots, if any
for j in range(i + 1, nrows2 * ncols2):
    fig2.delaxes(axes2[j // ncols2][j % ncols2])

plt.suptitle("Citation Status Proportion by Top 20 2nd-Level Topics (by citation count): Wikipedia vs Grokipedia", fontsize=16)
plt.tight_layout(rect=[0, 0.03, 1, 0.97])
plt.savefig(f"../graphics/citation_composition_by_second_level_topic_subplots.pdf")
plt.show()

In [None]:
# ---- PLOT: REGION TOPIC on GEOSPATIAL 6x? GRID ----

# Define canonical region and continent values from arrays (ignore continent* and region None)
region_values = [
    'South_Asia', 'Asia*', 'Europe*', 'East_Asia', 'Western_Europe', 'North_America', 'Southern_Europe',
    'Western_Africa', 'Africa*', 'West_Asia', 'Eastern_Europe', 'North_Asia', 'Northern_Europe', 'Southeast_Asia',
    'Central_Asia', 'South_America', 'Central_America', 'Northern_Africa', 'Central_Africa', 'Southern_Africa',
    'Eastern_Africa'
]
continent_values = ['Asia', 'Europe', 'Americas', 'Africa', 'Oceania']

# Lay out regions approximately geospatially (None for blank grid cells)
# (Manual arrangement for visual resemblance and coverage)
region_grid_labels = [
#    0               1                2               3                 4                  5
    ['North_America',  None,        None,         'Northern_Europe',    None,           'North_Asia', None],
    ['Central_America',None, 'Western_Europe',  'Southern_Europe', 'Eastern_Europe',  'Central_Asia', 'East_Asia'],
    [None,             'South_America',        None,        'Northern_Africa',     None,            'West_Asia', None],
    [None,       None, 'Western_Africa', 'Central_Africa',  'Eastern_Africa', 'South_Asia', 'Southeast_Asia'],
    [None,        None,        None,       'Southern_Africa',     None,        None, 'Oceania'],
]

print("== GEOSPATIAL REGION 6x? GRID ==")
for row in region_grid_labels:
    print([r if r is not None else "---" for r in row])

# All region_topics in the data (ignore None)
input_region_topics = set(df['region_topic'].dropna().unique())

# Standardize region naming for mapping: e.g., 'South_Asia', 'North_America'
def standardize_region_key(r):
    # Handles e.g. 'Geography.Regions.Americas.North_America' -> 'north_america', 'Asia*' -> 'asia*'
    if '.' in r:
        last = r.split('.')[-1]
    else:
        last = r
    return last.strip().replace(' ', '_').replace('-', '_').lower()

# Map from lower-snakecase region key to region value in the data
input_region_map = {standardize_region_key(r): r for r in input_region_topics}

# Build (label, canonical-region) grid for plotting
region_plot_grid = []
for row in region_grid_labels:
    label_row = []
    for label in row:
        if label is None:
            label_row.append(None)
            continue
        std_key = label.replace('-', '_').lower()
        canonical = input_region_map.get(std_key)
        if canonical is None:
            # Try looser containment fallback (for "*", e.g. "asia*" matches "asia*")
            canonical = next((orig for std, orig in input_region_map.items() if std_key in std), None)
        label_row.append((label, canonical))
    region_plot_grid.append(label_row)

nrows = len(region_plot_grid)
ncols = max(len(row) for row in region_plot_grid)

fig, axes = plt.subplots(
    nrows, ncols,
    figsize=(4 * ncols, 3.5 * nrows),
    squeeze=False
)

for row_idx, row in enumerate(region_plot_grid):
    for col_idx, cell in enumerate(row):
        ax = axes[row_idx][col_idx]
        if cell is None or cell[1] is None:
            ax.axis('off')
            continue
        label, region = cell
        # Only show if region in our filtered region_values list
        r_label = label.replace('-', '_')
        # Consider region_values is all upper snakecase; do lower for compare
        if r_label.lower() not in [rv.lower() for rv in region_values]:
            ax.axis('off')
            continue
        region_df = df[df['region_topic'] == region].copy()
        if region_df.empty:
            ax.axis('off')
            continue
        region_df["title_lc"] = region_df["title"].str.replace('_', ' ').str.lower()

        region_w_license = pd.merge(
            grokipedia_w_license_df, region_df, left_on="title", right_on="title_lc"
        )
        region_wo_license = pd.merge(
            grokipedia_wo_license_df, region_df, left_on="title", right_on="title_lc"
        )

        h = ax.hist(
            [region_w_license["similarity"].dropna(), region_wo_license["similarity"].dropna()],
            bins=100,
            color=["tab:blue", "tab:orange"],
            label=["With License", "Without License"],
            alpha=0.7,
            histtype="stepfilled",
        )
        ax.set_xlabel("Similarity")
        ax.set_ylabel("Count")
        pretty_label = label.replace('_', ' ').replace('-', ' ').replace('*', ' (all)').title()
        ax.set_title(pretty_label)

# Add legend to only the first visible axis
legend_added = False
for r in range(nrows):
    for c in range(ncols):
        if c >= len(region_plot_grid[r]):
            continue  # safety for non-rectangular grid
        cell = region_plot_grid[r][c]
        ax = axes[r][c]
        if not legend_added and cell is not None and cell[1] is not None:
            ax.legend()
            legend_added = True

plt.suptitle("Embedding Similarity Distributions by Geospatial Region and License Status", fontsize=16)
plt.tight_layout(rect=[0, 0.03, 1, 0.97])
plt.savefig(f"../graphics/embedding_similarity_by_spatial_region_6col_grid_license_split.pdf")
plt.show()


In [None]:
# ---- PLOT: CITATION COMPOSITION BY SUBCONTINENT on GEOSPATIAL GRID ----

filtered_df = pd.read_csv('../results/reliability_citation_diff.csv')

import matplotlib.pyplot as plt
import numpy as np
from matplotlib.patches import Patch
import math

# --- Setup categories, labels, colors ---
column_order = [
    'reliable', 'unreliable', 'blacklist', 'no_consensus', 'deprecated', 'other'
]
display_names = {
    'reliable': 'Generally reliable',
    'unreliable': 'Generally unreliable',
    'blacklist': 'Blacklisted',
    'no_consensus': 'No consensus',
    'deprecated': 'Deprecated',
    'other': 'Other'
}
color_map = {
    'reliable': 'green',
    'unreliable': 'red',
    'blacklist': 'black',
    'no_consensus': 'yellow',
    'deprecated': 'orange',
    'other': 'grey'
}

# Add missing columns if they don't exist (set to 0)
for col in ['wp_blacklist', 'wp_no_consensus', 'wp_deprecated', 
            'grok_blacklist', 'grok_no_consensus', 'grok_deprecated']:
    if col not in filtered_df.columns:
        filtered_df[col] = 0

# --- Join with subcontinent data ---
# Get unique titles with their subcontinents from df
df_with_subcontinents = df[['title', 'subcontinent']].dropna().copy()

# Merge citation data with subcontinent data - join directly on title
citation_with_subcontinents = pd.merge(
    filtered_df, df_with_subcontinents, on='title', how='inner'
)

# Define subcontinent grid layout (similar to region grid)
subcontinent_grid_labels = [
    ['North_America', None, None, 'Northern_Europe', None, 'North_Asia', None],
    ['Central_America', None, 'Western_Europe', 'Southern_Europe', 'Eastern_Europe', 'Central_Asia', 'East_Asia'],
    [None, 'South_America', None, 'Northern_Africa', None, 'West_Asia', None],
    [None, None, 'Western_Africa', 'Central_Africa', 'Eastern_Africa', 'South_Asia', 'Southeast_Asia'],
    [None, None, None, 'Southern_Africa', None, None, 'Oceania'],
]

print("== GEOSPATIAL SUBCONTINENT GRID ==")
for row in subcontinent_grid_labels:
    print([r if r is not None else "---" for r in row])

# All subcontinents in the data
input_subcontinents = set(citation_with_subcontinents['subcontinent'].dropna().unique())

# Standardize subcontinent naming for mapping
def standardize_subcontinent_key(s):
    # Handles e.g. 'Geography.Regions.Americas.North_America' -> 'north_america'
    if '.' in s:
        last = s.split('.')[-1]
    else:
        last = s
    return last.strip().replace(' ', '_').replace('-', '_').lower()

# Map from lower-snakecase subcontinent key to subcontinent value in the data
input_subcontinent_map = {standardize_subcontinent_key(s): s for s in input_subcontinents}

# Build (label, canonical-subcontinent) grid for plotting
subcontinent_plot_grid = []
for row in subcontinent_grid_labels:
    label_row = []
    for label in row:
        if label is None:
            label_row.append(None)
            continue
        std_key = label.replace('-', '_').lower()
        canonical = input_subcontinent_map.get(std_key)
        if canonical is None:
            # Try looser containment fallback
            canonical = next((orig for std, orig in input_subcontinent_map.items() if std_key in std), None)
        label_row.append((label, canonical))
    subcontinent_plot_grid.append(label_row)

nrows = len(subcontinent_plot_grid)
ncols = max(len(row) for row in subcontinent_plot_grid)

fig, axes = plt.subplots(
    nrows, ncols,
    figsize=(4 * ncols, 3.5 * nrows),
    squeeze=False
)

for row_idx, row in enumerate(subcontinent_plot_grid):
    for col_idx, cell in enumerate(row):
        ax = axes[row_idx][col_idx]
        if cell is None or cell[1] is None:
            ax.axis('off')
            continue
        label, subcontinent = cell
        
        # Select rows for this subcontinent
        subcontinent_df = citation_with_subcontinents[citation_with_subcontinents['subcontinent'] == subcontinent].copy()
        if subcontinent_df.empty:
            ax.axis('off')
            continue
        
        # Aggregate counts for this subcontinent
        agg = {
            'wp_reliable': subcontinent_df['wp_reliable'].sum(),
            'wp_unreliable': subcontinent_df['wp_unreliable'].sum(),
            'wp_blacklist': subcontinent_df['wp_blacklist'].sum(),
            'wp_no_consensus': subcontinent_df['wp_no_consensus'].sum(),
            'wp_deprecated': subcontinent_df['wp_deprecated'].sum(),
            'wp_other': subcontinent_df['wp_other'].sum(),
            'grok_reliable': subcontinent_df['grok_reliable'].sum(),
            'grok_unreliable': subcontinent_df['grok_unreliable'].sum(),
            'grok_blacklist': subcontinent_df['grok_blacklist'].sum(),
            'grok_no_consensus': subcontinent_df['grok_no_consensus'].sum(),
            'grok_deprecated': subcontinent_df['grok_deprecated'].sum(),
            'grok_other': subcontinent_df['grok_other'].sum(),
        }

        # Make DF of shape: index=['Wikipedia', 'Grokipedia'], columns=column_order
        wp_row = [agg[f'wp_{k}'] for k in column_order]
        grok_row = [agg[f'grok_{k}'] for k in column_order]
        prop_df = pd.DataFrame(
            [wp_row, grok_row],
            columns=column_order,
            index=['Wikipedia', 'Grokipedia']
        )
        prop_df_norm = prop_df.div(prop_df.sum(axis=1), axis=0).fillna(0)

        # --- Plotting stacked bars ---
        labels = ['Wikipedia', 'Grokipedia']
        x = np.arange(len(labels))
        bar_sep = 0.09
        width = 0.18

        # Set up stacking
        bottoms = [0, 0]

        # For synchronized stacking, process in column order:
        for j, col_name in enumerate(column_order):
            color = color_map.get(col_name, 'grey')
            # WP bar proportions
            wp_prop = prop_df_norm.loc['Wikipedia', col_name]
            grok_prop = prop_df_norm.loc['Grokipedia', col_name]
            ax.bar(x[0] - width/2 - bar_sep/2, wp_prop, width=width,
                    bottom=bottoms[0], color=color, edgecolor='none', zorder=2, alpha=0.5)
            ax.bar(x[1] + width/2 + bar_sep/2, grok_prop, width=width,
                    bottom=bottoms[1], color=color, edgecolor='none', zorder=2, alpha=0.5)

            # Diagonal change fill
            wp_top = bottoms[0] + wp_prop
            grok_top = bottoms[1] + grok_prop
            ax.fill_between(
                [x[0] - width/2, x[1] + width/2],
                [wp_top, grok_top],
                [bottoms[0], bottoms[1]],
                color=color, alpha=0.14, zorder=1, linewidth=0
            )
            bottoms[0] += wp_prop
            bottoms[1] += grok_prop

        # Set axis ticks and labels
        ax.set_xticks(x)
        ax.set_xticklabels(labels)
        ax.set_ylabel("Proportion of Citations")
        pretty_label = label.replace('_', ' ').replace('-', ' ').replace('*', ' (all)').title()
        ax.set_title(pretty_label)
        
        # Make axis tight with bars
        ax.set_xlim(-0.23, 1.23)
        ax.set_ylim(bottom=0, top=1.01)
        ax.spines["right"].set_visible(False)
        ax.spines["top"].set_visible(False)

# Add legend to only the first visible axis
legend_added = False
for r in range(nrows):
    for c in range(ncols):
        if c >= len(subcontinent_plot_grid[r]):
            continue  # safety for non-rectangular grid
        cell = subcontinent_plot_grid[r][c]
        ax = axes[r][c]
        if not legend_added and cell is not None and cell[1] is not None:
            legend_elements = [Patch(facecolor=color_map.get(col, 'grey'), label=display_names.get(col, col), alpha=0.55) for col in column_order]
            ax.legend(handles=legend_elements, title='Source Status', loc='upper center', framealpha=0.9)
            legend_added = True

plt.suptitle("Citation Status Proportion by Subcontinent: Wikipedia vs Grokipedia", fontsize=16)
plt.tight_layout(rect=[0, 0.03, 1, 0.97])
plt.savefig(f"../graphics/citation_composition_by_subcontinent_grid.pdf")
plt.show()


In [None]:
# ---- PLOT: CONTINENT TOPIC on GEOSPATIAL 6x? GRID ----

# Fix for "Americas" not showing up: 
# We'll reconstruct a continent mapping from region_topic codes.
# Most region_topic values have format: "Geography.Regions.<Continent>[.<...>]"
# We'll extract the continent for each row and use that to plot.

continent_values = ['Asia', 'Europe', 'Americas', 'Africa', 'Oceania']
region_grid_labels = [
    ['Americas', 'Europe', 'Asia'],
    [None, 'Africa', 'Oceania'],
]

print("== GEOSPATIAL CONTINENTS GRID TO PLOT ==")
for row in region_grid_labels:
    print([r if r is not None else "---" for r in row])

def extract_continent(region_topic):
    """Extracts continent from region_topic string, e.g.: 
       'Geography.Regions.Americas.North_America' => 'Americas'
       'Geography.Regions.Europe.Europe*' => 'Europe'
    """
    if not isinstance(region_topic, str):
        return None
    parts = region_topic.split('.')
    # Look for "Geography.Regions.<Continent>"
    try:
        r_idx = parts.index('Regions') + 1
        if r_idx < len(parts):
            cont = parts[r_idx]
            # Canonicalize for special cases ('Americas', 'Asia', etc)
            for v in continent_values:
                if cont.lower().startswith(v.lower()):
                    return v
    except (ValueError, IndexError):
        pass
    return None

# Add a column to df for continent (avoid mutating original)
df_cont = df.copy()
df_cont['continent'] = df_cont['region_topic'].apply(extract_continent)

# If we want "Asia*" etc to count as "Asia", this function above will handle that.

nrows = len(region_grid_labels)
ncols = max(len(row) for row in region_grid_labels)

fig, axes = plt.subplots(
    nrows, ncols,
    figsize=(4 * ncols, 3.5 * nrows),
    squeeze=False
)

for row_idx, row in enumerate(region_grid_labels):
    for col_idx, label in enumerate(row):
        ax = axes[row_idx][col_idx]
        if label is None:
            ax.axis('off')
            continue
        # Only show if label is a known continent
        if label not in continent_values:
            ax.axis('off')
            continue

        region_df = df_cont[df_cont['continent'] == label].copy()
        if region_df.empty:
            ax.axis('off')
            continue
        region_df["title_lc"] = region_df["title"].str.replace('_', ' ').str.lower()
        region_w_license = pd.merge(
            grokipedia_w_license_df, region_df, left_on="title", right_on="title_lc"
        )
        region_wo_license = pd.merge(
            grokipedia_wo_license_df, region_df, left_on="title", right_on="title_lc"
        )
        h = ax.hist(
            [region_w_license["similarity"].dropna(), region_wo_license["similarity"].dropna()],
            bins=100,
            color=["tab:blue", "tab:orange"],
            label=["With License", "Without License"],
            alpha=0.7,
            histtype="stepfilled",
        )
        ax.set_xlabel("Similarity")
        ax.set_ylabel("Count")
        pretty_label = label.replace('_', ' ').replace('-', ' ').replace('*', ' (all)').title()
        ax.set_title(pretty_label)

# Add legend to only the first visible axis
legend_added = False
for r in range(nrows):
    for c in range(ncols):
        if c >= len(region_grid_labels[r]):
            continue
        ax = axes[r][c]
        cell_label = region_grid_labels[r][c]
        if not legend_added and cell_label is not None:
            ax.legend()
            legend_added = True

plt.suptitle("Embedding Similarity Distributions by Continent and License Status", fontsize=16)
plt.tight_layout(rect=[0, 0.03, 1, 0.97])
plt.savefig(f"../graphics/embedding_similarity_by_continents_grid_license_split.pdf")
plt.show()



In [None]:
# ---- PLOT: CITATION COMPOSITION BY CONTINENT on GEOSPATIAL GRID ----

filtered_df = pd.read_csv('../results/reliability_citation_diff.csv')

import matplotlib.pyplot as plt
import numpy as np
from matplotlib.patches import Patch
import math

# --- Setup categories, labels, colors ---
column_order = [
    'reliable', 'unreliable', 'blacklist', 'no_consensus', 'deprecated', 'other'
]
display_names = {
    'reliable': 'Generally reliable',
    'unreliable': 'Generally unreliable',
    'blacklist': 'Blacklisted',
    'no_consensus': 'No consensus',
    'deprecated': 'Deprecated',
    'other': 'Other'
}
color_map = {
    'reliable': 'green',
    'unreliable': 'red',
    'blacklist': 'black',
    'no_consensus': 'yellow',
    'deprecated': 'orange',
    'other': 'grey'
}

# Add missing columns if they don't exist (set to 0)
for col in ['wp_blacklist', 'wp_no_consensus', 'wp_deprecated', 
            'grok_blacklist', 'grok_no_consensus', 'grok_deprecated']:
    if col not in filtered_df.columns:
        filtered_df[col] = 0

# --- Extract continent from region_topic ---
continent_values = ['Asia', 'Europe', 'Americas', 'Africa', 'Oceania']

def extract_continent(region_topic):
    """Extracts continent from region_topic string, e.g.: 
       'Geography.Regions.Americas.North_America' => 'Americas'
       'Geography.Regions.Europe.Europe*' => 'Europe'
    """
    if not isinstance(region_topic, str):
        return None
    parts = region_topic.split('.')
    # Look for "Geography.Regions.<Continent>"
    try:
        r_idx = parts.index('Regions') + 1
        if r_idx < len(parts):
            cont = parts[r_idx]
            # Canonicalize for special cases ('Americas', 'Asia', etc)
            for v in continent_values:
                if cont.lower().startswith(v.lower()):
                    return v
    except (ValueError, IndexError):
        pass
    return None

# Add a column to df for continent
df_cont = df.copy()
df_cont['continent'] = df_cont['region_topic'].apply(extract_continent)

# --- Join with continent data ---
# Get unique titles with their continents from df
df_with_continents = df_cont[['title', 'continent']].dropna().copy()

# Merge citation data with continent data - join directly on title
citation_with_continents = pd.merge(
    filtered_df, df_with_continents, on='title', how='inner'
)

# Define continent grid layout
continent_grid_labels = [
    ['Americas', 'Europe', 'Asia'],
    [None, 'Africa', 'Oceania'],
]

print("== GEOSPATIAL CONTINENTS GRID ==")
for row in continent_grid_labels:
    print([r if r is not None else "---" for r in row])

nrows = len(continent_grid_labels)
ncols = max(len(row) for row in continent_grid_labels)

fig, axes = plt.subplots(
    nrows, ncols,
    figsize=(4 * ncols, 3.5 * nrows),
    squeeze=False
)

for row_idx, row in enumerate(continent_grid_labels):
    for col_idx, label in enumerate(row):
        ax = axes[row_idx][col_idx]
        if label is None:
            ax.axis('off')
            continue
        # Only show if label is a known continent
        if label not in continent_values:
            ax.axis('off')
            continue

        # Select rows for this continent
        continent_df = citation_with_continents[citation_with_continents['continent'] == label].copy()
        if continent_df.empty:
            ax.axis('off')
            continue
        
        # Aggregate counts for this continent
        agg = {
            'wp_reliable': continent_df['wp_reliable'].sum(),
            'wp_unreliable': continent_df['wp_unreliable'].sum(),
            'wp_blacklist': continent_df['wp_blacklist'].sum(),
            'wp_no_consensus': continent_df['wp_no_consensus'].sum(),
            'wp_deprecated': continent_df['wp_deprecated'].sum(),
            'wp_other': continent_df['wp_other'].sum(),
            'grok_reliable': continent_df['grok_reliable'].sum(),
            'grok_unreliable': continent_df['grok_unreliable'].sum(),
            'grok_blacklist': continent_df['grok_blacklist'].sum(),
            'grok_no_consensus': continent_df['grok_no_consensus'].sum(),
            'grok_deprecated': continent_df['grok_deprecated'].sum(),
            'grok_other': continent_df['grok_other'].sum(),
        }

        # Make DF of shape: index=['Wikipedia', 'Grokipedia'], columns=column_order
        wp_row = [agg[f'wp_{k}'] for k in column_order]
        grok_row = [agg[f'grok_{k}'] for k in column_order]
        prop_df = pd.DataFrame(
            [wp_row, grok_row],
            columns=column_order,
            index=['Wikipedia', 'Grokipedia']
        )
        prop_df_norm = prop_df.div(prop_df.sum(axis=1), axis=0).fillna(0)

        # --- Plotting stacked bars ---
        labels = ['Wikipedia', 'Grokipedia']
        x = np.arange(len(labels))
        bar_sep = 0.09
        width = 0.18

        # Set up stacking
        bottoms = [0, 0]

        # For synchronized stacking, process in column order:
        for j, col_name in enumerate(column_order):
            color = color_map.get(col_name, 'grey')
            # WP bar proportions
            wp_prop = prop_df_norm.loc['Wikipedia', col_name]
            grok_prop = prop_df_norm.loc['Grokipedia', col_name]
            ax.bar(x[0] - width/2 - bar_sep/2, wp_prop, width=width,
                    bottom=bottoms[0], color=color, edgecolor='none', zorder=2, alpha=0.5)
            ax.bar(x[1] + width/2 + bar_sep/2, grok_prop, width=width,
                    bottom=bottoms[1], color=color, edgecolor='none', zorder=2, alpha=0.5)

            # Diagonal change fill
            wp_top = bottoms[0] + wp_prop
            grok_top = bottoms[1] + grok_prop
            ax.fill_between(
                [x[0] - width/2, x[1] + width/2],
                [wp_top, grok_top],
                [bottoms[0], bottoms[1]],
                color=color, alpha=0.14, zorder=1, linewidth=0
            )
            bottoms[0] += wp_prop
            bottoms[1] += grok_prop

        # Set axis ticks and labels
        ax.set_xticks(x)
        ax.set_xticklabels(labels)
        ax.set_ylabel("Proportion of Citations")
        pretty_label = label.replace('_', ' ').replace('-', ' ').replace('*', ' (all)').title()
        ax.set_title(pretty_label)
        
        # Make axis tight with bars
        ax.set_xlim(-0.23, 1.23)
        ax.set_ylim(bottom=0, top=1.01)
        ax.spines["right"].set_visible(False)
        ax.spines["top"].set_visible(False)

# Add legend to only the first visible axis
legend_added = False
for r in range(nrows):
    for c in range(ncols):
        if c >= len(continent_grid_labels[r]):
            continue
        ax = axes[r][c]
        cell_label = continent_grid_labels[r][c]
        if not legend_added and cell_label is not None:
            legend_elements = [Patch(facecolor=color_map.get(col, 'grey'), label=display_names.get(col, col), alpha=0.55) for col in column_order]
            ax.legend(handles=legend_elements, title='Source Status', loc='upper center', framealpha=0.9)
            legend_added = True

plt.suptitle("Citation Status Proportion by Continent: Wikipedia vs Grokipedia", fontsize=16)
plt.tight_layout(rect=[0, 0.03, 1, 0.97])
plt.savefig(f"../graphics/citation_composition_by_continent_grid.pdf")
plt.show()


In [None]:
# Plot embedding similarity distributions by article class (FA, GA, B, C, Start, Stub), split by license status

class_order = ["FA", "GA", "B", "C", "Start", "Stub"]
pretty_class_labels = {
    "FA": "Featured Article",
    "GA": "Good Article",
    "B": "B-Class",
    "C": "C-Class",
    "Start": "Start-Class",
    "Stub": "Stub-Class",
}

# Prepare class column: make None/NaN -> "Unknown", keep uppercase
df_class = df.copy()
df_class["class"] = df_class["class"].astype("category")
df_class["class"] = df_class["class"].cat.set_categories(class_order, ordered=True)
# Prepare lower-case titles for merge
df_class["title_lc"] = df_class["title"].str.replace('_', ' ').str.lower()

n_classes = len(class_order)
fig, axes = plt.subplots(
    2, 3, 
    figsize=(16, 8),
    squeeze=False
)

for idx, class_label in enumerate(class_order):
    row = idx // 3
    col = idx % 3
    ax = axes[row][col]
    class_df = df_class[df_class["class"] == class_label]
    if class_df.empty:
        ax.axis('off')
        continue
    class_w_license = pd.merge(
        grokipedia_w_license_df, class_df, left_on="title", right_on="title_lc"
    )
    class_wo_license = pd.merge(
        grokipedia_wo_license_df, class_df, left_on="title", right_on="title_lc"
    )
    h = ax.hist(
        [class_w_license["similarity"].dropna(), class_wo_license["similarity"].dropna()],
        bins=100,
        color=["tab:blue", "tab:orange"],
        label=["With License", "Without License"],
        alpha=0.7,
        histtype="stepfilled",
    )
    ax.set_xlabel("Similarity")
    ax.set_ylabel("Count")
    pretty_label = pretty_class_labels.get(class_label, class_label)
    ax.set_title(pretty_label)

# Add legend to only the first axis
axes[0][0].legend()
plt.suptitle("Embedding Similarity Distributions by Article Quality Class and License Status", fontsize=16)
plt.tight_layout(rect=[0, 0.03, 1, 0.97])
plt.savefig("../graphics/embedding_similarity_by_class_grid_license_split.pdf")
plt.show()