# Analysis

Actually do the analysis

In [1]:
import pandas as pd
import re
import json
import csv
from collections import Counter
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
import numpy as np

In [2]:
RESULT_DIR = "../results"
SUPP_DATA_DIR = "../supplemental_data"

## Absolute comparison of domain occurrences

In [None]:
with open('../results/grok_domains.json', 'r') as f:
    grok_domains = json.load(f)

with open('../results/wp_domains.json', 'r') as f:
    wp_domains = json.load(f)

total_grok = 0
total_wp = 0
grok_domains_counter = Counter()
wp_domains_counter = Counter()

for a in grok_domains:
    for d, c in a.items():
        for domain, count in c.items():
            grok_domains_counter[domain] += count
            total_grok += count

for a in wp_domains:
    for d, c in a.items():
        for domain, count in c.items():
            wp_domains_counter[domain] += count
            total_wp += count

In [None]:
wiki_citation_count_df = pd.DataFrame(list(wp_domains_counter.items()), columns=['domain', 'wiki_count'])
wiki_citation_count_df['wiki_total_cites'] = total_wp
wiki_citation_count_df['wiki_share'] = wiki_citation_count_df['wiki_count'] / total_wp
grok_citation_count_df = pd.DataFrame(list(grok_domains_counter.items()), columns=['domain', 'grok_count'])
grok_citation_count_df['grok_total_cites'] = total_grok
grok_citation_count_df['grok_share'] = grok_citation_count_df['grok_count'] / total_grok
citation_count_df = pd.merge(wiki_citation_count_df, grok_citation_count_df, on='domain', how='outer').fillna(0)
citation_count_df['count_diff'] = citation_count_df['grok_count'] - citation_count_df['wiki_count']
citation_count_df['share_diff'] = citation_count_df['grok_share'] - citation_count_df['wiki_share']

In [None]:
citation_count_df

In [None]:
citation_count_df.to_csv('../results/citation_count_df.csv', index=False)

In [None]:
def combine_domains(domain):
    if domain in ['bbc.co.uk', 'bbc.com', 'news.bbc.co.uk']:
        return 'bbc.com'
    if domain in ['twitter.com', 'x.com']:
        return 'x/twitter.com'
    if domain in ['edition.cnn.com', 'cnn.com']:
        return 'cnn.com'
    if domain in ['timesofindia.indiatimes.com', 'economictimes.indiatimes.com']:
        return 'indiatimes.com'
    return domain

citation_count_df['combined_domain'] = citation_count_df['domain'].apply(combine_domains)
# citation_count_df.sort_values('grok_share', ascending=False)[100:104]

In [None]:
t100_grok_corrected = citation_count_df[['combined_domain', 'grok_share', 'wiki_share']].groupby('combined_domain').sum().sort_values('grok_share', ascending=False)[:100]
t100_wiki_corrected = citation_count_df[['combined_domain', 'grok_share', 'wiki_share']].groupby('combined_domain').sum().sort_values('wiki_share', ascending=False)[:100]

In [None]:
t100_grok_corrected.reset_index(inplace=True)
t100_wiki_corrected.reset_index(inplace=True)
t100_grok_corrected['domain'] = t100_grok_corrected['combined_domain']
t100_wiki_corrected['domain'] = t100_wiki_corrected['combined_domain']
t100_grok_corrected[['domain', 'grok_share']].to_csv('../results/t100_grok_corrected.csv', index=False)
t100_wiki_corrected[['domain', 'wiki_share']].to_csv('../results/t100_wiki_corrected.csv', index=False)


In [None]:
t100_grok = pd.read_csv('../results/t100_grok.csv')
t100_wiki = pd.read_csv('../results/t100_wiki.csv')

In [None]:
t100_grok

In [None]:
t100_grok_combined = pd.merge(t100_grok_corrected, t100_grok, left_on='combined_domain', right_on='domain', how='outer')
t100_wiki_combined = pd.merge(t100_wiki_corrected, t100_wiki, left_on='combined_domain', right_on='domain', how='outer')

In [None]:
t100_wiki_combined[~t100_wiki_combined.isna().any(axis=1)]

In [None]:
.to_csv('../results/t100_grok_corrected.csv', index=False)

In [None]:
citation_count_df.sort_values('grok_share', ascending=False)[100:104]

In [None]:
citation_count_df.sort_values('wiki_share', ascending=False)[:50]

In [None]:
citation_count_df.sort_values('share_diff', ascending=False)[:50]

## (Un)reliable source additions / removals

In [None]:
# df = pd.read_csv(f"{RESULT_DIR}/domain_deltas.csv")
# df_wo_license = pd.read_csv(f'{RESULT_DIR}/domain_deltas_wo_license.csv')
# df_w_license = pd.read_csv(f'{RESULT_DIR}/domain_deltas_w_license.csv')
domain_set = (
    set(
        pd.read_csv(f'{SUPP_DATA_DIR}/domain_lists/openalex_journal_domains.csv')
        .Domain.tolist()
    ).union({
        'academia.edu',
        'arxiv.org',
        'cambridge.org',
        'ebsco.com',
        'journals.uchicago.edu',
        'jstor.org',
        'mdpi.com',
        'ncbi.nlm.nih.gov',
        'papers.ssrn.com',
        'researchgate.net',
        'sciencedirect.com',
        'tandfonline.com'
    })
)
# reliability_df = pd.read_csv(f'{SUPP_DATA_DIR}/perennial_sources_enwiki/perennial_sources.csv')
reliability_df = pd.read_csv(f'{SUPP_DATA_DIR}/perennial_sources_enwiki/enwiki_perennial_list.csv')
reliability_df['domain'] = reliability_df['source']
reliability_df = reliability_df[['domain', 'status']]

In [None]:
df_w_license = pd.merge(df_w_license, reliability_df, on='domain')
df_wo_license = pd.merge(df_wo_license, reliability_df, on='domain')

In [None]:
print("="*100)
print("Grokipedia content adapted from Wikipedia (with license)")
print("="*100)
print("Summary of link changes from Wikipedia --> Grokipedia")
print("="*100)
df_w_license[['delta_sum', 'status']].groupby(df_w_license['status']).sum().drop('status', axis=1).reset_index()

In [None]:
print("="*100)
print("Grokipedia content adapted from Wikipedia (with license)")
print("="*100)
print("Sources deemed: 'generally_reliable' by the enwiki community")
print("Measuring: Total links with a domain removed from Wikipedia --> Grokipedia")
print("="*100)


(
    df_w_license[
        df_w_license['reliability_status'] == 'generally_reliable']
        .sort_values('delta_sum', ascending=True)[:25]
        [['source_name', 'domain', 'delta_sum']]
)

In [None]:
print("="*100)
print("Grokipedia content adapted from Wikipedia (with license)")
print("="*100)
print("Sources deemed: 'generally_reliable' by the enwiki community")
print("Measuring: Total links with a domain removed from Wikipedia --> Grokipedia")
print("="*100)


(
    df_w_license[
        df_w_license['reliability_status'] == 'generally_reliable']
        .sort_values('delta_sum', ascending=True)[:25]
        [['source_name', 'domain', 'delta_sum']]
)

In [None]:
print("="*100)
print("Grokipedia content NOT adapted from Wikipedia (without license)")
print("="*100)
print("Summary of link changes from Wikipedia --> Grokipedia")
print("="*100)
df_wo_license[['delta_sum', 'reliability_status']].groupby(df_wo_license['reliability_status']).sum().drop('reliability_status', axis=1).reset_index()

In [None]:
print("="*100)
print("Grokipedia content NOT adapted from Wikipedia (without license)")
print("="*100)
print("Sources deemed: 'generally_reliable' by the enwiki community")
print("Measuring: Total links with a domain removed from Wikipedia --> Grokipedia")
print("="*100)


(
    df_wo_license[
        df_wo_license['reliability_status'] == 'generally_reliable']
        .sort_values('delta_sum', ascending=True)[:25]
        [['source_name', 'domain', 'delta_sum']]
)

In [None]:
print("="*100)
print("Grokipedia content NOT adapted from Wikipedia (without license)")
print("="*100)
print("Sources deemed: 'generally_unreliable' by the enwiki community")
print("Measuring: Total links with a domain added from Wikipedia --> Grokipedia")
print("="*100)

(
    df_wo_license[
        df_wo_license['reliability_status'] == 'generally_unreliable']
        .sort_values('delta_sum', ascending=False)[:25]
        [['source_name', 'domain', 'delta_sum']]
)

## How much more cited are specific domain types?

Specifically looking at academic journal domains (which Grok seems to really like) and .gov and .mil domains

In [None]:
def is_government_and_mil_domain(domain):
    """
    Filter to domains including .gov, .mil, or .gov.country (e.g., .gov.au, .gov.uk, .gov.in, etc.)
    Safely return False if domain is missing or not a string.
    """
    gov_mil_pattern = re.compile(r'\.gov(\.|$)|\.mil(\.|$)')
    if not isinstance(domain, str):
        return False
    return bool(gov_mil_pattern.search(domain))

def is_journal_domain(domain):
    return True if domain in domain_set else False

gov_mil_domain_df = df[df.domain.apply(is_government_and_mil_domain)]
journal_domain_df = df[df.domain.apply(is_journal_domain)]


In [None]:
top_100_academic_sources_net_cites = journal_domain_df.sort_values('delta_sum', ascending=False)[:100].delta_sum.sum()
top_100_gov_mil_sources_net_cites = gov_mil_domain_df.sort_values('delta_sum', ascending=False)[:100].delta_sum.sum()
academic_gov_mil_domains = set(journal_domain_df.sort_values('delta_sum', ascending=False)[:100].domain.tolist()).union(set(gov_mil_domain_df.sort_values('delta_sum', ascending=False)[:100].domain.tolist()))

total_net_cites = df.delta_sum.sum()
total_domains = df.domain.count()

print(f"Of a total net increase of {total_net_cites:,} citations:")
print(f"  - {top_100_academic_sources_net_cites:,} ({top_100_academic_sources_net_cites/total_net_cites:.1%}) are from the top 100 journal domains")
print(f"  - {top_100_gov_mil_sources_net_cites:,} ({top_100_gov_mil_sources_net_cites/total_net_cites:.1%}) are from the top 100 gov/mil domains")
print(f"These sources correspond with only {len(academic_gov_mil_domains) / total_domains:.5%} of the cited domains on Grokipedia")


## Shifts in article citation composition

#### Which articles had the largest shifts from enwiki-deemed reliable to unreliable sources in their composition?

In [None]:
def find_reliability_shift_maximal(reliability_df, result_dir='../results/overall/domains', fsuffix='_domains.json'):
    """
    Find articles where (WP_reliable - WP_unreliable) - (grok_unreliable - grok_reliable) is maximal.
    This identifies pages that went from well-sourced (reliable) in WP to poorly-sourced (unreliable) in Grok.
    
    Args:
        reliability_df: DataFrame with 'domain' and 'status' columns
        result_dir: Directory containing wp_domains.json and grok_domains.json
    
    Returns:
        List of tuples with article shift data, sorted by the maximal function
    """
    
    def get_article_reliability_counts(json_file, reliability_df):
        """Load JSON and count citations by reliability status per article."""
        # Create lookup dict for reliability status by normalized domain
        reliability_lookup = {}
        for _, row in reliability_df.iterrows():
            domain = row['domain']
            if domain:
                reliability_lookup[domain] = row.get('status', None)
        
        article_stats = []
        
        with open(json_file, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        for article_dict in data:
            for article_title, domains in article_dict.items():
                if isinstance(domains, dict):
                    rel_count = 0
                    unrel_count = 0
                    blacklist_count = 0
                    no_consensus_count = 0
                    deprecated_count = 0
                    other_count = 0
                    total_count = 0
                    
                    for domain, count in domains.items():
                        total_count += count
                        if domain and domain in reliability_lookup:
                            status = reliability_lookup[domain]
                            if status == 'Generally reliable':
                                rel_count += count
                            elif status == 'Generally unreliable':
                                unrel_count += count
                            elif status == 'Deprecated':
                                deprecated_count += count
                            elif status == 'No consensus':
                                no_consensus_count += count
                            elif status == 'Blacklisted':
                                blacklist_count += count
                            else:
                                # Domain in lookup but has other status (e.g., deprecated, no consensus)
                                other_count += count
                        else:
                            # Domain not in reliability lookup (unknown/other sources)
                            other_count += count
                    
                    article_stats.append((article_title, rel_count, unrel_count, blacklist_count, no_consensus_count, deprecated_count, other_count, total_count))
        
        return article_stats
    
    # Get per-article reliability counts
    wp_articles = get_article_reliability_counts(f'{result_dir}/wp{fsuffix}', reliability_df)
    grok_articles = get_article_reliability_counts(f'{result_dir}/grok{fsuffix}', reliability_df)
    
    print(f"Loaded {len(wp_articles)} articles from Wikipedia")
    print(f"Loaded {len(grok_articles)} articles from Grokipedia\n")
    
    # Create dictionaries for quick lookup
    wp_dict = {art[0]: art for art in wp_articles}
    grok_dict = {art[0]: art for art in grok_articles}
    
    # Get all unique articles
    all_articles = set(wp_dict.keys()) | set(grok_dict.keys())
    
    # Calculate the maximal function for each article
    article_maximals = []
    for article in all_articles:
        wp_data = wp_dict.get(article, (article, 0, 0, 0, 0, 0, 0, 0))
        grok_data = grok_dict.get(article, (article, 0, 0, 0, 0, 0, 0, 0))
        
        _, wp_rel, wp_unrel, wp_blacklist, wp_no_consensus, wp_deprecated, wp_other, wp_total = wp_data
        _, grok_rel, grok_unrel, grok_blacklist, grok_no_consensus, grok_deprecated, grok_other, grok_total = grok_data
        
        # Calculate: (WP_reliable - WP_unreliable) - (grok_unreliable - grok_reliable)
        # = WP_reliable - WP_unreliable - grok_unreliable + grok_reliable
        maximal_value = (wp_rel - wp_unrel) - (grok_unrel - grok_rel)
        
        article_maximals.append((
            article,
            wp_rel, wp_unrel, wp_blacklist, wp_no_consensus, wp_deprecated, wp_other, wp_total,
            grok_rel, grok_unrel, grok_blacklist, grok_no_consensus, grok_deprecated, grok_other, grok_total,
            maximal_value
        ))
    
    return article_maximals

columns = [
        'title', 'wp_reliable', 'wp_unreliable', 'wp_blacklist', 'wp_no_consensus', 'wp_deprecated', 'wp_other',
        'wp_total', 'grok_reliable', 'grok_unreliable', 'grok_blacklist', 'grok_no_consensus', 'grok_deprecated',
        'grok_other', 'grok_total', 'maximal'
]

In [None]:
article_maximals = find_reliability_shift_maximal(reliability_df)
max_reliability_shift_df = pd.DataFrame(article_maximals, columns=columns)
article_maximals_w_license = find_reliability_shift_maximal(reliability_df, fsuffix='_domains_w_license.json')
max_reliability_shift_df_w_license = pd.DataFrame(article_maximals_w_license, columns=columns)
article_maximals_wo_license = find_reliability_shift_maximal(reliability_df, fsuffix='_domains_wo_license.json')
max_reliability_shift_df_wo_license = pd.DataFrame(article_maximals_wo_license, columns=columns)

In [None]:
max_reliability_shift_df.to_csv('../results/reliability_citation_diff.csv', index=False)

In [None]:
def plot_reliability_charts(filtered_df, fsuffix='', show=True, title=None):
    """
    Plots stacked bar + diagonal comparison charts using the given DataFrame:
    1. Stacked bar and overlay: Proportion of sources in each reliability category for Wikipedia and Grokipedia, with diagonal fills illustrating change.
    2. Bar chart: Percentage of articles containing at least one source of each type (not 'other') for Wikipedia and Grokipedia.

    Parameters:
        filtered_df (pd.DataFrame): DataFrame, typically filtered on articles of interest.
        show (bool): If True, calls plt.show() at end.
    """

    import matplotlib.pyplot as plt
    import numpy as np
    from matplotlib.patches import Patch

    # --- Setup categories, labels, colors ---
    column_order = [
        'reliable', 'unreliable', 'blacklist', 'no_consensus', 'deprecated', 'other'
    ]
    display_names = {
        'reliable': 'Generally reliable',
        'unreliable': 'Generally unreliable',
        'blacklist': 'Blacklisted',
        'no_consensus': 'No consensus',
        'deprecated': 'Deprecated',
        'other': 'Other'
    }
    color_map = {
        'reliable': 'green',
        'unreliable': 'red',
        'blacklist': 'black',
        'no_consensus': 'yellow',
        'deprecated': 'orange',
        'other': 'grey'
    }

    # --- Aggregate counts as "wp" and "grok" "status" table ---
    agg = {
        'wp_reliable': filtered_df['wp_reliable'].sum(),
        'wp_unreliable': filtered_df['wp_unreliable'].sum(),
        'wp_blacklist': filtered_df['wp_blacklist'].sum(),
        'wp_no_consensus': filtered_df['wp_no_consensus'].sum(),
        'wp_deprecated': filtered_df['wp_deprecated'].sum(),
        'wp_other': filtered_df['wp_other'].sum(),
        'grok_reliable': filtered_df['grok_reliable'].sum(),
        'grok_unreliable': filtered_df['grok_unreliable'].sum(),
        'grok_blacklist': filtered_df['grok_blacklist'].sum(),
        'grok_no_consensus': filtered_df['grok_no_consensus'].sum(),
        'grok_deprecated': filtered_df['grok_deprecated'].sum(),
        'grok_other': filtered_df['grok_other'].sum(),
    }

    # Print the number of blacklisted sources
    print(f"Number of blacklisted sources (Wikipedia): {agg['wp_blacklist']}")
    print(f"Number of blacklisted sources (Grokipedia): {agg['grok_blacklist']}")
    
    # Print the number of pages/articles with at least one blacklisted source
    wp_pages_with_blacklist = (filtered_df['wp_blacklist'] > 0).sum()
    grok_pages_with_blacklist = (filtered_df['grok_blacklist'] > 0).sum()
    print(f"Number of pages with at least one blacklisted source (Wikipedia): {wp_pages_with_blacklist}")
    print(f"Number of pages with at least one blacklisted source (Grokipedia): {grok_pages_with_blacklist}")

    # Make DF of shape: index=['Wikipedia', 'Grokipedia'], columns=column_order
    wp_row = [agg[f'wp_{k}'] for k in column_order]
    grok_row = [agg[f'grok_{k}'] for k in column_order]
    prop_df = pd.DataFrame(
        [wp_row, grok_row],
        columns=column_order,
        index=['Wikipedia', 'Grokipedia']
    )
    prop_df_norm = prop_df.div(prop_df.sum(axis=1), axis=0).fillna(0)

    # --- Plotting stacked bars with diagonal overlays, make first plot narrower and better aligned ---
    labels = ['Wikipedia', 'Grokipedia']
    x = np.arange(len(labels))
    bar_sep = 0.09  # reduce gap between bars
    width = 0.18    # make bars narrower

    fig, axs = plt.subplots(1, 2, figsize=(16, 7), gridspec_kw={'width_ratios': [1, 1.7]})
    
    # Set figure title (for whole figure)
    plot_title = title if title is not None else "Source Reliability Category Proportion: Wikipedia vs Grokipedia"
    fig.suptitle(plot_title, fontsize=18, y=0.98)
    ax = axs[0]

    # Set up stacking
    bottoms = [0, 0]
    bars_wp = []
    bars_grok = []

    # For synchronized stacking, process in column order:
    print("Wikipedia proportions:")
    print(prop_df_norm.loc['Wikipedia'])
    print("\nGrokipedia proportions:")
    print(prop_df_norm.loc['Grokipedia'])
    for j, col in enumerate(column_order):
        color = color_map.get(col, 'grey')
        # WP bar proportions
        wp_prop = prop_df_norm.loc['Wikipedia', col]
        grok_prop = prop_df_norm.loc['Grokipedia', col]
        bar_wp = ax.bar(x[0] - width/2 - bar_sep/2, wp_prop, width=width,
                        bottom=bottoms[0], color=color, edgecolor='none', zorder=2, alpha=0.8)
        bar_grok = ax.bar(x[1] + width/2 + bar_sep/2, grok_prop, width=width,
                          bottom=bottoms[1], color=color, edgecolor='none', zorder=2, alpha=0.8)

        # Diagonal change fill
        wp_top = bottoms[0] + wp_prop
        grok_top = bottoms[1] + grok_prop
        ax.fill_between(
            [x[0] - width/2, x[1] + width/2],
            [wp_top, grok_top],
            [bottoms[0], bottoms[1]],
            color=color, alpha=0.25, zorder=1, linewidth=0
        )
        bars_wp.append(bar_wp)
        bars_grok.append(bar_grok)
        bottoms[0] += wp_prop
        bottoms[1] += grok_prop

    # Set axis ticks and labels
    ax.set_xticks(x)
    ax.set_xticklabels(labels, fontsize=16)
    ax.set_ylabel("Proportion of Citations", fontsize=16)
    ax.set_title("Source Status Proportion: Wikipedia vs Grokipedia", fontsize=16)

    # Make axis tight with bars, remove excess white space
    # Bars are at x[0]=0 and x[1]=1, with width=0.18 and bar_sep=0.09
    # Left bar spans: -0.18 to 0, right bar spans: 1 to 1.18
    # Add small padding: 0.05 on each side
    ax.set_xlim(-0.23, 1.23)
    ax.set_ylim(bottom=0, top=1.01)
    ax.spines["right"].set_visible(False)
    ax.spines["top"].set_visible(False)

    # Custom legend patch (color by status) - move inside plot to reduce whitespace
    legend_elements = [Patch(facecolor=color_map.get(col, 'grey'), label=display_names.get(col, col), alpha=0.8) for col in column_order]
    ax.legend(handles=legend_elements, title='Source Status', loc='upper center', framealpha=0.9)

    # Tighten subplot spacing to reduce whitespace
    fig.subplots_adjust(wspace=0.15, left=0.05, right=0.97, top=0.92, bottom=0.1)

    # ---- New plot: % of articles containing at least 1 in each source type (not 'other') ----

    ax2 = axs[1]
    main_types = ['reliable', 'unreliable', 'blacklist', 'no_consensus', 'deprecated']
    type_labels = [display_names[t] for t in main_types]
    bar_x = np.arange(len(main_types))
    bar_width = 0.36

    n_articles = len(filtered_df)
    percentages = {'Wikipedia': [], 'Grokipedia': []}
    for source_type in main_types:
        wp_col = f'wp_{source_type}'
        grok_col = f'grok_{source_type}'
        wp_count = (filtered_df[wp_col] > 0).sum()
        grok_count = (filtered_df[grok_col] > 0).sum()
        percentages['Wikipedia'].append(wp_count / n_articles * 100)
        percentages['Grokipedia'].append(grok_count / n_articles * 100)

    # Plot as side-by-side bars
    ax2.bar(bar_x - bar_width/2, percentages['Wikipedia'], bar_width,
           label='Wikipedia', color="#4977bc", edgecolor='black', alpha=0.7)
    ax2.bar(bar_x + bar_width/2, percentages['Grokipedia'], bar_width,
           label='Grokipedia', color="#e86b54", edgecolor='black', alpha=0.7)

    for i, (wp, gk) in enumerate(zip(percentages['Wikipedia'], percentages['Grokipedia'])):
        ax2.text(i - bar_width/2, wp + 1, f"{wp:.1f}%", ha='center', va='bottom', fontsize=14, color="#10426b")
        ax2.text(i + bar_width/2, gk + 1, f"{gk:.1f}%", ha='center', va='bottom', fontsize=14, color="#7a230c")

    ax2.set_xticks(bar_x)
    ax2.set_xticklabels(type_labels, rotation=14, fontsize=14)
    ax2.set_ylim(0, 105)
    ax2.set_ylabel("Percent of Articles", fontsize=16)
    ax2.set_title("% of Articles Citing Any Source in Category", fontsize=16)
    ax2.legend(loc='upper right', fontsize=14)
    ax2.grid(axis='y', linestyle=':', alpha=0.4)

    fig.tight_layout()
    fig.savefig(f'../graphics/overall_grok_wp_cite_composition_{fsuffix}.pdf')

    if show:
        plt.show()


In [None]:
plot_reliability_charts(max_reliability_shift_df, title='Source Reliability Category Comparison (all articles)')
plot_reliability_charts(max_reliability_shift_df_w_license, fsuffix='_w_license', title='Source Reliability Category Comparison (all articles with CC-license)')
plot_reliability_charts(max_reliability_shift_df_wo_license, fsuffix='_wo_license', title='Source Reliability Category Comparison (all articles without CC-license)')

In [None]:
max_reliability_shift_df.to_csv(f'{RESULT_DIR}/reliability_citation_diff.csv', index=False)

### Using Lin et al.

In [None]:
lin_reliability = pd.read_csv(f'../supplemental_data/news_reliability/LinRating_Join.csv')

In [None]:
lin_reliability['reliability_score'] = lin_reliability['pc1']

In [None]:
def find_reliability_shift_by_buckets(lin_reliability, result_dir='../results/overall/domains', fsuffix='_domains.json'):
    """
    Find articles where reliability shifts occur, grouped by reliability score buckets (0.2-sized buckets).
    Uses lin_reliability DataFrame with reliability_score instead of discrete status categories.
    
    Args:
        lin_reliability: DataFrame with 'domain' and 'reliability_score' columns
        result_dir: Directory containing wp_domains.json and grok_domains.json
    
    Returns:
        List of tuples with article shift data grouped by reliability score buckets
    """
    
    # Define buckets: 0-0.2, 0.2-0.4, 0.4-0.6, 0.6-0.8, 0.8-1.0
    bucket_size = 0.2
    buckets = [(i * bucket_size, (i + 1) * bucket_size) for i in range(5)]
    bucket_labels = [f"{i * bucket_size:.1f}-{(i + 1) * bucket_size:.1f}" for i in range(5)]
    
    def get_bucket(score):
        """Assign a reliability score to a bucket."""
        if pd.isna(score):
            return None
        for i, (low, high) in enumerate(buckets):
            if low <= score < high:
                return i
        # Handle edge case: score == 1.0
        if score == 1.0:
            return 4
        return None
    
    def get_article_reliability_bucket_counts(json_file, lin_reliability):
        """Load JSON and count citations by reliability score bucket per article."""
        # Create lookup dict for reliability score by domain
        # Handle both 'reliability_score' and 'pc1' column names
        score_col = 'reliability_score' if 'reliability_score' in lin_reliability.columns else 'pc1'
        reliability_lookup = {}
        for _, row in lin_reliability.iterrows():
            domain = row['domain']
            if domain and pd.notna(row.get(score_col)):
                reliability_lookup[domain] = row[score_col]
        
        article_stats = []
        
        with open(json_file, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        for article_dict in data:
            for article_title, domains in article_dict.items():
                if isinstance(domains, dict):
                    # Initialize counts for each bucket
                    bucket_counts = {i: 0 for i in range(5)}
                    other_count = 0
                    total_count = 0
                    
                    for domain, count in domains.items():
                        total_count += count
                        if domain and domain in reliability_lookup:
                            score = reliability_lookup[domain]
                            bucket_idx = get_bucket(score)
                            if bucket_idx is not None:
                                bucket_counts[bucket_idx] += count
                            else:
                                other_count += count
                        else:
                            # Domain not in reliability lookup (unknown/other sources)
                            other_count += count
                    
                    # Return tuple: (title, bucket_0, bucket_1, bucket_2, bucket_3, bucket_4, other, total)
                    article_stats.append((
                        article_title,
                        bucket_counts[0], bucket_counts[1], bucket_counts[2], 
                        bucket_counts[3], bucket_counts[4], other_count, total_count
                    ))
        
        return article_stats
    
    # Get per-article reliability bucket counts
    wp_articles = get_article_reliability_bucket_counts(f'{result_dir}/wp{fsuffix}', lin_reliability)
    grok_articles = get_article_reliability_bucket_counts(f'{result_dir}/grok{fsuffix}', lin_reliability)
    
    print(f"Loaded {len(wp_articles)} articles from Wikipedia")
    print(f"Loaded {len(grok_articles)} articles from Grokipedia\n")
    
    # Create dictionaries for quick lookup
    wp_dict = {art[0]: art for art in wp_articles}
    grok_dict = {art[0]: art for art in grok_articles}
    
    # Get all unique articles
    all_articles = set(wp_dict.keys()) | set(grok_dict.keys())
    
    # Calculate shifts for each article, grouped by buckets
    article_bucket_shifts = []
    for article in all_articles:
        wp_data = wp_dict.get(article, (article, 0, 0, 0, 0, 0, 0, 0))
        grok_data = grok_dict.get(article, (article, 0, 0, 0, 0, 0, 0, 0))
        
        _, wp_b0, wp_b1, wp_b2, wp_b3, wp_b4, wp_other, wp_total = wp_data
        _, grok_b0, grok_b1, grok_b2, grok_b3, grok_b4, grok_other, grok_total = grok_data
        
        # Calculate shift for each bucket: WP_count - Grok_count
        # Positive means WP has more, negative means Grok has more
        shifts = [
            wp_b0 - grok_b0,
            wp_b1 - grok_b1,
            wp_b2 - grok_b2,
            wp_b3 - grok_b3,
            wp_b4 - grok_b4
        ]
        
        # Calculate overall shift metric: weighted sum (higher buckets weighted more)
        # This gives more weight to shifts in higher reliability buckets
        weighted_shift = sum(shifts[i] * (i + 1) for i in range(5))
        
        article_bucket_shifts.append((
            article,
            wp_b0, wp_b1, wp_b2, wp_b3, wp_b4, wp_other, wp_total,
            grok_b0, grok_b1, grok_b2, grok_b3, grok_b4, grok_other, grok_total,
            shifts[0], shifts[1], shifts[2], shifts[3], shifts[4],  # individual bucket shifts
            weighted_shift  # overall weighted shift
        ))
    
    return article_bucket_shifts

bucket_columns = [
    'title',
    'wp_bucket_0_0.2', 'wp_bucket_0.2_0.4', 'wp_bucket_0.4_0.6', 'wp_bucket_0.6_0.8', 'wp_bucket_0.8_1.0', 'wp_other', 'wp_total',
    'grok_bucket_0_0.2', 'grok_bucket_0.2_0.4', 'grok_bucket_0.4_0.6', 'grok_bucket_0.6_0.8', 'grok_bucket_0.8_1.0', 'grok_other', 'grok_total',
    'shift_bucket_0_0.2', 'shift_bucket_0.2_0.4', 'shift_bucket_0.4_0.6', 'shift_bucket_0.6_0.8', 'shift_bucket_0.8_1.0',
    'weighted_shift'
]


In [None]:
article_maximals_lin = find_reliability_shift_by_buckets(lin_reliability)
max_reliability_shift_df_lin = pd.DataFrame(article_maximals_lin, columns=bucket_columns)
article_maximals_w_license_lin = find_reliability_shift_by_buckets(lin_reliability, fsuffix='_domains_w_license.json')
max_reliability_shift_df_w_license_lin = pd.DataFrame(article_maximals_w_license_lin, columns=bucket_columns)
article_maximals_wo_license_lin = find_reliability_shift_by_buckets(lin_reliability, fsuffix='_domains_wo_license.json')
max_reliability_shift_df_wo_license_lin = pd.DataFrame(article_maximals_wo_license_lin, columns=bucket_columns)

In [None]:
def plot_reliability_bucket_charts(filtered_df, fsuffix='', show=True, title=None):
    """
    Plots stacked bar + diagonal comparison charts using reliability score buckets:
    1. Stacked bar and overlay: Proportion of sources in each reliability bucket for Wikipedia and Grokipedia, with diagonal fills illustrating change.
    2. Bar chart: Percentage of articles containing at least one source in each bucket for Wikipedia and Grokipedia.

    Parameters:
        filtered_df (pd.DataFrame): DataFrame with bucket columns (from find_reliability_shift_by_buckets)
        fsuffix (str): Suffix for output filename
        show (bool): If True, calls plt.show() at end.
        title (str): Optional custom title for the whole figure. If None, uses default title.
    """

    import matplotlib.pyplot as plt
    import numpy as np
    from matplotlib.patches import Patch
    import matplotlib.colors as mcolors

    # --- Setup buckets, labels, colors ---
    bucket_labels = ['0.0-0.2', '0.2-0.4', '0.4-0.6', '0.6-0.8', '0.8-1.0', 'other']
    display_names = {
        '0.0-0.2': '0.0-0.2',
        '0.2-0.4': '0.2-0.4',
        '0.4-0.6': '0.4-0.6',
        '0.6-0.8': '0.6-0.8',
        '0.8-1.0': '0.8-1.0',
        'other': 'No score'
    }
    
    # Create color map: green (for 1.0), yellow (for 0.5), red (for 0.0), gray for 'other'
    from matplotlib.colors import to_hex, LinearSegmentedColormap
    
    # Create a green-yellow-red colormap, where 1.0 is green, 0.5 is yellow, 0.0 is red
    spect_cmap = LinearSegmentedColormap.from_list(
        "green_yellow_red", [(0.0, "#D73027"), (0.5, "#FEE08B"), (1.0, "#1A9850")]  # red, yellow, green
    )
    
    # Map bucket labels to their midpoint values for colormap
    bucket_midpoints = {
        '0.0-0.2': 0.1,  # Red end
        '0.2-0.4': 0.3,  # Red-yellow transition
        '0.4-0.6': 0.5,  # Yellow (middle)
        '0.6-0.8': 0.7,  # Yellow-green transition
        '0.8-1.0': 0.9,  # Green end
    }
    
    # Generate colors for each bucket using the colormap
    color_map = {}
    for bucket_label in bucket_labels:
        if bucket_label == 'other':
            color_map[bucket_label] = 'grey'
        else:
            midpoint = bucket_midpoints[bucket_label]
            color_map[bucket_label] = to_hex(spect_cmap(midpoint))

    # --- Aggregate counts as "wp" and "grok" bucket table ---
    # Map bucket labels to actual column name suffixes
    bucket_to_col = {
        '0.0-0.2': '0_0.2',
        '0.2-0.4': '0.2_0.4',
        '0.4-0.6': '0.4_0.6',
        '0.6-0.8': '0.6_0.8',
        '0.8-1.0': '0.8_1.0'
    }
    
    agg = {}
    for bucket_label in bucket_labels:
        if bucket_label == 'other':
            agg[f'wp_{bucket_label}'] = filtered_df['wp_other'].sum()
            agg[f'grok_{bucket_label}'] = filtered_df['grok_other'].sum()
        else:
            # Use the correct column name format
            col_suffix = bucket_to_col[bucket_label]
            agg[f'wp_{bucket_label}'] = filtered_df[f'wp_bucket_{col_suffix}'].sum()
            agg[f'grok_{bucket_label}'] = filtered_df[f'grok_bucket_{col_suffix}'].sum()

    # Make DF of shape: index=['Wikipedia', 'Grokipedia'], columns=bucket_labels
    wp_row = [agg[f'wp_{k}'] for k in bucket_labels]
    grok_row = [agg[f'grok_{k}'] for k in bucket_labels]
    prop_df = pd.DataFrame(
        [wp_row, grok_row],
        columns=bucket_labels,
        index=['Wikipedia', 'Grokipedia']
    )
    prop_df_norm = prop_df.div(prop_df.sum(axis=1), axis=0).fillna(0)

    # --- Plotting stacked bars with diagonal overlays ---
    labels = ['Wikipedia', 'Grokipedia']
    x = np.arange(len(labels))
    bar_sep = 0.09
    width = 0.18

    fig, axs = plt.subplots(1, 2, figsize=(16, 7), gridspec_kw={'width_ratios': [1, 1.7]})
    ax = axs[0]

    # Set up stacking
    bottoms = [0, 0]
    bars_wp = []
    bars_grok = []

    # For synchronized stacking, process in bucket order:
    for j, bucket_label in enumerate(bucket_labels):
        color = color_map.get(bucket_label, 'grey')
        # WP bar proportions
        wp_prop = prop_df_norm.loc['Wikipedia', bucket_label]
        grok_prop = prop_df_norm.loc['Grokipedia', bucket_label]
        bar_wp = ax.bar(x[0] - width/2 - bar_sep/2, wp_prop, width=width,
                        bottom=bottoms[0], color=color, edgecolor='none', zorder=2)
        bar_grok = ax.bar(x[1] + width/2 + bar_sep/2, grok_prop, width=width,
                          bottom=bottoms[1], color=color, edgecolor='none', zorder=2)

        # Diagonal change fill
        wp_top = bottoms[0] + wp_prop
        grok_top = bottoms[1] + grok_prop
        ax.fill_between(
            [x[0] - width/2, x[1] + width/2],
            [wp_top, grok_top],
            [bottoms[0], bottoms[1]],
            color=color, alpha=0.25, zorder=1, linewidth=0
        )
        bars_wp.append(bar_wp)
        bars_grok.append(bar_grok)
        bottoms[0] += wp_prop
        bottoms[1] += grok_prop

    # Set axis ticks and labels
    ax.set_xticks(x)
    ax.set_xticklabels(labels, fontsize=16)
    ax.set_ylabel("Proportion of Citations", fontsize=16)
    ax.set_title("Source Reliability Score Proportion: Wikipedia vs Grokipedia", fontsize=18)

    # Make axis tight with bars
    ax.set_xlim(-0.23, 1.23)
    ax.set_ylim(bottom=0, top=1.01)
    ax.spines["right"].set_visible(False)
    ax.spines["top"].set_visible(False)

    # Custom legend patch (color by bucket) - reversed order - move inside plot to reduce whitespace
    legend_elements = [Patch(facecolor=color_map.get(bucket_label, 'grey'), label=display_names.get(bucket_label, bucket_label), alpha=0.55) for bucket_label in reversed(bucket_labels)]
    ax.legend(handles=legend_elements, title='Reliability Score', loc='upper center', framealpha=0.9, fontsize=16)

    # Set figure title (for whole figure)
    plot_title = title if title is not None else "Source Reliability Score Proportion: Wikipedia vs Grokipedia"
    fig.suptitle(plot_title, fontsize=16, y=0.98)
    
    # Tighten subplot spacing to reduce whitespace (leave room for suptitle)
    fig.subplots_adjust(wspace=0.15, left=0.05, right=0.97, top=0.88, bottom=0.1)

    # ---- New plot: % of articles containing at least 1 in each bucket ----
    ax2 = axs[1]
    main_buckets = ['0.0-0.2', '0.2-0.4', '0.4-0.6', '0.6-0.8', '0.8-1.0']
    bucket_display_labels = [display_names[b] for b in main_buckets]
    bar_x = np.arange(len(main_buckets))
    bar_width = 0.36

    n_articles = len(filtered_df)
    percentages = {'Wikipedia': [], 'Grokipedia': []}
    # Print the number of articles containing at least one 0-0.2-reliability citation for Wikipedia and Grokipedia
    wp_0_0_2_col = 'wp_bucket_0_0.2'
    grok_0_0_2_col = 'grok_bucket_0_0.2'
    wp_0_0_2_num = (filtered_df[wp_0_0_2_col] > 0).sum()
    grok_0_0_2_num = (filtered_df[grok_0_0_2_col] > 0).sum()
    print(f"Number of articles with at least one 0-0.2 source: Wikipedia: {wp_0_0_2_num}, Grokipedia: {grok_0_0_2_num}")

    # Print the number of pages (rows) with a 0-0.2 source for Wikipedia and for Grokipedia
    print(f"Number of pages with a 0-0.2 source in Wikipedia: {wp_0_0_2_num}")
    print(f"Number of pages with a 0-0.2 source in Grokipedia: {grok_0_0_2_num}")

    for bucket_label in main_buckets:
        # Use the correct column name format
        col_suffix = bucket_to_col[bucket_label]
        wp_col = f'wp_bucket_{col_suffix}'
        grok_col = f'grok_bucket_{col_suffix}'
        wp_count = (filtered_df[wp_col] > 0).sum()
        grok_count = (filtered_df[grok_col] > 0).sum()
        percentages['Wikipedia'].append(wp_count / n_articles * 100)
        percentages['Grokipedia'].append(grok_count / n_articles * 100)

    # Plot as side-by-side bars with blue/red scheme (same as reliability chart)
    wp_color = "#4977bc"  # Blue for Wikipedia
    grok_color = "#e86b54"  # Red for Grokipedia
    ax2.bar(bar_x - bar_width/2, percentages['Wikipedia'], bar_width,
           label='Wikipedia', color=wp_color, edgecolor='black', alpha=0.7)
    ax2.bar(bar_x + bar_width/2, percentages['Grokipedia'], bar_width,
           label='Grokipedia', color=grok_color, edgecolor='black', alpha=0.7)

    for i, (wp, gk) in enumerate(zip(percentages['Wikipedia'], percentages['Grokipedia'])):
        ax2.text(i - bar_width/2, wp + 1, f"{wp:.1f}%", ha='center', va='bottom', fontsize=10, color="#10426b")
        ax2.text(i + bar_width/2, gk + 1, f"{gk:.1f}%", ha='center', va='bottom', fontsize=10, color="#7a230c")

    ax2.set_xticks(bar_x)
    ax2.set_xticklabels(bucket_display_labels, rotation=14, fontsize=16)
    ax2.set_ylim(0, 105)
    ax2.set_ylabel("Percent of Articles", fontsize=16)
    ax2.set_title("% of Articles Citing Any Source in Reliability Bucket", fontsize=18)
    ax2.legend(loc='upper right', fontsize=16)
    ax2.grid(axis='y', linestyle=':', alpha=0.4)

    fig.tight_layout()
    fig.savefig(f'../graphics/overall_grok_wp_cite_composition_lin_{fsuffix}.pdf')

    if show:
        plt.show()


In [None]:
plot_reliability_bucket_charts(max_reliability_shift_df_lin, title='Source Reliability Score Comparison (all articles)')
plot_reliability_bucket_charts(max_reliability_shift_df_w_license_lin, fsuffix='_w_license', title='Source Reliability Score Comparison (all articles with CC-license)')
plot_reliability_bucket_charts(max_reliability_shift_df_wo_license_lin, fsuffix='_wo_license', title='Source Reliability Score Comparison (all articles without CC-license)')

## Examining Book Citation Patterns in Grokipedia Articles

Finding articles with book citations and examining their reference structure.


(tktk)

## Getting Twitter / X usernames that were cited

In [23]:
import pickle
import re
import json
from collections import Counter, defaultdict
from tqdm import tqdm

def get_object_at_offset(path, offset):
    if offset is None:
        raise ValueError("Offset must not be None. Check the index lookup.")
    with open(path, "r", encoding="utf-8") as f:
        f.seek(offset)
        line = f.readline()
        return json.loads(line)

# Patterns as before
SITE_PATTERNS = {
    "twitter.com": re.compile(r"https?://(?:www\.)?(?:twitter\.com|x\.com)/([A-Za-z0-9_]{1,15})(?:[/?#]|$)"),
    "x.com": re.compile(r"https?://(?:www\.)?x\.com/([A-Za-z0-9_]{1,15})(?:[/?#]|$)"),
    "facebook.com": re.compile(r"https?://(?:www\.)?facebook\.com/(?!(pages|groups|events|marketplace|watch|gaming|live|photo\.php)(?:/|$))([A-Za-z0-9_.]{5,})(?:[/?#]|$)"),
    "youtube.com": re.compile(
        r"https?://(?:www\.)?youtube\.com/(?:(?:user|c|channel)/([A-Za-z0-9._\-]{1,64}))(?:[/?#]|$)"
    ),
    "youtu.be": re.compile(r"https?://youtu\.be/([A-Za-z0-9_\-]{6,})(?:[/?#]|$)"),
    "reddit.com": re.compile(r"https?://(?:www\.)?reddit\.com/user/([A-Za-z0-9_\-]{1,20})(?:[/?#]|$)"),
    "instagram.com": re.compile(r"https?://(?:www\.)?instagram\.com/([A-Za-z0-9_.]{1,30})(?:[/?#]|$)"),
    "tiktok.com": re.compile(r"https?://(?:www\.)?tiktok\.com/@([A-Za-z0-9_.]{1,30})(?:[/?#]|$)"),
    "truthsocial.com": re.compile(r"https?://(?:www\.)?truthsocial\.com/@([A-Za-z0-9_.]{1,30})(?:[/?#]|$)"),
    "bsky.app": re.compile(r"https?://(?:www\.)?bsky\.app/profile/([A-Za-z0-9_.\-]+)(?:[/?#]|$)"),
    "pinterest.com": re.compile(r"https?://(?:www\.)?pinterest\.com/([A-Za-z0-9_/.\-]{3,})/?(?:[/?#]|$)"),
}
ALL_PATTERNS = [(name, pat) for name, pat in SITE_PATTERNS.items()]

# Add patterns for shareable AI links
SHAREABLE_AI_PATTERNS = {
    # ChatGPT links look like: https://chatgpt.com/share/<uuid>
    "chatgpt.com": re.compile(
        r"(?:https?://)?(?:www\.)?chatgpt\.com/share/([0-9a-fA-F\-]{10,})"
    ),
    # Claude links look like: https://claude.ai/share/<uuid>
    "claude.ai": re.compile(
        r"(?:https?://)?(?:www\.)?claude\.ai/share/([0-9a-fA-F\-]{10,})"
    ),
    # You can add more patterns for other generative AI share links here
    "chat.deepseek.com": re.compile(
        r"(?:https?://)?(?:www\.)?chat\.deepseek\.com/share/([0-9a-fA-F\-]{10,})"
    ),
    "twitter.com": re.compile(
        r"(?:https?://)?(?:www\.)?twitter\.com/i/grok/share/([0-9a-fA-F\-]{10,})"
    ),
    "x.com": re.compile(
        r"(?:https?://)?(?:www\.)?x\.com/i/grok/share/([0-9a-fA-F\-]{10,})"
    ),
    "perplexity.ai": re.compile(
        r"(?:https?://)?(?:www\.)?perplexity\.ai/search/([0-9a-fA-F\-]{10,})"
    ),
    "copilot.microsoft.com": re.compile(
        r"(?:https?://)?(?:www\.)?copilot\.microsoft\.com/shares/([0-9a-fA-F\-]{10,})"
    )
}
ALL_SHARE_PATTERNS = [(name, pat) for name, pat in SHAREABLE_AI_PATTERNS.items()]

def extract_ai_share_links(urls, print_debug=False):
    """
    Extract AI share links from a list of URLs.
    Also extracts Twitter/X links where the username is "grok".
    
    Args:
        urls: List of URL strings
        print_debug: If True, print debug messages
    
    Returns:
        List of tuples: [(ai_site, share_id, url), ...]
        For Twitter/X grok username links: ("twitter.com/grok", username, url) or ("x.com/grok", username, url)
    """
    ai_shares = []
    # Pattern for Twitter/X usernames (for profile links, not share links)
    twitter_x_pattern = re.compile(r"https?://(?:www\.)?(?:twitter\.com|x\.com)/([A-Za-z0-9_]{1,15})(?:[/?#]|$)")
    
    for url in urls:
        if not url:
            continue
        
        # Track if we found a share link for this URL
        found_share_link = False
        
        # Check for AI share links first (prioritize share links over profile links)
        for ai_site, ai_pat in ALL_SHARE_PATTERNS:
            # Extract the domain from the ai_site key (e.g., "chatgpt.com" -> "chatgpt.com")
            domain = ai_site.split('/')[0]
            # Check if the domain is in the URL and try to match the pattern
            if domain in url:
                ai_match = ai_pat.search(url)
                if ai_match:
                    share_id = ai_match.group(1)
                    ai_shares.append((ai_site, share_id, url))
                    found_share_link = True
                    if print_debug:
                        print(f"    [DEBUG] Found AI share: {ai_site}, id: {share_id}, url: {url}")
                    # Break after finding a match to avoid duplicate entries
                    break
        
        # Only check for Twitter/X profile links where username is "grok" if we didn't find a share link
        if not found_share_link and ("twitter.com" in url or "x.com" in url):
            # Make sure it's not a share link path
            if "/i/grok/share/" not in url and "/i/share/" not in url:
                match = twitter_x_pattern.search(url)
                if match:
                    username = match.group(1)
                    if username.lower() == "grok":
                        # Determine which site (twitter.com or x.com)
                        site_key = "twitter.com/grok" if "twitter.com" in url else "x.com/grok"
                        ai_shares.append((site_key, username, url))
                        if print_debug:
                            print(f"    [DEBUG] Found Grok username link: {site_key}, username: {username}, url: {url}")
    
    return ai_shares

def get_ai_share_links_only(domains_data, idx_path, data_path, test_mode=False, print_debug=False):
    """
    Extract only AI share links from articles, without extracting social usernames.
    
    Args:
        domains_data: loaded from *_domains.json [{title: {...}}, ...]
        idx_path: path to cached index
        data_path: article data file (either grokipedia or wikipedia)
        test_mode: if True, only process the first 1000 articles for a quick test.
        print_debug: if True, print debug messages
    
    Returns:
        dict mapping titles to [(ai_site, share_id, url), ...] for AI share links.
    """
    print(f"Opening index file: {idx_path}")
    with open(idx_path, "rb") as f:
        idx_data = pickle.load(f)
    print(f"Index loaded. Number of entries: {len(idx_data)}")

    # Flatten input to [(title, domains)] list
    if isinstance(domains_data, dict):
        items_list = list(domains_data.items())
    else:
        items_list = []
        for d in domains_data:
            for title, domains in d.items():
                items_list.append((title, domains))
    print(f"items_list created with {len(items_list)} articles")

    ai_shares_by_title = defaultdict(list)  # {title: [(ai_site, share_id, url), ...]}

    iter_items = items_list
    bar_length = min(1000, len(items_list)) if test_mode else len(items_list)
    with tqdm(total=bar_length, desc="Extracting AI share links", unit="article") as pbar:
        for i, (title_orig, domains) in enumerate(iter_items):
            if test_mode and i >= 1000:
                print("Test mode enabled. Breaking after 1000 records.")
                break

            title_lookup = title_orig.replace("_", " ").lower()
            title_for_result = title_lookup

            # Only check for AI share domains, not social domains
            relevant = any(any(ai_dom in dom for ai_dom in SHAREABLE_AI_PATTERNS.keys()) for dom in domains)
            if not relevant:
                pbar.update(1)
                continue

            if i % 10000 == 0 and i != 0:
                print(f"Processing index {i} / {len(items_list)}: '{title_for_result}'...")

            line_idx = idx_data.get(title_orig)
            if line_idx is None:
                line_idx = idx_data.get(title_lookup)
            if line_idx is None:
                if print_debug:
                    print(f"  WARNING: No index for title '{title_for_result}'")
                pbar.update(1)
                continue

            try:
                ref = get_object_at_offset(data_path, line_idx)
            except Exception as e:
                print(f"  ERROR: Could not fetch article for '{title_for_result}' (offset: {line_idx}): {e}")
                pbar.update(1)
                continue

            urls = []

            # This follows the extraction logic in 05_preprocessing.ipynb
            if 'wikipedia' in data_path:
                references = ref.get("references", [])
                if i < 5 and print_debug:
                    print(f"  [DEBUG] First refs for '{title_for_result}': {references[:2]}")
                for single_ref in references:
                    metadata = single_ref.get("metadata") if isinstance(single_ref, dict) else None
                    url = metadata.get("url") if isinstance(metadata, dict) else None
                    if url:
                        urls.append(url)
            else:
                references = ref.get("data", {}).get("references", [])
                if i < 5 and print_debug:
                    print(f"  [DEBUG] First grok refs for '{title_for_result}': {references[:2]}")
                for single_ref in references:
                    link = single_ref.get("link") if isinstance(single_ref, dict) else None
                    href = link.get("href") if isinstance(link, dict) else None
                    if href:
                        urls.append(href)

            if i < 5 and print_debug:
                print(f"  [DEBUG] Extracted URLs for '{title_for_result}': {urls[:5]}")

            # Extract AI share links only
            ai_shares = extract_ai_share_links(urls, print_debug=print_debug)
            for ai_site, share_id, url in ai_shares:
                ai_shares_by_title[title_for_result].append((ai_site, share_id, url))

            pbar.update(1)

    print(f"Total articles with any AI share links: {len(ai_shares_by_title)}")
    return dict(ai_shares_by_title)

# Usernames known to be spurious, e.g. from X/Twitter link shortener
SPURIOUS_X_USERNAMES = {
    "PRODU", 'search'
}

# Values considered NOT real usernames -- skip these by site (case-sensitive)
SKIP_FACEBOOK_USERNAMES = set([
    "photo.php", 'story.php', 'photo', 'media', 'notes', 'business', 'permalink.php', 'search'
])
# Only numbers (facebook.com/123...), very likely not a real username
FACEBOOK_NUMERIC_RE = re.compile(r'^\d+$')

SKIP_INSTAGRAM_USERNAMES = set([
    "reel", 'tv'
])
# Skip youtube channel IDs like "UCtWQDzuH1e84SebEyZN_aXw": channel IDs always start with "UC" and are 24 chars
YOUTUBE_CHANNEL_ID_RE = re.compile(r"^UC[\w-]{22}$")

# X.com "i" pseudo-username (used for i/grok/share/ and others: always skip these, and possibly log for debug)
SKIP_X_USERNAMES = set([
    "i",
])

def get_social_usernames_with_counts(domains_data, idx_path, data_path, test_mode=False, print_debug=False):
    """
    Given:
        domains_data: loaded from *_domains.json [{title: {...}}, ...]
        idx_path: path to cached index
        data_path: article data file (either grokipedia or wikipedia)
        test_mode: if True, only process the first 1000 articles for a quick test.
    Returns:
        dict mapping titles to {site: Counter({username: count, ...}), ...} for user/social platforms.
        Also returns (ai_shares_by_title, skipped_x_i): ai_shares_by_title is {title: [(ai_site, share_id, url), ...]}
    """
    print(f"Opening index file: {idx_path}")
    with open(idx_path, "rb") as f:
        idx_data = pickle.load(f)
    print(f"Index loaded. Number of entries: {len(idx_data)}")

    # Flatten input to [(title, domains)] list
    if isinstance(domains_data, dict):
        items_list = list(domains_data.items())
    else:
        items_list = []
        for d in domains_data:
            for title, domains in d.items():
                items_list.append((title, domains))
    print(f"items_list created with {len(items_list)} articles")

    results = {}
    skipped_x_i = []  # For optional debug reporting
    ai_shares_by_title = defaultdict(list)  # New: {title: [(ai_site, share_id, url), ...]}

    iter_items = items_list
    bar_length = min(1000, len(items_list)) if test_mode else len(items_list)
    with tqdm(total=bar_length, desc="Extracting social usernames", unit="article") as pbar:
        for i, (title_orig, domains) in enumerate(iter_items):
            if test_mode and i >= 1000:
                print("Test mode enabled. Breaking after 1000 records.")
                break

            title_lookup = title_orig.replace("_", " ").lower()
            title_for_result = title_lookup

            relevant = any(dom in SITE_PATTERNS or any(ai_dom in dom for ai_dom in SHAREABLE_AI_PATTERNS.keys()) for dom in domains)
            if not relevant:
                pbar.update(1)
                continue

            if i % 10000 == 0 and i != 0:
                print(f"Processing index {i} / {len(items_list)}: '{title_for_result}'...")

            line_idx = idx_data.get(title_orig)
            if line_idx is None:
                line_idx = idx_data.get(title_lookup)
            if line_idx is None:
                if print_debug:
                    print(f"  WARNING: No index for title '{title_for_result}'")
                pbar.update(1)
                continue

            try:
                ref = get_object_at_offset(data_path, line_idx)
            except Exception as e:
                print(f"  ERROR: Could not fetch article for '{title_for_result}' (offset: {line_idx}): {e}")
                pbar.update(1)
                continue

            urls = []

            # This follows the extraction logic in 05_preprocessing.ipynb
            if 'wikipedia' in data_path:
                references = ref.get("references", [])
                if i < 5 and print_debug:
                    print(f"  [DEBUG] First refs for '{title_for_result}': {references[:2]}")
                for single_ref in references:
                    metadata = single_ref.get("metadata") if isinstance(single_ref, dict) else None
                    url = metadata.get("url") if isinstance(metadata, dict) else None
                    if url:
                        urls.append(url)
            else:
                references = ref.get("data", {}).get("references", [])
                if i < 5 and print_debug:
                    print(f"  [DEBUG] First grok refs for '{title_for_result}': {references[:2]}")
                for single_ref in references:
                    link = single_ref.get("link") if isinstance(single_ref, dict) else None
                    href = link.get("href") if isinstance(link, dict) else None
                    if href:
                        urls.append(href)

            if i < 5 and print_debug:
                print(f"  [DEBUG] Extracted URLs for '{title_for_result}': {urls[:5]}")

            usernames_by_site = defaultdict(Counter)
            # === Extract usernames from general social patterns ===
            for url in urls:
                if not url:
                    continue
                # Social sites block
                for site, pat in ALL_PATTERNS:
                    if site in url:
                        match = pat.search(url)
                        if match:
                            # Disallow youtube.com/watch etc. as username
                            if site == "youtube.com" and "watch" in url and not "/user/" in url and not "/c/" in url and not "/channel/" in url:
                                continue
                            # Disallow instagram.com/p/ as usernames
                            if site == "instagram.com" and re.search(r"/p/[^/?#]+", url):
                                continue
                            # facebook.com pattern has group 2, rest have group 1
                            if site == "facebook.com" and match.lastindex and match.lastindex > 1:
                                username = match.group(2)
                            else:
                                username = match.group(1)

                            # Skip site-specific username types
                            if site == "facebook.com":
                                if username in SKIP_FACEBOOK_USERNAMES:
                                    if print_debug:
                                        print(f"    [DEBUG] Skipped facebook.com username '{username}' in URL: {url}")
                                    continue
                                if FACEBOOK_NUMERIC_RE.match(username):
                                    if print_debug:
                                        print(f"    [DEBUG] Skipped numeric facebook.com username '{username}' in URL: {url}")
                                    continue
                            elif site == "instagram.com":
                                if username in SKIP_INSTAGRAM_USERNAMES:
                                    if print_debug:
                                        print(f"    [DEBUG] Skipped instagram.com username '{username}' in URL: {url}")
                                    continue
                            elif site == "youtube.com":
                                if YOUTUBE_CHANNEL_ID_RE.match(username):
                                    if print_debug:
                                        print(f"    [DEBUG] Skipped youtube.com channel ID '{username}' in URL: {url}")
                                    continue
                            elif site in {"x.com", "twitter.com"}:
                                if username in SKIP_X_USERNAMES:
                                    skipped_x_i.append((username, url, title_for_result))
                                    if print_debug:
                                        print(f"    [DEBUG] Skipped x.com username '{username}' in URL: {url}")
                                    continue
                                if username in SPURIOUS_X_USERNAMES:
                                    if print_debug:
                                        print(f"    [DEBUG] Skipped spurious {site} username '{username}' from URL: {url}")
                                    continue

                            if username:
                                usernames_by_site[site][username] += 1
                                if print_debug:
                                    print(f"    [DEBUG] Matched {site} username '{username}' in URL: {url}")

            # Extract AI share links
            ai_shares = extract_ai_share_links(urls, print_debug=print_debug)
            for ai_site, share_id, url in ai_shares:
                ai_shares_by_title[title_for_result].append((ai_site, share_id, url))

            # Only save in results if found any usernames
            if usernames_by_site and print_debug:
                print(f"  Found usernames for {title_for_result}: {[ (site, dict(counter)) for site, counter in usernames_by_site.items() if counter ]}")
                results[title_for_result] = {site: dict(counter) for site, counter in usernames_by_site.items() if counter}
            elif usernames_by_site:
                results[title_for_result] = {site: dict(counter) for site, counter in usernames_by_site.items() if counter}
            pbar.update(1)

    print(f"Total articles with any matching usernames: {len(results)}")
    print(f"Total articles with any AI share links: {len(ai_shares_by_title)}")
    return results, skipped_x_i, dict(ai_shares_by_title)

In [4]:
with open('../results/overall/domains/wp_domains.json', 'r') as f:
    wp_domains = json.load(f)

with open('../results/overall/domains/grok_domains.json', 'r') as f:
    grok_domains = json.load(f)

In [None]:
wp_usernames_by_title, wp_grok_convo_links, wp_ai_shares_by_title = get_social_usernames_with_counts(wp_domains, '../results/overall/cached_wiki_idx.pkl', '../grokipedia_wikipedia_articles.ndjson')
grok_usernames_by_title, grok_grok_convo_links, grok_ai_shares_by_title = get_social_usernames_with_counts(grok_domains, '../results/overall/cached_grok_idx.pkl', '../grokipedia_scrape.ndjson')

In [24]:
grok_ai_share_links = get_ai_share_links_only(grok_domains, '../results/overall/cached_grok_idx.pkl', '../grokipedia_scrape.ndjson')

Opening index file: ../results/overall/cached_grok_idx.pkl
Index loaded. Number of entries: 880623
items_list created with 883673 articles


Extracting AI share links:  14%|█▎        | 120371/883673 [00:36<03:44, 3399.83article/s]

Processing index 120000 / 883673: 'korean canadians'...


Extracting AI share links:  20%|██        | 180572/883673 [00:55<03:10, 3699.57article/s]

Processing index 180000 / 883673: 'thomas sadoski'...


Extracting AI share links:  22%|██▏       | 190573/883673 [00:58<03:26, 3357.35article/s]

Processing index 190000 / 883673: 'clan macfarlane'...


Extracting AI share links:  26%|██▌       | 230521/883673 [01:12<03:38, 2992.46article/s]

Processing index 230000 / 883673: 'mirpur district'...


Extracting AI share links:  28%|██▊       | 250636/883673 [01:19<03:07, 3368.46article/s]

Processing index 250000 / 883673: 'idiotest'...


Extracting AI share links:  29%|██▉       | 260516/883673 [01:22<03:02, 3411.03article/s]

Processing index 260000 / 883673: 'she (tyler, the creator song)'...


Extracting AI share links:  34%|███▍      | 300477/883673 [01:34<04:04, 2385.58article/s]

Processing index 300000 / 883673: 'k. m. sachin dev'...


Extracting AI share links:  36%|███▋      | 320539/883673 [01:40<02:33, 3668.73article/s]

Processing index 320000 / 883673: 'roscoe dash'...


Extracting AI share links:  37%|███▋      | 330435/883673 [01:44<03:26, 2676.70article/s]

Processing index 330000 / 883673: 'megan davis'...


Extracting AI share links:  39%|███▊      | 340446/883673 [01:47<02:36, 3476.04article/s]

Processing index 340000 / 883673: 'incidents at six flags parks'...


Extracting AI share links:  51%|█████     | 450557/883673 [02:19<02:09, 3346.55article/s]

Processing index 450000 / 883673: 'don't call me angel'...


Extracting AI share links:  58%|█████▊    | 510706/883673 [02:37<01:43, 3601.49article/s]

Processing index 510000 / 883673: 'faky'...


Extracting AI share links:  63%|██████▎   | 560314/883673 [02:51<02:00, 2672.39article/s]

Processing index 560000 / 883673: 'santiago creel'...


Extracting AI share links:  75%|███████▍  | 660304/883673 [03:19<01:07, 3296.31article/s]

Processing index 660000 / 883673: 'groups rally'...


Extracting AI share links:  95%|█████████▌| 840341/883673 [04:12<00:12, 3346.21article/s]

Processing index 840000 / 883673: 'baby driver'...


Extracting AI share links: 100%|██████████| 883673/883673 [04:24<00:00, 3336.40article/s]


Total articles with any AI share links: 220


In [28]:
grok_ai_share_links

{'ganesh chaturthi': [('x.com/grok',
   'grok',
   'https://x.com/grok/status/1959516819200659871')],
 'grok (chatbot)': [('x.com/grok',
   'grok',
   'https://x.com/grok/status/1982383601292558751'),
  ('x.com/grok', 'grok', 'https://x.com/grok/status/1978557968548581830'),
  ('x.com/grok', 'grok', 'https://x.com/grok/status/1982454130246652232')],
 'kamal haasan': [('x.com/grok',
   'grok',
   'https://x.com/grok/status/1934957329478566040')],
 'barlas': [('twitter.com/grok',
   'grok',
   'https://twitter.com/grok/status/1971732953303847168')],
 'the twelfth': [('x.com/grok',
   'grok',
   'https://x.com/grok/status/1939746368580637071')],
 'eastern orthodoxy by country': [('x.com/grok',
   'grok',
   'https://x.com/grok/status/1978462021470552229')],
 'nick fuentes': [('x.com/grok',
   'grok',
   'https://x.com/grok/status/1969964316276449649')],
 'persecution of christians in the eastern bloc': [('x.com/grok',
   'grok',
   'https://x.com/grok/status/1951197167718461852')],
 'mia 

In [30]:
with open('../results/overall/usernames/grok_twitter_convo_links_w_article.csv', 'w') as f:
    for title, link in grok_ai_share_links.items():
        for l in link:
            f.write(f'{title},{l[-1]}\n') if ',' not in title else f.write(f'"{title}",{l[-1]}\n')

In [None]:
with open('../results/wp_usernames_by_title.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["title", "site", "username", "count"])
    for title, site_dict in wp_usernames_by_title.items():
        for site, usernames in site_dict.items():
            for username, count in usernames.items():
                writer.writerow([title, site, username, count])

with open('../results/grok_usernames_by_title.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["title", "site", "username", "count"])
    for title, site_dict in grok_usernames_by_title.items():
        for site, usernames in site_dict.items():
            for username, count in usernames.items():
                writer.writerow([title, site, username, count])

In [None]:
pages = {}

for (_, link, title) in grok_grok_convo_links:
    if 'i/grok/share' in link:
        if title not in pages:
            pages[title] = set()

        link_id = link.strip().split('/')[-1]
        # Add link only if there is no other link in pages[title] with the same final part
        if link_id not in {l.strip().split('/')[-1] for l in pages[title]}:
            pages[title].add(link)

In [None]:
with open('../results/grok_grok_convo_links.txt', 'w') as f:
    for page, links in pages.items():
        for link in links:
            f.write(f'{page},{link}\n')

In [None]:
wp_username_df = pd.read_csv('../results/wp_usernames_by_title.csv')
grok_username_df = pd.read_csv('../results/grok_usernames_by_title.csv')

In [None]:
[i[1] for i in wp_grok_convo_links if 'i/grok/share' in i[1]]

In [None]:
[i[1] for i in grok_grok_convo_links if 'i/communitynotes/share' in i[1]]

In [None]:
with open('../results/grok_grok_convo_links.txt', 'w') as f:
    for i in grok_grok_convo_links:
        if 'i/grok/share' in i[1]:
            f.write(f'{i[1]}\n')

In [None]:
combined_username_df = pd.merge(wp_username_df, grok_username_df, on=['title', 'site', 'username'], suffixes=('_wp', '_grok'), how='outer').fillna(0)

In [None]:
combined_username_df['diff'] = combined_username_df['count_grok'] - combined_username_df['count_wp']

In [None]:
combined_username_df[combined_username_df['title'] == 'elon musk']

In [None]:
combined_username_df.sort_values(by='diff', ascending=False)[:50]

In [None]:
usernames_by_site_df = combined_username_df[['site', 'username', 'count_wp', 'count_grok', 'diff']].groupby(['site', 'username']).sum().reset_index().sort_values(by='count_grok', ascending=False)#.sort_values(by='diff', ascending=False)

In [None]:
usernames_by_site_df[:50]

In [None]:
usernames_by_site_df[usernames_by_site_df['site'].isin(['twitter.com', 'x.com'])][['username', 'count_wp', 'count_grok', 'diff']].groupby('username').sum().sort_values(by='diff', ascending=False)[:50]

### Analysis of fringe sites

In [None]:
fringe_sites = {
    'unz.com',
    'vdare.com',
    'frontpagemag.com',
    'jihadwatch.org',
    'lifesitenews.com',
    'thegatewaypundit.com',
    'globalresearch.ca',
    'voltairenet.org',
    'infowars.com',
    'stormfront.org',
}


import pandas as pd


fringe_rows = []
for page in grok_domains:
    title = list(page.keys())[0]
    domains = list(page.values())[0]
    for site in fringe_sites:
        if site in domains:
            fringe_rows.append({
                'title': title,
                'site': site,
                'citations': domains[site]
            })

fringe_df = pd.DataFrame(fringe_rows)


In [None]:
fringe_df.sort_values(by='citations', ascending=False).to_csv('../results/fringe_citations.csv', index=False)

In [None]:
wikipedia_sites = {
    'wikipedia.org',
    'wiktionary.org',
    'wikiquote.org',
    'wikisource.org',
    'wikiversity.org',
    'wikivoyage.org',
    'wiktionary.org',
    'wikidata.org',
    'wikibooks.org',
    'wikinews.org',
    'wikispecies.org',
    'wikiversity.org',
    'wikivoyage.org',
}

wikipedia_rows = []
for page in grok_domains:
    title = list(page.keys())[0]
    domains = list(page.values())[0]
    for site in wikipedia_sites:
        for domain in domains:
            if site in domain:
                wikipedia_rows.append({
                    'title': title,
                    'site': domain,
                    'citations': domains[domain]
                })

wikipedia_df = pd.DataFrame(wikipedia_rows)

In [None]:
wikipedia_df.sort_values(by='citations', ascending=False).to_csv('../results/wikipedia_citations.csv', index=False)

In [None]:
wikipedia_df.site.value_counts()

In [None]:
total_citations = 0
for page in grok_domains:
    domains = list(page.values())[0]
    total_citations += sum(domains.values())
total_citations