In [None]:
import pandas as pd
from matplotlib import pyplot as plt
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.patches import Rectangle
import matplotlib.patches as mpatches
import squarify


In [None]:
t100_wiki = pd.read_csv('../results/t100_wiki.csv')
t100_grok = pd.read_csv('../results/t100_grok.csv')


In [None]:
# ============================================================================
# PLOT 1: Treemap/Area Plot by Individual Domain, Grouped by Type (Wiki vs Grok)
# ============================================================================

# Get unique types and assign consistent colors
all_types = sorted(set(t100_wiki['Type'].unique()) | set(t100_grok['Type'].unique()))
colors_map = {
    'Academic': '#1f77b4',      # blue
    'Database': '#ff7f0e',      # orange
    'Government': '#2ca02c',    # green
    'Industry site': '#d62728', # red
    'News': '#9467bd',          # purple
    'Niche media': '#8c564b',   # brown
    'Other': '#e377c2',         # pink
    'Portal': '#7f7f7f',        # gray
    'Reference': '#bcbd22',     # yellow-green
    'UGC': '#17becf'            # cyan
}

# For both, sort by Type (alphabetical), then by size descending within each type
def group_and_sort_sites(df, share_col, type_order):
    df = df.copy()
    df['Type'] = pd.Categorical(df['Type'], categories=type_order, ordered=True)
    return df.sort_values(['Type', share_col], ascending=[True, False]).reset_index(drop=True)

wiki_domains = group_and_sort_sites(
    t100_wiki, 'wiki_share', all_types
)
grok_domains = group_and_sort_sites(
    t100_grok, 'grok_share', all_types
)

fig1, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 9))

def site_treemap_grouped(ax, df, share_col, title, colors_map, label_thresh=0.0025):
    types = df['Type']
    colors = [colors_map.get(t, '#cccccc') for t in types]
    sizes = df[share_col].values
    # Show domain name if above a label threshold, else blank (showing INDIVIDUAL SITES for largest treemap cells)
    labels = [
        d if s >= label_thresh else ""
        for d, s in zip(df['domain'], sizes)
    ]

    # Generate rectangles and positions with squarify, not plotting yet
    normed_sizes = squarify.normalize_sizes(sizes, 100, 100)
    rects = squarify.squarify(normed_sizes, 0, 0, 100, 100)
    for r, color, label in zip(rects, colors, labels):
        ax.add_patch(
            Rectangle(
                (r['x'], r['y']),
                r['dx'],
                r['dy'],
                facecolor=color,
                alpha=0.8,
                edgecolor='black',    # Draw border
                linewidth=0.8
            )
        )
        if label:
            label = label.replace('.com', '').replace('.co.uk', '').replace('washingtonpost', 'washington\npost').replace('pro-football-reference', 'pro\nfootball\nref').replace('hollywoodreporter', 'hollywood\nreporter').replace('animenewsnetwork', 'anime\nnews\nnetwork').replace('books.google', 'google\nbooks').replace('news.google', 'google\nnews').replace('pmc.ncbi.nlm.nih.gov', 'pubmed').replace('rollingstone', 'rolling\nstone')
            ax.text(
                r['x'] + r['dx']/2,
                r['y'] + r['dy']/2,
                label,
                fontsize=10,
                ha='center',
                va='center'
            )
    ax.set_xlim(0, 100)
    ax.set_ylim(0, 100)
    ax.set_aspect('equal')
    ax.set_title(title, fontsize=18)
    ax.axis('off')

site_treemap_grouped(
    ax1, wiki_domains, 'wiki_share',
    'Wikipedia Top 100 Individual Sites (Grouped by Type)', colors_map
)
site_treemap_grouped(
    ax2, grok_domains, 'grok_share',
    'Grokipedia Top 100 Individual Sites (Grouped by Type)', colors_map
)

# Add legend (by type)
legend_elements = [
    mpatches.Patch(facecolor=colors_map.get(t, '#cccccc'), label=t)
    for t in all_types
    if (t in wiki_domains['Type'].unique()) or (t in grok_domains['Type'].unique())
]
fig1.legend(handles=legend_elements, loc='center', bbox_to_anchor=(0.5, 0.025), ncol=min(len(legend_elements), 6))

plt.tight_layout(rect=[0, 0.05, 1, 1])
plt.savefig('../graphics/domain_treemap_individual_comparison_grouped.pdf', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# ============================================================================
# PLOT 2: Side-by-side Top 100 Lists with Position Changes
# ============================================================================

# Add position rankings (1-100) to both dataframes
# Wikipedia: sorted by wiki share descending
wiki_ranked = t100_wiki.copy()
wiki_ranked = wiki_ranked.sort_values('wiki_share', ascending=False).reset_index(drop=True)
wiki_ranked['position'] = wiki_ranked.index + 1
wiki_ranked['wiki_position'] = wiki_ranked['position']

# Grokipedia: sorted by grok share descending
grok_ranked = t100_grok.copy()
grok_ranked = grok_ranked.sort_values('grok_share', ascending=False).reset_index(drop=True)
grok_ranked['position'] = grok_ranked.index + 1
grok_ranked['grok_position'] = grok_ranked['position']

# Create mapping for domains in both lists
wiki_positions = dict(zip(wiki_ranked['domain'], wiki_ranked['wiki_position']))
grok_positions = dict(zip(grok_ranked['domain'], grok_ranked['grok_position']))

# Find domains that appear in both lists
common_domains = set(wiki_ranked['domain']) & set(grok_ranked['domain'])

# Create figure with single axes for full control
fig2, ax = plt.subplots(figsize=(16, 20))
fig2.suptitle('Top 100 Domain Rankings: Wikipedia vs Grokipedia', fontsize=20, y=0.995)

# Set up coordinate system: x from 0 to 1, y with padding at top and bottom
ax.set_xlim(0, 1)
ax.set_ylim(-3, 103.5)  # Add padding above and below
ax.invert_yaxis()  # Top position at top

# Left side (Wikipedia) at x=0.15, Right side (Grokipedia) at x=0.85
wiki_x = 0.3
grok_x = 0.7

# Plot Wikipedia list (left)
for idx, row in wiki_ranked.iterrows():
    pos = row['wiki_position']
    domain = row['domain']
    domain_type = row['Type']
    color = colors_map.get(domain_type, '#cccccc')
    
    # Draw colored circle/square for each domain
    ax.scatter(wiki_x, pos, s=200, c=color, edgecolors='black', linewidths=0.5, 
               alpha=0.8, zorder=3)
    
    # Add domain label outside to the left of circle (clean up for readability)
    domain_label = domain.replace('.com', '').replace('.co.uk', '').replace('.org', '')
    if len(domain_label) > 20:
        domain_label = domain_label[:17] + '...'
    ax.text(wiki_x - 0.01, pos, domain_label, ha='right', va='center', fontsize=12,
            fontweight='bold' if domain in common_domains else 'normal')

# Plot Grokipedia list (right)
for idx, row in grok_ranked.iterrows():
    pos = row['grok_position']
    domain = row['domain']
    domain_type = row['Type']
    color = colors_map.get(domain_type, '#cccccc')
    
    # Draw colored circle/square for each domain
    ax.scatter(grok_x, pos, s=200, c=color, edgecolors='black', linewidths=0.5, 
               alpha=0.8, zorder=3)
    
    # Add domain label outside to the right of circle (clean up for readability)
    domain_label = domain.replace('.com', '').replace('.co.uk', '').replace('.org', '')
    if len(domain_label) > 20:
        domain_label = domain_label[:17] + '...'
    ax.text(grok_x + 0.01, pos, domain_label, ha='left', va='center', fontsize=12,
            fontweight='bold' if domain in common_domains else 'normal')

# Draw connecting lines for domains in both lists
from matplotlib.path import Path
import matplotlib.patches as patches

for domain in common_domains:
    wiki_pos = wiki_positions[domain]
    grok_pos = grok_positions[domain]
    pos_change = grok_pos - wiki_pos
    
    # Get color from Wikipedia type (or use gray if not found)
    wiki_row = wiki_ranked[wiki_ranked['domain'] == domain].iloc[0]
    line_color = colors_map.get(wiki_row['Type'], '#888888')
    
    # Line width based on absolute position change (thicker = bigger change)
    linewidth = 0.5 + abs(pos_change) * 0.02
    linewidth = min(linewidth, 3.0)  # Cap at 3
    
    # Alpha based on position change magnitude
    alpha = 0.3 + min(abs(pos_change) / 50, 0.4)  # 0.3 to 0.7
    
    # Draw curved line (using bezier curve)
    x1, y1 = wiki_x + 0.05, wiki_pos
    x2, y2 = grok_x - 0.05, grok_pos
    
    # Create curved path with control points
    verts = [(x1, y1), 
             ((x1 + x2) / 2, y1),  # Control point 1
             ((x1 + x2) / 2, y2),  # Control point 2
             (x2, y2)]
    codes = [Path.MOVETO, Path.CURVE4, Path.CURVE4, Path.CURVE4]
    path = Path(verts, codes)
    
    patch = patches.PathPatch(path, edgecolor=line_color, facecolor='none', 
                              linewidth=linewidth, alpha=alpha, zorder=1)
    ax.add_patch(patch)

# Add titles for each side
ax.text(wiki_x, -2, 'Wikipedia Top 100', ha='center', fontsize=18)
ax.text(grok_x, -2, 'Grokipedia Top 100', ha='center', fontsize=18)

# Remove axes decorations
ax.axis('off')

# Add legend
legend_elements = [
    mpatches.Patch(facecolor=colors_map.get(t, '#cccccc'), label=t, edgecolor='black', alpha=0.8)
    for t in all_types
    if (t in wiki_ranked['Type'].unique()) or (t in grok_ranked['Type'].unique())
]
fig2.legend(handles=legend_elements, loc='lower center', bbox_to_anchor=(0.5, 0.03), 
           ncol=min(len(legend_elements), 6), fontsize=13, frameon=True, title='Domain Type')

# Add text explaining the visualization
fig2.text(0.5, 0.02, 'Lines connect domains appearing in both lists. Line thickness indicates magnitude of position change.',
         ha='center', fontsize=13, style='italic')

plt.tight_layout(rect=[0, 0.05, 1, 0.98])
plt.savefig("../graphics/domain_position_comparison_t100.png", dpi=300, bbox_inches='tight')
plt.show()


In [None]:
len(set(t100_wiki.domain).intersection(set(t100_grok.domain)))

In [None]:
t100_wiki.sum()


In [None]:
t100_grok.sum()

In [None]:
t100_wiki.groupby('Type').sum()


In [None]:
t100_grok.groupby('Type').sum()
