In [1]:
import json
from tqdm import tqdm
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

url2botinfo_path = f'../measurement_data/url2botinfo_20240411.json'
with open(url2botinfo_path, 'r') as f:
    url2botinfo = json.load(f)

llm_bots = [
    'GPTBot', 'Google-Extended', 'anthropic-ai', 'cohere-ai', 
    "Amazonbot", "FacebookBot", "Bytespider", "Yeti", 
    "YandexBot", "Baiduspider", "PetalBot",
]

databroker_bots = [
    "CCBot", 
    "Omgilibot", 
]

rag_bots = [
    "Bingbot", "ChatGPT-User", "cohere-ai", 
    "PerplexityBot", "YouBot", "DuckDuckBot", 
]

llm_bots = [bot.lower() for bot in llm_bots]
databroker_bots = [bot.lower() for bot in databroker_bots]
rag_bots = [bot.lower() for bot in rag_bots]

# Update the bot names in url2botinfo to lowercase
new_url2botinfo = {}
for url, bot_info in url2botinfo.items():
    new_bot_info = {bot_name.lower(): paths for bot_name, paths in bot_info.items()}
    new_url2botinfo[url] = new_bot_info
url2botinfo = new_url2botinfo

In [2]:
# Initialize counters for each combination
llm_and_db_count = 0
llm_and_rag_count = 0
db_and_rag_count = 0

# Iterate through each domain in url2botinfo
for url, bot_info in tqdm(url2botinfo.items()):
    bots_in_domain = set(bot_info.keys())  # Get the set of bots for this domain
    
    # Check which bots are in this domain
    llm_present = any(bot in bots_in_domain for bot in llm_bots)
    db_present = any(bot in bots_in_domain for bot in databroker_bots)
    rag_present = any(bot in bots_in_domain for bot in rag_bots)
    
    # Count the domains where these combinations are present
    if llm_present or db_present:
        llm_and_db_count += 1
    if llm_present or rag_present:
        llm_and_rag_count += 1
    if db_present or rag_present:
        db_and_rag_count += 1

# Print the results
print(f"Domains with LLM and Data Broker bots: {llm_and_db_count}")
print(f"Domains with LLM and RAG bots: {llm_and_rag_count}")
print(f"Domains with Data Broker and RAG bots: {db_and_rag_count}")

100%|██████████| 582281/582281 [00:01<00:00, 378572.49it/s]

Domains with LLM and Data Broker bots: 35842
Domains with LLM and RAG bots: 41705
Domains with Data Broker and RAG bots: 23645





In [3]:
target_pairs = [
    ('GPTBot', 'CCBot'),
    ('Google-Extended', 'CCBot'),
    ('anthropic-ai', 'CCBot'),
    ('cohere-ai', 'CCBot'),
    ('Amazonbot', 'CCBot'),
    ('FacebookBot', 'CCBot'),
    ('Bytespider', 'CCBot'),
    ('Yeti', 'CCBot'),
    ('YandexBot', 'CCBot'),
    ('Baiduspider', 'CCBot'),
    ('PetalBot', 'CCBot'), 
]
target_pairs = [(pair[0].lower(), pair[1].lower()) for pair in target_pairs] + [(pair[1].lower(), pair[0].lower()) for pair in target_pairs] 

In [4]:
# Creating a set of unique bots
target_bots = set(bot for pair in target_pairs for bot in pair)
domain_with_target_bots = [domain for domain, bot_info in url2botinfo.items() if len(set([bot.lower() for bot in bot_info.keys()]).intersection(target_bots)) > 0]

In [5]:
len(domain_with_target_bots), len(target_bots)

(35689, 12)

## Count inter-full conflict between bots 

In [6]:
from tqdm import tqdm
from itertools import product, combinations

def is_full_conflict(bot1, bot2, bot_info):
    bot1_allows = set(bot_info[bot1].get('allow', []))
    bot2_allows = set(bot_info[bot2].get('allow', []))
    bot1_disallows = set(bot_info[bot1].get('disallow', []))
    bot2_disallows = set(bot_info[bot2].get('disallow', []))

    # Corrected conditions for a bot allowing or disallowing all
    bot1_allows_all = not bot1_allows and not bot1_disallows
    bot2_allows_all = not bot2_allows and not bot2_disallows
    bot1_disallows_all = bot1_disallows == {'/'}
    bot2_disallows_all = bot2_disallows == {'/'}

    # Identifying conflict when one bot allows all and the other disallows all
    allows_disallows_conflict = (bot1_allows_all and bot2_disallows_all) or (bot1_disallows_all and bot2_allows_all)

    return allows_disallows_conflict

# Initialize dictionaries for each specific type of inter-conflict
full_inter_llm_databroker_url = {(bot1, bot2): [] for bot1, bot2 in product(llm_bots, databroker_bots)}
full_inter_llm_rag_url = {(bot1, bot2): [] for bot1, bot2 in product(llm_bots, rag_bots)}
full_inter_databroker_rag_url = {(bot1, bot2): [] for bot1, bot2 in product(databroker_bots, rag_bots)}

def count_inter_conflicts(url, bot_info):
    bots = set(bot_info.keys())
    
    if '*' not in bots:
        bot_info['*'] = {'allow': [], 'disallow': []}
    
    # Function to update conflicts in a specific dictionary
    def update_conflicts(url, bot_pair_dict):
        for bot1, bot2 in bot_pair_dict.keys():
            if bot1 in bots and bot2 in bots:
                if is_full_conflict(bot1, bot2, bot_info):
                    bot_pair_dict[(bot1, bot2)].append(url)
            elif bot1 not in bots and bot2 in bots:
                if is_full_conflict('*', bot2, bot_info):
                    bot_pair_dict[(bot1, bot2)].append(url)
                
            elif bot1 in bots and bot2 not in bots:
                if is_full_conflict(bot1, '*', bot_info):
                    bot_pair_dict[(bot1, bot2)].append(url)
            else:
                pass;
                # print('both bots are not listed --> no conflict')

    # Update each specific inter-conflict dictionary
    update_conflicts(url, full_inter_llm_databroker_url)
    update_conflicts(url, full_inter_llm_rag_url)
    update_conflicts(url, full_inter_databroker_rag_url)

# Example loop to populate the dictionaries
for url, bot_info in tqdm(url2botinfo.items()):
    count_inter_conflicts(url, bot_info)

100%|██████████| 582281/582281 [00:06<00:00, 95812.78it/s] 


## Count inter-conflict between bots

In [7]:
from tqdm import tqdm
from itertools import product, combinations

def is_strict_conflict(bot1, bot2, bot_info):
    bot1_allows = set(bot_info[bot1].get('allow', []))
    bot2_allows = set(bot_info[bot2].get('allow', []))
    bot1_disallows = set(bot_info[bot1].get('disallow', []))
    bot2_disallows = set(bot_info[bot2].get('disallow', []))

    return bot1_allows != bot2_allows or bot1_disallows != bot2_disallows


# Initialize dictionaries for each specific type of inter-conflict
partial_inter_llm_databroker_url = {(bot1, bot2): [] for bot1, bot2 in product(llm_bots, databroker_bots)}
partial_inter_llm_rag_url = {(bot1, bot2): [] for bot1, bot2 in product(llm_bots, rag_bots)}
partial_inter_databroker_rag_url = {(bot1, bot2): [] for bot1, bot2 in product(databroker_bots, rag_bots)}

def count_partial_inter_conflicts(url, bot_info):
    bots = set(bot_info.keys())
    
    # Non-strict setting
    if '*' not in bot_info:
        bot_info['*'] = {'allow': [], 'disallow': []}

    # Function to update conflicts in a specific dictionary
    def update_conflicts(url, bot_pair_dict):
        for bot1, bot2 in bot_pair_dict.keys():
            if bot1 in bots and bot2 in bots:
                if is_strict_conflict(bot1, bot2, bot_info):
                    bot_pair_dict[(bot1, bot2)].append(url)
            elif bot1 not in bots and bot2 in bots:
                if is_strict_conflict('*', bot2, bot_info):
                    bot_pair_dict[(bot1, bot2)].append(url)
                
            elif bot1 in bots and bot2 not in bots:
                if is_strict_conflict(bot1, '*', bot_info):
                    bot_pair_dict[(bot1, bot2)].append(url)

            else:
                pass;
                # print('both bots are not listed --> no conflict')

    # Update each specific inter-conflict dictionary
    update_conflicts(url, partial_inter_llm_databroker_url)
    update_conflicts(url, partial_inter_llm_rag_url)
    update_conflicts(url, partial_inter_databroker_rag_url)

# Example loop to populate the dictionaries
for url, bot_info in tqdm(url2botinfo.items()):
    count_partial_inter_conflicts(url, bot_info)

100%|██████████| 582281/582281 [00:05<00:00, 99192.70it/s] 


## Conflict statistics (Table 6)

In [8]:
from prettytable import PrettyTable
from tqdm import tqdm
table = PrettyTable()
table.field_names = ["Bot Pair", "Domains", "Conflicts", "Full Conflicts"]

# Track unique domains across all pairs
all_domains = set()
all_partial_conflicts = set()
all_full_conflicts = set()

for pair in tqdm(target_pairs):
    if 'ccbot' == pair[1]:
        domain_with_bot_pair = [domain for domain, bot_info in url2botinfo.items() if len(set([bot.lower() for bot in bot_info.keys()]).intersection(set(pair))) > 0]
        all_domains.update(domain_with_bot_pair)
        
        # Get partial conflict stats  
        partial_conflicts = ""
        if pair in partial_inter_llm_databroker_url:
            conflicted_domains = partial_inter_llm_databroker_url[pair]
            all_partial_conflicts.update(conflicted_domains)
            num_domains = len(conflicted_domains)
            partial_conflicts = f"{num_domains} ({num_domains/len(domain_with_bot_pair)*100:.1f}%)"
        elif pair in partial_inter_llm_rag_url:
            conflicted_domains = partial_inter_llm_rag_url[pair]
            all_partial_conflicts.update(conflicted_domains)
            num_domains = len(conflicted_domains)
            partial_conflicts = f"{num_domains} ({num_domains/len(domain_with_bot_pair)*100:.1f}%)"

        # Get full conflict stats
        full_conflicts = ""
        if pair in full_inter_llm_databroker_url:
            conflicted_domains = full_inter_llm_databroker_url[pair]
            all_full_conflicts.update(conflicted_domains)
            num_domains = len(conflicted_domains)
            full_conflicts = f"{num_domains} ({num_domains/len(domain_with_bot_pair)*100:.1f}%)"
        elif pair in full_inter_llm_rag_url:
            conflicted_domains = full_inter_llm_rag_url[pair]
            all_full_conflicts.update(conflicted_domains)
            num_domains = len(conflicted_domains)
            full_conflicts = f"{num_domains} ({num_domains/len(domain_with_bot_pair)*100:.1f}%)"
            
        table.add_row([pair, len(domain_with_bot_pair), partial_conflicts, full_conflicts])

# Add totals row with unique domain counts
table.add_row(["Total (Unique)", len(all_domains), 
               f"{len(all_partial_conflicts)} ({len(all_partial_conflicts)/len(all_domains)*100:.1f}%)",
               f"{len(all_full_conflicts)} ({len(all_full_conflicts)/len(all_domains)*100:.1f}%)"])

print(table)


100%|██████████| 22/22 [00:07<00:00,  2.76it/s]

+------------------------------+---------+---------------+----------------+
|           Bot Pair           | Domains |   Conflicts   | Full Conflicts |
+------------------------------+---------+---------------+----------------+
|     ('gptbot', 'ccbot')      |  17124  |  9997 (58.4%) |  1583 (9.2%)   |
| ('google-extended', 'ccbot') |  11730  |  6562 (55.9%) |  1000 (8.5%)   |
|  ('anthropic-ai', 'ccbot')   |  10569  |  7917 (74.9%) |  1055 (10.0%)  |
|    ('cohere-ai', 'ccbot')    |  10395  |  8539 (82.1%) |  1228 (11.8%)  |
|    ('amazonbot', 'ccbot')    |  12508  |  9976 (79.8%) |  1454 (11.6%)  |
|   ('facebookbot', 'ccbot')   |  10782  |  8611 (79.9%) |  1218 (11.3%)  |
|   ('bytespider', 'ccbot')    |  11792  |  9582 (81.3%) |  1419 (12.0%)  |
|      ('yeti', 'ccbot')       |  12818  |  9878 (77.1%) |  1595 (12.4%)  |
|    ('yandexbot', 'ccbot')    |  14297  | 11764 (82.3%) |  1696 (11.9%)  |
|   ('baiduspider', 'ccbot')   |  20107  | 15106 (75.1%) |  2185 (10.9%)  |
|    ('petal


