In [1]:
import json
import pandas as pd
import csv
from datetime import datetime
from collections import defaultdict, Counter
from tqdm import tqdm

In [2]:
# load llm bots
with open("../llm_bots.json", 'r') as f:
    first_seen_date = json.load(f)

target_bots = set(first_seen_date.keys())

## Load data

In [3]:
## tranco data
tranco_file = "../measurement_data/tranco_top-1m_20250407.csv"

# read line and make url2rank dict
url2rank = {}
with open(tranco_file, 'r') as f:
    for line in f:
        rank, url = line.strip().split(',')
        url2rank[url] = int(rank)

In [4]:
start_date_str = "20230101000000"
mid_cutoff_date_str = "20240301000000"
cutoff_date_str = "20250401000000"

start_date = datetime.strptime(start_date_str, "%Y%m%d%H%M%S")
mid_cutoff = datetime.strptime(mid_cutoff_date_str, "%Y%m%d%H%M%S")
cutoff_date = datetime.strptime(cutoff_date_str, "%Y%m%d%H%M%S")
num_days_expected_historical_1 = (mid_cutoff - start_date).days+1
num_days_expected_historical_2 = (cutoff_date - mid_cutoff).days+1
num_days_expected_total = (cutoff_date - start_date).days+1

historical_robots_file = f"../measurement_data/historical_tranco_robots_data_20230101_20250407.json"
with open(historical_robots_file, 'r') as f:
    all_snapshots_data = json.load(f)

csv_file = f'../measurement_data/tranco_top-1m_20250407.csv'
with open(csv_file, 'r') as csvfile:
    reader = csv.reader(csvfile)
    rows = list(reader)
    total_rows = len(rows)
url2tranco_rank = {row[1]: row[0] for row in rows}

with open("../measurement_data/url2botinfo_20240411.json", 'r') as f:
    url2botinfo = json.load(f)

In [5]:
all_snapshots_data.keys()

dict_keys(['seesaa.net', 'live.com', 'popsugar.com', 'bolavip.com', 'theadvocate.com', 'gizmodo.com', 'prnewswire.com', 'otto.de', 'auctollo.com', 'girlsaskguys.com', 'samsung.com', 'elle.com', '1688.com', 'wyborcza.pl', 'tv2.dk', 'narod.ru', 'nordstromrack.com', 'usatoday.com', 'notion.so', 'masutabe.info', 'volkswagen.de', 'userapi.com', 'livejournal.com', 'the-star.co.ke', 'ntppool.org', 'pnas.org', 'uwants.com', 'hdtube.porn', 'popularmechanics.com', 'avito.ru', 'ebay.com', 'avclub.com', 'brainyquote.com', 'generatepress.com', 'meetup.com', 'rawstory.com', 'wikiwand.com', 'reuters.com', 'asahi.com', 'nydailynews.com', 'shaalaa.com', 'kompas.com', 'laprensagrafica.com', 'gq.com', 'leparisien.fr', 'flickr.com', 'dict.cn', '123rf.com', 'brainly.in', 'newsru.com', 'justjared.com', 'detik.com', 'zoominfo.com', 'boardgamegeek.com', 'lianjia.com', 'rusprofile.ru', 'ecuavisa.com', 'bbc.co.uk', 'mercadolibre.com.ve', 'shopee.tw', 'gartner.com', 'fortinet.com', 'go.com', 'consumerreports.org

In [6]:
# Helper function to extract bot names from robots.txt
def get_bot_list(robots_txt):
    bots = []
    for line in robots_txt.split('\n'):
        if line.strip().startswith('#'):
            continue
        parts = [part.strip() for part in line.split(':') if part.strip()]
        if len(parts) != 2:
            continue
        
        key, value = parts
        key = key.lower()
        value = value.split('#', 1)[0].strip() 
        
        if key == 'user-agent':
            bots.append(value)

    return bots

# Helper function to find first timestamp of a bot appearance
def find_first_timestamp(data, bot_name):
    for i, (timestamp, bot_list) in enumerate(data):
        if bot_name in bot_list:
            return timestamp

In [7]:
### LLM bots inclusion percentage change
filtered_snapshots_data = {}
total_coverage = 20

# Filter all snapshots data
for site, snapshots in all_snapshots_data.items():
    if site not in url2tranco_rank or int(url2tranco_rank[site]) > 10000:
        continue
    filtered_snapshots = [snapshot for snapshot in snapshots if snapshot['timestamp'] <= cutoff_date_str]
    coverage_percentage = round((len(filtered_snapshots) / num_days_expected_total) * 100) 
    if filtered_snapshots and coverage_percentage >= total_coverage:
        filtered_snapshots_data[site] = filtered_snapshots

print(
      f"All data - Number of filtered domains: {len(filtered_snapshots_data)}\n"
      f"All data coverage cut: {total_coverage}%\n"  
      f"Snapshots less than {0.01*total_coverage*num_days_expected_total} days are excluded in all data collection\n"
      f"Total snapshots in filtered data: {sum([len(data) for data in filtered_snapshots_data.values()])}")

All data - Number of filtered domains: 563
All data coverage cut: 20%
Snapshots less than 164.4 days are excluded in all data collection
Total snapshots in filtered data: 239811


## Filter by Coverage

In [8]:
url_with_llm_bots = set([url for url, bot_list in url2botinfo.items() if set([bot.lower() for bot in bot_list]).intersection(set(b.lower() for b in target_bots))])
crawled_urls = set(filtered_snapshots_data.keys())

In [9]:
len(url_with_llm_bots), len(crawled_urls), len(url_with_llm_bots.intersection(crawled_urls))

(41886, 563, 501)

## Find bot first appearance date

In [10]:
## to find first timestamp of a bot appearance
url2ts_bot_list = defaultdict(list)
for url, snapshot_data in filtered_snapshots_data.items():
    for snapshot in snapshot_data:
        url2ts_bot_list[url].append(
            (snapshot['timestamp'],
             get_bot_list(snapshot['html_content']))
             )

In [11]:
bots = []
for url in crawled_urls.intersection(url_with_llm_bots):
    bots += list(url2botinfo[url].keys())
bot_counter = Counter(bots)

In [12]:
data = {} 

for url in crawled_urls.intersection(url_with_llm_bots):
    data[url] = {}  # Initialize a nested dictionary for each URL
    for bot in target_bots:
        init_ts = find_first_timestamp(url2ts_bot_list[url], bot)
        data[url][bot] = init_ts  # Fill in the timestamp for each bot at this URL
    data[url]['tranco_rank'] = url2tranco_rank.get(url, None)

results_df = pd.DataFrame.from_dict(data, orient='index')

# Display the DataFrame
results_df["tranco_rank"] = pd.to_numeric(results_df["tranco_rank"])
results_df = results_df.sort_values(by='tranco_rank')
results_df.reset_index(inplace=True)
results_df.rename(columns={'index': 'URL'}, inplace=True)



In [13]:
results_df

Unnamed: 0,URL,Baiduspider,Google-Extended,GPTBot,anthropic-ai,Amazonbot,Omgilibot,CCBot,Bytespider,BingBot,ClaudeBot,YouBot,FacebookBot,PetalBot,Yeti,ChatGPT-User,DuckDuckbot,tranco_rank
0,facebook.com,,20240110001440,20240110001440,,20250331003653,,,,,20250331003653,,,20250331003653,20230101000020,,,4
1,dzen.ru,,,20240117003650,,,,,,,,,,,,,,6
2,apple.com,20230101000118,,,,,,,,,,,,,,,,8
3,instagram.com,,20240203000128,20240203000128,,20240726000218,,,,,20240718002918,,,20240726000218,20230101000007,,,13
4,twitter.com,,,,,,,,,,,,20231017000216,,,,,14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
496,windowscentral.com,,,20230906030115,,,,,,,,,,,,,,8740
497,virgilio.it,20230704145107,,,,,,,,,,,,,,,,8934
498,extra.cz,,,20231206090612,,,,,,,,,,,,20231206090612,,8990
499,cc.com,,,,,,,,,,,,,20230428024417,,,,9308


In [14]:
# Replace entries with date 2024-01-01 with NaT 
# we don't know since when the bot is first listed in robots.txt in such cases
# as our initial data collection starts from 2023-01-01

entries_before = 0
for col in results_df.columns:
    if col not in ['URL', 'tranco_rank']:
        mask = results_df[col].astype(str).str.startswith('20230101')
        entries_before += mask.sum()
        results_df.loc[mask, col] = pd.NaT

print(f"Number of entries changed from 20230101* to NaT: {entries_before}")

Number of entries changed from 20230101* to NaT: 95


In [15]:
results_df

Unnamed: 0,URL,Baiduspider,Google-Extended,GPTBot,anthropic-ai,Amazonbot,Omgilibot,CCBot,Bytespider,BingBot,ClaudeBot,YouBot,FacebookBot,PetalBot,Yeti,ChatGPT-User,DuckDuckbot,tranco_rank
0,facebook.com,,20240110001440,20240110001440,,20250331003653,,,,,20250331003653,,,20250331003653,NaT,,,4
1,dzen.ru,,,20240117003650,,,,,,,,,,,,,,6
2,apple.com,NaT,,,,,,,,,,,,,,,,8
3,instagram.com,,20240203000128,20240203000128,,20240726000218,,,,,20240718002918,,,20240726000218,NaT,,,13
4,twitter.com,,,,,,,,,,,,20231017000216,,,,,14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
496,windowscentral.com,,,20230906030115,,,,,,,,,,,,,,8740
497,virgilio.it,20230704145107,,,,,,,,,,,,,,,,8934
498,extra.cz,,,20231206090612,,,,,,,,,,,,20231206090612,,8990
499,cc.com,,,,,,,,,,,,,20230428024417,,,,9308


### Days passed since first bot appearance

In [16]:
# Convert first_seen_date to datetime if not already done
for bot in first_seen_date:
    if first_seen_date[bot] and not isinstance(first_seen_date[bot], datetime):
        first_seen_date[bot] = datetime.strptime(first_seen_date[bot], '%Y-%m-%d')

# Ensure all relevant bot columns are in datetime format for accurate calculation
for bot in target_bots:
    # Only convert if the column exists and not already in datetime format
    if bot in results_df.columns and not isinstance(results_df[bot].iloc[0], pd.Timestamp):
        results_df[bot] = pd.to_datetime(results_df[bot].astype(str), errors='coerce', format='%Y%m%d%H%M%S')

In [17]:
stats_final = {}
urls_with_days_passed_less_than_zero = {}

for bot, first_seen in tqdm(first_seen_date.items()):
    if not first_seen:
        print(f"{bot} not in first_seen_date")
        continue
    if bot not in results_df:
        print(f"{bot} not in result_df")
    bot_series = results_df[[bot, 'URL']].dropna(subset=[bot])  # Keep rows with valid bot dates
    
    
    bot_series['Days_Passed'] = (bot_series[bot] - first_seen).dt.days

    # These bots existed before our first historical data collection
    if bot in set(["Amazonbot", "FacebookBot", "Bytespider", "Yeti", "Baiduspider", "PetalBot", "BingBot", "DuckDuckbot"]):
        # Remove rows where Days_Passed is less than 0
        bot_series = bot_series[bot_series['Days_Passed'] >= 0]

    # Update stats and include URL with the minimum days passed
    if not bot_series['Days_Passed'].empty:
        # print(bot, len(bot_series['Days_Passed']))
        min_days = int(bot_series['Days_Passed'].min())
        max_days = int(bot_series['Days_Passed'].max())
        avg_days = float(bot_series['Days_Passed'].mean())
        median_days = int(bot_series['Days_Passed'].median())
        # Find the URL(s) corresponding to the minimum days passed
        min_days_urls = bot_series[bot_series['Days_Passed'] == min_days]['URL'].tolist()
        max_days_urls = bot_series[bot_series['Days_Passed'] == max_days]['URL'].tolist()
        stats_final[bot] = {
            'max': max_days,
            'min': min_days,
            'average': f"{avg_days:.2f}",
            'median': f"{median_days:.2f}",
            'URL_with_min_days': min_days_urls,
            'URL_with_max_days': max_days_urls,
        }
    else:
        print(bot_series)
        print(f"{bot} days passed is empty")

    # Find and store URLs with days passed less than 0
    negative_days_passed_urls = bot_series[bot_series['Days_Passed'] < 0]['URL'].tolist()
    if negative_days_passed_urls:
        urls_with_days_passed_less_than_zero[bot] = negative_days_passed_urls


100%|██████████| 16/16 [00:00<00:00, 769.61it/s]

Empty DataFrame
Columns: [DuckDuckbot, URL, Days_Passed]
Index: []
DuckDuckbot days passed is empty





# Print the table (Table 3 in paper)

In [18]:
# Sort stats_final with the order shown in Table 3 of the paper

ordered_bots = [
    # LLM Vendor
    "GPTBot", "Google-Extended", "anthropic-ai", "ClaudeBot",
    "Amazonbot", "FacebookBot", "Bytespider", "Yeti", "Baiduspider", "PetalBot",
    # Data Broker
    "CCBot", "Omgilibot",
    # RAG Vendor
    "ChatGPT-User", "YouBot", "BingBot", "DuckDuckbot"
]

# Create a new ordered dictionary for pretty printing and table output
from collections import OrderedDict
stats_final = OrderedDict((bot, stats_final[bot]) for bot in ordered_bots if bot in stats_final)

In [19]:
from prettytable import PrettyTable

table = PrettyTable()

# Define the table columns
table.field_names = ["Bot Name", "Median Days After", "Max. Days After", "Min. Days After"]

# Populate the table with data from stats_final
for bot, stats in stats_final.items():
    table.add_row([bot, stats['median'], stats['max'], stats['min']])

# Print the table
print("Days passed (Table 3)")
print(table)

Days passed (Table 3)
+-----------------+-------------------+-----------------+-----------------+
|     Bot Name    | Median Days After | Max. Days After | Min. Days After |
+-----------------+-------------------+-----------------+-----------------+
|      GPTBot     |       29.00       |       470       |        0        |
| Google-Extended |       40.00       |       512       |        1        |
|   anthropic-ai  |       209.00      |       551       |        84       |
|    ClaudeBot    |       80.00       |       346       |       -136      |
|    Amazonbot    |       160.00      |       550       |        6        |
|   FacebookBot   |       342.00      |       621       |       224       |
|    Bytespider   |       238.00      |       551       |        14       |
|       Yeti      |       86.00       |       334       |        9        |
|   Baiduspider   |       38.00       |       327       |        7        |
|     PetalBot    |       355.00      |       742       |        2

In [20]:
print("The following URLs with days passed less than 0:")
print("\nBot Name      | Domain              | First Appearance Date")
print("-" * 65)
for bot, domain_list in urls_with_days_passed_less_than_zero.items():
    for domain in domain_list:
        date = results_df[results_df.URL==domain][bot].tolist()[0]
        print(f"{bot:<12} | {domain:<20} | {date}")

The following URLs with days passed less than 0:

Bot Name      | Domain              | First Appearance Date
-----------------------------------------------------------------
ClaudeBot    | tumblr.com           | 2024-02-27 00:03:59
ClaudeBot    | nytimes.com          | 2024-02-15 00:33:25
ClaudeBot    | cnn.com              | 2024-04-17 00:00:40
ClaudeBot    | theverge.com         | 2024-04-12 00:00:41
ClaudeBot    | indiamart.com        | 2024-03-18 15:51:33
ClaudeBot    | venturebeat.com      | 2023-12-05 00:12:22
ClaudeBot    | coursehero.com       | 2024-03-04 23:03:00
ClaudeBot    | dpreview.com         | 2024-03-01 04:56:50
ChatGPT-User | lemonde.fr           | 2023-03-29 01:44:32
ChatGPT-User | geeksforgeeks.org    | 2023-05-02 13:12:51
ChatGPT-User | expedia.com          | 2023-03-09 10:05:46
ChatGPT-User | instacart.com        | 2023-03-12 00:50:29
ChatGPT-User | tf1.fr               | 2023-03-30 00:13:23
ChatGPT-User | kayak.com            | 2023-03-08 10:23:44
ChatGPT-User