In [1]:
import pandas as pd
import os, json, glob, subprocess

In [2]:
df_urls = pd.read_json("hf://datasets/stefan-it/grokipedia-urls/urls.jsonl", lines=True)
url_set = set(df_urls['url'].tolist())

In [6]:
# Download all JSONL batches from GCS and collect unique URLs

BUCKET = "enwiki-structured-contents-20251028"
LOCAL_DIR = os.path.abspath(os.path.join("..", "scraped_data"))
os.makedirs(LOCAL_DIR, exist_ok=True)

# Download all shard batch files (and any root-level batches if present)
# Requires: gcloud auth application-default login (or VM default creds) and gsutil installed

# Find all files that would be downloaded
result = subprocess.run(
    ["gsutil", "ls", f"gs://{BUCKET}/shard_*/*.jsonl"],
    check=False,
    stdout=subprocess.PIPE,
    text=True
)
remote_files = result.stdout.strip().splitlines()

In [7]:
remote_files

['gs://enwiki-structured-contents-20251028/shard_0/batch_480000_480326.jsonl',
 'gs://enwiki-structured-contents-20251028/shard_0/batch_480327_480662.jsonl',
 'gs://enwiki-structured-contents-20251028/shard_0/batch_480663_481000.jsonl',
 'gs://enwiki-structured-contents-20251028/shard_0/batch_481001_481333.jsonl',
 'gs://enwiki-structured-contents-20251028/shard_0/batch_481334_481672.jsonl',
 'gs://enwiki-structured-contents-20251028/shard_0/batch_481673_482007.jsonl',
 'gs://enwiki-structured-contents-20251028/shard_0/batch_482008_482335.jsonl',
 'gs://enwiki-structured-contents-20251028/shard_0/batch_482336_482672.jsonl',
 'gs://enwiki-structured-contents-20251028/shard_0/batch_482673_483011.jsonl',
 'gs://enwiki-structured-contents-20251028/shard_0/batch_483012_483347.jsonl',
 'gs://enwiki-structured-contents-20251028/shard_0/batch_483348_483680.jsonl',
 'gs://enwiki-structured-contents-20251028/shard_0/batch_483681_484021.jsonl',
 'gs://enwiki-structured-contents-20251028/shard_0/b

In [9]:
# For each remote file, check if it exists locally; if not, download it
for remote_fp in remote_files:
    # remote_fp looks like: gs://bucket/shard_x/filename.jsonl
    filename = os.path.basename(remote_fp)
    local_fp = os.path.join(LOCAL_DIR, filename)
    if os.path.exists(local_fp):
        continue  # skip if already downloaded
    subprocess.run(
        ["gsutil", "cp", remote_fp, local_fp],
        check=False
    )

Copying gs://enwiki-structured-contents-20251028/shard_0/batch_800020_800361.jsonl...
\ [1 files][ 39.7 MiB/ 39.7 MiB]                                                
Operation completed over 1 objects/39.7 MiB.                                     
Copying gs://enwiki-structured-contents-20251028/shard_0/batch_800362_800702.jsonl...
\ [1 files][ 35.5 MiB/ 35.5 MiB]                                                
Operation completed over 1 objects/35.5 MiB.                                     
Copying gs://enwiki-structured-contents-20251028/shard_0/batch_800703_801047.jsonl...
- [1 files][ 31.1 MiB/ 31.1 MiB]                                                
Operation completed over 1 objects/31.1 MiB.                                     
Copying gs://enwiki-structured-contents-20251028/shard_0/batch_801048_801391.jsonl...
- [1 files][ 28.3 MiB/ 28.3 MiB]                                                
Operation completed over 1 objects/28.3 MiB.                                     
Copy

In [None]:
# 2) Aggregate unique URLs from all JSONL files in LOCAL_DIR
urls = set()
files = glob.glob(os.path.join(LOCAL_DIR, "*.jsonl"))

i = 0
for fp in files:
    try:
        with open(fp, "r") as f:
            for line in f:
                if i % 40000 == 0:
                    print(f"Processing page {i}")
                if not line.strip():
                    i += 1
                    continue
                try:
                    rec = json.loads(line)
                    # JSONL structure: {'title', 'url', 'data', 'scraped_at'} or similar
                    if "url" in rec:
                        urls.add(rec["url"])
                        i += 1
                    elif "data" in rec and isinstance(rec["data"], dict) and "url" in rec["data"]:
                        urls.add(rec["data"]["url"])
                        i += 1
                except json.JSONDecodeError:
                    i += 1
                    continue
    except FileNotFoundError:
        continue

print(f"Downloaded {len(files)} JSONL files into: {LOCAL_DIR}")
print(f"Unique URLs collected: {len(urls):,}")
# 'urls' now contains the set of all URLs

Processing page 0
Downloaded 1738 JSONL files into: /Users/haltriedman/code/wiki-grok-comparison/scraped_data
Unique URLs collected: 658,619


In [12]:
len(url_set.difference(urls))

226660

In [13]:
with open("../urls.txt", "w") as f:
    for url in url_set.difference(urls):
        f.write(url + "\n")
