# Analysis of skipped + errored pages in Grokipedia scrape

In [None]:
import pandas as pd
import os, json, glob, subprocess

## Load all URLs from HF

In [None]:
df_urls = pd.read_json("hf://datasets/stefan-it/grokipedia-urls/urls.jsonl", lines=True)
url_set = set(df_urls['url'].tolist())

## Download data from GCS

Only download if not already existing locally

In [None]:
# Download all JSONL batches from GCS and collect unique URLs

BUCKET = "enwiki-structured-contents-20251028"
LOCAL_DIR = os.path.abspath(os.path.join("..", "scraped_data"))
os.makedirs(LOCAL_DIR, exist_ok=True)

# Download all shard batch files (and any root-level batches if present)
# Requires: gcloud auth application-default login (or VM default creds) and gsutil installed

# Find all files that would be downloaded
result = subprocess.run(
    ["gsutil", "ls", f"gs://{BUCKET}/shard_*/*.jsonl"],
    check=False,
    stdout=subprocess.PIPE,
    text=True
)
remote_files = result.stdout.strip().splitlines()

# For each remote file, check if it exists locally; if not, download it
for remote_fp in remote_files:
    # remote_fp looks like: gs://bucket/shard_x/filename.jsonl
    filename = os.path.basename(remote_fp)
    local_fp = os.path.join(LOCAL_DIR, filename)
    if os.path.exists(local_fp):
        continue  # skip if already downloaded
    subprocess.run(
        ["gsutil", "cp", remote_fp, local_fp],
        check=False
    )

## Get set of URLs from GCS + local

In [None]:
# Aggregate unique URLs from all JSONL files in LOCAL_DIR
urls = set()
files = glob.glob(os.path.join(LOCAL_DIR, "*.jsonl"))

i = 0
for fp in files:
    try:
        with open(fp, "r") as f:
            for line in f:
                if i % 40000 == 0:
                    print(f"Processing page {i}")
                if not line.strip():
                    i += 1
                    continue
                try:
                    rec = json.loads(line)
                    # JSONL structure: {'title', 'url', 'data', 'scraped_at'} or similar
                    if "url" in rec:
                        urls.add(rec["url"])
                        i += 1
                    elif "data" in rec and isinstance(rec["data"], dict) and "url" in rec["data"]:
                        urls.add(rec["data"]["url"])
                        i += 1
                except json.JSONDecodeError:
                    i += 1
                    continue
    except FileNotFoundError:
        continue

print(f"Considered {len(files)} JSONL files into: {LOCAL_DIR}")
print(f"Unique URLs collected: {len(urls):,}")

In [None]:
len(url_set.difference(urls))

In [None]:
# write out to ../urls.txt for later retry
with open("../urls.txt", "w") as f:
    for url in url_set.difference(urls):
        f.write(url + "\n")
