# Dedupe Grokipedia content

I realized that I accidentally mis-parsed `<h1>` blocks in Grokipedia, leading to a great deal of duplicated content. This notebook is for deduping those pieces of content.

In [None]:
import json
import os
from dotenv import load_dotenv
import logging
from pathlib import Path
import tempfile
import shutil


# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Load environment variables
load_dotenv()


In [None]:
def process_page(page):
    data = page.get("data") or {}
    sections = data.get("sections") or []
    if not sections:
        return page

    # Precompute content signatures for each section: set of (type, text)
    section_signatures = []
    for s in sections:
        content = s.get("content") or []
        sig = set()
        for item in content:
            if isinstance(item, dict):
                sig.add((item.get("type"), item.get("text")))
        section_signatures.append(sig)

    # For each h1 section, build the "other sections" signature and dedupe its content
    for idx, s in enumerate(sections):
        if s.get("level") != "h1":
            continue

        # Union of all content in other sections (exclude this h1’s own section)
        other_sig = set()
        for j, sig in enumerate(section_signatures):
            if j != idx:
                other_sig |= sig

        content = s.get("content") or []
        deduped = [
            item for item in content
            if isinstance(item, dict) and (item.get("type"), item.get("text")) not in other_sig
        ]
        s["content"] = deduped

        # Update the cached signature for this h1 after dedupe (optional, not required further)
        section_signatures[idx] = set(
            (item.get("type"), item.get("text")) for item in deduped if isinstance(item, dict)
        )

    return page

def process_file(in_fp, out_fp=None):
    pages_processed = 0
    # If rewriting in place, write to a temp file first
    if out_fp is None:
        parent = Path(in_fp).parent
        tmp_fh, tmp_path = tempfile.mkstemp(prefix="dedupe_", suffix=".jsonl", dir=str(parent))
        os.close(tmp_fh)
        out_fp = tmp_path
        inplace = True
    else:
        inplace = False

    with open(in_fp, "r", encoding="utf-8") as fin, open(out_fp, "w", encoding="utf-8") as fout:
        for line in fin:
            line = line.strip()
            if not line:
                continue
            try:
                page = json.loads(line)
            except json.JSONDecodeError:
                continue
            page = process_page(page)
            fout.write(json.dumps(page, ensure_ascii=False) + "\n")
            pages_processed += 1

    if inplace:
        # Atomic replace
        shutil.move(out_fp, in_fp)

    return pages_processed

def dedupe_main_section(
    scraped_dir="../scraped_data",
    test_fp=None,
    outfile=None,
    glob_pattern="*.jsonl"  # recurse by default
):
    """
    For each page record, removes from the main (h1) section any content item (by (type, text))
    that also appears in any non-main section.

    Behavior:
      - If test_fp is provided: process only that file.
      - Else: process all JSONL files under scraped_dir matching glob_pattern.
      - If outfile is provided: append processed pages to outfile (do not modify inputs).
      - If outfile is None: rewrite input files in place (atomic replace).

    Returns:
      - When processing a single file (test_fp): number of pages processed.
      - When processing multiple: total number of files and pages processed.
    """

    # Single-file mode
    if test_fp:
        if outfile:
            # Append mode: process input into outfile (append)
            processed = 0
            with open(test_fp, "r", encoding="utf-8") as fin, open(outfile, "a", encoding="utf-8") as fout:
                for line in fin:
                    line = line.strip()
                    if not line:
                        continue
                    try:
                        page = json.loads(line)
                    except json.JSONDecodeError:
                        continue
                    page = process_page(page)
                    fout.write(json.dumps(page, ensure_ascii=False) + "\n")
                    processed += 1
            print(f"Processed {processed} pages from {test_fp} → appended to {outfile}")
            return processed
        else:
            processed = process_file(test_fp, out_fp=None)
            print(f"Processed {processed} pages (in place): {test_fp}")
            return processed

    # Directory mode
    base = Path(scraped_dir)
    files = sorted(base.glob(glob_pattern))
    total_pages = 0
    total_files = 0

    if outfile:
        # Ensure outfile is fresh
        Path(outfile).parent.mkdir(parents=True, exist_ok=True)
        if Path(outfile).exists():
            # Keep appending to existing if desired; otherwise uncomment to truncate:
            # open(outfile, "w").close()
            pass

    for fp in files:
        if not fp.is_file() or fp.suffix.lower() != ".jsonl":
            continue
        total_files += 1
        if outfile:
            # Append all processed pages from this file into outfile
            with open(fp, "r", encoding="utf-8") as fin, open(outfile, "a", encoding="utf-8") as fout:
                for line in fin:
                    line = line.strip()
                    if not line:
                        continue
                    try:
                        page = json.loads(line)
                    except json.JSONDecodeError:
                        continue
                    page = process_page(page)
                    fout.write(json.dumps(page, ensure_ascii=False) + "\n")
                    total_pages += 1
        else:
            # Rewrite in place
            total_pages += process_file(str(fp), out_fp=None)

    print(f"Processed {total_pages} pages across {total_files} files" + (f" → appended to {outfile}" if outfile else " (in place)"))
    return total_files, total_pages

In [None]:
test_fp = '/Users/haltriedman/code/wiki-grok-comparison/scraped_data/clive_anderson.jsonl'
outfile = 'test_out.jsonl'

dedupe_main_section(test_fp=test_fp, outfile=outfile)

In [None]:
dedupe_main_section()