### 1. Preprocess and split all books into segments

In [7]:
import json
import sys
from pathlib import Path

sys.path.append(str(Path.cwd().parent))
from book_segmenting import TextSegmenter
from book_preprocessing import TxtBookPreprocessor
from utils import SEGMENT_DIR, BOOK_DIR

SEGMENT_CHARS_MIN = 150
SEGMENT_CHARS_MAX = 500
segment_dir = SEGMENT_DIR / "batch2"

preprocessor = TxtBookPreprocessor()
segmenter = TextSegmenter(segment_size=(SEGMENT_CHARS_MIN, SEGMENT_CHARS_MAX))

for f in segment_dir.glob("*.json"):
    f.unlink()
for book in BOOK_DIR.glob("*.txt"):
    with open(book, "r", encoding="utf-8-sig") as f:
        book_content = f.read()
    book_slug = book.stem
    chunks = segmenter.segment_text(preprocessor.clean_text(book_content))
    with open(segment_dir / f"{book_slug}.json", "w", encoding="utf-8") as f:
        json.dump(chunks, f, indent=2)

### 2. Sample N segments from all unused yet

In [8]:
import sys
from pathlib import Path

sys.path.append(str(Path.cwd().parent))
from utils import SEGMENT_DIR, TO_ANNOTATE_DIR, BOOK_META_DIR
from dataset_small.segment_sampling import SegmentSampler

N = 10000
BATCH_NAME = "batch_10k"
MAX_PER_BOOK = 3
SEED = 42
segment_dir = SEGMENT_DIR / "batch2"

sampler = SegmentSampler(segment_dir, BOOK_META_DIR, TO_ANNOTATE_DIR, SEED)
sample = sampler.sample_balanced(N, BATCH_NAME, max_per_book=MAX_PER_BOOK)
sampler.print_status()

Available: 610668 segments
  Science Fiction: only 90 available (wanted 1428)
  Fiction: only 603 available (wanted 1428)
  Western: only 66 available (wanted 1428)
  Fantasy: only 147 available (wanted 1428)
  Mystery: only 222 available (wanted 1428)
  History: only 57 available (wanted 1428)
  Travel: only 15 available (wanted 1428)

Saved 10000 segments to /home/terra/Projects/vis-desc/modules/lab/data/to-annotate/batch_10k.json
=== CORPUS STATUS ===
Books: 400
Total segments: 611140
Used segments: 500
Available: 610640

By genre:
  Fiction: 315528 (201 books)
  Mystery: 106695 (74 books)
  Fantasy: 60682 (49 books)
  Western: 45875 (22 books)
  Science Fiction: 43143 (30 books)
  History: 32717 (19 books)
  Travel: 6500 (5 books)


### Chart

In [None]:
import json
import collections
from pathlib import Path
import matplotlib.pyplot as plt
from utils import BOOK_META_DIR, DATA_DIR

genre_counts = collections.Counter()
for meta_file in BOOK_META_DIR.glob("*.json"):
    try:
        with open(meta_file, "r", encoding="utf-8") as f:
            data = json.load(f)
        g = data.get("genre") or "Unknown"
        if isinstance(g, str):
            g = g.strip() or "Unknown"
        else:
            g = "Unknown"
        if g.lower() == "science fiction":
            g = "Sci-Fi"
        genre_counts[g] += 1
    except Exception:
        pass

if not genre_counts:
    print("No genre data found for Matplotlib chart.")
else:
    genres, counts = zip(*genre_counts.most_common())
    total = sum(counts)
    plt.rcParams.update({"font.size": 42, "axes.titlesize": 48, "legend.fontsize": 32})

    fig, ax = plt.subplots(figsize=(20, 14), dpi=120)

    def autopct_fmt(pct):
        return ("%0.1f%%" % pct) if pct >= 1 else ""

    wedges, texts, autotexts = ax.pie(
        counts,
        labels=None,  # legend instead of slice text
        autopct=autopct_fmt,
        startangle=90,
        counterclock=False,
        pctdistance=0.70,
        textprops={"color": "#111", "fontsize": 44},
        wedgeprops={"linewidth": 2.5, "edgecolor": "white"},
    )

    for t in autotexts:
        t.set_fontsize(44)
        t.set_fontweight("bold")

    from matplotlib.patches import Patch

    legend_patches = [
        Patch(facecolor=w.get_facecolor(), edgecolor="white", label=f"{g} ({c})")
        for w, g, c in zip(wedges, genres, counts)
    ]

    fig.subplots_adjust(left=0, right=0.80, top=1, bottom=0)

    leg = fig.legend(
        handles=legend_patches,
        loc="center left",
        bbox_to_anchor=(0.7, 0.5),
        frameon=True,
        title_fontsize=36,
        borderpad=0.4,
        labelspacing=0.9,
        handlelength=1.6,
        handleheight=1.1,
    )
    frame = leg.get_frame()
    frame.set_alpha(0.18)
    frame.set_edgecolor("#bcbcbc")
    frame.set_linewidth(1.2)
    frame.set_facecolor("#ffffff")

    ax.set_aspect("equal")

    pie_path = DATA_DIR / "genre_distribution.png"
    fig.savefig(pie_path, bbox_inches="tight", pad_inches=0, transparent=True)
    print("Saved pie to", pie_path)
    plt.show()
    print("Total books counted:", total)

print("Done.")

### Small part 2 (prefer new books)

sample from new books first, sample from the old ones only after the book/genre limit is reached

In [2]:
import sys
import json
from pathlib import Path
from collections import defaultdict

sys.path.append(str(Path.cwd().parent))
from utils import SEGMENT_DIR, TO_ANNOTATE_DIR, BOOK_META_DIR
from dataset_small.segment_sampling import SegmentSampler

NEW_DIR = SEGMENT_DIR / "batch2"
OLD_DIR = SEGMENT_DIR / "batch1"

only_new_books = set(f.stem for f in NEW_DIR.glob("*.json")) - set(
    f.stem for f in OLD_DIR.glob("*.json")
)
print(f"Only-new books: {len(only_new_books)}")

N_PREF = 500
BATCH_NAME_PREF = "batch_002"
MAX_PER_BOOK_PREF = 3
SEED = 42

new_sampler = SegmentSampler(NEW_DIR, BOOK_META_DIR, TO_ANNOTATE_DIR, SEED)
old_sampler = SegmentSampler(OLD_DIR, BOOK_META_DIR, TO_ANNOTATE_DIR, SEED)

used_segments = new_sampler._load_used_segments()


def available_segments(sampler):
    all_segments = sampler._load_all_segments_with_book_meta()
    return [s for s in all_segments if s["segment_id"] not in used_segments]


available_new = available_segments(new_sampler)
available_old = available_segments(old_sampler)
print(f"Available new segments: {len(available_new)}")
print(f"Available old segments: {len(available_old)}")

# Genres present in either set
genres = sorted({s["genre"] for s in available_new + available_old})
target_total = min(N_PREF, len(available_new) + len(available_old))
per_genre = max(1, target_total // len(genres))
print(
    f"Target total: {target_total} | Genres: {len(genres)} | Per-genre target: {per_genre}"
)

samples = []
chosen_ids = set()


def sample_from_pool(pool, needed, max_per_book):
    # Group by book, enforce per-book limit then randomly sample up to needed
    by_book = defaultdict(list)
    for seg in pool:
        by_book[seg["book_id"]].append(seg)
    candidate_segments = []
    for book_segs in by_book.values():
        take = min(max_per_book, len(book_segs))
        candidate_segments.extend(new_sampler._rng.sample(book_segs, take))
    if len(candidate_segments) <= needed:
        return candidate_segments
    return new_sampler._rng.sample(candidate_segments, needed)


for genre in genres:
    # New first
    new_pool = [
        s
        for s in available_new
        if s["genre"] == genre and s["segment_id"] not in chosen_ids
    ]
    take_new = min(per_genre, len(new_pool))
    picked_new = sample_from_pool(new_pool, take_new, MAX_PER_BOOK_PREF)
    for s in picked_new:
        chosen_ids.add(s["segment_id"])
    samples.extend(picked_new)

    remaining_needed = per_genre - len(picked_new)
    if remaining_needed > 0:
        old_pool = [
            s
            for s in available_old
            if s["genre"] == genre and s["segment_id"] not in chosen_ids
        ]
        take_old = min(remaining_needed, len(old_pool))
        picked_old = sample_from_pool(old_pool, take_old, MAX_PER_BOOK_PREF)
        for s in picked_old:
            chosen_ids.add(s["segment_id"])
        samples.extend(picked_old)
        if len(picked_old) < remaining_needed:
            print(
                f"  {genre}: shortage (needed {per_genre}, got {len(picked_new) + len(picked_old)})"
            )

# Fill remaining slots from any available (prefer new still)
if len(samples) < target_total:
    remaining_slots = target_total - len(samples)
    remaining_new = [s for s in available_new if s["segment_id"] not in chosen_ids]
    remaining_old = [s for s in available_old if s["segment_id"] not in chosen_ids]
    # Try new first
    add_new = min(remaining_slots, len(remaining_new))
    if add_new:
        samples.extend(new_sampler._rng.sample(remaining_new, add_new))
        chosen_ids.update(s["segment_id"] for s in samples[-add_new:])
    remaining_slots -= add_new
    if remaining_slots > 0 and remaining_old:
        add_old = min(remaining_slots, len(remaining_old))
        samples.extend(old_sampler._rng.sample(remaining_old, add_old))
        chosen_ids.update(s["segment_id"] for s in samples[-add_old:])

new_sampler._rng.shuffle(samples)

# Save
output_file = TO_ANNOTATE_DIR / f"{BATCH_NAME_PREF}.json"
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(samples, f, indent=2)
print(f"Saved {len(samples)} segments to {output_file}")

# Stats
new_count = sum(1 for s in samples if s["book_slug"] in only_new_books)
old_count = len(samples) - new_count
print(f"Sample composition -> New: {new_count} | Old: {old_count}")
by_genre_sample = defaultdict(int)
for s in samples:
    by_genre_sample[s["genre"]] += 1
print("By genre (sampled):")
for g, c in sorted(by_genre_sample.items(), key=lambda x: x[1], reverse=True):
    print(f"  {g}: {c}")

print("Preferential sampling done.")

Only-new books: 110
Available new segments: 610668
Available old segments: 426190
Target total: 500 | Genres: 7 | Per-genre target: 71
Available new segments: 610668
Available old segments: 426190
Target total: 500 | Genres: 7 | Per-genre target: 71
  History: shortage (needed 71, got 60)
  Travel: shortage (needed 71, got 18)
  History: shortage (needed 71, got 60)
  Travel: shortage (needed 71, got 18)
Saved 500 segments to /home/terra/Projects/vis-desc/modules/lab/data/to-annotate/batch_002.json
Sample composition -> New: 150 | Old: 350
By genre (sampled):
  Fiction: 107
  Mystery: 80
  Fantasy: 79
  Science Fiction: 76
  Western: 75
  History: 65
  Travel: 18
Preferential sampling done.
Saved 500 segments to /home/terra/Projects/vis-desc/modules/lab/data/to-annotate/batch_002.json
Sample composition -> New: 150 | Old: 350
By genre (sampled):
  Fiction: 107
  Mystery: 80
  Fantasy: 79
  Science Fiction: 76
  Western: 75
  History: 65
  Travel: 18
Preferential sampling done.


In [3]:
# Split batch_002 into two halves for annotation
BATCH_FILE = TO_ANNOTATE_DIR / "batch_002.json"
with open(BATCH_FILE, "r", encoding="utf-8") as f:
    all_samples = json.load(f)

midpoint = len(all_samples) // 2
batch_002_a = all_samples[:midpoint]
batch_002_b = all_samples[midpoint:]
with open(TO_ANNOTATE_DIR / "batch_002_m.json", "w", encoding="utf-8") as f:
    json.dump(batch_002_a, f, indent=2)
with open(TO_ANNOTATE_DIR / "batch_002_v.json", "w", encoding="utf-8") as f:
    json.dump(batch_002_b, f, indent=2)