In [2]:
from knowledge_propagation.utils import vars, io, extractor, misc
import pandas as pd
from collections import defaultdict
import os

In [3]:
task_agnostic_strategy_generation_prompt = """
Consider the following document. What are some strategies specific to this document that I can use to help me learn and remember all of the information contained? Use markdown and prefix each strategy with ##

<document>
{chunk}
</document>
"""
task_specific_strategy_generation_prompt = """
I need to study for a trivia competition. Generate a list of questions that covers every piece of information in this document. After generating all the questions, for each question, generate a general study strategy or prompt that would help me memorize that kind of information (without focusing too much on the particular question). The prompt should outline a detailed set of guidelines or step-by-step for how I should rehearse or exercise the information to most effectively internalize it.

Output all the questions, then <start_strategies>, then all the strategies. Prefix each strategy with a ##.

<document>
{chunk}
</document>
"""

active_reading_w_strategy = """
Here's a learning strategy:
{strategy}

Apply this strategy to the following document:
<document>
{chunk}
</document>
"""

In [4]:
test_split_name = "test_id_sample"

In [16]:
test_data = io.load_jsonlines(f"/home/zliu/zliu/Synthetic_Continued_Pretraining_leo/data/dataset/raw/4K_controlled_RE/{test_split_name}.jsonl")
text2id = {example["text"]: i for i, example in enumerate(test_data)}
id2text = {i: example["text"] for i, example in enumerate(test_data)}

# strategy generation prompts for both

In [22]:
active_reading_strategy_df_content = []

for i, example in enumerate(test_data):
    active_reading_strategy_df_content.append(
        {
            "text": example["text"],
            "prompt": task_agnostic_strategy_generation_prompt.format(chunk=example["text"]),
            "prompt_type": "task_agnostic",
        }
    )
    active_reading_strategy_df_content.append(
        {
            "text": example["text"],
            "prompt": task_specific_strategy_generation_prompt.format(chunk=example["text"]),
            "prompt_type": "task_specific",
        }
    )

In [None]:
# io.dump_jsonlines(active_reading_strategy_df_content, f"/home/zliu/zliu/Synthetic_Continued_Pretraining_leo/data/dataset/raw/4K_controlled_RE/{test_split_name}_active_reading_strategy_construction.jsonl")

In [24]:
active_reading_strategy_completions = io.load_jsonlines(f"/home/zliu/zliu/Synthetic_Continued_Pretraining_leo/data/dataset/raw/4K_controlled_RE/{test_split_name}_active_reading_strategy_construction_generated(gpt-5).jsonl")

In [37]:
# print(active_reading_strategy_completions[0]["completion"])

In [35]:
def learning_strategy_extractor(text):
    import regex as re
    pattern = re.compile(
        r"##\s*(?P<title>.+?)\n(?P<content>(?:.*?\n?)*?)(?=##|\Z)",
        re.DOTALL
    )

    learning_strategies = []
    matches = pattern.finditer(text)
    for m in matches:
        title = m.group("title").strip()
        content = m.group("content").strip()
        learning_strategies.append({
            "title": title,
            "content": content
        })
    return learning_strategies

In [36]:
learning_strategies = learning_strategy_extractor(active_reading_strategy_completions[0]["completion"])


In [53]:
len(active_reading_strategy_completions)

100

In [39]:
active_reading_augmentation_df_content = []

for example in active_reading_strategy_completions:
    learning_strategies = learning_strategy_extractor(example["completion"])
    for learning_strategy in learning_strategies:
        active_reading_augmentation_df_content.append({
            "text": example["text"],
            "prompt": active_reading_w_strategy.format(strategy=learning_strategy["content"], chunk=example["text"]),
            "learning_strategy_type": example["prompt_type"],
            "learning_strategy_title": learning_strategy["title"],
        })


In [46]:
print(active_reading_augmentation_df_content[0]["prompt"])


Here's a learning strategy:
- Draw three steps labeled: 1) 8th grade, 2) College, 3) After graduation. Place the matching names on each step: Wordsworth → Marie Antoinette → Franklin D. Roosevelt.

Apply this strategy to the following document:
<document>
Mia Lewis first wrote about William Wordsworth in an 8th-grade book report. In college, she focused her thesis on Marie Antoinette. After graduation, she curated museum exhibitions to honor Franklin D. Roosevelt.
</document>



In [None]:
# io.dump_jsonlines(active_reading_augmentation_df_content, f"/home/zliu/zliu/Synthetic_Continued_Pretraining_leo/data/dataset/raw/4K_controlled_RE/{test_split_name}_active_reading_augmentation.jsonl")

In [7]:
active_reading_augmentation_generated = io.load_jsonlines(f"/home/zliu/zliu/Synthetic_Continued_Pretraining_leo/data/dataset/raw/4K_controlled_RE/{test_split_name}_active_reading_augmentation_generated(gpt-5).jsonl")

In [8]:
print(active_reading_augmentation_generated[0]["completion"])

1) 8th grade: William Wordsworth
2) College: Marie Antoinette
3) After graduation: Franklin D. Roosevelt


In [9]:
len(active_reading_augmentation_generated)

1016

In [10]:
active_reading_augmentation_generated[3]

{'text': 'Mia Lewis first wrote about William Wordsworth in an 8th-grade book report. In college, she focused her thesis on Marie Antoinette. After graduation, she curated museum exhibitions to honor Franklin D. Roosevelt.',
 'prompt': "\nHere's a learning strategy:\n- Sketch three icons in a row and label them: Book (book report) → Cap (thesis) → Frame (exhibition). Under each, write the name: Wordsworth → Marie Antoinette → FDR.\n\nApply this strategy to the following document:\n<document>\nMia Lewis first wrote about William Wordsworth in an 8th-grade book report. In college, she focused her thesis on Marie Antoinette. After graduation, she curated museum exhibitions to honor Franklin D. Roosevelt.\n</document>\n",
 'learning_strategy_type': 'task_agnostic',
 'learning_strategy_title': 'Dual-coding with simple icons',
 'completion': 'Book (book report) → Cap (thesis) → Frame (exhibition)\nWordsworth → Marie Antoinette → FDR'}

In [11]:
df = pd.DataFrame(active_reading_augmentation_generated)
# len(df[df["learning_strategy_type"] == "task_agnostic"])
len(df[df["learning_strategy_type"] == "task_specific"])

483

In [26]:
# learning_strategy_types = ["task_agnostic",]
# learning_strategy_types = ["task_specific",]
learning_strategy_types = ["task_agnostic", "task_specific"]

id2augmentation = {}

for augmentation in active_reading_augmentation_generated:
    assert augmentation["text"] in text2id
    if augmentation["learning_strategy_type"] not in learning_strategy_types:
        continue
    if text2id[augmentation["text"]] not in id2augmentation:
        id2augmentation[text2id[augmentation["text"]]] = [None]
    id2augmentation[text2id[augmentation["text"]]].append(augmentation["completion"])
# len(id2augmentation)

aug_dir = f"/home/zliu/zliu/Synthetic_Continued_Pretraining_leo/data/dataset/raw/4K_controlled_RE/{test_split_name}/active_reading-{'-'.join(learning_strategy_types)}"
os.makedirs(aug_dir, exist_ok=True)
for id, augmentations in id2augmentation.items():
    # pass
    augmentations.insert(1, id2text[id])
    io.dump_json(augmentations, os.path.join(aug_dir, f"{id}.json"))


In [27]:
augmentations

[None,
 'Ryan Kelly was born in India. He spent most of his adult life in Oman. After retirement, he lived in Norway and passed away.',
 '- Chronological order: ION (India → Oman → Norway)\n- Chant: “ION: birth in India, work in Oman, rest in Norway.”\n- Applied: Ryan Kelly = ION — born in India, spent most adult life in Oman, retired and passed away in Norway.',
 '- Born → India\n- Adult → Oman\n- Retired → Norway\n\nBAR over ION. BAR over ION.',
 'For Ryan Kelly:\n- Left: 👶 India\n- Middle: 💼 Oman\n- Right: 🕯️ Norway\n\n10-sec redraw: 👶 India | 💼 Oman | 🕯️ Norway',
 'Applied mnemonic\n\n- Hot → India: Born in India. Color cue: saffron and green.\n- Desert → Oman: Spent most of his adult life in Oman. Color cue: red, white, and green.\n- Cold → Norway: After retirement he lived in Norway and passed away. Color cue: red cross on blue.\n\nVisualization cue: Picture Ryan moving from a saffron‑green sunrise (India), across a red‑white‑green desert camp (Oman), to a red cross on blue ice (

In [28]:
16 * 2048

32768

In [29]:
334_000 / 2000

167.0

In [30]:
from data.cptdata import _MemmapDataset

In [33]:
len(_MemmapDataset(block_size=2048, bin_file="/home/zliu/zliu/Synthetic_Continued_Pretraining_leo/data/dataset/bins/4K_controlled_RE-test_id_sample-meta_aug-one_stage-naive-Qwen2.5-1.5B-Instruct.bin", subsample_ratio=1).ids)

17376