# Is Child-Directed Speech Effective Training Data for Language Models?

## Предустановки

In [1]:
!nvidia-smi

/bin/bash: line 1: nvidia-smi: command not found


In [26]:
!pip install -q datasets transformers accelerate sentencepiece evaluate scipy scikit-learn

In [27]:
import os
import random
from datasets import load_dataset
import datasets

In [28]:
SEED = 42
random.seed(SEED)

In [None]:
!git clone https://github.com/styfeng/TinyDialogues.git
%cd TinyDialogues

In [29]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Датасет

In [7]:
# официальный датасет с huggingface
ds = load_dataset("styfeng/TinyDialogues")

print(ds)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

tinydialogue_train_ordered.txt:   0%|          | 0.00/141M [00:00<?, ?B/s]

tinydialogue_val_ordered.txt:   0%|          | 0.00/25.4M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/110024 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/19708 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 110024
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 19708
    })
})


In [30]:
# =================================================
# Convert dataset to paper‑compatible text format
# =================================================
# Paper requirements (Appendix B):
# - One conversation per example
# - Speaker labels surrounded by ** **
# - Double newlines between utterances
# - <|endoftext|> token at end

os.makedirs("TD", exist_ok=True)

MAX_TOKENS = 5_000_000  # subset for Colab feasibility

def write_split(split, out_path):
    token_count = 0
    with open(out_path, "w", encoding="utf-8") as f:
        for ex in ds[split]:
            text = ex["text"].strip()
            if not text.endswith("<|endoftext|>"):
                text += "\n<|endoftext|>"
            tokens = text.split()
            if token_count + len(tokens) > MAX_TOKENS:
                break
            f.write(text + "\n")
            token_count += len(tokens)
    print(f"Wrote {token_count:,} tokens to {out_path}")

write_split("train", "TD/train.txt")
write_split("validation", "TD/val.txt")

Wrote 4,999,850 tokens to TD/train.txt
Wrote 4,382,089 tokens to TD/val.txt


In [31]:
print(ds["train"][0]["text"][:500])

**Dad**: "Hey sweetie, do you want to paint with Daddy?" \n\n **Child**: "Paint!" \n\n **Dad**: "Yes, we'll use these brushes. But first, let's put on your apron so we don't get paint on your clothes." \n\n **Child**: "Apron!" \n\n **Mom**: "Breakfast is almost ready! Who wants pancakes?" \n\n **Child**: "Pancake!" \n\n **Dad**: "We'll eat first, then paint. Let's wash hands before we eat, okay?" \n\n **Child**: "Wash!" \n\n **Mom**: "Careful, the pancakes are hot. We'll let them cool a little b


In [32]:
!python scripts/tokenizers/train_GPT2_tokenizer.py \
    TD/train.txt TD/val.txt TD_tokenizer

tokenizer_config.json: 100% 26.0/26.0 [00:00<00:00, 47.7kB/s]
config.json: 100% 665/665 [00:00<00:00, 3.16MB/s]
vocab.json: 100% 1.04M/1.04M [00:00<00:00, 14.2MB/s]
merges.txt: 100% 456k/456k [00:00<00:00, 10.0MB/s]
tokenizer.json: 100% 1.36M/1.36M [00:00<00:00, 21.7MB/s]
['def', 'Ġadd', '_', 'n', 'umbers', '(', 'a', ',', 'Ġb', '):', 'Ċ', 'Ġ', 'Ġ', 'Ġ', 'Ġ"""', 'Add', 'Ġthe', 'Ġtwo', 'Ġnumbers', 'Ġ`', 'a', '`', 'Ġand', 'Ġ`', 'b', '`', '."', '""', 'Ċ', 'Ġ', 'Ġ', 'Ġ', 'Ġreturn', 'Ġa', 'Ġ+', 'Ġb']
50129
[2K[00:00:00] Tokenize words                 ██████████████████ 46911    /    46911
[2K[00:00:00] Count pairs                    ██████████████████ 46911    /    46911
[2K[00:00:02] Compute merges                 ██████████████████ 51742    /    51742
All special tokens: ['<|endoftext|>', '<UNK>']
BOS token: <|endoftext|>
EOS token: <|endoftext|>
PAD token: None
UNK token: <|endoftext|>
SEP token: None
CLS token: None
MASK token: None


In [49]:
!python scripts/tokenizers/test_GPT2_tokenizer.py TD_tokenizer



 TD_tokenizer 

['do', 'Ġyou', 'Ġwant', 'Ġto', 'Ġlook', 'Ġat', 'Ġthat', 'Ġit', 'Ġsays', 'Ġlook', 'Ġ?']
do you want to look at that it says look ? 

['The', 'Ġyellow', '-', 'billed', 'Ġshri', 'ke', 'Ġ(', "'", 'C', 'or', 'vin', 'ella', 'Ġcor', 'v', 'ina', "')", 'Ġis', 'Ġa', 'Ġlarge', 'Ġpasser', 'ine', 'Ġbird', 'Ġin', 'Ġthe', 'Ġshri', 'ke', 'Ġfamily', '.', 'ĠIt', 'Ġis', 'Ġsometimes', 'Ġknown', 'Ġas', 'Ġthe', 'Ġlong', '-', 'tailed', 'Ġshri', 'ke', ',', 'Ġbut', 'Ġthis', 'Ġis', 'Ġto', 'Ġbe', 'Ġdiscouraged', ',', 'Ġsince', 'Ġit', 'Ġinvites', 'Ġconfusion', 'Ġwith', 'Ġthe', 'Ġlong', '-', 'tailed', 'Ġshri', 'ke', ',', "Ġ'", 'L', 'an', 'ius', 'Ġsch', 'ach', "',", 'Ġof', 'Ġtropical', 'Ġsouthern', 'ĠAsia', '.', 'ĠThe', 'Ġyellow', '-', 'billed', 'Ġshri', 'ke', 'Ġis', 'Ġa', 'Ġcommon', 'Ġresident', 'Ġbreeding', 'Ġbird', 'Ġin', 'Ġtropical', 'ĠAfrica', 'Ġfrom', 'ĠS', 'ene', 'gal', 'Ġeast', 'wards', 'Ġto', 'ĠU', 'g', 'anda', 'Ġand', 'Ġlocally', 'Ġin', 'Ġwesternmost', 'ĠKeny', 'a', '.', 'ĠIt', 'Ġfrequ',

Я посмотрела код этого скрипта с проверкой, там есть предложения, которые он токенизирует, и действительно всё правильно.
Можем сравнить с их токенайзером, там вроде всё так же, поэтому должно быть одинаково.


In [11]:
!python scripts/tokenizers/test_GPT2_tokenizer.py tokenizers/GPT2_tinydialogue



 tokenizers/GPT2_tinydialogue 

['do', 'Ġyou', 'Ġwant', 'Ġto', 'Ġlook', 'Ġat', 'Ġthat', 'Ġit', 'Ġsays', 'Ġlook', 'Ġ', '?']
do you want to look at that it says look ? 

['The', 'Ġyellow', '-', 'billed', 'Ġshri', 'ke', 'Ġ(', "'", 'Cor', 'vin', 'ella', 'Ġcor', 'v', 'ina', "')", 'Ġis', 'Ġa', 'Ġlarge', 'Ġpasser', 'ine', 'Ġbird', 'Ġin', 'Ġthe', 'Ġshri', 'ke', 'Ġfamily', '.', 'ĠIt', 'Ġis', 'Ġsometimes', 'Ġknown', 'Ġas', 'Ġthe', 'Ġlong', '-', 'tailed', 'Ġshri', 'ke', ',', 'Ġbut', 'Ġthis', 'Ġis', 'Ġto', 'Ġbe', 'Ġdiscouraged', ',', 'Ġsince', 'Ġit', 'Ġinvites', 'Ġconfusion', 'Ġwith', 'Ġthe', 'Ġlong', '-', 'tailed', 'Ġshri', 'ke', ',', "Ġ'", 'L', 'an', 'ius', 'Ġsc', 'ha', 'ch', "',", 'Ġof', 'Ġtropical', 'Ġsouthern', 'ĠAsia', '.', 'ĠThe', 'Ġyellow', '-', 'billed', 'Ġshri', 'ke', 'Ġis', 'Ġa', 'Ġcommon', 'Ġresident', 'Ġbreeding', 'Ġbird', 'Ġin', 'Ġtropical', 'ĠAfrica', 'Ġfrom', 'ĠSen', 'eg', 'al', 'Ġeast', 'ward', 's', 'Ġto', 'ĠU', 'g', 'anda', 'Ġand', 'Ġlocally', 'Ġin', 'Ġwesternmost', 'ĠK', 'eny'

Да, всё так.

## Обучаем GPT-2 на коленке

In [12]:
from transformers import GPT2Config, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments
from transformers import AutoTokenizer
from datasets import load_dataset

In [13]:
tokenizer = AutoTokenizer.from_pretrained("TD_tokenizer")
tokenizer.pad_token = tokenizer.eos_token

In [14]:
data_files = {
    "train": "TD/train.txt",
    "validation": "TD/val.txt"
}

raw_datasets = load_dataset("text", data_files=data_files)


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

In [15]:
def sample_dataset(dataset, sample_fraction=0.3, seed=None):
    if seed is not None:
        random.seed(seed)

    sampled_dataset = {}
    for split_name in dataset:
        split = dataset[split_name]
        total_size = len(split)
        sample_size = int(total_size * sample_fraction)

        # Случайным образом выбираем индексы без повторов
        sampled_indices = random.sample(range(total_size), sample_size)

        # Создаём новый split с выбранными примерами
        sampled_dataset[split_name] = split.select(sampled_indices)

    return datasets.DatasetDict(sampled_dataset)

# Применяем выборку (30% от исходного датасета)
sampled_datasets = sample_dataset(raw_datasets, sample_fraction=0.3)

Мы взяли 30% потому что хотели успеть до 18, но мы всё равно не успеваем. Тем не менее оставим 30% чтобы, возможно, успеть что-нибудь ещё.

In [16]:
sampled_datasets

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 9126
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 5912
    })
})

In [22]:
def tokenize_fn(examples):
    out = tokenizer(
        examples["text"],
        truncation=True,
        max_length=1024,
    )
    out["labels"] = out["input_ids"].copy()
    return out

tokenized_datasets = sampled_datasets.map(
    tokenize_fn,
    batched=True,
    remove_columns=["text"],
)

Map:   0%|          | 0/9126 [00:00<?, ? examples/s]

Map:   0%|          | 0/5912 [00:00<?, ? examples/s]

In [23]:
# GPT‑2 SMALL config (124M params, as in paper)
config = GPT2Config(
    vocab_size=len(tokenizer),
    n_positions=1024,
    n_ctx=1024,
)

In [24]:
model = GPT2LMHeadModel(config)

In [26]:
training_args = TrainingArguments(
    output_dir="gpt2_td",
    overwrite_output_dir=True,
    # evaluation_strategy="epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,
    num_train_epochs=5,  # paper uses 20; reduced for Colab
    weight_decay=0.0,
    logging_steps=200,
    save_strategy="epoch",
    report_to="none",
    seed=SEED,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
)

trainer.train()

  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 0, 'bos_token_id': 0, 'pad_token_id': 0}.


Step,Training Loss
200,3.5776
400,2.5305
600,2.2907
800,2.1481
1000,2.0518
1200,1.9511
1400,1.8571
1600,1.7857
1800,1.747
2000,1.6986


TrainOutput(global_step=5705, training_loss=1.6401150662474837, metrics={'train_runtime': 4779.9575, 'train_samples_per_second': 9.546, 'train_steps_per_second': 1.194, 'total_flos': 6961549512960000.0, 'train_loss': 1.6401150662474837, 'epoch': 5.0})

In [27]:
# =================================================
# Save model
# =================================================

trainer.save_model("/content/drive/My Drive/NLPproject/gpt2_td_final")
tokenizer.save_pretrained("/content/drive/My Drive/NLPproject/gpt2_td_final")

print("Training complete. Model saved.")

# =================================================
# NEXT STEPS (not run here):
# -------------------------------------------------
# - Zorro evaluation (BabyLM pipeline)
# - Word Similarity benchmarks
# - Dataset comparisons (Wikipedia, CHILDES)
# - Local / global ordering experiments
# =================================================


Training complete. Model saved.


## Оценка

In [37]:
path = 'evaluation-pipeline/filter-data_zorro_dialogue-format-tinydialogue_Mom/'
for i in os.listdir(path):
    with open(path + i) as f:
        json = f.readlines()
    with open(path + i, 'w') as f2:
        f2.write('[')
        f2.write(','.join(json[:len(json)*0.3]))
        f2.write(']')

In [42]:
!python evaluation-pipeline/babylm_eval_zorro2.py \
  "../drive/My Drive/NLPproject/gpt2_td_final" decoder zorro_dialogue-format-tinydialogue_Mom \
  "../drive/MyDrive/NLPproject/eval_results/results.txt" \
  "../drive/My Drive/NLPproject/eval_results/final_avg.csv" \
  "../drive/My Drive/NLPproject/eval_results/results.csv" \
  "../drive/My Drive/NLPproject/eval_results/results.jsonl"

False
`torch_dtype` is deprecated! Use `dtype` instead!
2025-12-28 20:01:41.566580: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-12-28 20:01:41.572330: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-12-28 20:01:41.589443: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1766952101.610400   49097 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1766952101.616024   49097 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1766952101.649461   49097 computation_placer.cc:177] computation placer a