In [1]:
import copy
import random
from dataclasses import dataclass, field
from typing import Dict, List, Optional, Sequence

import torch
import torch.distributed
import transformers
from datasets import load_dataset
from transformers import Trainer
import sys
import os

IGNORE_INDEX = -100
EOT_TOKEN = "<|EOT|>"


In [2]:
def build_instruction_prompt(instruction: str):
    return "<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant used for generating proofs in Isabelle to prove the provided natural language statements.<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n".format(
        instruction.strip()
    ).lstrip()


@dataclass
class ModelArguments:
    model_name_or_path: Optional[str] = field(default="deepseek-ai/DeepSeek-R1-Distill-Qwen-7B")


@dataclass
class DataArguments:
    data_path: List[str] = field(
        default_factory=list, metadata={"help": "Paths to the training data."}
    )
    instruction_field: str = field(
        default="instruction", metadata={"help": "The field name for the instruction"}
    )
    output_field: str = field(
        default="output", metadata={"help": "The field name for the output"}
    )


@dataclass
class TrainingArguments(transformers.TrainingArguments):
    cache_dir: Optional[str] = field(default=None)
    optim: str = field(default="adamw_torch")
    model_max_length: int = field(  # NOTE: ignore this
        default=512,
        metadata={
            "help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
        },
    )


def safe_save_model_for_hf_trainer(trainer: transformers.Trainer, output_dir: str):
    """Collects the state dict and dump to disk."""
    state_dict = trainer.model.state_dict()
    if trainer.args.should_save:
        cpu_state_dict = {key: value.cpu() for key, value in state_dict.items()}
        del state_dict
        trainer._save(output_dir, state_dict=cpu_state_dict)  # noqa


In [3]:
def _tokenize_fn(
    strings: Sequence[str], tokenizer: transformers.PreTrainedTokenizer
) -> Dict:
    """Tokenize a list of strings."""
    tokenized_list = [
        tokenizer(
            text,
            return_tensors="pt",
            padding="longest",
            # max_length=tokenizer.model_max_length,
            # truncation=True,
        )
        for text in strings
    ]

    input_ids = labels = [tokenized.input_ids[0] for tokenized in tokenized_list]
    input_ids_lens = labels_lens = [
        tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item()
        for tokenized in tokenized_list
    ]

    return dict(
        input_ids=input_ids,
        labels=labels,
        input_ids_lens=input_ids_lens,
        labels_lens=labels_lens,
    )


def preprocess(
    sources: Sequence[str],
    targets: Sequence[str],
    tokenizer: transformers.PreTrainedTokenizer,
) -> Dict:
    """Preprocess the data by tokenizing."""
    examples = [s + t for s, t in zip(sources, targets)]
    examples_tokenized, sources_tokenized = [
        _tokenize_fn(strings, tokenizer) for strings in (examples, sources)
    ]
    input_ids = examples_tokenized["input_ids"]

    labels = copy.deepcopy(input_ids)
    for label, source_len in zip(labels, sources_tokenized["input_ids_lens"]):
        label[:source_len] = IGNORE_INDEX
    return dict(input_ids=input_ids, labels=labels)


@dataclass
class DataCollatorForSupervisedDataset(object):
    """Collate examples for supervised fine-tuning."""

    tokenizer: transformers.PreTrainedTokenizer

    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
        input_ids, labels = tuple(
            [instance[key] for instance in instances] for key in ("input_ids", "labels")
        )
        input_ids = [torch.tensor(x) for x in input_ids]
        input_ids = torch.nn.utils.rnn.pad_sequence(
            input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id
        )
        labels = [torch.tensor(x) for x in labels]
        labels = torch.nn.utils.rnn.pad_sequence(
            labels, batch_first=True, padding_value=IGNORE_INDEX
        )

        return dict(
            input_ids=input_ids,
            labels=labels,
            attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
        )

In [4]:
def train():
    parser = transformers.HfArgumentParser(
        (ModelArguments, DataArguments, TrainingArguments)
    )
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    # Initialize distributed training only if running in distributed mode
    if training_args.local_rank != -1 and os.getenv("RANK") is not None:
        if torch.distributed.is_available() and not torch.distributed.is_initialized():
            torch.distributed.init_process_group(backend="nccl", init_method="env://")

    if training_args.local_rank == 0:
        print("=" * 100)
        print(training_args)

    tokenizer = transformers.AutoTokenizer.from_pretrained(
        model_args.model_name_or_path,
        padding_side="right",
        use_fast=True,
        trust_remote_code=True,
    )

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token 
    def train_tokenize_function(examples, tokenizer):
        sources = [
            build_instruction_prompt(instruction)
            for instruction in examples[data_args.instruction_field]
        ]
        targets = [
            f"{output.strip()}{tokenizer.eos_token}"
            for output in examples[data_args.output_field]
        ]
        data_dict = preprocess(sources, targets, tokenizer)
        return data_dict

    print("PAD Token:", tokenizer.pad_token, tokenizer.pad_token_id)
    print("BOS Token", tokenizer.bos_token, tokenizer.bos_token_id)
    print("EOS Token", tokenizer.eos_token, tokenizer.eos_token_id)

    if training_args.local_rank == 0:
        print("Load tokenizer from {} over.".format(model_args.model_name_or_path))

    model = transformers.AutoModelForCausalLM.from_pretrained(
        model_args.model_name_or_path, torch_dtype=torch.bfloat16
    )

    if training_args.local_rank == 0:
        print("Load model from {} over.".format(model_args.model_name_or_path))

    raw_train_datasets = load_dataset(
        "json",
        data_files=data_args.data_path,
        split="train",
        cache_dir=training_args.cache_dir,
    )
    
    # Use a safe barrier
    if training_args.local_rank > 0 and torch.distributed.is_initialized():
        torch.distributed.barrier()

    train_dataset = raw_train_datasets.map(
        train_tokenize_function,
        batched=True,
        batch_size=3000,
        num_proc=32,
        remove_columns=raw_train_datasets.column_names,
        load_from_cache_file=True,
        desc="Running Encoding",
        fn_kwargs={"tokenizer": tokenizer},
    )

    if training_args.local_rank == 0 and torch.distributed.is_initialized():
        torch.distributed.barrier()

    if training_args.local_rank == 0:
        print("Training dataset samples:", len(train_dataset))
        for index in random.sample(range(len(train_dataset)), 3):
            print(
                f"Sample {index} of the training set: {train_dataset[index]['input_ids']}, {train_dataset[index]['labels']}."
            )
            print(
                f"Sample {index} of the training set: {tokenizer.decode(list(train_dataset[index]['input_ids']))}."
            )

    data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
    data_module = dict(
        train_dataset=train_dataset, eval_dataset=None, data_collator=data_collator
    )

    trainer = Trainer(
        model=model, tokenizer=tokenizer, args=training_args, **data_module
    )

    trainer.train()
    trainer.save_state()
    safe_save_model_for_hf_trainer(trainer=trainer, output_dir=training_args.output_dir)


In [5]:

critic_model = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
data_path = "train.jsonl"
output_path = "Distill-Qwen-1.5B"

sys.argv = [
    "notebook",
    "--model_name_or_path", critic_model,
    "--data_path", data_path,
    "--output_dir", output_path,
    "--instruction_field", "natural_language_statement",
    "--output_field", "formal_proof",
    "--num_train_epochs", "1",
    "--model_max_length", "1024",
    "--per_device_train_batch_size", "4",
    "--per_device_eval_batch_size", "1",
    "--gradient_accumulation_steps", "8",
    "--eval_strategy", "no",
    "--save_strategy", "steps",
    "--save_steps", "10",
    "--save_total_limit", "5",
    "--learning_rate", "1e-5",
    "--warmup_steps", "0",
    "--logging_steps", "1",
    "--lr_scheduler_type", "cosine",
    "--gradient_checkpointing", "True",
    "--report_to", "wandb",
    "--bf16", "True"
]


In [6]:
if __name__ == "__main__":
    train()

TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=True,
bf16_full_eval=False,
cache_dir=None,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=no,
eval_use_

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Load model from deepseek-ai/DeepSeek-Prover-V1.5-Base over.
Training dataset samples: 200
Sample 134 of the training set: [100000, 27, 91, 309, 62, 4789, 66325, 6713, 185, 2054, 418, 1551, 20881, 11, 4015, 457, 92632, 15895, 13, 1257, 418, 245, 9394, 20308, 1222, 327, 17209, 28489, 279, 98907, 276, 6650, 254, 4286, 3892, 4706, 12838, 17790, 91, 309, 62, 409, 66325, 185, 27, 91, 309, 62, 4789, 66325, 3631, 185, 549, 19407, 440, 29280, 17, 1, 481, 330, 23351, 881, 3892, 4706, 372, 4446, 25, 1273, 12837, 252, 40616, 317, 5929, 276, 12837, 244, 357, 654, 12837, 2644, 40616, 317, 5929, 276, 12837, 353, 357, 654, 285, 254, 31583, 12837, 375, 40616, 7432, 327, 12837, 252, 40616, 285, 12837, 2644, 357, 654, 937, 254, 31583, 12837, 375, 40616, 839, 7432, 327, 12837, 244, 40616, 285, 12837, 353, 357, 633, 27, 91, 309, 62, 409, 66325, 185, 27, 91, 309, 62, 4789, 66325, 81038, 185, 1898, 15255, 254, 24937, 3418, 881, 245, 26932, 98907, 5637, 11, 395, 543, 1181, 254, 5610, 28526, 279, 254, 24937, 5

  trainer = Trainer(


[2025-02-24 15:50:07,025] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/usr/bin/ld: cannot find -lcufile
collect2: error: ld returned 1 exit status
[34m[1mwandb[0m: Currently logged in as: [33mbalaji-vir1997[0m ([33mbalaji-vir1997-stevens-institute-of-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


OutOfMemoryError: CUDA out of memory. Tried to allocate 32.00 MiB. GPU 0 has a total capacity of 47.74 GiB of which 54.25 MiB is free. Including non-PyTorch memory, this process has 44.01 GiB memory in use. Of the allocated memory 39.42 GiB is allocated by PyTorch, and 3.12 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
!nvidia-smi