In [None]:
!pip install flash-linear-attention

Collecting flash-linear-attention
  Downloading flash_linear_attention-0.4.1-py3-none-any.whl.metadata (39 kB)
Collecting fla-core==0.4.1 (from flash-linear-attention)
  Downloading fla_core-0.4.1-py3-none-any.whl.metadata (38 kB)
Downloading flash_linear_attention-0.4.1-py3-none-any.whl (287 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m287.4/287.4 kB[0m [31m29.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fla_core-0.4.1-py3-none-any.whl (437 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m437.3/437.3 kB[0m [31m44.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fla-core, flash-linear-attention
Successfully installed fla-core-0.4.1 flash-linear-attention-0.4.1


In [None]:
import os

CACHE_DIR_BASE = "/content/my_hf_cache"

os.environ["HF_HOME"] = CACHE_DIR_BASE  # General Hugging Face home

from datasets import load_dataset
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers
from transformers import PreTrainedTokenizerFast

def train_eca_tokenizer():
    print("Loading dataset kreasof-ai/ECA-Zero...")
    dataset = load_dataset("kreasof-ai/ECA-Zero", split="train")

    def batch_iterator(batch_size=1000, limit=20000):
        for i in range(0, min(len(dataset), limit), batch_size):
            batch = dataset[i : i + batch_size]
            # We don't need to manually inject tags here for training,
            # we just need the raw text content so BPE learns the chars.
            # The tags are added via 'special_tokens' below.
            yield [
                f"{inp}\n{cot}\n{tgt}"
                for inp, cot, tgt in zip(batch["input"], batch["cot"], batch["target"])
            ]

    tokenizer = Tokenizer(models.BPE())
    tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
    tokenizer.decoder = decoders.ByteLevel()

    # --- KEY CHANGE: Add tags to special_tokens ---
    special_tokens = [
        "<|startoftext|>",
        "<|endoftext|>",
        "<|unknown|>",
        "<|pad|>",
        "<think>",   # Start of reasoning
        "</think>"   # End of reasoning
    ]

    trainer = trainers.BpeTrainer(
        vocab_size=256,
        special_tokens=special_tokens,
        min_frequency=100
    )

    print("Training BPE with <think> tokens...")
    tokenizer.train_from_iterator(batch_iterator(), trainer=trainer)

    fast_tokenizer = PreTrainedTokenizerFast(
        tokenizer_object=tokenizer,
        bos_token="<|startoftext|>",
        eos_token="<|endoftext|>",
        pad_token="<|pad|>",
        unk_token="<|unknown|>",
        # Register them so HF knows they are special
        additional_special_tokens=["<think>", "</think>"]
    )

    save_path = "./eca_tokenizer"
    fast_tokenizer.save_pretrained(save_path)
    print(f"Tokenizer saved to {save_path}. Vocab size: {len(fast_tokenizer)}")

    # Validation
    test_str = "Task: ...\n<think>\nStep 1...\n</think>\nResult"
    ids = fast_tokenizer.encode(test_str)
    print(f"Test Encoding: {fast_tokenizer.convert_ids_to_tokens(ids)}")
    # Output should show '<think>' as a single token, not split up.

if __name__ == "__main__":
    train_eca_tokenizer()

Loading dataset kreasof-ai/ECA-Zero...


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00002.parquet:   0%|          | 0.00/106M [00:00<?, ?B/s]

data/train-00001-of-00002.parquet:   0%|          | 0.00/106M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/2.14M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/333333 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3333 [00:00<?, ? examples/s]

Training BPE with <think> tokens...
Tokenizer saved to ./eca_tokenizer. Vocab size: 256
Test Encoding: ['Task', ':', 'Ġ', '..', '.', 'Ċ', '<think>', 'Ċ', 'Step', 'Ġ1', '..', '.', 'Ċ', '</think>', 'Ċ', 'R', 'es', 'ul', 't']


In [None]:
import os

CACHE_DIR_BASE = "/content/my_hf_cache"

os.environ["HF_HOME"] = CACHE_DIR_BASE  # General Hugging Face home

import sys
import torch
import wandb

from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    PreTrainedTokenizerFast,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
)
from fla.models import NSAConfig

from huggingface_hub import HfApi, create_repo, upload_folder

# --- Configuration ---
DATASET_ID = "kreasof-ai/ECA-Zero"
TOKENIZER_PATH = "./eca_tokenizer"
OUTPUT_DIR = "./eca_zero_baseline"

# --- HF & W&B credentials (read from env) ---
HF_TOKEN = "..."
HF_REPO_ID = "..."

# --- W&B initialization ---
try:
    wandb.login()
    # init a run so Trainer events will get associated with it
    wandb.init(project="ECA-Zero", name="nsa-baseline-1-epoch", reinit=True)
except Exception as e:
    print(f"[Warning] wandb init/login failed: {e}. Continuing without wandb run object.")
    # Trainer still can log to wandb if configured, but report_to may need adjustment.

# --- Model config ---
config = NSAConfig(
    vocab_size=256,
    hidden_size=256,
    intermediate_size=768,
    num_heads=16,
    num_kv_heads=1,
    head_dim=32,
    block_size=64,
    block_counts=8,
    window_size=128,
    num_hidden_layers=31,
    max_position_embeddings=2048,
    rope_theta=10000.0,
    norm_eps=1e-5,
    tie_word_embeddings=True,
    bos_token_id=0,
    eos_token_id=1,
)

# --- Tokenizer load ---
try:
    tokenizer = PreTrainedTokenizerFast.from_pretrained(TOKENIZER_PATH)
    # ensure pad token exists
    if tokenizer.pad_token is None:
        tokenizer.add_special_tokens({"pad_token": "<|pad|>"})
    tokenizer.pad_token = "<|pad|>"
except OSError:
    print("Error: Tokenizer not found. Run train_tokenizer.py first.")
    sys.exit(1)

print(f"Loading {DATASET_ID}...")
dataset = load_dataset(DATASET_ID, split="train")

def tokenize_function(examples):
    texts = []
    for inp, cot, tgt in zip(examples["input"], examples["cot"], examples["target"]):
        # Explicitly add BOS at start and EOS at end
        formatted_text = (
            f"{tokenizer.bos_token}"
            f"{inp}\n"
            f"<think>\n{cot}\n</think>\n"
            f"{tgt}"
            f"{tokenizer.eos_token}"
        )
        texts.append(formatted_text)

    return tokenizer(
        texts,
        truncation=True,
        max_length=2048,
        padding="max_length",
    )

print("Tokenizing with <BOS> and <think> formatting...")
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    num_proc=4,
    remove_columns=dataset.column_names,
)

print("Initializing Model...")
model = AutoModelForCausalLM.from_config(config)
model.resize_token_embeddings(len(tokenizer))

print(f"Params: {sum(p.numel() for p in model.parameters()) / 1e6:.2f}M")

# --- Training arguments ---
args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=16,
    gradient_accumulation_steps=4,
    num_train_epochs=1,
    # max_steps=1,
    learning_rate=3e-4,
    warmup_steps=1000,
    weight_decay=0.01,
    logging_steps=50,
    save_strategy="steps",
    save_steps=2000,
    save_total_limit=2,
    bf16=True,
    dataloader_num_workers=4,
    report_to="wandb",  # logs to W&B
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

print("Starting Training...")
trainer.train()

print("Saving locally...")
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

# --- Upload to Hugging Face Hub ---
if not HF_TOKEN:
    print(
        "\n[Warning] HUGGINGFACE_HUB_TOKEN (or HF_TOKEN) not found in environment.\n"
        "Skipping upload to Hugging Face Hub. To enable upload, set HUGGINGFACE_HUB_TOKEN env var."
    )
else:
    try:
        api = HfApi()
        user_info = api.whoami(token=HF_TOKEN)
        hf_user = user_info.get("name") or user_info.get("username")
        if not hf_user:
            raise RuntimeError("Unable to determine HF username via token.")

        # Determine target repo id
        if HF_REPO_ID:
            repo_id = HF_REPO_ID
        else:
            repo_id = f"{hf_user}/transformers-baseline"

        print(f"Creating or ensuring repo exists: {repo_id} ...")
        # create_repo will not fail if exist_ok=True
        create_repo(repo_id=repo_id, token=HF_TOKEN, exist_ok=True, private=False)

        print(f"Uploading {OUTPUT_DIR} to HF repo {repo_id} ...")
        # upload_folder will push all files in OUTPUT_DIR to the repo root
        # path_in_repo="" places them at root
        upload_folder(
            folder_path=OUTPUT_DIR,
            path_in_repo="",
            repo_id=repo_id,
            token=HF_TOKEN,
        )

        print(f"Upload complete. Model & tokenizer uploaded to Hugging Face repo: {repo_id}")
    except Exception as e:
        print(f"[Error] Failed to upload to Hugging Face Hub: {e}")

# finish wandb run
try:
    if "wandb" in sys.modules:
        wandb.finish()
except Exception:
    pass

print("Done.")

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

[34m[1mwandb[0m: Currently logged in as: [33makbar2habibullah[0m ([33makbar2habibullah-kreasof-ai[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Loading kreasof-ai/ECA-Zero...
Tokenizing with <BOS> and <think> formatting...


Map (num_proc=4):   0%|          | 0/333333 [00:00<?, ? examples/s]

Initializing Model...
Params: 27.38M
Starting Training...


Step,Training Loss
50,20.1074
100,17.7387
150,16.5072
200,14.1296
250,11.2997
300,8.863
350,6.7225
400,4.8925
450,3.3676
500,2.6096


Saving locally...
Creating or ensuring repo exists: ChavyvAkvar/nsa-baseline-1-epoch ...
Uploading ./eca_zero_baseline to HF repo ChavyvAkvar/nsa-baseline-1-epoch ...


Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...kpoint-4000/rng_state.pth: 100%|##########| 14.6kB / 14.6kB            

  ...ckpoint-5209/scheduler.pt: 100%|##########| 1.47kB / 1.47kB            

  ...ckpoint-4000/scheduler.pt: 100%|##########| 1.47kB / 1.47kB            

  ...kpoint-5209/rng_state.pth: 100%|##########| 14.6kB / 14.6kB            

  ...ckpoint-4000/optimizer.pt:   0%|          |  579kB /  219MB            

  ...nt-4000/model.safetensors:   1%|          |  552kB /  110MB            

  ...nt-5209/model.safetensors:   1%|          |  553kB /  110MB            

  ...aseline/model.safetensors:   1%|          |  553kB /  110MB            

  ...ckpoint-5209/optimizer.pt:   0%|          |  575kB /  219MB            

  ...nt-4000/training_args.bin:   1%|1         |  75.0B / 5.84kB            

Upload complete. Model & tokenizer uploaded to Hugging Face repo: ChavyvAkvar/nsa-baseline-1-epoch


0,1
train/epoch,▁▁▁▁▁▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇█████
train/global_step,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇█████
train/grad_norm,█▄▂▂▁▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/learning_rate,▁▂▃▄▄▆▆▆▇▇███▇▇▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▄▃▃▃▃▂▂▁▁▁
train/loss,█▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
total_flos,1.1188609304179507e+17
train/epoch,1.0
train/global_step,5209.0
train/grad_norm,0.12922
train/learning_rate,0.0
train/loss,0.1002
train_loss,1.3124
train_runtime,21447.5195
train_samples_per_second,15.542
train_steps_per_second,0.243


Done.
