In [10]:
%pip -q install  "datasets>=2.20.0" "accelerate>=0.33.0" bitsandbytes sentencepiece

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [3]:
import os, torch, random
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling

MODEL_ID = "HuggingFaceTB/SmolLM2-135M"
PROJECT = "smollm2-continued-pretrain"
seed=42; random.seed(seed); torch.manual_seed(seed)

<torch._C.Generator at 0x10ff42fb0>

In [4]:
samples = [
    "தமிழ் மொழியில் ஒரு சுருக்கமான உரை. இது மொழி ஏற்புக்காக மாதிரி வாக்கியம்.",
    "Another sentence in the target domain/language to help the model adapt.",
    "Short paragraphs are fine for a demo; use larger corpora for real gains.",
]
ds = Dataset.from_dict({"text": samples})

In [5]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def tok(ex): return tokenizer(ex["text"], truncation=True, padding="max_length", max_length=512)
tok_ds = ds.map(tok, batched=True, remove_columns=["text"])

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

In [1]:
# Disable HF progress bars to avoid traitlets/layout/contextvar errors
import os
os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "0"  # keep the fast-downloader off on this kernel

from huggingface_hub.utils import disable_progress_bars
disable_progress_bars()

from huggingface_hub import snapshot_download

MODEL_ID  = "HuggingFaceTB/SmolLM2-135M"
CACHE_DIR = "./_hf_cache_colab4"

local_model_path = snapshot_download(
    repo_id=MODEL_ID,
    local_dir=CACHE_DIR,
    allow_patterns=["*.safetensors","*.bin","*.json","*.model","tokenizer*","*merges*"],
    resume_download=True,
    max_workers=8,
)
print("Downloaded to:", local_model_path)


Downloaded to: /Users/keerthana/Keerthana/workspace/unsloth/_hf_cache_colab4




In [6]:
model = AutoModelForCausalLM.from_pretrained(local_model_path, device_map="auto")
collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [7]:
args = TrainingArguments(
    output_dir=f"./{PROJECT}",
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,
    learning_rate=2e-4,
    num_train_epochs=1,
    bf16=torch.cuda.is_available(),
    logging_steps=10,
    save_steps=100,
    report_to="none",
)

In [8]:
trainer = Trainer(model=model, args=args, train_dataset=tok_ds, data_collator=collator)
trainer.train()

The model is already on multiple devices. Skipping the move to device specified in `args`.


Step,Training Loss


TrainOutput(global_step=1, training_loss=1.6758861541748047, metrics={'train_runtime': 4.8855, 'train_samples_per_second': 0.614, 'train_steps_per_second': 0.205, 'total_flos': 978771050496.0, 'train_loss': 1.6758861541748047, 'epoch': 1.0})

In [9]:
def gen(prompt):
    ids = tokenizer(prompt, return_tensors="pt").to(model.device)
    out = model.generate(**ids, max_new_tokens=80, do_sample=True, top_p=0.9, temperature=0.8)
    print(tokenizer.decode(out[0], skip_special_tokens=True))
gen("தமிழில் ஒரு சிறு விளக்கம் எழுதவும்:")

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


தமிழில் ஒரு சிறு விளக்கம் எழுதவும்:

- மொழி மிழ் மொழி மாக மொழி மாக மாக மாக மாக மாக
