# Inference Model

In [None]:
!pip install -U \
  "transformers==4.52.4" \
  "trl" \
  "peft==0.15.2" \
  "accelerate==1.8.1" \
  "datasets>=2.21.0" \
  "bitsandbytes>=0.43.2"

Collecting transformers==4.52.4
  Downloading transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Collecting trl
  Downloading trl-0.23.0-py3-none-any.whl.metadata (11 kB)
Collecting peft==0.15.2
  Downloading peft-0.15.2-py3-none-any.whl.metadata (13 kB)
Collecting accelerate==1.8.1
  Downloading accelerate-1.8.1-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes>=0.43.2
  Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl.metadata (11 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers==4.52.4)
  Downloading tokenizers-0.21.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
INFO: pip is looking at multiple versions of trl to determine which version is compatible with other requirements. This could take a while.
Collecting trl
  Downloading trl-0.22.2-py3-none-any.whl.metadata (11 kB)
  Downloading trl-0.22.1-py3-none-any.whl.metadata (11 kB)
  Downloading trl-0.22.0-py3-none-any.whl.metadata (11 kB)
  Downloading trl-0.21.0-py3-

In [None]:
import torch, os
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from transformers.utils import is_bitsandbytes_available
from peft import PeftModel

BASE_ID     = "scb10x/llama3.2-typhoon2-1b-instruct"
ADAPTER_DIR = "/content/drive/MyDrive/OutputLLM/llama3.2-typhoon2-1b-instruct/adapter_safety/final"

USE_BF16 = torch.cuda.is_available() and torch.cuda.is_bf16_supported()
DTYPE = torch.bfloat16 if USE_BF16 else torch.float16

tokenizer = AutoTokenizer.from_pretrained(BASE_ID, use_fast=True, trust_remote_code=True)

quant = None
if is_bitsandbytes_available():
    quant = BitsAndBytesConfig(
        load_in_4bit=True, bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True, bnb_4bit_compute_dtype=DTYPE
    )

base = AutoModelForCausalLM.from_pretrained(
    BASE_ID,
    device_map="auto",
    torch_dtype=DTYPE,
    trust_remote_code=True,
    attn_implementation="sdpa",
    quantization_config=quant
)
model = PeftModel.from_pretrained(base, ADAPTER_DIR)
model.eval()
model.config.use_cache = True


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/444 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/777 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/302 [00:00<?, ?B/s]

In [None]:
SYSTEM_MSG = (
    "คุณคือผู้ช่วย AI ของกองทุนเงินให้กู้ยืมเพื่อการศึกษา (กยศ.) "
    "ตอบคำถามด้วยความสุภาพ กระชับ และอ้างอิงจากข้อมูลหรือบริบทที่มีอยู่เท่านั้น "
    "พร้อมอธิบายเหตุผลประกอบว่าทำไมถึงตอบเช่นนั้น"
    "ตอบให้ตรงกับคำถาม แต่หากข้อมูลไม่เพียงพอ ให้แจ้งผู้ใช้ตามตรงว่าไม่สามารถให้คำตอบได้"
    "อธิบายคำตอบที่เกี่ยวข้องกับคำถามแบบละเอียดที่ถูกต้อง"
)

def answer_question_only(question: str, max_new_tokens=2048, temperature=0.2, top_p=0.95):
    """
    ใช้เฉพาะ 'question' ไม่มี context
    """
    try:
        messages = [
            {"role": "system", "content": SYSTEM_MSG},
            {"role": "user",   "content": question}
        ]
        prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    except Exception:
        prompt = f"<|system|>\n{SYSTEM_MSG}\n<|user|>\n{question}\n<|assistant|>\n"

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        out = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            top_p=top_p,
            do_sample=(temperature > 0),
            repetition_penalty=1.05,
            pad_token_id=tokenizer.eos_token_id,
        )
    text = tokenizer.decode(out[0], skip_special_tokens=True)
    if "<|assistant|>" in text:
        text = text.split("<|assistant|>")[-1].strip()
    return text

In [None]:
from tqdm import tqdm
tqdm.pandas()
df['ModelGen'] = df['question_text'].progress_apply(lambda x: answer_question_only(x))

100%|██████████| 19/19 [01:18<00:00,  4.14s/it]


In [None]:
for i in range(len(df)):
  df['ModelGen'] = df['ModelGen'].apply(lambda x: x.split("\n\n")[-1] if isinstance(x, str) else x)

In [None]:
n = 5
print(f"Question = {df['Question'][n]}")
print(f"FineTuning 3B = {df['Answer'][n]}")
print(f"FineTuning 1B = {df['ModelGen'][n]}")


Question = ถ้าไม่มีรายชื่อสัมภาษณ์จะได้ กยศ อยู่ไหมคะ
FineTuning 3B = ไม่เป็นผู้ที่อยู่ระหว่างการถูกดำเนินคดีในศาลฎีกาแผนกคดีทรัพย์สินทางปัญญาและการค้าระหว่างประเทศ\n35. ไม่เป็นผู้ที่อยู่ระหว่างการถูกดำเนินคดีในศาลฎีกาแผนกคดีภาษีอากร\n36. ไม่เป็นผู้ที่อยู่ระหว่างการถูกดำเนินคดีในศาลฎีกาแผนกคดีแรงงาน\n37. ไม่เป็นผู้ที่อยู่ระหว่างการถูกดำเนินคดีในศาลฎีกาแผนกคดีทรัพย์สินทางปัญญาและการค้าระหว่างประเทศ\n38. ไม่เป็นผู้ที่อยู่ระหว่างการถูกดำเนินคดีในศาลฎีกาแผนกคดีภาษีอากร\n39. ไม่เป็นผู้ที่อยู่ระหว่างการถูกดำเนินคดีในศาลฎีกาแผนกคดีแรงงาน\n40. ไม่เป็นผู้ที่อยู่ระหว่างการถูกดำเนินคดีในศาลฎีกาแผนกคดีทรัพย์สินทางปัญญาและการค้าระหว่างประเทศ\n41. ไม่เป็นผู้ที่อยู่ระหว่างการถูกดำเนินคดีในศาลฎีกาแผนกคดีภาษีอากร\n42. ไม่เป็นผู้ที่อยู่ระหว่างการถูกดำเนินคดีในศาลฎีกาแผนกคดีแรงงาน\n43. ไม่เป็นผู้ที่อยู่ระหว่างการถูกดำเนินคดีในศาลฎีกาแผนกคดีทรัพย์สินทางปัญญาและการค้าระหว่างประเทศ\n44. ไม่เป็นผู้ที่อยู่ระหว่างการถูกดำเนินคดีในศาลฎีกาแผนกคดีภาษีอากร\n45. ไม่เป็นผู้ที่อยู่ระหว่างการถูกดำเนินคดีในศาลฎีกาแผนกค

## No Finetune Inference

In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_id = "scb10x/llama3.2-typhoon2-t1-3b-research-preview"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

SYSTEM_MSG = (
    "คุณคือผู้ช่วย AI ของกองทุนเงินให้กู้ยืมเพื่อการศึกษา (กยศ.) "
    "ตอบคำถามด้วยความสุภาพ กระชับ และอ้างอิงจากข้อมูลหรือบริบทที่มีอยู่เท่านั้น "
    "พร้อมอธิบายเหตุผลประกอบว่าทำไมถึงตอบเช่นนั้น "
    "ตอบให้ตรงกับคำถาม แต่หากข้อมูลไม่เพียงพอ ให้แจ้งผู้ใช้ตามตรงว่าไม่สามารถให้คำตอบได้ "
    "อธิบายคำตอบที่เกี่ยวข้องกับคำถามแบบละเอียดที่ถูกต้อง"
)

responses = []

for q in df['question_text']:
    messages = [
        {"role": "system", "content": SYSTEM_MSG},
        {"role": "user", "content": q},
    ]

    input_ids = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to(model.device)

    terminators = [
        tokenizer.eos_token_id,
        tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]

    outputs = model.generate(
        input_ids,
        max_new_tokens=2000,
        eos_token_id=terminators,
        do_sample=True,
        temperature=0.2,
        top_p=0.95,
    )

    response = outputs[0][input_ids.shape[-1]:]
    decoded = tokenizer.decode(response, skip_special_tokens=True)
    responses.append(decoded)

df['Model1B'] = responses


# FineTune LLM

In [None]:
!pip install -U \
  "transformers==4.52.4" \
  "trl" \
  "peft==0.15.2" \
  "accelerate==1.8.1" \
  "datasets>=2.21.0" \
  "bitsandbytes>=0.43.2"

Collecting transformers==4.52.4
  Downloading transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Collecting trl
  Downloading trl-0.23.0-py3-none-any.whl.metadata (11 kB)
Collecting peft==0.15.2
  Downloading peft-0.15.2-py3-none-any.whl.metadata (13 kB)
Collecting accelerate==1.8.1
  Downloading accelerate-1.8.1-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes>=0.43.2
  Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl.metadata (11 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers==4.52.4)
  Downloading tokenizers-0.21.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
INFO: pip is looking at multiple versions of trl to determine which version is compatible with other requirements. This could take a while.
Collecting trl
  Downloading trl-0.22.2-py3-none-any.whl.metadata (11 kB)
  Downloading trl-0.22.1-py3-none-any.whl.metadata (11 kB)
  Downloading trl-0.22.0-py3-none-any.whl.metadata (11 kB)
  Downloading trl-0.21.0-py3-

In [None]:
pip install deepspeed


Collecting deepspeed
  Downloading deepspeed-0.17.5.tar.gz (1.6 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m56.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting hjson (from deepspeed)
  Downloading hjson-3.1.0-py3-none-any.whl.metadata (2.6 kB)
Collecting ninja (from deepspeed)
  Downloading ninja-1.13.0-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (5.1 kB)
Downloading hjson-3.1.0-py3-none-any.whl (54 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.0/54.0 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ninja-1.13.0-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (180 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m180.7/180.7 kB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for c

In [None]:
!pip install rouge_score nltk sacrebleu


Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-3.2.0-py3-none-any.whl (22 kB)
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score

In [None]:
import os, random, math, json, time, glob
import numpy as np
import pandas as pd
import torch

from datasets import Dataset, DatasetDict, load_from_disk
from sklearn.model_selection import train_test_split

from transformers import (
    AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
)
from transformers.utils import is_bitsandbytes_available
from peft import LoraConfig, prepare_model_for_kbit_training, PeftModel
from trl import SFTTrainer, SFTConfig
from transformers.trainer_callback import EarlyStoppingCallback, TrainerCallback

MODEL_ID   = "scb10x/llama3.2-typhoon2-t1-3b-research-preview"
CSV_PATH   = "/content/DataForFinetune.csv"
OUTPUT_DIR = "/content/drive/MyDrive/OutputLLM/llama3.2-typhoon2-t1-3b-research-preview"

MAX_SEQ_LEN = 4000         
BATCH_SIZE = 2
GRAD_ACCUM = 16
EPOCHS = 2                 
LORA_TARGET = ["q_proj","k_proj","v_proj","o_proj"]
LORA_R = 8
LORA_ALPHA = 16
GC_FREE_GB_TH = 6.0        

CKPT_DIR = os.path.join(OUTPUT_DIR, "checkpoints_hf")    
ADAPTER_SAFETY_DIR = os.path.join(OUTPUT_DIR, "adapter_safety")
os.makedirs(CKPT_DIR, exist_ok=True)
os.makedirs(ADAPTER_SAFETY_DIR, exist_ok=True)

TOK_CACHE_DIR = os.path.join(OUTPUT_DIR, "tokenized_ds")

SEED = 42
LR = 1e-4
WARMUP_RATIO = 0.03
WEIGHT_DECAY = 0.0

LOG_EVERY  = 25
EVAL_STRATEGY = "steps"      
SAVE_STRATEGY = "steps" 
EVAL_EVERY = 25    
SAVE_EVERY = 200          

MAX_GRAD_NORM = 1.0
USE_EARLY_STOPPING = False
PATIENCE = 4

TIME_LIMIT_HOURS = 200
TIME_LIMIT_SAVE_EVERY_STEPS = 200  

os.makedirs(OUTPUT_DIR, exist_ok=True)
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"  
torch.backends.cuda.matmul.allow_tf32 = True
try:
    torch.set_float32_matmul_precision("high")
except Exception:
    pass

random.seed(SEED); np.random.seed(SEED)
torch.manual_seed(SEED); torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

USE_BF16 = torch.cuda.is_available() and torch.cuda.is_bf16_supported()
DTYPE = torch.bfloat16 if USE_BF16 else torch.float16

USE_DEEPSPEED = False
try:
    import deepspeed  
    USE_DEEPSPEED = True
except Exception:
    USE_DEEPSPEED = False

df = pd.read_csv(CSV_PATH)
need = {"cleaned_text","question_text","answer_text"}
missing = need - set(df.columns)
assert not missing, f"Missing columns: {missing}"
df = df[list(need)].dropna().reset_index(drop=True)

df["__key__"] = (
    df["cleaned_text"].astype(str) + "||" +
    df["question_text"].astype(str) + "||" +
    df["answer_text"].astype(str)
)
df = df.drop_duplicates("__key__").drop(columns="__key__").reset_index(drop=True)

SYSTEM_MSG = (
    "คุณคือผู้ช่วย AI ของกองทุนเงินให้กู้ยืมเพื่อการศึกษา (กยศ.) "
    "ตอบคำถามด้วยความสุภาพ กระชับ และอ้างอิงจากข้อมูลหรือบริบทที่มีอยู่เท่านั้น "
    "พร้อมอธิบายเหตุผลประกอบว่าทำไมถึงตอบเช่นนั้น"
    "ตอบให้ตรงกับคำถาม แต่หากข้อมูลไม่เพียงพอ ให้แจ้งผู้ใช้ตามตรงว่าไม่สามารถให้คำตอบได้"
    "อธิบายคำตอบกี่เกี่ยวข้องกับคำถามแบบละเอียด"
)


def row_to_messages(row):
    context = str(row["cleaned_text"])
    q       = str(row["question_text"])
    a       = str(row["answer_text"]).strip()
    user_msg = (
        "ต่อไปนี้คือบริบท (context):\n"
        f"{context}\n\n"
        "คำถาม:\n"
        f"{q}\n\n"
        "โปรดตอบเป็นภาษาไทยและยึดตามบริบทด้านบนเท่านั้น"
    )
    return [
        {"role":"system","content":SYSTEM_MSG},
        {"role":"user","content":user_msg},
        {"role":"assistant","content":a},
    ]

def render_example(row):
    msgs = row_to_messages(row)
    return (
        f"<|system|>\n{msgs[0]['content']}\n"
        f"<|user|>\n{msgs[1]['content']}\n"
        f"<|assistant|>\n{msgs[2]['content']}"
    )

df["text"] = df.apply(render_example, axis=1)

train_df, val_df = train_test_split(
    df[["text"]], test_size=0.1, random_state=SEED, shuffle=True
)

train_ds_raw = Dataset.from_pandas(train_df.reset_index(drop=True))
val_ds_raw   = Dataset.from_pandas(val_df.reset_index(drop=True))
raw_data     = DatasetDict({"train": train_ds_raw, "validation": val_ds_raw})

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

def tokenize_batch(ex):
    out = tokenizer(
        ex["text"],
        truncation=True,
        max_length=MAX_SEQ_LEN,
        padding=False,
        return_attention_mask=True
    )
    out["labels"] = out["input_ids"].copy()
    return out

if os.path.exists(TOK_CACHE_DIR):
    data_tok = load_from_disk(TOK_CACHE_DIR)
else:
    data_tok = raw_data.map(tokenize_batch, batched=True, num_proc=4, desc="Tokenizing")
    keep_cols = ["input_ids","attention_mask","labels"]
    data_tok = DatasetDict({
        "train": data_tok["train"].remove_columns([c for c in data_tok["train"].column_names if c not in keep_cols]),
        "validation": data_tok["validation"].remove_columns([c for c in data_tok["validation"].column_names if c not in keep_cols]),
    })
    data_tok.save_to_disk(TOK_CACHE_DIR)

bnb_config = None
if is_bitsandbytes_available():
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=DTYPE,
    )

attn_impl_candidates = ["sdpa", "eager"]  
last_err = None
for attn_impl in attn_impl_candidates:
    try:
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_ID,
            torch_dtype=DTYPE,
            device_map="auto",
            trust_remote_code=True,
            attn_implementation=attn_impl,
            quantization_config=bnb_config if bnb_config is not None else None,
        )
        break
    except Exception as e:
        last_err = e
else:
    raise RuntimeError(f"Failed to load model. Last error: {last_err}")

def get_free_total_gb(device=0):
    free_bytes, total_bytes = torch.cuda.mem_get_info(device)
    return free_bytes / (1024**3), total_bytes / (1024**3)

AUTO_TOGGLE_GC = True
USE_GRADIENT_CHECKPOINTING = False  
if torch.cuda.is_available() and AUTO_TOGGLE_GC:
    free_gb, total_gb = get_free_total_gb(0)
    if free_gb >= GC_FREE_GB_TH:  
        USE_GRADIENT_CHECKPOINTING = False

model.config.use_cache = False
if USE_GRADIENT_CHECKPOINTING:
    if hasattr(model, "gradient_checkpointing_enable"):
        model.gradient_checkpointing_enable()
else:
    if hasattr(model, "gradient_checkpointing_disable"):
        model.gradient_checkpointing_disable()

if bnb_config is not None:
    model = prepare_model_for_kbit_training(model)

peft_cfg = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=LORA_TARGET,  
)

ds_config = None
if USE_DEEPSPEED:
    ds_config = {
        "train_batch_size": BATCH_SIZE * GRAD_ACCUM,
        "gradient_accumulation_steps": GRAD_ACCUM,
        "zero_optimization": {
            "stage": 2,
            "overlap_comm": True,
            "contiguous_gradients": True
        },
        "bf16": {"enabled": bool(USE_BF16)},
        "fp16": {"enabled": (not USE_BF16)},
        "gradient_clipping": MAX_GRAD_NORM,
    }

class TimeLimitCallback(TrainerCallback):
    def __init__(self, max_hours=11.75, adapter_dir=ADAPTER_SAFETY_DIR, save_every_steps=TIME_LIMIT_SAVE_EVERY_STEPS):
        self.max_secs = max_hours * 3600
        self.adapter_dir = adapter_dir
        self.save_every_steps = save_every_steps
        os.makedirs(adapter_dir, exist_ok=True)
    def on_train_begin(self, args, state, control, **kwargs):
        self.t0 = time.time()
    def _save_adapter(self, model, tokenizer, tag):
        path = os.path.join(self.adapter_dir, tag)
        os.makedirs(path, exist_ok=True)
        model.save_pretrained(path, safe_serialization=True)
        tokenizer.save_pretrained(path)
    def on_step_end(self, args, state, control, **kwargs):
        model = kwargs.get("model", None)
        tokenizer = kwargs.get("tokenizer", None)
        if model is None or tokenizer is None:
            return control
        if state.global_step and state.global_step % self.save_every_steps == 0:
            self._save_adapter(model, tokenizer, f"step-{state.global_step}")
        if time.time() - self.t0 >= self.max_secs:
            self._save_adapter(model, tokenizer, f"step-{state.global_step}-final")
            control.should_training_stop = True
        return control

def latest_hf_checkpoint_dir(base=CKPT_DIR):
    paths = sorted(glob.glob(os.path.join(base, "checkpoint-*")), key=os.path.getmtime)
    return paths[-1] if paths else None

optim_choice = "adamw_torch_fused" if USE_DEEPSPEED else ("paged_adamw_8bit" if is_bitsandbytes_available() else "adamw_torch")

args = SFTConfig(
    output_dir=OUTPUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=GRAD_ACCUM,
    learning_rate=LR,
    weight_decay=WEIGHT_DECAY,
    warmup_ratio=WARMUP_RATIO,
    lr_scheduler_type="cosine",
    logging_steps=LOG_EVERY,

    eval_strategy="steps",     
    save_strategy="steps",           
    save_steps=SAVE_EVERY,
    save_total_limit=2,
    load_best_model_at_end=False,       

    bf16=USE_BF16,
    fp16=(not USE_BF16),
    gradient_checkpointing=USE_GRADIENT_CHECKPOINTING,
    max_seq_length=MAX_SEQ_LEN,

    dataset_text_field=None,
    packing=False,

    report_to="none",
    max_grad_norm=MAX_GRAD_NORM,
    ddp_find_unused_parameters=False,
    deepspeed=ds_config,
    optim=optim_choice,
    save_safetensors=True,
    dataloader_num_workers=2,
    dataloader_pin_memory=True,
)

trainer = SFTTrainer(
    model=model,
    peft_config=peft_cfg,
    train_dataset=data_tok["train"],
    eval_dataset=data_tok["validation"],
    args=args,
)

if USE_EARLY_STOPPING and EVAL_STRATEGY != "no":
    trainer.add_callback(EarlyStoppingCallback(early_stopping_patience=PATIENCE))

trainer.add_callback(TimeLimitCallback(max_hours=TIME_LIMIT_HOURS))

resume_dir = latest_hf_checkpoint_dir(CKPT_DIR)

resume_dir = latest_hf_checkpoint_dir(OUTPUT_DIR) or resume_dir

trainer.train(resume_from_checkpoint=resume_dir)

trainer.model.save_pretrained(os.path.join(ADAPTER_SAFETY_DIR, "final"), safe_serialization=True)
tokenizer.save_pretrained(os.path.join(ADAPTER_SAFETY_DIR, "final"))

trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print("Done. Saved to:", OUTPUT_DIR)
print("Latest HF checkpoint:", latest_hf_checkpoint_dir(OUTPUT_DIR))
print("Adapter safety path:", ADAPTER_SAFETY_DIR)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/769 [00:00<?, ?B/s]

Tokenizing (num_proc=4):   0%|          | 0/3978 [00:00<?, ? examples/s]

Tokenizing (num_proc=4):   0%|          | 0/442 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3978 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/442 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/899 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/302 [00:00<?, ?B/s]

average_tokens_across_devices is set to True but it is invalid when world size is1. Turn it to False automatically.


Truncating train dataset:   0%|          | 0/3978 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/442 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss
25,0.9511,0.926597
50,0.8147,0.845289
75,0.7969,0.812686
100,0.7581,0.787353
125,0.7399,0.764746
150,0.7175,0.748594
175,0.6935,0.737447


Step,Training Loss,Validation Loss
25,0.9511,0.926597
50,0.8147,0.845289
75,0.7969,0.812686
100,0.7581,0.787353
125,0.7399,0.764746
150,0.7175,0.748594
175,0.6935,0.737447
200,0.691,0.730755
225,0.6848,0.728327
250,0.6773,0.727917


  return fn(*args, **kwargs)


Done. Saved to: /content/drive/MyDrive/OutputLLM/llama3.2-typhoon2-t1-3b-research-preview
Latest HF checkpoint: /content/drive/MyDrive/OutputLLM/llama3.2-typhoon2-t1-3b-research-preview/checkpoint-250
Adapter safety path: /content/drive/MyDrive/OutputLLM/llama3.2-typhoon2-t1-3b-research-preview/adapter_safety


In [None]:
pip install mpi4py

Collecting mpi4py
  Downloading mpi4py-4.1.0-cp312-cp312-manylinux1_x86_64.manylinux_2_5_x86_64.whl.metadata (16 kB)
Downloading mpi4py-4.1.0-cp312-cp312-manylinux1_x86_64.manylinux_2_5_x86_64.whl (1.5 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.5 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m58.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: mpi4py
Successfully installed mpi4py-4.1.0


# GUFF

In [None]:
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

BASE_ID = "scb10x/llama3.2-typhoon2-1b-instruct"
ADAPTER_DIR = f"/content/drive/MyDrive/OutputLLM/llama3.2-typhoon2-1b-instruct/adapter_safety/final"
MERGED_OUT = f"/content/drive/MyDrive/ModelMerged/merged-full"
tok = AutoTokenizer.from_pretrained(BASE_ID, trust_remote_code=True)
base = AutoModelForCausalLM.from_pretrained(BASE_ID, torch_dtype=torch.float16, device_map="cpu")

model = PeftModel.from_pretrained(base, ADAPTER_DIR)
merged = model.merge_and_unload()

merged.save_pretrained(MERGED_OUT, safe_serialization=True)
tok.save_pretrained(MERGED_OUT)
print("Merged HF saved at:", MERGED_OUT)

config.json:   0%|          | 0.00/777 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/302 [00:00<?, ?B/s]

Merged HF saved at: /content/drive/MyDrive/ModelMerged/merged-full


In [None]:
# !git clone https://github.com/ggerganov/llama.cpp
# !cd llama.cpp
!pip install -r /content/llama.cpp/requirements/requirements-all.txt


Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cpu, https://download.pytorch.org/whl/cpu, https://download.pytorch.org/whl/nightly
Collecting git+https://github.com/huggingface/transformers@v4.56.0-Embedding-Gemma-preview (from -r /content/llama.cpp/requirements/../tools/mtmd/../../requirements/requirements-convert_legacy_llama.txt (line 8))
  Cloning https://github.com/huggingface/transformers (to revision v4.56.0-Embedding-Gemma-preview) to /tmp/pip-req-build-j5eu29jc
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-j5eu29jc
  Running command git checkout -q 60b68e304cf4b6569b0660a13b558b929d4b0e77
  Resolved https://github.com/huggingface/transformers to commit 60b68e304cf4b6569b0660a13b558b929d4b0e77
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Ignoring torch:

In [None]:
!python /content/llama.cpp/convert_hf_to_gguf.py \
    /content/drive/MyDrive/ModelMerged/merged-full \
    --outfile /content/drive/MyDrive/GGUF/llama1bmergedf16.gguf \
    --outtype f16


INFO:hf-to-gguf:Loading model: merged-full
INFO:hf-to-gguf:Model architecture: LlamaForCausalLM
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:rope_freqs.weight,           torch.float32 --> F32, shape = {32}
INFO:hf-to-gguf:gguf: loading model part 'model.safetensors'
INFO:hf-to-gguf:token_embd.weight,           torch.float16 --> F16, shape = {2048, 128256}
INFO:hf-to-gguf:blk.0.attn_norm.weight,      torch.float16 --> F32, shape = {2048}
INFO:hf-to-gguf:blk.0.ffn_down.weight,       torch.float16 --> F16, shape = {8192, 2048}
INFO:hf-to-gguf:blk.0.ffn_gate.weight,       torch.float16 --> F16, shape = {2048, 8192}
INFO:hf-to-gguf:blk.0.ffn_up.weight,         torch.float16 --> F16, shape = {2048, 8192}
INFO:hf-to-gguf:blk.0.ffn_norm.weight,       torch.float16 --> F32, shape = {2048}
INFO:hf-to-gguf:blk.0.attn_k.weight,         torch.float16 --> F16, shape = {2048, 512}
INFO:hf-to-gguf:blk.0.attn_output.weight,    t

In [None]:
!cd llama.cpp && make -j

/bin/bash: line 1: cd: llama.cpp: No such file or directory


In [None]:
!apt-get update && apt-get install -y build-essential


0% [Working]            Get:1 https://cli.github.com/packages stable InRelease [3,917 B]
0% [Connecting to archive.ubuntu.com (185.125.190.36)] [Connecting to security.0% [Connecting to archive.ubuntu.com (185.125.190.36)] [Connecting to security.                                                                               Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
0% [Connecting to archive.ubuntu.com (185.125.190.36)] [Connecting to security.0% [Connecting to archive.ubuntu.com (185.125.190.36)] [Connecting to security.                                                                               Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:4 https://cli.github.com/packages stable/main amd64 Packages [346 B]
Get:5 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:6 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:7 https://developer.download.nvid

In [None]:
!mkdir build

In [None]:
!ls -la /content/llama.cpp/build/bin

total 88860
drwxr-xr-x  2 root root    4096 Sep 11 09:12 .
drwxr-xr-x 12 root root    4096 Sep 11 09:10 ..
-rwxr-xr-x  1 root root  728240 Sep 11 09:10 libggml-base.so
-rwxr-xr-x  1 root root  979304 Sep 11 09:10 libggml-cpu.so
-rwxr-xr-x  1 root root   55176 Sep 11 09:10 libggml.so
-rwxr-xr-x  1 root root 2535040 Sep 11 09:10 libllama.so
-rwxr-xr-x  1 root root  780880 Sep 11 09:11 libmtmd.so
-rwxr-xr-x  1 root root 2425288 Sep 11 09:11 llama-batched
-rwxr-xr-x  1 root root 2425384 Sep 11 09:11 llama-batched-bench
-rwxr-xr-x  1 root root  509512 Sep 11 09:11 llama-bench
-rwxr-xr-x  1 root root 2461008 Sep 11 09:11 llama-cli
-rwxr-xr-x  1 root root  356840 Sep 11 09:11 llama-convert-llama2c-to-ggml
-rwxr-xr-x  1 root root 2454488 Sep 11 09:11 llama-cvector-generator
-rwxr-xr-x  1 root root 2438528 Sep 11 09:11 llama-diffusion-cli
-rwxr-xr-x  1 root root 2434408 Sep 11 09:11 llama-embedding
-rwxr-xr-x  1 root root 2425600 Sep 11 09:11 llama-eval-callback
-rwxr-xr-x  1 root root 2455072 

In [None]:
!/content/llama.cpp/build/bin/llama-quantize \
  /content/drive/MyDrive/GGUF/llama1bmergedf16.gguf \
  /content/drive/MyDrive/GGUF/llama1b-q3_K_L.gguf \
  q3_K_L


main: build = 6446 (c0389dba)
main: built with cc (Ubuntu 11.4.0-1ubuntu1~22.04.2) 11.4.0 for x86_64-linux-gnu
main: quantizing '/content/drive/MyDrive/GGUF/llama1bmergedf16.gguf' to '/content/drive/MyDrive/GGUF/llama1b-q3_K_L.gguf' as Q3_K_L
llama_model_loader: loaded meta data with 29 key-value pairs and 147 tensors from /content/drive/MyDrive/GGUF/llama1bmergedf16.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Merged Full
llama_model_loader: - kv   3:                         general.size_label str              = 1.2B
llama_model_loader: - kv   4:                          llama.block_count u32              = 16
llama_mo

In [None]:
!/content/llama.cpp/build/bin/llama-quantize \
  /content/drive/MyDrive/GGUF/llama1bmergedf16.gguf \
  /content/drive/MyDrive/GGUF/llama1b-q4_K_M.gguf \
  q4_K_M


main: build = 6446 (c0389dba)
main: built with cc (Ubuntu 11.4.0-1ubuntu1~22.04.2) 11.4.0 for x86_64-linux-gnu
main: quantizing '/content/drive/MyDrive/GGUF/llama1bmergedf16.gguf' to '/content/drive/MyDrive/GGUF/llama1b-q4_K_M.gguf' as Q4_K_M
llama_model_loader: loaded meta data with 29 key-value pairs and 147 tensors from /content/drive/MyDrive/GGUF/llama1bmergedf16.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Merged Full
llama_model_loader: - kv   3:                         general.size_label str              = 1.2B
llama_model_loader: - kv   4:                          llama.block_count u32              = 16
llama_mo