In [1]:
!nvidia-smi

Tue Jan 21 12:59:07 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 565.57.02              Driver Version: 566.03         CUDA Version: 12.7     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4060 Ti     On  |   00000000:01:00.0  On |                  N/A |
|  0%   39C    P8              4W /  165W |    3949MiB /  16380MiB |      7%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [61]:
import os
import torch
import transformers
import peft
import datasets

assert torch.cuda.is_available(), "you need cuda for this part"
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [62]:
device

device(type='cuda', index=0)

In [63]:
dataset_path = "/app/datasets/oberon/docs/bb_ru"
#model_name = "Qwen/Qwen2.5-Coder-7B"
model_name = 'MTSAIR/Cotype-Nano'


In [66]:
only_model_name = model_name.split("/")[-1]
dataset_name = dataset_path.split("/")[-1]
sft_model_path = f"/app/models/{dataset_name}_{only_model_name}"

In [78]:
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)
peft_config = peft.LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=[
        "mlp.down_proj",
        "self_attn.k_proj",
        "self_attn.o_proj",
        "mlp.up_proj",
        "self_attn.v_proj",
        "mlp.gate_proj",
        "self_attn.q_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=peft.TaskType.CAUSAL_LM
)
training_args = transformers.TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    fp16=True,
    gradient_accumulation_steps=4,
    num_train_epochs=30,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir='./logs',
    logging_steps=1,
)



In [68]:
def subset_to_dataset(s):
    return Dataset.from_dict({'text': [s[i]['text'] for i in range(len(s))]})
        
def dataset_load():
    file_names = []
    for subdir, dirs, files in os.walk(dataset_path):
        for file in files:
            file_names.append(os.path.join(subdir, file))
    texts = []
    for f in file_names:
        with open(f, 'r', encoding='utf-8') as file:
            texts.append(file.read())
    dataset = Dataset.from_dict({'text': texts})

    train_len = int(len(dataset)*0.8)
    lengths = [train_len, len(dataset)-train_len]
    train_dataset, test_dataset = torch.utils.data.random_split(dataset, lengths)
    
    return datasets.DatasetDict({"train":subset_to_dataset(train_dataset),"test":subset_to_dataset(test_dataset)})

In [69]:
dataset = dataset_load()

In [70]:
print(dataset["train"][0]['text'])

MODULE ObxViews1;
(**
	project	= "BlackBox"
	organization	= "www.oberon.ch"
	contributors	= "Oberon microsystems"
	version	= "System/Rsrc/About"
	copyright	= "System/Rsrc/About"
	license	= "Docu/BB-License"
	changes	= "
	- YYYYMMDD, nn, ...
	"
	issues	= "
	- ...
	"

**)

	IMPORT Views, Ports, Properties;

	TYPE View = POINTER TO RECORD (Views.View) END;

	PROCEDURE (v: View)  Restore (f: Views.Frame; l, t, r, b: INTEGER);
	BEGIN
		f.DrawRect(l, t, r, b, Ports.fill, Ports.red)
	END Restore;

	PROCEDURE (v: View) HandlePropMsg (VAR msg: Properties.Message);
	BEGIN
		WITH msg: Properties.SizePref DO
			IF (msg.w = Views.undefined) OR (msg.h = Views.undefined) THEN
				 msg.w := 20 * Ports.mm; msg.h := 10 * Ports.mm
			END
		ELSE	(* ignore other messages *)
		END
	END HandlePropMsg;

	PROCEDURE Deposit*;
		VAR v: View;
	BEGIN
		NEW(v); Views.Deposit(v)
	END Deposit;

END ObxViews1.



In [71]:

tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples['text'])

tokenized_dataset = dataset.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])

Token indices sequence length is longer than the specified maximum sequence length for this model (277343 > 131072). Running this sequence through the model will result in indexing errors
Map (num_proc=4): 100%|██████████| 856/856 [00:02<00:00, 426.55 examples/s]
Map (num_proc=4): 100%|██████████| 215/215 [00:00<00:00, 467.39 examples/s]


In [72]:
block_size = 128
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

lm_datasets = tokenized_dataset.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

Map (num_proc=4): 100%|██████████| 856/856 [00:01<00:00, 602.63 examples/s]
Map (num_proc=4): 100%|██████████| 215/215 [00:00<00:00, 718.66 examples/s]


In [74]:
tokenizer.decode(lm_datasets["train"][1]["input_ids"])

' Restore (f: Views.Frame; l, t, r, b: INTEGER);\n\tBEGIN\n\t\tf.DrawRect(l, t, r, b, Ports.fill, Ports.red)\n\tEND Restore;\n\n\tPROCEDURE (v: View) HandlePropMsg (VAR msg: Properties.Message);\n\tBEGIN\n\t\tWITH msg: Properties.SizePref DO\n\t\t\tIF (msg.w = Views.undefined) OR (msg.h = Views.undefined) THEN\n\t\t\t\t msg.w := 20 * Ports.mm; msg.h := 10 * Ports.mm\n\t\t\tEND\n\t\tELSE\t(* ignore other messages *)\n\t'

In [75]:
model = transformers.AutoModelForCausalLM.from_pretrained(model_name, device_map=device,quantization_config=bnb_config,)
model._hf_peft_config_loaded = True  # silence a warning from HF trainer

In [76]:
model = peft.get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 18,464,768 || all params: 1,562,179,072 || trainable%: 1.1820


In [79]:

data_collator = transformers.DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = transformers.Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["test"],
    data_collator=data_collator,
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss


In [14]:
model.save_pretrained(sft_model_path)
tokenizer.save_pretrained(sft_model_path)

('/app/models/bb_ru_cotype/tokenizer_config.json',
 '/app/models/bb_ru_cotype/special_tokens_map.json',
 '/app/models/bb_ru_cotype/vocab.json',
 '/app/models/bb_ru_cotype/merges.txt',
 '/app/models/bb_ru_cotype/added_tokens.json',
 '/app/models/bb_ru_cotype/tokenizer.json')