In [5]:
!nvidia-smi

Tue Jan 21 11:06:50 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 565.57.02              Driver Version: 566.03         CUDA Version: 12.7     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4060 Ti     On  |   00000000:01:00.0  On |                  N/A |
| 47%   59C    P2            123W /  165W |   14025MiB /  16380MiB |     94%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [6]:
import os
import torch
import transformers
import peft
from datasets import load_dataset, Dataset
assert torch.cuda.is_available(), "you need cuda for this part"
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [7]:
dataset_path = "/app/datasets/oberon/docs/bb_ru"
#model_name = "Qwen/Qwen2.5-Coder-7B"
model_name = 'MTSAIR/Cotype-Nano'


In [8]:
sft_model_path = f"/app/models/bb_ru_cotype"

In [9]:
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)
peft_config = peft.LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=[
        "mlp.down_proj",
        "self_attn.k_proj",
        "self_attn.o_proj",
        "mlp.up_proj",
        "self_attn.v_proj",
        "mlp.gate_proj",
        "self_attn.q_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=peft.TaskType.CAUSAL_LM
)
training_args = transformers.TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    fp16=True,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir='./logs',
    logging_steps=10,
)



In [10]:
file_names = []

for subdir, dirs, files in os.walk(dataset_path):
    for file in files:
        file_names.append(os.path.join(subdir, file))

In [11]:
texts = []
for f in file_names:
    with open(f, 'r', encoding='utf-8') as file:
        texts.append(file.read())

In [12]:
dataset = Dataset.from_dict({'text': texts})
dataset

Dataset({
    features: ['text'],
    num_rows: 1071
})

In [None]:

tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/1071 [00:00<?, ? examples/s]

In [None]:
model = transformers.AutoModelForCausalLM.from_pretrained(model_name, device_map=device,quantization_config=bnb_config,)
model._hf_peft_config_loaded = True  # silence a warning from HF trainer

In [None]:
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

In [None]:

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
    data_collator=data_collator,
)

In [None]:
trainer.train()

In [None]:
model.save_pretrained(sft_model_path)
tokenizer.save_pretrained(sft_model_path)