In [1]:
import os
import torch
import transformers
import peft
import datasets
import evaluate
import time
assert torch.cuda.is_available(), "you need cuda for this part"
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def prompt_to_chat(prompt, system = None):
    messages = [
        {"role": "user", "content": prompt}
    ]
    if system is not None:
        messages.insert(0, {"role": "system", "content": prompt})
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    return text

def infer(model, prompt, l=100, use_chat = True, temperature=0.4, top_p = 0.8, system = None):
    if use_chat:
        prompt = prompt_to_chat(prompt, system)
    model_inputs = tokenizer([prompt], return_tensors="pt").to(model.device)

    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=l,
        #temperature=temperature, 
        #top_p=top_p,
        do_sample=True ,  
        repetition_penalty = 1.1,
        temperature = 0.1,
        top_p = 0.3,
        top_k = 20,
    )
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]
    
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=False)[0]
    return response

In [7]:

model_name = 'bb_ru_Qwen2.5-Coder-3B-Instruct_20250125-212602'
model_path = f'/app/models/{model_name}'
base_model_name = 'Qwen/Qwen2.5-Coder-3B-Instruct'
USE_QUANT = True

In [8]:
tokenizer = transformers.AutoTokenizer.from_pretrained(base_model_name)

In [9]:
if USE_QUANT:
    print("LOADING Q MODEL")
    bnb_config = transformers.BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.bfloat16
    )
    base_model = transformers.AutoModelForCausalLM.from_pretrained(base_model_name, device_map=device,quantization_config=bnb_config,)
else:
    base_model  = transformers.AutoModelForCausalLM.from_pretrained(
        base_model_name,
        low_cpu_mem_usage=True,
        return_dict=True,
        torch_dtype=torch.bfloat16,
        device_map="auto"
    )

LOADING Q MODEL


Loading checkpoint shards: 100%|██████████| 2/2 [00:44<00:00, 22.39s/it]


In [10]:

model = peft.PeftModel.from_pretrained(base_model, model_path)

In [13]:
model.push_to_hub("hodza/BlackBox-Coder-3B")

adapter_model.safetensors: 100%|██████████| 120M/120M [00:07<00:00, 15.1MB/s] 


CommitInfo(commit_url='https://huggingface.co/hodza/BBCopilot/commit/c9ffca07547d98b71d93e16f681c60004fa9e951', commit_message='Upload model', commit_description='', oid='c9ffca07547d98b71d93e16f681c60004fa9e951', pr_url=None, repo_url=RepoUrl('https://huggingface.co/hodza/BBCopilot', endpoint='https://huggingface.co', repo_type='model', repo_id='hodza/BBCopilot'), pr_revision=None, pr_num=None)

In [11]:
system = '''
Ты помощник для работы в системе BlackBox с использованием языка Component Pascal. Твоя задача информативно отвечать на вопросы. 
Ответ необходимо предоставить в формате markdown и выделять код символами ```. 

Пример: 

**Вопрос**: Как реализовать сортировку пузырьком?
**Ответ**: 

Cортировку пузырьком можно реализовать с помощью следующего кода:

```
	PROCEDURE BubbleSort*;  
		VAR i, j: INTEGER; x: Item;
	BEGIN
		FOR i := 1 TO n-1 DO
			FOR j := n-1 TO i BY -1 DO
				IF a[j-1] > a[j] THEN
					x := a[j-1];  a[j-1] := a[j];  a[j] := x
				END
			END
		END
	END BubbleSort;
```
'''

In [12]:
print(infer(model, 'Как мне вывести массив в Log?', 512, use_chat = True, system = system))

Чтобы вывести элементы массива, достаточно использовать цикл FOR. Например:

VAR a: ARRAY 10 OF INTEGER;
BEGIN
	FOR i := 0 TO LEN(a) - 1 DO
		Log.Int( a[i] )
	END;

Если нужно вывести несколько значений, то можно использовать процедуру Log.Ln (Ln = line number).<|im_end|>
