In [1]:
import torch
from tqdm import tqdm

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained('Qwen/Qwen2.5-0.5B-Instruct').eval()
tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen2.5-0.5B-Instruct')

In [5]:
# два промпта
input_text_hedgehog = '<|im_start|>system\nYou are a storyteller. Generate a story based on user message.<|im_end|>\n<|im_start|>user\nGenerate me a short story about a tiny hedgehog named Sonic.<|im_end|>\n<|im_start|>assistant\n'
input_text_json = '<|im_start|>system\nYou are a JSON machine. Generate a JSON with format {"contractor": string with normalized contractor name, "sum": decimal, "currency": string with uppercased 3-letter currency code} based on user message.<|im_end|>\n<|im_start|>user\nTransfer 100 rubles and 50 kopeck to Mike<|im_end|>\n<|im_start|>assistant\n'

In [6]:
# logits = model(input_ids=..., attention_mask=...).logits
# logits_for_first_batch_and_last_token = logits[0, -1]

In [11]:
def save_to_file(content, filepath):
    """Сохраняет текст в файл"""
    with open(filepath, 'w', encoding='utf-8') as f:
        f.write(content)
    print(f"Файл сохранен: {filepath}")

#Задача 1. Greedy Decoding

In [7]:
def greedy_decode(input_text, max_length=1000):

    # объект Encoding, в котором содержатся ids и attention_mask
    encoding = tokenizer(input_text).encodings[0]

    # токенизация encoding
    input_ids = torch.tensor([encoding.ids]).long()
    attention_mask = torch.tensor([encoding.attention_mask]).long()

    generated_ids = input_ids.clone()
    eos_token_id = 151645

    # для отображения прогресса
    progress_bar = tqdm(total=max_length, desc="Generating tokens", unit="token")

    # длина генерации превысила 1000 токенов
    for _ in range(max_length):
        with torch.no_grad():
            # модель возвращает логиты
            logits = model(input_ids=generated_ids, attention_mask=attention_mask).logits
        logits_for_first_batch_and_last_token = logits[0, -1, :]
        next_token_id = torch.argmax(logits_for_first_batch_and_last_token).item()

        # сгенерировался EOS-токен с ID = 151645
        if next_token_id == eos_token_id:
            progress_bar.update(1)
            progress_bar.close()
            break

        generated_ids = torch.cat([generated_ids, torch.tensor([[next_token_id]]).long()], dim=-1)
        attention_mask = torch.cat([attention_mask, torch.tensor([[1]]).long()], dim=-1)
        progress_bar.update(1)

    progress_bar.close()
    return generated_ids[0].tolist()


In [8]:
print("Generating hedgehog story...")
hedgehog_ids = greedy_decode(input_text_hedgehog)
hedgehog_text = tokenizer.decode(hedgehog_ids, skip_special_tokens=True)
print("\nGenerated hedgehog story:")
print(hedgehog_text)

Generating hedgehog story...


Generating tokens:  24%|██▎       | 236/1000 [11:56<38:39,  3.04s/token]  


Generated hedgehog story:
system
You are a storyteller. Generate a story based on user message.
user
Generate me a short story about a tiny hedgehog named Sonic.
assistant
Once upon a time, in a small, cozy village nestled in the heart of the forest, there lived a tiny hedgehog named Sonic. Sonic was a curious and adventurous creature, always eager to explore the world around him. One day, while wandering through the forest, Sonic stumbled upon a hidden cave.

Inside the cave, Sonic discovered a treasure chest filled with magical items. As he opened the chest, he was amazed to see that the items were not just ordinary, but enchanted. Sonic was thrilled to find that he could use the items to help others in need.

From that day on, Sonic became a hero in the village. He used his magical powers to help people in need, and soon, the village was filled with people who were grateful for the help they received from Sonic.

Sonic's story became a legend, and people from all over the village w




In [12]:
save_to_file(hedgehog_text, 'hedgehog_text_1.txt')

Файл сохранен: hedgehog_text_1.txt


In [9]:
print("\nGenerating JSON...")
json_ids = greedy_decode(input_text_json)
json_text = tokenizer.decode(json_ids, skip_special_tokens=True)
print("\nGenerated JSON:")
print(json_text)


Generating JSON...


Generating tokens:   2%|▏         | 23/1000 [00:45<32:05,  1.97s/token]


Generated JSON:
system
You are a JSON machine. Generate a JSON with format {"contractor": string with normalized contractor name, "sum": decimal, "currency": string with uppercased 3-letter currency code} based on user message.
user
Transfer 100 rubles and 50 kopeck to Mike
assistant
{"contractor": "Mike", "sum": 105, "currency": "rubles"}





In [13]:
save_to_file(json_text, 'json_text.json')

Файл сохранен: json_text.json


Результаты:
- Если запустить алгоритм несколько раз, то будут ли различаться генерации?

Результат генерации не будет различаться в зависимости от параметров модели (Temperature, Top_K and Top_P не применимы для greedy decoding), так как в greedy decoding всегда выбирается максимальный по вероятности токен
- Какие есть проблемы с таким подходом к генерации в случае с генерацией сказки и в случае с генерацией JSON?

В генерации текста - простота, тк выбирается наиболее подходящий вариант

В генерации JSON - может привести к ошибкам в структуре (тк они неочевидны и задаются через промт)
- Сгенерированный текст про ёжика и сгенерированный JSON.

In [None]:
hedgehog_text

# Задача 2. Sampling