In [1]:
import hashlib
import json
import math
import os
import random
import uuid
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime
from typing import Literal
import peft

import openai
import pandas as pd
import torch  # noqa: F401
from dotenv import load_dotenv
from huggingface_hub.hf_api import HfFolder
import torchvision
import transformers.models.llama.modeling_llama
from transformers import AutoModelForCausalLM, AutoTokenizer

from dotenv import load_dotenv

load_dotenv()

if "HUGGINGFACE_TOKEN" in os.environ:
    HfFolder.save_token(os.environ["HUGGINGFACE_TOKEN"])

In [2]:
# os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [3]:
df_ = pd.read_csv("Arena_QS_updated_filtered.csv", nrows=100)

In [4]:
df = df_.sample(n=10, random_state=42)

def sample_questions(
    df: pd.DataFrame, question_types: list, sample_qs: int, random_seed: int = 42
):
    random.seed(random_seed)
    mapping = []
    prompts = []
    sampled_ids = []
    for idx, row in df.iterrows():
        context = row["text"][:12_000] # TODO
        available = [qt for qt in question_types if pd.notna(row.get(qt))]
        if not available:
            continue
        if sample_qs > 0 and len(available) > sample_qs:
            chosen = random.sample(available, sample_qs)
        else:
            chosen = available
        for qt in chosen:
            prompt = f"""
Context: {context}
Question ({qt}): {row[qt]}

Answer the question in Kazakh language, use information provided in the context. Be concise and clear, only answer the question asked, but answer it well.
"""
            prompts.append(prompt)
            mapping.append(
                {
                    "task_id": idx,
                    "question": qt,
                    "question_type": qt,
                    "context": context,
                    "prompt": prompt,
                }
            )
            sampled_ids.append(f"{idx}-{qt}")
    return mapping, prompts, sampled_ids


mapping, prompts, _ = sample_questions(df, {"WHY_QS", "WHAT_QS", "HOW_QS", "DESCRIBE_QS", "ANALYZE_QS"}, sample_qs=1, random_seed=42)

In [5]:
len(mapping), len(prompts)

(10, 10)

In [8]:
# model_id = "armanibadboy/llama3.2-kazllm-3b-by-arman" # +
# model_id = "meta-llama/Llama-3.2-1B-Instruct" # +
# model_id = "TilQazyna/llama-kaz-instruct-8B-1" # +
# model_id = "google/gemma-2-2b-it" # +
# model_id = "AmanMussa/llama2-kazakh-7b" # -
# model_id = "IrbisAI/Irbis-7b-v0.1" # +
# model_id = "armanibadboy/llama3.1-kazllm-8b-by-arman-ver2" # -
# model_id = "meta-llama/Llama-3.2-3B-Instruct" # +
# model_id = "Qwen/Qwen2.5-7B-Instruct" # +
# model_id = "meta-llama/Llama-3.1-8B-Instruct" # +
# model_id = "google/gemma-2-9b-it" # +
model_id = "issai/LLama-3.1-KazLLM-1.0-8B" # +


if model_id == "armanibadboy/llama3.2-kazllm-3b-by-arman":
    extra = {
        "gguf_file": "unsloth.Q8_0.gguf",
    }
else:
    extra = {}

tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
if model_id == "armanibadboy/llama3.1-kazllm-8b-by-arman-ver2":
    model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B-Instruct", trust_remote_code=True, device_map="auto", torch_dtype=torch.bfloat16)
    model = peft.PeftModel.from_pretrained(model, model_id, safe_serialization=True, torch_dtype=torch.bfloat16)
else:
    model = AutoModelForCausalLM.from_pretrained(
        model_id, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2", trust_remote_code=True, **extra
    )
model.generation_config.pad_token_id = tokenizer.pad_token_id
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00003-of-00004.safetensors:  96%|#########6| 4.74G/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/155 [00:00<?, ?B/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((4096,), eps=1e-05)
    (rotary_

In [9]:
generation_config = {
    "do_sample": True,
    "max_new_tokens": 256,
    "num_beams": 1,
    "repetition_penalty": 1.0,
    "remove_invalid_values": True,
    "eos_token_id": tokenizer.eos_token_id,
    "pad_token_id": tokenizer.eos_token_id,
    "forced_eos_token_id": tokenizer.eos_token_id,
    "use_cache": True,
    "no_repeat_ngram_size": 0,
    "num_return_sequences": 1,
}

batch_size = 3
batch_prompts = prompts[0:batch_size]
chat_inputs = [
    [
        {"role": "user", "content": prompt},
    ]
    for prompt in batch_prompts
]

formatted_inputs = tokenizer.apply_chat_template(
    chat_inputs,
    tokenize=True,
    padding=True,
    truncation=True,
    return_tensors="pt",
    # chat_template="""{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}"""
    # chat_template="""{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}"""
#     chat_template="""{% for message in messages %}
# Сұрақ: {{ message['content'] | trim }}
# Жауап:
# {% endfor %}"""
)
formatted_inputs = formatted_inputs.to(device)
attention_masks = []
for input_ids in formatted_inputs:
    number_of_padding = 0
    for token_id in input_ids:
        if token_id == tokenizer.pad_token_id:
            number_of_padding += 1
        else:
            break
    attention_masks.append(
        [0] * number_of_padding + [1] * (len(input_ids) - number_of_padding)
    )
attention_masks = torch.tensor(attention_masks).to(device)

In [10]:
# torch._dynamo.config.capture_dynamic_output_shape_ops = True
# torch._dynamo.config.capture_scalar_outputs = True
# torch.set_float32_matmul_precision('high')

In [11]:
# model.__dict__

In [12]:
with torch.no_grad():
    out_ids = model.generate(
        **{
            "input_ids": formatted_inputs,
            "attention_mask": attention_masks,
        },
        **generation_config,
    )
    if out_ids.ndim == 1:
        out_ids = out_ids.unsqueeze(0)

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


In [16]:
outputs = []
for j in range(len(batch_prompts)):
    input_length = formatted_inputs[j].shape[0]
    generated_tokens = out_ids[j][input_length:]
    out_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
    out_text = out_text[len("assistant") :].strip()
    # out_text = out_text[len("user\n\n") :].strip()
    rec = mapping[j]
    rec["output"] = out_text
    # rec["generation_id"] = str(uuid.uuid4())
    outputs.append(rec)

In [19]:
outputs[1]['output']

'Жаңаөзен оқиғасының Қазақстандағы азаматтық қоғамның дамуына әсері теріс болды. Ереуіл және содан кейінгі қантөгіс жұртшылықтың билікке деген сенімін шайқаған. Бұл оқиға сонымен қатар азаматтық қоғамның өзалдық құрылымдарының жемісі емес, саяси биліктердің шаруасы екендігін көрсетті.'

In [18]:
len(outputs[0]['context'])

12000