In [1]:
import hashlib
import json
import math
import os
import random
import uuid
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime
from typing import Literal

import openai
import pandas as pd
import torch  # noqa: F401
from dotenv import load_dotenv
from huggingface_hub.hf_api import HfFolder
import torchvision
import transformers.models.llama.modeling_llama
from transformers import AutoModelForCausalLM, AutoTokenizer

from dotenv import load_dotenv

load_dotenv()

if "HUGGINGFACE_TOKEN" in os.environ:
    HfFolder.save_token(os.environ["HUGGINGFACE_TOKEN"])

In [2]:
df_ = pd.read_csv("Arena_QS_updated_filtered.csv", nrows=100)

In [3]:
df = df_.sample(n=10, random_state=42)

def sample_questions(
    df: pd.DataFrame, question_types: list, sample_qs: int, random_seed: int = 42
):
    random.seed(random_seed)
    mapping = []
    prompts = []
    sampled_ids = []
    for idx, row in df.iterrows():
        context = row["text"]
        available = [qt for qt in question_types if pd.notna(row.get(qt))]
        if not available:
            continue
        if sample_qs > 0 and len(available) > sample_qs:
            chosen = random.sample(available, sample_qs)
        else:
            chosen = available
        for qt in chosen:
            prompt = f"""
Context: {context}
Question ({qt}): {row[qt]}

Answer the question in Kazakh language, use information provided in the context. Be concise and clear, only answer the question asked, but answer it well.
"""
            prompts.append(prompt)
            mapping.append(
                {
                    "task_id": idx,
                    "question": qt,
                    "question_type": qt,
                    "context": context,
                    "prompt": prompt,
                }
            )
            sampled_ids.append(f"{idx}-{qt}")
    return mapping, prompts, sampled_ids


mapping, prompts, _ = sample_questions(df, {"WHY_QS", "WHAT_QS", "HOW_QS", "DESCRIBE_QS", "ANALYZE_QS"}, sample_qs=2, random_seed=42)

In [4]:
len(mapping), len(prompts)

(20, 20)

In [None]:
# model_id = "armanibadboy/llama3.2-kazllm-3b-by-arman" # +
# model_id = "meta-llama/Llama-3.2-1B-Instruct" # +
model_id = "TilQazyna/llama-kaz-instruct-8B-1"
# model_id = "google/gemma-2-2b-it"
# model_id = "AmanMussa/llama2-kazakh-7b"
# model_id = "IrbisAI/Irbis-7b-v0.1"
# model_id = "armanibadboy/llama3.1-kazllm-8b-by-arman-ver2"
# model_id = "meta-llama/Llama-3.2-3B-Instruct"
# model_id = "Qwen/Qwen2.5-7B-Instruct"
# model_id = "meta-llama/Llama-3.1-8B-Instruct"
# model_id = "google/gemma-2-9b-it"
# model_id = "issai/LLama-3.1-KazLLM-1.0-8B"


if model_id == "armanibadboy/llama3.2-kazllm-3b-by-arman":
    extra = {
        "gguf_file": "unsloth.Q8_0.gguf",
    }
else:
    extra = {}

tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
model = AutoModelForCausalLM.from_pretrained(
    model_id, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2", trust_remote_code=True, **extra
)
model.generation_config.pad_token_id = tokenizer.pad_token_id
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
generation_config = {
    "do_sample": True,
    "max_new_tokens": 256,
    "num_beams": 1,
    "temperature": 0.1,
    "repetition_penalty": 1.0,
    "remove_invalid_values": True,
    "eos_token_id": tokenizer.eos_token_id,
    "pad_token_id": tokenizer.eos_token_id,
    "forced_eos_token_id": tokenizer.eos_token_id,
    "use_cache": True,
    "no_repeat_ngram_size": 0,
    "num_return_sequences": 1,
}

outputs = []
batch_size = 5
batch_prompts = prompts[0:batch_size]
chat_inputs = [
    [
        {"role": "user", "content": prompt},
    ]
    for prompt in batch_prompts
]

formatted_inputs = tokenizer.apply_chat_template(
    chat_inputs,
    tokenize=True,
    padding=True,
    truncation=True,
    return_tensors="pt",
)
formatted_inputs = formatted_inputs.to(device)
attention_masks = []
for input_ids in formatted_inputs:
    number_of_padding = 0
    for token_id in input_ids:
        if token_id == tokenizer.pad_token_id:
            number_of_padding += 1
        else:
            break
    attention_masks.append(
        [0] * number_of_padding + [1] * (len(input_ids) - number_of_padding)
    )
attention_masks = torch.tensor(attention_masks).to(device)

In [None]:
with torch.no_grad():
    out_ids = model.generate(
        **{
            "input_ids": formatted_inputs,
            "attention_mask": attention_masks,
        },
        **generation_config,
    )
    if out_ids.ndim == 1:
        out_ids = out_ids.unsqueeze(0)

In [None]:
for j in range(len(batch_prompts)):
    input_length = formatted_inputs[j].shape[0]
    generated_tokens = out_ids[j][input_length:]
    out_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
    out_text = out_text[len("assistant") :].strip()
    rec = mapping[j]
    rec["output"] = out_text
    # rec["generation_id"] = str(uuid.uuid4())
    outputs.append(rec)

In [None]:
outputs[4]['output']