In [None]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 2048
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "meta-llama/Llama-2-7b-hf",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments, TrainerCallback
from datasets import Dataset
import torch

test_prompts = [
    "Хиймэл оюун гэж юу вэ?",
    "Монголын нийслэл хот аль вэ?",
]

class InferenceCallback(TrainerCallback):
    def __init__(self, model, tokenizer, test_prompts, alpaca_prompt, inference_steps=100, log_file="inference_log.txt"):
        self.model = model
        self.tokenizer = tokenizer
        self.test_prompts = test_prompts
        self.alpaca_prompt = alpaca_prompt
        self.inference_steps = inference_steps
        self.log_file = log_file

    def on_step_end(self, args, state, control, **kwargs):
        if state.global_step % self.inference_steps == 0:
            print(f"\n{'='*80}")
            print(f"Running inference at step {state.global_step}")
            print(f"{'='*80}\n")

            self.model.eval()

            for prompt in self.test_prompts:
                inputs = self.tokenizer([
                    self.alpaca_prompt.format(prompt, "")
                ], return_tensors="pt").to("cuda")

                with torch.no_grad():
                    outputs = self.model.generate(
                        **inputs,
                        max_new_tokens=128,
                        use_cache=True,
                        temperature=0.7,
                        do_sample=True
                    )

                response = self.tokenizer.batch_decode(outputs)[0]
                response_start = response.find("### Response:") + len("### Response:")
                response_text = response[response_start:].strip()

                print(f"Prompt: {prompt}")
                print(f"Response: {response_text}\n")
                print("-" * 80 + "\n")

            self.model.train()

        return control

dataset = Dataset.from_file("/content/drive/MyDrive/llm/llm/data-00000-of-00001.arrow")

alpaca_prompt = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{}

### Response:
{}"""

def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    outputs = examples["output"]
    texts = []
    for instruction, output in zip(instructions, outputs):
        text = alpaca_prompt.format(instruction, output) + tokenizer.eos_token
        texts.append(text)
    return {"text": texts}

dataset = dataset.map(formatting_prompts_func, batched=True)

inference_callback = InferenceCallback(
    model=model,
    tokenizer=tokenizer,
    test_prompts=test_prompts,
    alpaca_prompt=alpaca_prompt,
    inference_steps=100
)

In [None]:
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 6,
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 10,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        save_strategy = "steps",
        save_steps = 100,
        save_total_limit = 3,
        load_best_model_at_end = False,
    ),
)

trainer.train()

In [None]:
FastLanguageModel.for_inference(model)

inputs = tokenizer(
[
    alpaca_prompt.format(
        "Хиймэл оюун гэж юу вэ?",
        "",
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(
    **inputs,
    max_new_tokens=256,
    temperature=0.7,
    do_sample=True,
)
print(tokenizer.batch_decode(outputs)[0])

<s> Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Хиймэл оюун гэж юу вэ?

### Response:
Хиймэл оюун бол компьютерийн шинжлэх ухааны салбар бөгөөд өгөгдлөөс суралцаж, түүнийг ашиглах, урвуулахын тулд машин сургалт, байгалийн хэл боловсруулалт, өгөгдөл шинжилгээ зэрэг арга хэмжээг ашигладаг. Энэ нь хиймэл оюун мод, хиймэл оюун эсийн зэрэглэлийг хурдан, нарийвчлалтай илэрхийлэхэд анхаардаг.</s>
