In [None]:
!pip install transformers==4.55.2 peft==0.17.0 trl==0.21.0 bitsandbytes==0.47.0 accelerate==1.10.0 vllm==0.10.1 gradio==5.43.0 pydantic==2.11.7
!pip install ipython>=8.0 jedi>=0.19

In [None]:
import os
from google.colab import userdata
hf_token = userdata.get('HF_TOKEN')
os.environ['HF_TOKEN'] = hf_token

In [None]:
# Google Drive Import
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!ls /content/drive/MyDrive/fine_tune_output

In [None]:
!ls /content/drive/MyDrive/dpo_output_1

In [None]:
!ls /content/drive/MyDrive/dpo_output_2

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel, PeftConfig
from vllm import LLM, SamplingParams
import gradio as gr

from vllm import LLM, SamplingParams

vllm_model = LLM(
    model = "Evion/llama3-alpaca-tuned-and-merged",
    tokenizer = "Evion/llama3-alpaca-tuned-and-merged",
    gpu_memory_utilization=0.85,
    enable_lora = True
)

sampling_params = SamplingParams(temperature=0.05, top_p=0.95, max_tokens=256)

In [None]:
from vllm.lora.request import LoRARequest

dpo_output_1_base_path = "/content/drive/MyDrive/dpo_output_1/"
dpo_output_2_base_path = "/content/drive/MyDrive/dpo_output_2/"

lora_configs = {
    "dpo_output_1_cpk_10": (1, dpo_output_1_base_path + "checkpoint-10"),
    "dpo_output_1_cpk_20": (2, dpo_output_1_base_path + "checkpoint-20"),
    "dpo_output_1_cpk_30": (3, dpo_output_1_base_path + "checkpoint-30"),
    "dpo_output_1_cpk_40": (4, dpo_output_1_base_path + "checkpoint-40"),
    "dpo_output_2_cpk_10": (5, dpo_output_2_base_path + "checkpoint-10"),
    "dpo_output_2_cpk_20": (6, dpo_output_2_base_path + "checkpoint-20"),
    "dpo_output_2_cpk_30": (7, dpo_output_2_base_path + "checkpoint-30"),
    "dpo_output_2_cpk_40": (8, dpo_output_2_base_path + "checkpoint-40")
}

def generate_text(raw_input, temperature = 0.05, top_p = 0.95, max_tokens=256, lora_mode = "default"):
    sampling_params = SamplingParams(
        temperature=temperature,
        top_p=top_p,
        max_tokens=max_tokens
    )
    alpaca_prompt = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

    ### Instruction:
    {}

    ### Response:
    {}"""
    prompt = alpaca_prompt.format(raw_input,"")
    lora_config = {}



    if lora_mode != "default":
        lora_config["lora_request"] = LoRARequest(lora_mode, lora_configs[lora_mode][0], lora_configs[lora_mode][1])

    outputs = vllm_model.generate(
        [prompt],
        sampling_params,
        **lora_config,
    )
    return outputs[0].outputs[0].text

In [None]:
questions = [
    "List three ways to reduce plastic waste in daily life.",
    "Write a haiku about artificial intelligence",
    "Translate '안녕하세요, 오늘 날씨가 좋네요' into English.",
    "Explain what the term LLM means in the field of AI.",
    "What is a famous tall tower in Paris?",
    "What is Fine-Tuning?",
    "Find the least common multiple (LCM) of 15 and 25.",
    "What were the main causes of World War II?",
    "Briefly explain the process of photosynthesis.",
    "Analyze the personality of the protagonist in Shakespeare's 'Hamlet'."
]

In [None]:
for idx, question in enumerate(questions):
  print(f"Question_{idx}: {question}")
  print("Default model response -----")
  print(generate_text(raw_input = question, lora_mode = "default"))
  for mode in lora_configs.keys():
    print(f"{mode} model response ------")
    print(generate_text(raw_input = question, lora_mode = mode))
    print()