In [None]:
!pip install transformers==4.55.2 peft==0.17.0 trl==0.21.0 bitsandbytes==0.47.0 accelerate==1.10.0 vllm==0.10.1 gradio==5.43.0 pydantic==2.11.7
!pip install ipython>=8.0 jedi>=0.19

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
from google.colab import userdata
hf_token = userdata.get('HF_TOKEN')
os.environ['HF_TOKEN'] = hf_token

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel, PeftConfig
from vllm import LLM, SamplingParams
import gradio as gr

vllm_model = LLM(
    model = "jsgoodlife0511/llama3.1-tuned-and-merged",
    tokenizer = "jsgoodlife0511/llama3.1-tuned-and-merged",
    gpu_memory_utilization=0.85, # GPU memory limitation, ex) KV cache
    enable_lora = True # Dynimically apply different LoRA adapters
)

sampling_params = SamplingParams(temperature=0.05, top_p=0.95, max_tokens=256)

In [None]:
from vllm.lora.request import LoRARequest

dpo_output_1_base_path = "/content/drive/MyDrive/dpo_output_1/"
dpo_output_2_base_path = "/content/drive/MyDrive/dpo_output_2/"

lora_configs = {
    "dpo_output_1_cpk_10": (1, dpo_output_1_base_path + "checkpoint-10"),
    "dpo_output_1_cpk_20": (2, dpo_output_1_base_path + "checkpoint-20"),
    "dpo_output_1_cpk_30": (3, dpo_output_1_base_path + "checkpoint-30"),
    "dpo_output_1_cpk_40": (4, dpo_output_1_base_path + "checkpoint-40"),
    "dpo_output_2_cpk_10": (5, dpo_output_2_base_path + "checkpoint-10"),
    "dpo_output_2_cpk_20": (6, dpo_output_2_base_path + "checkpoint-20"),
    "dpo_output_2_cpk_30": (7, dpo_output_2_base_path + "checkpoint-30"),
    "dpo_output_2_cpk_40": (8, dpo_output_2_base_path + "checkpoint-40")
}

# Gradio interface will call this function
def generate_text(raw_input, temperature = 0.05, top_p = 0.95, max_tokens=256, lora_mode = "default"):
    sampling_params = SamplingParams(
        temperature=temperature,
        top_p=top_p,
        max_tokens=max_tokens
    )
    # Formatting alpaca prompt
    alpaca_prompt = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

    ### Instruction:
    {}

    ### Response:
    {}"""
    prompt = alpaca_prompt.format(raw_input,"") # Response should be blank
    lora_config = {}



    if lora_mode != "default":
        lora_config["lora_request"] = LoRARequest(lora_mode, lora_configs[lora_mode][0], lora_configs[lora_mode][1])

    outputs = vllm_model.generate( # attach a QLoRA adapter to the base model in part 2
        [prompt],
        sampling_params,
        **lora_config,
    )
    return outputs[0].outputs[0].text

In [None]:
# Check a sample output
generate_test(raw_input = "What should I consider when building an AI service using an LLM?", lora_mode = "dpo_output_1_cpk_30")

In [None]:
# Check a sample output
generate_test(raw_input = "What should I consider when building an AI service using an LLM?", lora_mode = "dpo_output_2_cpk_30")

In [None]:
iface = gr.Interface(
    fn=generate_text,
    inputs=[ # Parameters for the above 'generate_text' function
        gr.Textbox(label="Prompt"),
        gr.Slider(0.0, 2.0, value=0.0, label="Temperature"), #
        gr.Slider(0.0, 1.0, value=0.95, label="Top P"),
        gr.Slider(1, 1000, value=256, step=1, label="Max Tokens"),
        gr.Dropdown(
            choices=["default"] + list(lora_configs.keys()),
            label="Model Type",
            value="default"
        ),
    ],
    outputs="text",
    title="Fine-tuned PEFT Model Demo",
    description="Enter a prompt to generate text using the fine-tuned model.",
    api_name = "generate"
)

iface.launch(share=True)

In [None]:
iface.close()