In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel

# Original model + LoRA directory
base_model_dir = "/home/iclab/minjun/Tiny-Mental-LLM/pretrained_models/meta-llama_Llama-3.2-3B-Instruct"
lora_dir = "Output/meta-llama-Llama-3.2-3B-Instruct-QLoRA/checkpoint"

use_qlora = False

if use_qlora:
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4"
    )
    torch_dtype = torch.bfloat16
else:
    bnb_config = None
    torch_dtype = torch.bfloat16

print("[INFO] Loading base model in 4bit if QLoRA is True...")
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_dir,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch_dtype
)

print("[INFO] Attaching LoRA adapter from output dir...")
model = PeftModel.from_pretrained(
    base_model,
    lora_dir,
)
model.eval()

# If necessary, merge LoRA
# model = model.merge_and_unload()

tokenizer = AutoTokenizer.from_pretrained(base_model_dir, truncation_side='left')
if tokenizer.pad_token_id is None and tokenizer.eos_token_id is not None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

print("[INFO] Model and tokenizer loaded.")


def run_multi_turn_chat(
    model,
    tokenizer,
    max_new_tokens=128,
    temperature=0.7,
    top_p=0.9,
    num_beams=1,
    repetition_penalty=1.0
):
    """
    Function to conduct a multi-turn conversation with the user.
    - max_new_tokens: Maximum number of tokens to generate in a single response
    - temperature: Sampling temperature (0~1, higher means more randomness)
    - top_p: Consider only candidates whose cumulative probability is top_p or less during sampling (0~1)
    - num_beams: Number of beams for beam search (1 for sampling, 2 or more for beam search)
    - repetition_penalty: Penalty for repetition (1 or more)
    """

    conversation_history = ""
    model.eval()

    while True:
        user_message = input("User: ")
        if user_message.lower() in ["exit", "quit"]:
            print("Exiting chat...")
            break

        # Update conversation history
        conversation_history += f"\nUser: {user_message}\nAssistant: "

        # Tokenize + move to GPU
        inputs = tokenizer(
            conversation_history,
            return_tensors="pt"
        ).to(model.device)

        with torch.no_grad():
            generated_outputs = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                temperature=temperature,
                top_p=top_p,
                num_beams=num_beams,
                repetition_penalty=repetition_penalty,
                do_sample=(num_beams == 1),  # Distinguish beam search/sampling
                eos_token_id=tokenizer.eos_token_id,
                pad_token_id=tokenizer.pad_token_id
            )

        # Extract only new tokens, excluding the input part from the entire generated tokens
        new_tokens = generated_outputs[0][inputs["input_ids"].shape[1]:]
        assistant_reply = tokenizer.decode(new_tokens, skip_special_tokens=True)

        # Add to conversation history
        conversation_history += assistant_reply
        print(f"Assistant: {assistant_reply}\n")



run_multi_turn_chat(
    model,
    tokenizer,
    max_new_tokens=128,
    temperature=0.5,
    top_p=0.8,
    num_beams=1,
    repetition_penalty=1.2
)
```

        Too many current requests. Your queue position is 1. Please wait for a while or switch to other models for a smoother experience.

  from .autonotebook import tqdm as notebook_tqdm


[INFO] Loading base model in 4bit if QLoRA is True...


Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.11it/s]


[INFO] Attaching LoRA adapter from output dir...
[INFO] Model and tokenizer loaded.
Assistant:  Based on the provided text, several indicators suggest that the individual may be experiencing anxiety rather than depression:

1. **Concern about Safety**: The mention of feeling "a little uncomfortable" when entering their personal financial details online indicates an awareness of potential risks or threats (in this case, identity theft). This concern suggests heightened vigilance, which is often associated with anxiety.

2. **Doubtfulness**: The phrase "Am I being too doubtful?" reflects self-reflection regarding one’s own emotional state. While uncertainty can accompany both anxiety and depressive states, excessive worry about safety and judgmental thoughts are more characteristic of anxiety disorders.

3.

Assistant:  To determine whether the described scenario indicates depression, we need to consider the context and specific behaviors presented by the patient. However, without explic