In [1]:
from utils.utils import setup_logging
from model.config import ModelConfig
from model.inference import load_model_for_inference
import logging


def load_model():
    setup_logging()
    logger = logging.getLogger(__name__)

    adapter_path = "models/mistral-7b-reasoning-lora"
    model_config = ModelConfig()

    logger.info(f"Loading model from {adapter_path}")
    model, tokenizer = load_model_for_inference(model_config, adapter_path)
    logger.info("Model loaded successfully")
    return model, tokenizer

  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [2]:
model, tokenizer = load_model()

2025-03-14 08:54:02 - __main__ - INFO - Loading model from models/mistral-7b-reasoning-lora


==((====))==  Unsloth 2025.2.15: Fast Mistral patching. Transformers: 4.49.0.
   \\   /|    GPU: NVIDIA L4. Max memory: 21.951 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = True]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


2025-03-14 08:54:03 - accelerate.utils.modeling - INFO - We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
Unsloth 2025.2.15 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.
2025-03-14 08:54:08 - __main__ - INFO - Model loaded successfully


In [3]:
def generate_response(model, tokenizer, prompt: str):
    inputs = tokenizer.apply_chat_template(
        [{"role": "user", "content": prompt}],
        return_tensors="pt",
        add_generation_prompt=True,
    ).to(model.device)

    outputs = model.generate(
        input_ids=inputs,
        max_new_tokens=800,
        do_sample=True,
        temperature=0.6,
        min_p=0.95,
    )
    # response = tokenizer.decode(
    #     outputs[0][inputs.shape[1] :], skip_special_tokens=False
    # )
    response = tokenizer.batch_decode(outputs)
    return response

In [4]:
from pprint import pprint


def run_inference(model, tokenizer, prompt):
    response = generate_response(model, tokenizer, prompt)
    pprint(response)

In [5]:
prompt = "If a rectangle has length 5 and width 3, what is its area? Reason before answering."

In [6]:
run_inference(model, tokenizer, prompt)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


['<s>[INST] If a rectangle has length 5 and width 3, what is its area? Reason '
 'before answering.[/INST] [THINK]\n'
 'How many square units are in the rectangle? ** The area of the rectangle is '
 '5 x 3 = <<5*3=15>>15 square units.\n'
 '[/THINK]\n'
 '\n'
 '[ANSWER]\n'
 '15\n'
 '[/ANSWER]</s>']
