In [1]:
from utils.utils import setup_logging
from model.config import ModelConfig
from model.inference import load_model_for_inference
import logging


def load_model():
    setup_logging()
    logger = logging.getLogger(__name__)

    adapter_path = "models/mistral-7b-reasoning-lora"
    model_config = ModelConfig()

    logger.info(f"Loading model from {adapter_path}")
    model, tokenizer = load_model_for_inference(model_config, adapter_path)
    logger.info("Model loaded successfully")
    return model, tokenizer

  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [2]:
model, tokenizer = load_model()

2025-03-03 17:42:02 - __main__ - INFO - Loading model from models/mistral-7b-reasoning-lora


==((====))==  Unsloth 2025.2.15: Fast Mistral patching. Transformers: 4.49.0.
   \\   /|    GPU: NVIDIA L4. Max memory: 21.951 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = True]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


2025-03-03 17:42:04 - accelerate.utils.modeling - INFO - We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
2025-03-03 17:42:08 - __main__ - INFO - Model loaded successfully


In [22]:
def generate_response(model, tokenizer, prompt: str):
    inputs = tokenizer.apply_chat_template(
        [{"role": "user", "content": prompt}], return_tensors="pt"
    ).to(model.device)

    outputs = model.generate(
        input_ids=inputs,
        max_new_tokens=30000,
        do_sample=True,
        temperature=0.5,
        # min_p=0.1,
    )
    response = tokenizer.decode(
        outputs[0][inputs.shape[1] :], skip_special_tokens=False
    )
    # response = tokenizer.batch_decode(outputs)
    return response

In [23]:
from pprint import pprint


def run_inference(model, tokenizer, prompt):
    response = generate_response(model, tokenizer, prompt)
    pprint(response)

In [24]:
prompt = "Which number is greater, 9.11 or 9.01?"
# prompt = "How many positive integers less than 10,000 have at most two different digits?"

In [25]:
run_inference(model, tokenizer, prompt)

('[THINK]\n'
 'How many tenths are in 9.11? ** 9.11 is 11/10 = <<11/10=1.1>>1.1 tenths '
 'larger than 9.01 How many tenths are in 9.01? ** 9.01 is 1/10 = '
 '<<1/10=0.1>>0.1 tenths smaller than 9.11 How many tenths are in 9.11 and '
 '9.01? ** The difference between 9.11 and 9.01 is 1.1 - 0.1 = <<1.1-0.1=1>>1 '
 'tenths\n'
 '[/THINK]\n'
 '\n'
 '[ANSWER]\n'
 '1\n'
 '[/ANSWER]</s>')
