# Qwen2.5-Math-1.5B Inference
This notebook demonstrates how to perform inference using vLLM.
This model was fine-tuned using Unsloth on a Kaggle T4 GPU.
You can find the training notebook [here](https://www.kaggle.com/code/mehedi457/train-qwen2-5-math-1-5b-map-unsloth).

**References:**
* https://www.kaggle.com/code/aleaiest/lb-0-945-qwen2-5-32b-gptq
* https://www.kaggle.com/code/cdeotte/gemma2-9b-it-cv-0-945

In [1]:
import os
os.environ["VLLM_USE_V1"] = "0"
os.environ["CUDA_VISIBLE_DEVICES"]="0,1"

In [2]:
import os
import pandas as pd
import torch
import vllm
from datasets import Dataset
from vllm.lora.request import LoRARequest
import argparse

MODEL_DIR = '/kaggle/input/unsloth-qwen25-math-1b-bnb-4bit'
LORA_DIR = "/kaggle/input/qwen2-5-math-1-5b-map-lora/ver_1/checkpoint-4587"

In [3]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
train = pd.read_csv('/kaggle/input/map-charting-student-math-misunderstandings/train.csv')
train.Misconception = train.Misconception.fillna('NA')
train['target'] = train.Category+":"+train.Misconception
train['label'] = le.fit_transform(train['target'])
target_classes = le.classes_
n_classes = len(target_classes)
print(f"Train shape: {train.shape} with {n_classes} target classes")
train.head()

Train shape: (36696, 9) with 65 target classes


Unnamed: 0,row_id,QuestionId,QuestionText,MC_Answer,StudentExplanation,Category,Misconception,target,label
0,0,31772,What fraction of the shape is not shaded? Give...,\( \frac{1}{3} \),0ne third is equal to tree nineth,True_Correct,,True_Correct:NA,37
1,1,31772,What fraction of the shape is not shaded? Give...,\( \frac{1}{3} \),1 / 3 because 6 over 9 is 2 thirds and 1 third...,True_Correct,,True_Correct:NA,37
2,2,31772,What fraction of the shape is not shaded? Give...,\( \frac{1}{3} \),"1 3rd is half of 3 6th, so it is simplee to un...",True_Neither,,True_Neither:NA,64
3,3,31772,What fraction of the shape is not shaded? Give...,\( \frac{1}{3} \),1 goes into everything and 3 goes into nine,True_Neither,,True_Neither:NA,64
4,4,31772,What fraction of the shape is not shaded? Give...,\( \frac{1}{3} \),1 out of every 3 isn't coloured,True_Correct,,True_Correct:NA,37


In [4]:
idx = train.apply(lambda row: row.Category.split('_')[0],axis=1)=='True'
correct = train.loc[idx].copy()
correct['c'] = correct.groupby(['QuestionId','MC_Answer']).MC_Answer.transform('count')
correct = correct.sort_values('c',ascending=False)
correct = correct.drop_duplicates(['QuestionId'])
correct = correct[['QuestionId','MC_Answer']]
correct['is_correct'] = 1

train = train.merge(correct, on=['QuestionId','MC_Answer'], how='left')
train.is_correct = train.is_correct.fillna(0)

In [5]:
test = pd.read_csv('/kaggle/input/map-charting-student-math-misunderstandings/test.csv')
test = test.merge(correct, on=['QuestionId','MC_Answer'], how='left')
test.is_correct = test.is_correct.fillna(0)
test['is_correct'] = test.apply(lambda x: "yes" if x['is_correct'] == 1 else "no", axis=1)

In [6]:
llm = vllm.LLM(
    MODEL_DIR,
    tensor_parallel_size=1,
    # torch.cuda.device_count(),
    gpu_memory_utilization=0.95,
    trust_remote_code=True,
    dtype="half",
    enforce_eager=True,
    max_model_len=2048,
    disable_log_stats=True,
    enable_prefix_caching=True,
    enable_lora=True,
)
tokenizer = llm.get_tokenizer()

2025-08-11 09:33:20.958891: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1754904801.168322      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1754904801.227419      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


INFO 08-11 09:33:34 [__init__.py:235] Automatically detected platform cuda.
INFO 08-11 09:33:51 [config.py:1604] Using max model len 2048
INFO 08-11 09:33:52 [llm_engine.py:228] Initializing a V0 LLM engine (v0.10.0) with config: model='/kaggle/input/unsloth-qwen25-math-1b-bnb-4bit', speculative_config=None, tokenizer='/kaggle/input/unsloth-qwen25-math-1b-bnb-4bit', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=2048, download_dir=None, load_format=LoadFormat.BITSANDBYTES, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=bitsandbytes, enforce_eager=True, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(backend='xgrammar', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics

[W811 09:34:04.801738989 socket.cpp:200] [c10d] The hostname of the client socket cannot be retrieved. err=-3


INFO 08-11 09:34:14 [parallel_state.py:1102] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0
INFO 08-11 09:34:14 [model_runner.py:1083] Starting to load model /kaggle/input/unsloth-qwen25-math-1b-bnb-4bit...


[W811 09:34:14.812633618 socket.cpp:200] [c10d] The hostname of the client socket cannot be retrieved. err=-3


INFO 08-11 09:34:15 [bitsandbytes_loader.py:733] Loading weights with BitsAndBytes quantization. May take a while ...


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 08-11 09:34:23 [logger.py:65] Using PunicaWrapperGPU.
INFO 08-11 09:34:24 [model_runner.py:1115] Model loading took 1.1808 GiB and 8.293155 seconds
INFO 08-11 09:34:33 [worker.py:295] Memory profiling takes 8.82 seconds
INFO 08-11 09:34:33 [worker.py:295] the current vLLM instance can use total_gpu_memory (14.74GiB) x gpu_memory_utilization (0.95) = 14.00GiB
INFO 08-11 09:34:33 [worker.py:295] model weights take 1.18GiB; non_torch_memory takes 0.05GiB; PyTorch activation peak memory takes 1.39GiB; the rest of the memory reserved for KV Cache is 11.38GiB.
INFO 08-11 09:34:33 [executor_base.py:113] # cuda blocks: 26643, # CPU blocks: 9362
INFO 08-11 09:34:33 [executor_base.py:118] Maximum concurrency for 2048 tokens per request: 208.15x
INFO 08-11 09:34:40 [llm_engine.py:424] init engine (profile, create kv cache, warmup model) took 15.84 seconds


In [7]:
tokenizer.chat_template = """{% for message in messages %}
{% if message['role'] == 'system' %}
<|im_start|>system
{{ message['content'] }}<|im_end|>
{% elif message['role'] == 'user' %}
<|im_start|>user
{{ message['content'] }}<|im_end|>
{% elif message['role'] == 'assistant' %}
<|im_start|>assistant
{{ message['content'] }}<|im_end|>
{% elif message['role'] == 'tool' %}
<|im_start|>tool
{{ message['content'] }}<|im_end|>
{% endif %}
{% endfor %}"""


In [8]:
special_character_list = [
    '■', '□', '▲', '△', '▼', '▽', '◆', '◇', '○', '●', '★', '☆', '♦', '♥', '♠', '♣',
    '§', '†', '‡', '※', '∞', '±', '≠', '≈', '√', '∑', '∏', '∆', 'Ω', 'μ', '∂', '→',
    '←', '↑', '↓', '↔', '↕', '〈', '〉', '『', '』', '│', '─', '┌', '┐', '└', '┘', '┼',
    '█', '▓', '▒', '£', '¥', '€', '₩', '©', '®', '™', '♪', '♫', '☀', '☁', '☂', '☃', '☎'
]

In [9]:
from transformers import LogitsProcessor

class LabelOnlyLogitsProcessor(LogitsProcessor):
    def __init__(self, allowed_token_ids):
        self.allowed_token_ids = allowed_token_ids

    def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> torch.Tensor:
        mask = torch.full_like(scores, float('-inf'))
        if scores.dim() == 1:
            mask[self.allowed_token_ids] = 0
        elif scores.dim() == 2:
            mask[:, self.allowed_token_ids] = 0
        else:
            raise ValueError("Unexpected score dimensions")
        return scores + mask


# Get the token IDs for your special characters
allowed_token_ids = [tokenizer.encode(c, add_special_tokens=False)[0] for c in special_character_list]

In [10]:
class_mappings = [f"{special_character_list[i]}: {le.classes_[i]}" for i in range(n_classes)]

SYS_PROMPT = f"""You are an expert at analyzing math student responses. Your task is to classify the student's explanation into one of the following Category:Misconception classes.

Respond with ONLY the single character corresponding to the correct classification.

Available classifications:
{', '.join(class_mappings)}

Analyze the given input and provide your classification.
"""

In [11]:
def create_inference_prompt(row):
    user_content = (
        f"Question: {row['QuestionText']}\n"
        f"Answer: {row['MC_Answer']}\n"
        f"Correct? {row['is_correct']}\n"
        f"Student Explanation: {row['StudentExplanation']}"
    )

    # The 'assistant' role is what the model will generate.
    messages = [
        {"role": "system", "content": SYS_PROMPT},
        {"role": "user", "content": user_content},
    ]
    # We use add_generation_prompt=True to signal the model to generate the next part.
    return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)


print("Formatting prompts for inference...")
test_input = test.apply(create_inference_prompt, axis=1)

Formatting prompts for inference...


In [12]:
test_input[0]

"<|im_start|>system\nYou are an expert at analyzing math student responses. Your task is to classify the student's explanation into one of the following Category:Misconception classes.\n\nRespond with ONLY the single character corresponding to the correct classification.\n\nAvailable classifications:\n■: False_Correct:NA, □: False_Misconception:Adding_across, ▲: False_Misconception:Adding_terms, △: False_Misconception:Additive, ▼: False_Misconception:Base_rate, ▽: False_Misconception:Certainty, ◆: False_Misconception:Definition, ◇: False_Misconception:Denominator-only_change, ○: False_Misconception:Division, ●: False_Misconception:Duplication, ★: False_Misconception:Firstterm, ☆: False_Misconception:FlipChange, ♦: False_Misconception:Ignores_zeroes, ♥: False_Misconception:Incomplete, ♠: False_Misconception:Incorrect_equivalent_fraction_addition, ♣: False_Misconception:Interior, §: False_Misconception:Inverse_operation, †: False_Misconception:Inversion, ‡: False_Misconception:Irrelevant

In [13]:
from vllm import SamplingParams

sampling_params = SamplingParams(
    temperature=0,
    max_tokens=1,
    logprobs=8,
    stop=["\n", "."],
    logits_processors=[LabelOnlyLogitsProcessor(allowed_token_ids)],
)

outputs = llm.generate(test_input, 
                       sampling_params, 
                       lora_request=LoRARequest("default", 1, LORA_DIR)
                      )

Adding requests:   0%|          | 0/3 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/3 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

In [14]:
special_to_idx = {i:idx for idx, i in enumerate(special_character_list)}

In [15]:
sampled_preds = [tokenizer.decode(list(out.outputs[0].logprobs[0])[:3]) for out in outputs]
sampled_pred_ids = [[special_to_idx[j] for j in pred]for pred in sampled_preds]
last_sampled_pred = [le.inverse_transform(i) for i in sampled_pred_ids]

In [16]:
# Join 3 labels per row with space
joined_preds = [" ".join(row) for row in last_sampled_pred]

# Save submission
sub = pd.DataFrame({
    "row_id": test.row_id.values,
    "Category:Misconception": joined_preds
})
sub.to_csv("submission.csv", index=False)
sub.head()

Unnamed: 0,row_id,Category:Misconception
0,36696,True_Correct:NA True_Neither:NA True_Misconcep...
1,36697,False_Misconception:WNB False_Neither:NA False...
2,36698,True_Neither:NA True_Correct:NA True_Misconcep...
