In [1]:
import requests
import json
from typing import List, Dict, Any

def get_logprobs(prompt: str, vllm_url: str = "http://localhost:8000/v1/completions") -> Dict[str, Any]:
    """
    Get logprobs for an input prompt without generation.
    
    Args:
        prompt: The input text to get logprobs for
        vllm_url: URL of vLLM completions endpoint
        
    Returns:
        Dictionary containing logprobs information
    """
    payload = {
        "model": "Qwen/Qwen3-8b",
        "prompt": prompt,
        "max_tokens": 1,  # Minimal generation
        "temperature": 0.0,
        "logprobs": False,  # Request logprobs
        "echo": True,  # Echo the prompt tokens with their logprobs
        "stream": False
    }
    
    headers = {"Content-Type": "application/json"}
    
    try:
        response = requests.post(vllm_url, json=payload, headers=headers)
        response.raise_for_status()
        result = response.json()

        log_probs = []
        for token_dict in result['choices'][0]['prompt_logprobs'][1:]:
            for _, logprob_dict in token_dict.items():
                if (len(token_dict) == 1) or (logprob_dict['rank'] != 1):
                    log_probs.append(logprob_dict['logprob'])
        
        return log_probs
        
    except Exception as e:
        raise Exception(f"Failed to get logprobs: {e}")



In [4]:
# Example usage
prompt = "The capital of Spain is"
logprobs = get_logprobs(prompt)

# Print the logprobs information



In [5]:
logprobs

[-9.782947540283203,
 -0.5661647915840149,
 -6.395083904266357,
 -0.6938387155532837]

In [6]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load the Skywork Reward model
model_name = "Skywork/Skywork-Reward-V2-Qwen3-0.6B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

print(f"Loaded {model_name}")
print(f"Model device: {next(model.parameters()).device}")


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/500 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.19G [00:00<?, ?B/s]

Loaded Skywork/Skywork-Reward-V2-Qwen3-0.6B
Model device: cpu


In [7]:
model.to('cuda')

Qwen3ForSequenceClassification(
  (model): Qwen3Model(
    (embed_tokens): Embedding(151936, 1024, padding_idx=151654)
    (layers): ModuleList(
      (0-27): 28 x Qwen3DecoderLayer(
        (self_attn): Qwen3Attention(
          (q_proj): Linear(in_features=1024, out_features=2048, bias=False)
          (k_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (v_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (o_proj): Linear(in_features=2048, out_features=1024, bias=False)
          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
        )
        (mlp): Qwen3MLP(
          (gate_proj): Linear(in_features=1024, out_features=3072, bias=False)
          (up_proj): Linear(in_features=1024, out_features=3072, bias=False)
          (down_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): Qwen3RMSNorm((1024,), eps=1e

In [25]:
# Test the reward model with two different responses
prompt = "actually return the function in  C++"
response1 = """beginner looking for a simple implementation. Let me provide a clear function for computing factorial.

Here's a simple function to compute the factorial of a number:

```
int factorial(int n) {
    int result = 1;
    for (int i = 1; i <= n; i++) {
        result *= i;
    }
    return result;
}
"""
response2 = """ beginner, so I should explain the steps clearly. Let's start with the the code since they asked for it. I need to write a function that takes an integer and returns its factorial. I'll use a loop to multiply from 1 to n. Also, handle the case where n is 0, since 0! is 1. Make sure to include necessary headers and a main function for testing. Let me structure the code properly with comments. Alright, that should cover it."""

conv1 = [{"role": "user", "content": prompt}, {"role": "assistant", "content": response1}]
conv2 = [{"role": "user", "content": prompt}, {"role": "assistant", "content": response2}]

# Format and tokenize the conversations
conv1_formatted = tokenizer.apply_chat_template(conv1, tokenize=False)
conv2_formatted = tokenizer.apply_chat_template(conv2, tokenize=False)
# These two lines remove the potential duplicate bos token
if tokenizer.bos_token is not None and conv1_formatted.startswith(tokenizer.bos_token):
    conv1_formatted = conv1_formatted[len(tokenizer.bos_token):]
if tokenizer.bos_token is not None and conv2_formatted.startswith(tokenizer.bos_token):
    conv2_formatted = conv2_formatted[len(tokenizer.bos_token):]
conv1_tokenized = tokenizer(conv1_formatted, return_tensors="pt").to('cuda')
conv2_tokenized = tokenizer(conv2_formatted, return_tensors="pt").to('cuda')

# Get the reward scores
with torch.no_grad():
    score1 = model(**conv1_tokenized).logits[0][0].item()
    score2 = model(**conv2_tokenized).logits[0][0].item()
print(f"Score for response 1: {score1}")
print(f"Score for response 2: {score2}")


Score for response 1: -3.759751319885254
Score for response 2: -4.643280506134033


In [None]:
 model(**conv1_tokenized).

SequenceClassifierOutputWithPast(loss=None, logits=tensor([[-3.7598]], device='cuda:0', grad_fn=<IndexBackward0>), past_key_values=None, hidden_states=None, attentions=None)