In [3]:
# Requires transformers>=4.51.0
import torch
from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM

def format_instruction(instruction, query, doc):
    if instruction is None:
        instruction = 'Given a web search query, retrieve relevant passages that answer the query'
    output = "<Instruct>: {instruction}\n<Query>: {query}\n<Document>: {doc}".format(instruction=instruction,query=query, doc=doc)
    return output

def process_inputs(pairs):
    inputs = tokenizer(
        pairs, padding=False, truncation='longest_first',
        return_attention_mask=False, max_length=max_length - len(prefix_tokens) - len(suffix_tokens)
    )
    for i, ele in enumerate(inputs['input_ids']):
        inputs['input_ids'][i] = prefix_tokens + ele + suffix_tokens
    inputs = tokenizer.pad(inputs, padding=True, return_tensors="pt", max_length=max_length)
    for key in inputs:
        inputs[key] = inputs[key].to(model.device)
    return inputs

@torch.no_grad()
def compute_logits(inputs, **kwargs):
    batch_scores = model(**inputs).logits[:, -1, :]
    true_vector = batch_scores[:, token_true_id]
    false_vector = batch_scores[:, token_false_id]
    batch_scores = torch.stack([false_vector, true_vector], dim=1)
    batch_scores = torch.nn.functional.log_softmax(batch_scores, dim=1)
    scores = batch_scores[:, 1].exp().tolist()
    return scores

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-Reranker-0.6B", padding_side='left')
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-Reranker-0.6B").eval()

# We recommend enabling flash_attention_2 for better acceleration and memory saving.
# model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-Reranker-4B", torch_dtype=torch.float16, attn_implementation="flash_attention_2").cuda().eval()

token_false_id = tokenizer.convert_tokens_to_ids("no")
token_true_id = tokenizer.convert_tokens_to_ids("yes")
max_length = 8192

prefix = "<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be \"yes\" or \"no\".<|im_end|>\n<|im_start|>user\n"
suffix = "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"
prefix_tokens = tokenizer.encode(prefix, add_special_tokens=False)
suffix_tokens = tokenizer.encode(suffix, add_special_tokens=False)
        
task = 'Given a web search query, retrieve relevant passages that answer the query'

queries = ["What is the capital of China?",
    "Explain gravity",
]

documents = [
    "The capital of China is Beijing.",
    "Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun.",
]
documents_neg = [
    "The capital of France is Paris.",
    "Photosynthesis is the process by which green plants and some other organisms use sunlight to synthesize foods from carbon dioxide and water.",
]

pairs = [format_instruction(task, query, doc) for query, doc in zip(queries, documents)]
anti_pairs = [format_instruction(task, query, doc) for query, doc in zip(queries, documents_neg)]
# Tokenize the input texts
inputs = process_inputs(pairs)
scores = compute_logits(inputs)
anti_inputs = process_inputs(anti_pairs)
anti_scores = compute_logits(anti_inputs)

print("scores: ", scores)
print("anti_scores: ", anti_scores)

You're using a Qwen2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


scores:  [0.9994982481002808, 0.9993619322776794]
anti_scores:  [0.00014956131053622812, 0.00020937531371600926]


In [None]:
import dspy
from llama_index.core.agent.workflow import FunctionAgent
from llama_index.llms.google_genai import GoogleGenAI
from llama_index.core.agent.workflow import FunctionAgent


GOOGLE_API_KEY = "AIzaSyC-LwOBfcEiBmrVHwYxYW0NVzUdoC1GEqM"
lm = dspy.LM('gemini/gemini-2.5-pro-preview-03-25', api_key=GOOGLE_API_KEY)
dspy.configure(lm=lm)

qa = dspy.ChainOfThought('question -> answer')

def answer_question(question: str) -> str:
    """Answers a question based on general knowledge."""
    response = qa(question=question)
    return response.answer

def calculate(a: float, b: float, operator: str) -> float:
    """Simple calculus on two numbers a and b. Supports add, subtract, multiply and divide. The operator is only allowed to be one of the following: +, -, *, /. For instance, useful to estimate the development of a disease over time."""
    match operator:
        case "+":
            return f"The sum of {a} and {b} is {a+b}."
        case "-":
            return f"Subtracting {a} and {b} (a-b) is {a-b}."
        case "*":
            return f"Multiplying {a} and {b} is {a*b}."
        case "/":
            return f"The ratio between {a} and {b} is {a/b}."
        case _:
            return "Invalid operator. Please use one of the following: +, -, *, /."

# Wrap your functions as tools
llm = GoogleGenAI(model="gemini-2.5-pro-preview-03-25", api_key=GOOGLE_API_KEY)

agent = FunctionAgent(
    tools=[answer_question, calculate],
    llm=llm,
)
from llama_index.core.agent.workflow import ToolCallResult
async def run_agent_verbose(query: str):
    handler = agent.run(query)
    async for event in handler.stream_events():
        if isinstance(event, ToolCallResult):
            print(
                f"Called tool {event.tool_name} with args {event.tool_kwargs}\nGot result: {event.tool_output}"
            )

    return await handler


result = await run_agent_verbose("Calculate 10 + 5")
result2 = await run_agent_verbose("What is the capital of France?")
print(result)
print(result2)

Called tool calculate with args {'b': 5, 'operator': '+', 'a': 10}
Got result: The sum of 10 and 5 is 15.
Called tool answer_question with args {'question': 'What is the capital of France?'}
Got result: Paris
15

Paris

