# Better Way

In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer,pipeline,LogitsProcessor
import torch
import torch
import torchaudio
import torchgen
import torchvision
# import bitsandbytes
import accelerate
from torch.nn import functional as F
from torch.nn.functional import softmax
import warnings

# To ignore all warnings:
warnings.filterwarnings("ignore")

# Notebook: Testing Ollama Functionality

**Objective.** Explore Llama 3.2B model functionality provided through HuggingFace
<!-- **Data.** <sources, date range, N obs, key filters>   -->
<!-- **Method.** <models/algorithms + key hyperparams>   -->


<!-- **Takeaways.** <what to do with the result>  
**Limitations.** <assumptions, data gaps, caveats>  
**Repro.** `python -m pip install -r requirements.txt`  
`papermill this.ipynb out.ipynb -p seed 42 -p start 2018-01-01 -p end 2025-08-01` -->




In [2]:
# # print(tokenizer.chat_template)

# def get_probability_distribution(prompt):
#     input_ids = tokenizer.encode(prompt, return_tensors="pt").to("mps")
#     with torch.no_grad():
#         outputs = model(input_ids)
#         logits = outputs.logits[:, -1, :]
#         probabilities = F.softmax(logits, dim=-1)
#     return {"probabilities": probabilities, "input_ids": input_ids}

# PipeLine Text generation, Forcing answers using chat templates

In [3]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")

pipe = pipeline(
    "text-generation",
    tokenizer= tokenizer,
    model="meta-llama/Llama-3.2-3B-Instruct",
    model_kwargs={"torch_dtype": torch.bfloat16},
    device="mps",
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
messages = [
        {"role": "system", "content": "You are a probability expert who always gives estimate of events"},
        {"role": "user", "content": "What is the probability of having an economic recession in the United States this year (2025)?Just give me an estimate and nothing else with no additional text."}
    ]



In [6]:
prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False, # Return a string
        add_generation_prompt=True 
)

In [7]:
outputs = pipe(
        prompt,
        max_new_tokens=100,
        do_sample=False, # Need for temperature = 0.0
        temperature=0.0,
        # top_p=1.0,
    )

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


In [8]:
# 7. Print the generated text

if outputs and isinstance(outputs, list) and 'generated_text' in outputs[0]:

    full_text = outputs[0]['generated_text']
    

    # For Llama 3 Instruct, the response starts after "<|start_header_id|>assistant<|end_header_id|>\n\n"
    response_start_marker = "<|start_header_id|>assistant<|end_header_id|>\n\n"
    response_start_index = full_text.rfind(response_start_marker)

    if response_start_index != -1:
        response = full_text[response_start_index + len(response_start_marker):].strip()
        print("Generated Response:")
        print(response)
    else:
      
        if full_text.startswith(prompt):
             response = full_text[len(prompt):].strip()
             print("Generated Response (fallback extraction):")
             print(response)
        else:
             print("Generated Text (could not isolate response):")
             print(full_text) # Print the whole thing 
else:
    print("No output or unexpected output format:", outputs)

Generated Response:
20-30%


# Complement Question

In [9]:
messages = [
        {"role": "system", "content": "You are a probability expert who always gives estimate of events"},
        {"role": "user", "content": "What is the probability of not having an economic recession in the United States this year (2025)?Just give me an estimate and nothing else with no additional text."}
    ]

prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False, # Return a string
        add_generation_prompt=True 
)

outputs = pipe(
        prompt,
        max_new_tokens=100,
        do_sample=False, # Need for temperature = 0.0
        temperature=0.0,
        # top_p=1.0,
    )




Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


In [10]:

if outputs and isinstance(outputs, list) and 'generated_text' in outputs[0]:

    full_text = outputs[0]['generated_text']
    

    # For Llama 3 Instruct, the response starts after "<|start_header_id|>assistant<|end_header_id|>\n\n"
    response_start_marker = "<|start_header_id|>assistant<|end_header_id|>\n\n"
    response_start_index = full_text.rfind(response_start_marker)

    if response_start_index != -1:
        response = full_text[response_start_index + len(response_start_marker):].strip()
        print("Generated Response:")
        print(response)
    else:
      
        if full_text.startswith(prompt):
             response = full_text[len(prompt):].strip()
             print("Generated Response (fallback extraction):")
             print(response)
        else:
             print("Generated Text (could not isolate response):")
             print(full_text) # Print the whole thing 
else:
    print("No output or unexpected output format:", outputs)

Generated Response:
20-30%


# Prompt Sensitivity

In [11]:
messages = [
    # Added "probabilistic" to the system message
        {"role": "system", "content": "You are a probability expert who only answers in terms of probabilty estimates, with no additional text."},
        {"role": "user", "content": "What is the probability of an economic recession in the United States this year (2025)?"}
    ]

In [12]:

prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False, # Return a string
        add_generation_prompt=True 
)


outputs = pipe(
        prompt,
        max_new_tokens=100,
        do_sample=False, # Need for temperature = 0.0
        temperature=0.0,
        # top_p=1.0,
    )



# 7. Print the generated text

if outputs and isinstance(outputs, list) and 'generated_text' in outputs[0]:

    full_text = outputs[0]['generated_text']
    

    # For Llama 3 Instruct, the response starts after "<|start_header_id|>assistant<|end_header_id|>\n\n"
    response_start_marker = "<|start_header_id|>assistant<|end_header_id|>\n\n"
    response_start_index = full_text.rfind(response_start_marker)

    if response_start_index != -1:
        response = full_text[response_start_index + len(response_start_marker):].strip()
        print("Generated Response:")
        print(response)
    else:
      
        if full_text.startswith(prompt):
             response = full_text[len(prompt):].strip()
             print("Generated Response (fallback extraction):")
             print(response)
        else:
             print("Generated Text (could not isolate response):")
             print(full_text) # Print the whole thing if unsure
else:
    print("No output or unexpected output format:", outputs)


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Generated Response:
0.35%


# Testing different questions

In [13]:
questions = [
    "What is the probability of an economic recession in the United States this year (2025)?",
    "How likely is it that interest rates will be cut by the Federal Reserve in 2025?",
    "What are the chances of a major stock market correction in 2025?",
    "What is the probability of inflation falling below 2% in the US in 2025?",
    "How likely is it that unemployment will rise above 6% this year?"
]

# Format all prompts using chat template
prompts = []
for question in questions:
    messages = [
        {"role": "system", "content": "You are a probability expert who only answers in terms of probabilty estimates, with no additional text."},
        {"role": "user", "content": question}
    ]
    formatted_prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    prompts.append(formatted_prompt)




In [14]:
# Generate responses in batch
outputs = pipe(
    prompts,
    max_new_tokens=100,
    do_sample=False,
    temperature=0.0,
)


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


In [15]:

# Extract and print responses
response_start_marker = "<|start_header_id|>assistant<|end_header_id|>\n\n"

print("\nGenerated Responses:\n")
for idx, output in enumerate(outputs):
    full_text = output[0]['generated_text']
    response_start_index = full_text.rfind(response_start_marker)

    print(f"Q{idx+1}: {questions[idx]}")
    if response_start_index != -1:
        response = full_text[response_start_index + len(response_start_marker):].strip()
        print(f"A{idx+1}: {response}\n")
    else:
        # Fallback
        if full_text.startswith(prompts[idx]):
            response = full_text[len(prompts[idx]):].strip()
            print(f"A{idx+1}: {response} (extracted using fallback)\n")
        else:
            print(f"A{idx+1}: Could not parse response. Raw output:\n{full_text}\n")


Generated Responses:

Q1: What is the probability of an economic recession in the United States this year (2025)?
A1: 0.35%

Q2: How likely is it that interest rates will be cut by the Federal Reserve in 2025?
A2: Approximately 34.7%

Q3: What are the chances of a major stock market correction in 2025?
A3: Approximately 34.7%

Q4: What is the probability of inflation falling below 2% in the US in 2025?
A4: I estimate the probability of inflation falling below 2% in the US in 2025 at 12.4%.

Q5: How likely is it that unemployment will rise above 6% this year?
A5: Approximately 34.6%



In [16]:
del pipe

# Use generation to get the Logits

In [17]:
# import torch
# from transformers import AutoTokenizer, AutoModelForCausalLM

model_id = "meta-llama/Llama-3.2-3B-Instruct"          # any Llama‑2/3 chat checkpoint works
tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="left")
model     = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16).to("mps")





Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [18]:
messages = [
    # Added "probabilistic" to the system message
        {"role": "system", "content": "You are a probability expert who only answers in terms of probabilty estimates, with no additional text."},
        {"role": "user", "content": "What is the probability of having an economic recession in the United States this year (2025)?"}
    ]

#  Format the conversation with the model‑specific chat template
prompt_ids = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,          
    return_tensors="pt"
).to(model.device)

In [19]:
# Generate while asking for the raw scores (logits) at every step
out = model.generate(
    prompt_ids,
    max_new_tokens=40,
    return_dict_in_generate=True,
    output_scores=True,      # gives one logits tensor per generated step
    do_sample=False,
    temperature=0.0,                 
)


#  Convert those logits to token‑wise probabilities
#     compute_transition_scores aligns each score tensor with the token it chose
log_probs = model.compute_transition_scores(
    out.sequences, out.scores, normalize_logits=True         # True → log p_i, not raw logits
)     


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
From v4.47 onwards, when a model cache is to be returned, `generate` will return a `Cache` instance instead by default (as opposed to the legacy tuple of tuples format). If you want to keep returning the legacy format, please set `return_legacy_cache=True`.


In [20]:
   
gen_ids   = out.sequences[0, prompt_ids.shape[-1]:]          # only the new tokens
probs     = log_probs[0].exp()                               # p_i = e^{log p_i}
tokens    = tokenizer.convert_ids_to_tokens(gen_ids)

for tok, p in zip(tokens, probs.tolist()):
    print(f"{tok:>12s}  {p:.4f}")

           I  0.1900
   Ġestimate  0.2939
        Ġthe  0.5681
Ġprobability  0.9912
         Ġof  0.8675
         Ġan  0.4365
   Ġeconomic  0.9982
  Ġrecession  0.9999
         Ġin  0.9962
        Ġthe  0.9998
     ĠUnited  0.8475
     ĠStates  0.9996
       Ġthis  0.7722
       Ġyear  0.9999
          Ġ(  0.8431
         202  0.9984
           5  0.9834
           )  0.9786
         Ġto  0.3735
         Ġbe  0.9997
     Ġaround  0.4796
           Ġ  0.9979
          12  0.0896
           .  0.3700
           4  0.2632
          %.  0.8635
  <|eot_id|>  0.9793


In [21]:
tokenizer.decode(gen_ids)  # decode the generated tokens

'I estimate the probability of an economic recession in the United States this year (2025) to be around 12.4%.<|eot_id|>'

### above logits for each generation setp(5 steps above) is conditioned on previous input+ generated tokens

In [22]:
# one logit tensor per generated step
log_probs.shape

torch.Size([1, 27])

In [23]:
tokenizer.chat_template

'{{- bos_token }}\n{%- if custom_tools is defined %}\n    {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n    {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n    {%- if strftime_now is defined %}\n        {%- set date_string = strftime_now("%d %b %Y") %}\n    {%- else %}\n        {%- set date_string = "26 Jul 2024" %}\n    {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n    {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0][\'role\'] == \'system\' %}\n    {%- set system_message = messages[0][\'content\']|trim %}\n    {%- set messages = messages[1:] %}\n{%- else %}\n    {%- set system_message = "" %}\n{%- endif %}\n\n{#- System message #}\n{{- "<|start_header_id|>system<|end_header_id|>\\n\\n" }}\n{%- if tools is not none %}\n    {{- "Environment: ipython\\n" }}\n{%- endif %}\n{{- "Cutting

In [25]:
out.scores

(tensor([[ 3.8594,  6.9062,  5.1875,  ..., -0.8398, -0.8398, -0.8398]],
        device='mps:0'),
 tensor([[ 3.5938,  4.3125,  2.8750,  ..., -2.8438, -2.8438, -2.8438]],
        device='mps:0'),
 tensor([[ 3.6875,  2.2344,  0.0199,  ..., -1.3906, -1.3906, -1.3906]],
        device='mps:0'),
 tensor([[ 0.1982,  0.2773,  0.1855,  ..., -1.1016, -1.1016, -1.1016]],
        device='mps:0'),
 tensor([[ 4.6875,  2.7500,  0.2695,  ..., -0.5742, -0.5742, -0.5781]],
        device='mps:0'),
 tensor([[2.1875, 2.2969, 0.8125,  ..., 1.0703, 1.0703, 1.0703]],
        device='mps:0'),
 tensor([[ 1.2344,  3.9375,  1.6016,  ..., -1.0625, -1.0625, -1.0625]],
        device='mps:0'),
 tensor([[ 2.6719,  4.5312,  1.3516,  ..., -2.4688, -2.4688, -2.4688]],
        device='mps:0'),
 tensor([[ 4.5000,  5.7500,  2.6875,  ..., -0.1484, -0.1484, -0.1484]],
        device='mps:0'),
 tensor([[1.7422, 2.0625, 2.2656,  ..., 1.3828, 1.3828, 1.3828]],
        device='mps:0'),
 tensor([[2.3906, 2.7031, 4.4688,  ..., 0.

In [28]:
top_k = 5
logits_per_step = out.scores  # list of tensors, logits over the whole vocab
# interested_setp  = 4

for step, logits in enumerate(logits_per_step):
    probs = softmax(logits[0], dim=-1)  # Convert logits to probabilities
    top_probs, top_ids = probs.topk(top_k)

    # if step+1 == interested_setp: print(f"\nStep {step+1}:")
    print(f"\nStep {step+1}:")
    for i in range(top_k):
        token = tokenizer.convert_ids_to_tokens(int(top_ids[i]))
        # if step+1 == interested_setp: print(f"  {token:>12s}  P={top_probs[i]:.4f}")
        print(f"  {token:>12s}  P={top_probs[i]:.4f}")


Step 1:
             I  P=0.1900
        Approx  P=0.1676
           Est  P=0.0897
             0  P=0.0792
     Estimated  P=0.0699

Step 2:
     Ġestimate  P=0.2939
          Ġcan  P=0.2593
       Ġcannot  P=0.2289
            'm  P=0.0842
            'd  P=0.0616

Step 3:
          Ġthe  P=0.5681
            Ġa  P=0.1436
           Ġit  P=0.0871
             Ġ  P=0.0528
         Ġthis  P=0.0466

Step 4:
  Ġprobability  P=0.9912
             Ġ  P=0.0041
       Ġannual  P=0.0017
   Ġlikelihood  P=0.0006
       Ġchance  P=0.0005

Step 5:
           Ġof  P=0.8675
           Ġat  P=0.0807
           Ġto  P=0.0381
           Ġas  P=0.0096
           Ġis  P=0.0017

Step 6:
           Ġan  P=0.4365
       Ġhaving  P=0.3400
            Ġa  P=0.2062
          Ġthe  P=0.0149
    Ġrecession  P=0.0007

Step 7:
     Ġeconomic  P=0.9982
     ĠAmerican  P=0.0006
           ĠUS  P=0.0005
      Ġeconomy  P=0.0002
      economic  P=0.0000

Step 8:
    Ġrecession  P=0.9999
       Ġrecess  P=0.0001
   

variance of estimate? what does it mean here?

# Complement Question

In [30]:
messages = [
    # Added "probabilistic" to the system message
        {"role": "system", "content": "You are a probability expert who only answers in terms of probabilty estimates, with no additional text."},
        {"role": "user", "content": "What is the probability of not having an economic recession in the United States this year (2025)??"}
    ]

#  Format the conversation with the model‑specific chat template
prompt_ids = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,          
    return_tensors="pt"
).to(model.device)

In [31]:
out = model.generate(
    prompt_ids,
    max_new_tokens=40,
    return_dict_in_generate=True,
    output_scores=True,                  # gives one logits tensor per generated step
)


#  Convert those logits to token‑wise probabilities
#     compute_transition_scores aligns each score tensor with the token it chose
log_probs = model.compute_transition_scores(
    out.sequences, out.scores, normalize_logits=True         # True → log p_i, not raw logits
)     

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


In [32]:
gen_ids   = out.sequences[0, prompt_ids.shape[-1]:]          # only the new tokens
probs     = log_probs[0].exp()                               # p_i = e^{log p_i}
tokens    = tokenizer.convert_ids_to_tokens(gen_ids)

for tok, p in zip(tokens, probs.tolist()):
    print(f"{tok:>12s}  {p:.4f}")

           0  0.2824
           .  1.0000
           8  0.0361
           -  0.0753
           0  1.0000
           .  1.0000
           9  0.3486
  <|eot_id|>  0.5139


In [33]:
del model

# Zero SHOT classification

In [34]:

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")

pipe = pipeline(
    "zero-shot-classification",
    tokenizer= tokenizer,
    model="meta-llama/Llama-3.2-3B-Instruct",
    model_kwargs={"torch_dtype": torch.bfloat16},
    device="mps",
)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-3B-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.


In [35]:
prompt = "Will the United States go into economic recession this year (2025)?"
candidate_labels = ["Yes", "No"]

result = pipe(prompt, candidate_labels)

print(result)

Tokenizer was not supporting padding necessary for zero-shot, attempting to use  `pad_token=eos_token`


{'sequence': 'Will the United States go into economic recession this year (2025)?', 'labels': ['No', 'Yes'], 'scores': [0.872347354888916, 0.1276526153087616]}


In [36]:
prompt = "Will the New York Mets win over Los Angeles Dodgers tomorrow?"
candidate_labels = ["New York Mets win", "Los Angeles Dodgers win"]

result = pipe(prompt, candidate_labels)

print(result)

{'sequence': 'Will the New York Mets win over Los Angeles Dodgers tomorrow?', 'labels': ['New York Mets win', 'Los Angeles Dodgers win'], 'scores': [0.5295066833496094, 0.4704933166503906]}


In [37]:
candidate_labels = ["Yes", "No"]

result = pipe(prompt, candidate_labels)

print(result)

{'sequence': 'Will the New York Mets win over Los Angeles Dodgers tomorrow?', 'labels': ['No', 'Yes'], 'scores': [0.6198165416717529, 0.38018345832824707]}


In [38]:
del pipe

# Generate reasoning but conclude with an answer

In [39]:
model_id = "meta-llama/Llama-3.2-3B-Instruct"          # any Llama‑2/3 chat checkpoint works
tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="left")
model     = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16).to("mps")

messages = [
    # Added "probabilistic" to the system message
        {"role": "system", "content": "You are a probability expert who provides reasoning and certainly conclude with a final estimate of event probabilities."},
        {"role": "user", "content": "What is the probability of an economic recession in the United States this year (2025)?"}
    ]

#  Format the conversation with the model‑specific chat template
prompt_ids = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,          
    return_tensors="pt"
).to(model.device)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [40]:
# Generate while asking for the raw scores (logits) at every step
out = model.generate(
    prompt_ids,
    max_new_tokens=40,
    return_dict_in_generate=True,
    output_scores=True,                  # gives one logits tensor per generated step
)

#  Convert those logits to token‑wise probabilities
#     compute_transition_scores aligns each score tensor with the token it chose
log_probs = model.compute_transition_scores(
    out.sequences, out.scores, normalize_logits=True         # True → log p_i, not raw logits
)     

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


In [41]:
gen_ids   = out.sequences[0, prompt_ids.shape[-1]:]          # only the new tokens
probs     = log_probs[0].exp()                               # p_i = e^{log p_i}
tokens    = tokenizer.convert_ids_to_tokens(gen_ids)

for tok, p in zip(tokens, probs.tolist()):
    print(f"{tok:>12s}  {p:.4f}")

          As  0.5300
          Ġa  1.0000
Ġprobability  1.0000
     Ġexpert  1.0000
           ,  1.0000
          ĠI  1.0000
       Ġmust  0.5519
  Ġemphasize  1.0000
       Ġthat  1.0000
 Ġpredicting  1.0000
   Ġeconomic  1.0000
     Ġrecess  1.0000
        ions  1.0000
         Ġis  0.8411
 Ġinherently  1.0000
  Ġuncertain  1.0000
        Ġand  1.0000
    Ġsubject  1.0000
         Ġto  1.0000
    Ġvarious  1.0000
    Ġfactors  1.0000
           ,  0.8670
  Ġincluding  1.0000
     Ġunfore  0.7311
        seen  1.0000
     Ġevents  1.0000
        Ġand  0.5519
    Ġchanges  0.7390
         Ġin  1.0000
     Ġglobal  0.6971
   Ġeconomic  1.0000
 Ġconditions  1.0000
         .ĊĊ  0.6514
        That  1.0000
      Ġbeing  1.0000
       Ġsaid  1.0000
           ,  1.0000
      Ġbased  0.3170
         Ġon  1.0000
 Ġhistorical  0.2608


###  Some issues above. It doesnt have to conclude in 40 tokens and can take a lot of time. Context is not strictly enforced


# Forcing generation to Yes/No by logits processing


In [42]:
tokenizer.decode(prompt_ids[0])

'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 20 Aug 2025\n\nYou are a probability expert who provides reasoning and certainly conclude with a final estimate of event probabilities.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat is the probability of an economic recession in the United States this year (2025)?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'

In [43]:
class YesNoLogitsProcessor(LogitsProcessor):
    """
    A LogitsProcessor that forces the model to predict only "yes" or "no".
    """

    def __init__(self, tokenizer, yes_token_ids, no_token_ids):
        """
        Initializes the YesNoLogitsProcessor.

        Args:
            tokenizer:  The tokenizer used by the model.
            yes_token_ids: The token IDs for "yes" (can be single or multiple tokens).
            no_token_ids: The token IDs for "no" (can be single or multiple tokens).
        """
        self.tokenizer = tokenizer
        self.yes_token_ids = yes_token_ids
        self.no_token_ids = no_token_ids
        self.allowed_token_ids = list(set(yes_token_ids + no_token_ids)) # Unique allowed

    def __call__(self, input_ids, scores):
        """
        Filters the logits to allow only "yes" or "no" tokens.

        Args:
            input_ids: torch.LongTensor. The batch of input token ids.
            scores: torch.FloatTensor. The model's output logits.

        Returns:
            torch.FloatTensor: The filtered logits.
        """
        if len(input_ids) == 0:
            return scores

        # Create a mask to filter out unwanted tokens
        mask = torch.ones(scores.shape[-1], dtype=torch.bool, device=scores.device)
        mask[self.allowed_token_ids] = False  # Set allowed tokens to False

        # Create a large negative number for masking
        very_negative_number = torch.finfo(scores.dtype).min
        scores[:, mask] = very_negative_number

        return scores

In [44]:
question = "Will the United States go into economic recession this year (2025)?"
question = "Will the sun rise in the east tomorrow?"

messages = [
    # Added "probabilistic" to the system message
        {"role": "system", "content": "You are an expert in everything, who answers only in Yes or No"},
        {"role": "user", "content": question}
    ]

#  Format the conversation with the model‑specific chat template
prompt_ids = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,          
    return_tensors="pt"
).to(model.device)

In [45]:
yes_tokens = ["yes", "Yes", "YES"]
no_tokens = ["no", "No", "NO"]
yes_token_ids = []
no_token_ids = []

for y in yes_tokens:
    yes_token_ids.extend(tokenizer.encode(y, add_special_tokens=False))
for n in no_tokens:
    no_token_ids.extend(tokenizer.encode(n, add_special_tokens=False))
#Get unique ids
yes_token_ids = list(set(yes_token_ids))
no_token_ids = list(set(no_token_ids))
print(f"Yes token IDs: {yes_token_ids}")
print(f"No token IDs: {no_token_ids}")



Yes token IDs: [9642, 9891, 14331]
No token IDs: [2201, 9173, 2822]


In [46]:

# Create the LogitsProcessor
yes_no_processor = YesNoLogitsProcessor(tokenizer, yes_token_ids, no_token_ids)

In [47]:
out = model.generate(
    prompt_ids,
    max_new_tokens=1,
    return_dict_in_generate=True,
    logits_processor=[yes_no_processor],
    output_scores=True,                  # gives one logits tensor per generated step
)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


In [48]:
log_probs = model.compute_transition_scores(
    out.sequences, out.scores, normalize_logits=True         # True → log p_i, not raw logits
)     

gen_ids   = out.sequences[0, prompt_ids.shape[-1]:]          # only the new tokens
probs     = log_probs[0].exp()                               # p_i = e^{log p_i}
tokens    = tokenizer.convert_ids_to_tokens(gen_ids)

for tok, p in zip(tokens, probs.tolist()):
    print(f"{tok:>12s}  {p:.4f}")

         Yes  1.0000
