# PRefLexOR Inference: Thinking and Reflection and Agentic Reasoning

In [None]:
!pip install git+https://github.com/lamm-mit/PRefLexOR.git --quiet

In [None]:
import os

from tqdm.notebook import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer

import torch

from PRefLexOR import *

# Define thinking and reflection tokens
think_start = '<|thinking|>'
think_end = '<|/thinking|>'
reflect_start="<|reflect|>"
reflect_end= "<|/reflect|>"

import transformers

### Load model

In [None]:
model_name='lamm-mit/PRefLexOR_ORPO_DPO_EXO_REFLECT_10222024'

model = AutoModelForCausalLM.from_pretrained(model_name,
    torch_dtype =torch.bfloat16,
    #attn_implementation="flash_attention_2",
    device_map="auto", trust_remote_code=True,
    )
tokenizer = AutoTokenizer.from_pretrained(model_name, )

### Inference: Conventional

In [None]:
txt = 'What is the relationship between materials and music? Brief answer.' + f' Use {think_start}.'

output_text, messages = generate_local_model(
    model=model,
    tokenizer=tokenizer,
    prompt=txt,
    system_prompt='',
    num_return_sequences=1,
    repetition_penalty=1.0,
    temperature=0.1,
    max_new_tokens=2024,
    messages=[],
    do_sample=True
)

print(output_text)

#### Extract thinking or other sections from the output

In [None]:
thinking    = extract_text(output_text, thinking_start=think_start, thinking_end=think_end)[0].strip()
reflection  = extract_text(output_text, thinking_start=reflect_start, thinking_end=reflect_end)[0].strip()
answer_only = extract_text(output_text, thinking_start=reflect_end, thinking_end="NONE").strip()

In [None]:
print ("THINKING:\n\n", thinking)

In [None]:
print ("REFLECTION:\n\n", thinking)

In [None]:
print ("ANSWER:\n\n", answer_only)

### Inference: Recursive using multi-agent reasoning using thinking and reflection tokens

In [None]:
from PRefLexOR import recursive_response_from_thinking

#### Load second model that will be the critic agent

In [None]:
model_name_base = "meta-llama/Llama-3.2-3B-Instruct"

critic_model = AutoModelForCausalLM.from_pretrained(
    model_name_base,
    torch_dtype=torch.bfloat16,
    #attn_implementation="flash_attention_2",
    device_map="auto",
    trust_remote_code=True
)

In [None]:
output_text, output_list, output_text_integrated = recursive_response(
    model=model,
    tokenizer=tokenizer,
    model_critic=critic_model,
    tokenizer_critic=tokenizer,  #same tokenizer for critic agent as the reasoning model in our case
    question='How do biological materials fail gracefully? Brief answer.',
    N=2,
    temperature=0.1,
    temperature_improvement=0.1,
    system_prompt='You are a helpful assistant.',
    system_prompt_critic='You carefully improve responses, with attention to detail, and following all directions.',
    verbatim=False,
    thinking_start=think_start, thinking_end = think_end,
    reflect_start=reflect_start, reflect_end= reflect_end,
)

In [None]:
for i, item in enumerate(output_list):
    answer_only=extract_text(item, thinking_start=reflect_end, thinking_end="NONE")
    print (64*"-"+f"\n>>>i={i}<<<\n"+64*"-")
    print (answer_only)


In [None]:
print ("INTEGRATED RESPONSE:")
print (output_text_integrated)