In [None]:
import os

cache_dir ='/scratch/hakeem.at/Queryable-Shared-Reference-Repository/notebooks/pretrained_models'

os.environ['HF_HOME'] = cache_dir
os.environ['TRANSFORMERS_CACHE'] = cache_dir
os.environ['HUGGINGFACE_HUB_CACHE'] = cache_dir

import json
import random
from tqdm.auto import tqdm
import pandas as pd

import torch
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer, AutoProcessor
seed = 42
random.seed(seed)


In [None]:
input_file = "final_rag_eval_dataset_2048.json"
with open(input_file, "r", encoding="utf-8") as f:
    eval_dataset = json.load(f)

eval_data = []
random.seed(42)
random.shuffle(eval_dataset)
max_data_points = 500
for idx, item in enumerate(eval_dataset):
    if not item: 
        continue
    if idx==max_data_points:
        break
    
    eval_data.append({
        'question': item['question'],
        'excerpt': item.get('excerpt', ''),
        'source': item.get('source', ''),
        'retrieved_docs': item['retrieved_docs'],
        'context': '\n\n'.join(item['retrieved_docs'][:5]),
    })
    

print(f"Total evaluation questions: {len(eval_data)}")

In [None]:
SYSTEM_PROMPT = """You are a helpful assistant. Answer questions using ONLY information from the provided context. If the context does not contain enough information to answer the question, respond with exactly: "I don't know." """

USER_TEMPLATE = """<context>
{context}
</context>

<question>
{question}
</question>

Provide your answer:"""

MODELS = {
    'qwen3_8b': 'Qwen/Qwen3-8B',
    'llama31_8b': 'meta-llama/Llama-3.1-8B-Instruct',
    'qwen3_vl_8b': 'Qwen/Qwen3-VL-8B-Instruct',
}

gpu_memory_utilization = 0.95
max_model_len = 12288
max_num_seqs = 64
enforce_eager = True

sampling_params = SamplingParams(
    temperature=0,
    max_tokens=512,
)

In [None]:
def format_prompt_qwen3(context: str, question: str) -> str:
    user_content = USER_TEMPLATE.format(context=context, question=question)
    prompt = (
        f"<|im_start|>system\n{SYSTEM_PROMPT}<|im_end|>\n"
        f"<|im_start|>user\n{user_content}<|im_end|>\n"
        f"<|im_start|>assistant\n<think>\n\n</think>\n\n"
    )
    return prompt

def format_prompt_qwen3_vl(context: str, question: str) -> str:
    user_content = USER_TEMPLATE.format(context=context, question=question)
    prompt = (
        f"<|im_start|>system\n{SYSTEM_PROMPT}<|im_end|>\n"
        f"<|im_start|>user\n{user_content}<|im_end|>\n"
        f"<|im_start|>assistant\n"
    )
    return prompt

In [None]:
def format_prompt_llama31(context: str, question: str) -> str:
    user_content = USER_TEMPLATE.format(context=context, question=question)
    prompt = (
        f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n"
        f"{SYSTEM_PROMPT}<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n"
        f"{user_content}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
    )
    return prompt

In [None]:
def format_with_tokenizer(model_id: str, model_name: str, context: str, question: str) -> str:
    user_content = USER_TEMPLATE.format(context=context, question=question)
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": user_content},
    ]
    
    if model_name == 'qwen3_vl_8b':
        processor = AutoProcessor.from_pretrained(model_id)
        return processor.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
    else:
        tokenizer = AutoTokenizer.from_pretrained(model_id)
        kwargs = {"tokenize": False, "add_generation_prompt": True}
        if model_name == 'qwen3_8b':
            kwargs["enable_thinking"] = False
        return tokenizer.apply_chat_template(messages, **kwargs)

In [None]:
PROMPT_FORMATTERS = {
    'qwen3_8b': format_prompt_qwen3,
    'llama31_8b': format_prompt_llama31,
    'qwen3_vl_8b': format_prompt_qwen3_vl,
}

In [None]:
TEST_CONTEXT = """The Eiffel Tower is a wrought-iron lattice tower located in Paris, France. 
It was constructed from 1887 to 1889 as the entrance arch for the 1889 World's Fair. 
The tower is 330 meters tall and was the tallest man-made structure in the world until 1930."""

TEST_QUESTION = "How tall is the Eiffel Tower and who built it?"

def test_single_inference(model_name: str, model_id: str):
    print(f"\n{'='*70}")
    print(f"MODEL: {model_name} ({model_id})")
    print(f"{'='*70}")
    
    model = LLM(
        model=model_id,
        gpu_memory_utilization=gpu_memory_utilization,
        max_model_len=max_model_len,
        max_num_seqs=max_num_seqs,
        enforce_eager=enforce_eager,
        trust_remote_code=True,
    )
    
    sampling_params = SamplingParams(temperature=0, max_tokens=256)
    
    manual_formatter = PROMPT_FORMATTERS[model_name]
    manual_prompt = manual_formatter(context=TEST_CONTEXT, question=TEST_QUESTION)
    
    print(f"\n{'─'*40}")
    print("METHOD 1: MANUAL TEMPLATE")
    print(f"{'─'*40}")
    print(f"PROMPT:\n{manual_prompt}")
    
    manual_response = model.generate([manual_prompt], sampling_params)
    manual_output = manual_response[0].outputs[0].text.strip()
    
    print(f"\nRESPONSE:\n{manual_output}")
    
    user_content = USER_TEMPLATE.format(context=TEST_CONTEXT, question=TEST_QUESTION)
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": user_content},
    ]
    
    if model_name == 'qwen3_vl_8b':
        tokenizer = AutoProcessor.from_pretrained(model_id)
        tokenizer_prompt = tokenizer.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
    else:
        tokenizer = AutoTokenizer.from_pretrained(model_id)
        kwargs = {"tokenize": False, "add_generation_prompt": True}
        if model_name == 'qwen3_8b':
            kwargs["enable_thinking"] = False
        tokenizer_prompt = tokenizer.apply_chat_template(messages, **kwargs)
    
    print(f"\n{'─'*40}")
    print("METHOD 2: TOKENIZER TEMPLATE")
    print(f"{'─'*40}")
    print(f"PROMPT:\n{tokenizer_prompt}")
    
    tokenizer_response = model.generate([tokenizer_prompt], sampling_params)
    tokenizer_output = tokenizer_response[0].outputs[0].text.strip()
    
    print(f"\nRESPONSE:\n{tokenizer_output}")
    
    print(f"\n{'─'*40}")
    print("COMPARISON")
    print(f"{'─'*40}")
    templates_match = manual_prompt.strip() == tokenizer_prompt.strip()
    responses_match = manual_output == tokenizer_output
    print(f"Templates identical: {templates_match}")
    print(f"Responses identical: {responses_match}")
    
    if not templates_match:
        print("\n⚠️  Template differences detected!")
    
    del model
    del tokenizer
    torch.cuda.empty_cache()
    
    return {
        'model': model_name,
        'manual_prompt': manual_prompt,
        'tokenizer_prompt': tokenizer_prompt,
        'manual_response': manual_output,
        'tokenizer_response': tokenizer_output,
        'templates_match': templates_match,
        'responses_match': responses_match,
    }


def run_all_template_tests():
    """Run template comparison test for all models."""
    results = []
    
    for model_name, model_id in MODELS.items():
        try:
            result = test_single_inference(model_name, model_id)
            results.append(result)
        except Exception as e:
            print(f"\n❌ Error testing {model_name}: {e}")
            continue
    
    print(f"\n{'='*70}")
    print("SUMMARY")
    print(f"{'='*70}")
    for r in results:
        status = "✅" if r['templates_match'] and r['responses_match'] else "⚠️"
        print(f"{status} {r['model']}: templates_match={r['templates_match']}, responses_match={r['responses_match']}")
    
    return results


results = run_all_template_tests()

In [None]:
def run_model_inference(
    model_name: str, 
    model_id: str, 
    eval_data: list, 
    batch_size: int = 512,
    use_tokenizer_template: bool = False
):
    print(f"\n{'='*60}")
    print(f"Loading: {model_name} ({model_id})")
    print(f"{'='*60}\n")
    
    model = LLM(
        model=model_id,
        gpu_memory_utilization=gpu_memory_utilization,
        max_model_len=max_model_len,
        max_num_seqs=max_num_seqs,
        enforce_eager=enforce_eager,
        trust_remote_code=True,
    )
    
    if use_tokenizer_template:
        if model_name == 'qwen3_vl_8b':
            tokenizer = AutoProcessor.from_pretrained(model_id)
        else:
            tokenizer = AutoTokenizer.from_pretrained(model_id)
    
    formatter = PROMPT_FORMATTERS[model_name]
    
    results = []
    
    for idx in tqdm(range(0, len(eval_data), batch_size), desc=f"{model_name}"):
        batch_data = eval_data[idx:min(idx + batch_size, len(eval_data))]
        
        if use_tokenizer_template:
            batch_prompts = []
            for item in batch_data:
                user_content = USER_TEMPLATE.format(
                    context=item['context'], question=item['question']
                )
                messages = [
                    {"role": "system", "content": SYSTEM_PROMPT},
                    {"role": "user", "content": user_content},
                ]
                kwargs = {"tokenize": False, "add_generation_prompt": True}
                if model_name == 'qwen3_8b':
                    kwargs["enable_thinking"] = False
                batch_prompts.append(
                    tokenizer.apply_chat_template(messages, **kwargs)
                )
        else:
            batch_prompts = [
                formatter(context=item['context'], question=item['question'])
                for item in batch_data
            ]
        
        try:
            responses = model.generate(batch_prompts, sampling_params)
            
            for item, response in zip(batch_data, responses):
                raw_response = response.outputs[0].text.strip()
                results.append({
                    'question': item['question'],
                    'excerpt': item['excerpt'],
                    'source': item['source'],
                    'context': item['context'],
                    'retrieved_docs': item['retrieved_docs'],
                    'response': raw_response,
                    'model': model_name,
                })
                
        except Exception as e:
            print(f"Error in batch {idx}: {e}")
            continue
    
    del model
    if use_tokenizer_template:
        del tokenizer
    torch.cuda.empty_cache()
    
    print(f"Completed: {len(results)} responses")
    return results

In [None]:
results_qwen3 = run_model_inference('qwen3_8b', MODELS['qwen3_8b'], eval_data)

with open("responses_qwen3_8b_1.json", "w", encoding="utf-8") as f:
    json.dump(results_qwen3, f, indent=2, ensure_ascii=False)
print("Saved: responses_qwen3_8b_1.json")


In [None]:
results_llama = run_model_inference('llama31_8b', MODELS['llama31_8b'], eval_data)

with open("responses_llama31_8b_1.json", "w", encoding="utf-8") as f:
    json.dump(results_llama, f, indent=2, ensure_ascii=False)
print("Saved: responses_llama31_8b_1.json")

In [None]:
results_qwen_vl = run_model_inference('qwen3_vl_8b', MODELS['qwen3_vl_8b'], eval_data)

with open("responses_qwen3_vl_8b_1.json", "w", encoding="utf-8") as f:
    json.dump(results_qwen_vl, f, indent=2, ensure_ascii=False)
print("Saved: responses_qwen3_vl_8b_1.json")

In [None]:
all_results = results_qwen3 + results_llama + results_qwen_vl

with open("all_responses_combined_1.json", "w", encoding="utf-8") as f:
    json.dump(all_results, f, indent=2, ensure_ascii=False)

df = pd.DataFrame(all_results)
df.to_csv("all_responses_combined.csv", index=False)

print(f"\nTotal responses: {len(all_results)}")
print(df['model'].value_counts())