<a href="https://colab.research.google.com/github/hirdeshkumar2407/NLP_Group_Assigment/blob/main/Training%20models/2_RAG_Retriever.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Imports and loading the dataset:

In [1]:
pip install hnswlib

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import os
from sentence_transformers import SentenceTransformer, CrossEncoder, util
import torch
import hnswlib
from transformers import AutoModel

if os.path.isfile("rag_instruct.json"): 
    df = pd.read_json("rag_instruct.json")
else:
    df = pd.read_json("hf://datasets/FreedomIntelligence/RAG-Instruct/rag_instruct.json")

documents = df['documents']

2025-05-20 09:54:19.899134: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747734859.924055     175 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747734859.931812     175 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
print(documents[:3])

0    [decided to make the story more straightforwar...
1    [the world with 68.5% of Taiwanese high school...
2    [Sparrho Sparrho combines human and artificial...
Name: documents, dtype: object


## Our models for calculating the embeddings and using the CrossEncoder

In [4]:
semb_model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
xenc_model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
#semb_model.to('cuda')

## Calculating the embeddings for the corpus:

In [5]:
corpus_embeddings = semb_model.encode(documents, convert_to_tensor=True, show_progress_bar=True)


Batches:   0%|          | 0/1267 [00:00<?, ?it/s]

## Indexing for faster access:

In [6]:
index = hnswlib.Index(space='cosine', dim=corpus_embeddings.size(1))

In [7]:
# Define hnswlib index path
index_path = "./hnswlib.index"

# Load index if available
if os.path.exists(index_path):
    print("Loading index...")
    index.load_index(index_path)
# Else index data collection
else:
    # Initialise the index
    print("Start creating HNSWLIB index")
    index.init_index(max_elements=corpus_embeddings.size(0), ef_construction=400, M=64)
    #  Compute the HNSWLIB index (it may take a while)
    index.add_items(corpus_embeddings.cpu(), list(range(len(corpus_embeddings))))
    # Save the index to a file for future loading
    print("Saving index to:", index_path)
    index.save_index(index_path)

Start creating HNSWLIB index
Saving index to: ./hnswlib.index


In [8]:
# function to get the related docs
def get_related_docs(query, k=3):
    query_embedding = semb_model.encode(query, convert_to_tensor=True)
    corpus_ids, _ = index.knn_query(query_embedding.cpu(), k=k)

    model_inputs = [(query, str(documents[idx])) for idx in corpus_ids[0]]
    cross_scores = xenc_model.predict(model_inputs)
    send_to_LLM = ""
    positive_docs = [documents[corpus_ids[0][idx]] for idx in np.argsort(-cross_scores) if cross_scores[idx] > 0]

    if len(positive_docs) > 1:
        for i, doc in enumerate(positive_docs):
            send_to_LLM += f"Document {i+1}:\n\n"
            # Convert the list 'doc' to a string before concatenating
            send_to_LLM += str(doc) + "\n"
    elif len(positive_docs) == 1:
        # Convert the list to a string if there's only one document
        send_to_LLM = str(positive_docs[0])

    else:
        # If no positive scores, take the top 2 negative scores
        negative_docs = []
        for idx in np.argsort(-cross_scores)[:2]: # Take the top 2 indices based on sorted scores
            negative_docs.append(documents[corpus_ids[0][idx]])

        if len(negative_docs) > 1:
            for i, doc in enumerate(negative_docs):
                send_to_LLM += f"Document {i+1}:\n"
                send_to_LLM += str(doc) + "\n\n"
        elif len(negative_docs) == 1:
            send_to_LLM = str(negative_docs[0])

    return send_to_LLM



In [9]:
!pip install -U bitsandbytes accelerate transformers
print("Required libraries upgrade/installation attempted.")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Required libraries upgrade/installation attempted.


In [10]:
import bitsandbytes
print(f"bitsandbytes version: {bitsandbytes.__version__}")
import transformers
print(f"transformers version: {transformers.__version__}")
import torch
print(f"PyTorch version: {torch.__version__}")
# Check if GPU is available to transformers
if torch.cuda.is_available():
    print(f"CUDA is available. GPU: {torch.cuda.get_device_name(0)}")
else:
    print("CUDA not available. Running on CPU.")

bitsandbytes version: 0.45.5
transformers version: 4.51.3
PyTorch version: 2.6.0+cu124
CUDA is available. GPU: Tesla T4


In [11]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch # Already imported, but good practice if cell is standalone

# Define quantization configuration
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

model_name = "AITeamVN/Vi-Qwen2-3B-RAG"

try:
    print(f"\nLoading tokenizer for {model_name}...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    print("Tokenizer loaded.")

    print(f"\nLoading model {model_name} with 4-bit quantization...")
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=quant_config,
        device_map="auto" # Let accelerate handle device placement
    )
    print("Model loaded successfully with 4-bit quantization.")
    if hasattr(model, 'hf_device_map'):
        print(f"Model device map: {model.hf_device_map}")
    else:
        print(f"Model is on device: {model.device}")


except ImportError as e:
    print(f"ImportError during model loading: {e}")
    print("This usually means 'bitsandbytes' is not the correct version or not found.")
    print("Ensure Cell 1 (pip install -U ...) ran successfully in THIS session.")
    print("If you ran Cell 1 and then the KERNEL/SESSION fully restarted, you need to run Cell 1 again.")
except Exception as e:
    print(f"An error occurred during model loading: {e}")


Loading tokenizer for AITeamVN/Vi-Qwen2-3B-RAG...
Tokenizer loaded.

Loading model AITeamVN/Vi-Qwen2-3B-RAG with 4-bit quantization...


model.safetensors.index.json:   0%|          | 0.00/35.6k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.21G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]

Model loaded successfully with 4-bit quantization.
Model device map: {'model.embed_tokens': 0, 'lm_head': 0, 'model.layers.0': 0, 'model.layers.1': 0, 'model.layers.2': 0, 'model.layers.3': 0, 'model.layers.4': 0, 'model.layers.5': 0, 'model.layers.6': 0, 'model.layers.7': 1, 'model.layers.8': 1, 'model.layers.9': 1, 'model.layers.10': 1, 'model.layers.11': 1, 'model.layers.12': 1, 'model.layers.13': 1, 'model.layers.14': 1, 'model.layers.15': 1, 'model.layers.16': 1, 'model.layers.17': 1, 'model.layers.18': 1, 'model.layers.19': 1, 'model.layers.20': 1, 'model.layers.21': 1, 'model.layers.22': 1, 'model.layers.23': 1, 'model.layers.24': 1, 'model.layers.25': 1, 'model.layers.26': 1, 'model.layers.27': 1, 'model.layers.28': 1, 'model.layers.29': 1, 'model.layers.30': 1, 'model.layers.31': 1, 'model.layers.32': 1, 'model.layers.33': 1, 'model.layers.34': 1, 'model.layers.35': 1, 'model.norm': 1, 'model.rotary_emb': 1}


In [23]:
# model.to('cuda')
# tokenizer.to('cuda')

query = "Do all plants do photosynthesis?"

context_docs = get_related_docs(query)

prompt = f"Given this context: \n{context_docs} \n\nPlease answer the question: {query}.\n\nAnswer:\n"

inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=200,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )

# Decode and print result
answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("\n=== Generated Answer ===\n")
print(answer.split("Answer:")[-1].strip())  # Optional: strip prompt parts

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


=== Generated Answer ===

Based on the information provided in the documents, it is clear that not all plants do photosynthesis. While most plants perform photosynthesis as part of their biological processes, some plants have lost the ability to produce chlorophyll or to perform photosynthesis altogether. For example, some parasitic and mycotrophic plants may lose the ability to produce chlorophyll or to photosynthesize. Additionally, some plants may experience a reduction in photosynthetic activity under certain environmental conditions, such as shading or low light levels. Therefore, it is not accurate to say that all plants do photosynthesis. However, the vast majority of plants do engage in photosynthesis to varying degrees. 

In conclusion, while most plants do perform photosynthesis, not all plants do so. The ability to perform photosynthesis is not universal among all plant species. Some plants may have lost the capacity to perform photosynthesis due to their lifestyle or envir

# Model Inspection

In [13]:
from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM

model_name = "AITeamVN/Vi-Qwen2-3B-RAG" # Ensure this is the correct model name

print(f"\n---  Inspecting Configuration for {model_name} ---")

try:
    # Load the configuration
    config = AutoConfig.from_pretrained(model_name)
    print("\nModel Configuration Loaded Successfully.")

    # a) Maximum Context Window / Sequence Length
    # Common attribute names for max sequence length / context window:
    # 'max_position_embeddings', 'n_positions', 'n_ctx'
    # The exact name can vary between model architectures.
    max_len_attrs = ['max_position_embeddings', 'n_positions', 'n_ctx', 'sliding_window'] # Qwen2 uses 'sliding_window' or implies by rope_theta
    context_window = None
    print("\nPotential attributes for max context window:")
    for attr in max_len_attrs:
        if hasattr(config, attr):
            value = getattr(config, attr)
            print(f"  - Found '{attr}': {value}")
            if isinstance(value, int) and (context_window is None or value > context_window) and attr != 'sliding_window': # sliding_window is different
                context_window = value
            if attr == 'sliding_window' and value is not None:
                print(f"    Note: This model uses a sliding window of {value}. Effective context might be related but not strictly this value for all operations.")
                # For Qwen2, the context window is typically very large (e.g., 32k, 128k) but it uses a sliding window attention mechanism
                # The 'sliding_window' parameter itself in Qwen2 config is the size of the attention window.
                # The actual theoretical max context can be much larger, often found in model card or from rope_theta.
                # If 'sliding_window' is present, this often indicates the *attention* window size.
                # The actual max sequence length might be different (often larger for Qwen2 series).
                # Let's try to load the tokenizer too, as it sometimes has max_model_input_sizes
                try:
                    tokenizer_temp = AutoTokenizer.from_pretrained(model_name)
                    if hasattr(tokenizer_temp, 'model_max_length'):
                        print(f"  - Tokenizer 'model_max_length': {tokenizer_temp.model_max_length}")
                        if context_window is None or tokenizer_temp.model_max_length > context_window:
                             context_window = tokenizer_temp.model_max_length
                except Exception as e_tok:
                    print(f"    Could not load tokenizer to check its max length: {e_tok}")


    if context_window:
        print(f"\nEstimated Maximum Context Window / Sequence Length: {context_window} tokens")
    else:
        print("\nCould not automatically determine a clear maximum context window from common config attributes.")
        print("Please refer to the model card or documentation for the definitive context window.")

    # b) Model Type / Architecture
    if hasattr(config, 'model_type'):
        print(f"\nModel Type / Architecture: {config.model_type}")
    else:
        print("\nModel Type not explicitly found in config.")

  

except Exception as e:
    print(f"An error occurred while loading or inspecting the model configuration: {e}")
    print("Ensure the model name is correct and you have an internet connection.")


---  Inspecting Configuration for AITeamVN/Vi-Qwen2-3B-RAG ---

Model Configuration Loaded Successfully.

Potential attributes for max context window:
  - Found 'max_position_embeddings': 32768
  - Found 'sliding_window': 32768
    Note: This model uses a sliding window of 32768. Effective context might be related but not strictly this value for all operations.
  - Tokenizer 'model_max_length': 131072

Estimated Maximum Context Window / Sequence Length: 131072 tokens

Model Type / Architecture: qwen2


In [14]:
sample_queries = [ "What is a key reason OpenSSH is considered secure?", "What are the main components of Docker's service?", "In which film did Christopher Walken portray a character who gives a speech involving a gold watch related to his experiences in the Vietnam War?", "Which health benefits are associated with running?", "Which flag, the New Zealand Ensign or the Union Jack, had formal legislation passed for its use earlier?", "Did Jimmy Carter's high school activities have any influence on his professional pursuit?"]

print(f"Defined {len(sample_queries)} sample queries for evaluation.")

Defined 6 sample queries for evaluation.


# Helper Function for LLM Generation

In [15]:
# Cell A: Helper Function for LLM Generation

import torch # Ensure torch is imported

def generate_llm_answer_experimental(current_model, current_tokenizer, prompt_text,
                                     max_new_tokens=250, temperature=0.7, top_p=0.9, do_sample=True):
    """
    Generates an answer from the LLM given a prompt and generation parameters.
    Returns the extracted answer text.
    """
    answer_text = "Error during generation."
    # print(f"--- DEBUG: Prompt to LLM (first 500 chars) ---\n{prompt_text[:500]}...") # Uncomment for deep debugging of prompt
    try:
        inputs = current_tokenizer(prompt_text, return_tensors="pt", truncation=True, max_length=current_tokenizer.model_max_length - max_new_tokens - 10).to(current_model.device) # Added truncation & buffer
        
        with torch.no_grad():
            outputs = current_model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                temperature=temperature,
                top_p=top_p,
                do_sample=do_sample,
                pad_token_id=current_tokenizer.eos_token_id
            )
        
        full_generation = current_tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Try to extract answer after "Answer:" or "answer:"
        answer_cue_index = -1
        if "Answer:" in full_generation:
            answer_cue_index = full_generation.rfind("Answer:") + len("Answer:")
        elif "answer:" in full_generation.lower():
            answer_cue_index = full_generation.lower().rfind("answer:") + len("answer:")
            
        if answer_cue_index != -1:
            answer_text = full_generation[answer_cue_index:].strip()
        else:
            # Fallback: take text after the prompt if cue not found
            # This might be noisy if the model doesn't follow the prompt structure.
            answer_text = full_generation[len(prompt_text):].strip() if len(full_generation) > len(prompt_text) else full_generation
            # print("Warning: 'Answer:' cue not found in LLM output. Using fallback extraction.")

    except Exception as e:
        print(f"  Error during LLM generation: {e}")
        answer_text = f"Error: {e}"
    return answer_text

print("Helper function 'generate_llm_answer_experimental' defined.")

Helper function 'generate_llm_answer_experimental' defined.


# Baseline & Retriever Output Check

### Prompt Engineering

In [31]:
# Cell C: Experiment 1 - Prompt Engineering

print("\n--- Experiment 1: Prompt Engineering ---")

# Choose one query index from your 'sample_queries' list (0 to N_SAMPLE_QUERIES-1)
query_index_for_prompt_exp = 0 # Example: use the first sample query
selected_query = sample_queries[query_index_for_prompt_exp]

# Get the context that was retrieved for this query in the baseline run
# Or re-retrieve if you prefer, to ensure it's fresh
print(f"  Retrieving context for: \"{selected_query}\"")
context_for_prompt_exp = get_related_docs(selected_query, k=3) # Using Mehdi's function
print(f"  Context (start): {context_for_prompt_exp[:300]}...")


print(f"\n--- Testing different prompts for query: \"{selected_query}\" ---")

# Prompt Style 1 (Baseline)
#prompt_style_1 = f"Given this context: \n{context_for_prompt_exp} \n\nPlease answer the question: {selected_query}.\n\nAnswer:\n"
print("\n--- Prompt Style 1 (Baseline) ---")
answer_1 = generate_llm_answer_experimental(model, tokenizer, prompt_style_1)
print(f"  Generated Answer:\n    {answer_1}")

# Prompt Style 2 (More direct, instruction first)
#prompt_style_2 = f"Based ONLY on the following context, answer the question. If the answer is not in the context, state that.\n\nContext:\n{context_for_prompt_exp}\n\nQuestion: {selected_query}\n\nAnswer:\n"
print("\n--- Prompt Style 2 (Direct Instruction) ---")
answer_2 = generate_llm_answer_experimental(model, tokenizer, prompt_style_2)
print(f"  Generated Answer:\n    {answer_2}")

# Prompt Style 3 (Role-playing)
prompt_style_3 = f"You are a helpful AI assistant. Your task is to answer the question using *only* the provided text. Question: {selected_query}\n\nProvided text:\n{context_for_prompt_exp}\n\nAnswer based on provided text:\n"
print("\n--- Prompt Style 3 (Role-Playing) ---")
answer_3 = generate_llm_answer_experimental(model, tokenizer, prompt_style_3)
print(f"  Generated Answer:\n    {answer_3}")

# Prompt Style 4 (Explicit "Do Not Use Outside Knowledge" + "If not found, state it")
#prompt_style_4 = f"Carefully review the following text. Based strictly and ONLY on this provided text, answer the question. Do not use any external knowledge. If the answer cannot be found in the text, explicitly state that the information is not present in the provided context.\n\nProvided Text:\n{context_for_prompt_exp}\n\nQuestion: {selected_query}\n\nAnswer:\n"
print("\n--- Prompt Style 4 (Strict Constraint & Explicit Not Found) ---")
answer_4 = generate_llm_answer_experimental(model, tokenizer, prompt_style_4)
print(f"  Generated Answer:\n    {answer_4}")

#Prompt Style 5 (Question First, then Context, with strict instruction)
prompt_style_5 = f"Question: {selected_query}\n\nConsider ONLY the following context to answer the question. Do not make assumptions or use outside information. If the context does not contain the answer, indicate that.\n\nContext:\n{context_for_prompt_exp}\n\nAnswer:\n"
print("\n--- Prompt Style 5 (Question First, Strict Constraint) ---")
answer_5 = generate_llm_answer_experimental(model, tokenizer, prompt_style_5)
print(f"  Generated Answer:\n    {answer_5}")

# Prompt Style 6 (Adding a step-by-step thought process cue - Chain of Thought like)
# This might encourage it to first evaluate if the context is useful.
prompt_style_6 = f"Context:\n{context_for_prompt_exp}\n\nQuestion: {selected_query}\n\nLet's think step by step. First, does the context contain information relevant to answering the question? Second, based only on that relevant information, what is the answer? If no relevant information is found, state that.\n\nAnswer:\n"
print("\n--- Prompt Style 6 (Step-by-Step / Chain of Thought Cue) ---")
answer_6 = generate_llm_answer_experimental(model, tokenizer, prompt_style_6, max_new_tokens=300) # Give more tokens for thought process
print(f"  Generated Answer:\n    {answer_6}")


# (Assuming 'selected_query' and 'context_for_prompt_exp' are defined from previous part of Cell C)

prompt_style_7 = f"""Your task is to answer the following question.

Follow these steps:
1. First, try to answer the question using ONLY the provided 'Context' below.
2. If the context contains relevant information but isn't specific enough to fully answer, state what you found and then ask the user to 'please provide more details or a more specific question related to the context.'
3. If the context is completely irrelevant or does not contain any information to answer the question, then try to answer using your general knowledge.
4. If you use your general knowledge and can answer, clearly state: "Based on my general knowledge, the answer is: ..."
5. If, after checking the context and your general knowledge, you still cannot answer the question, respond with: "I do not have enough information to answer this question from the provided context or my general knowledge."

Context:
{context_for_prompt_exp}

Question: {selected_query}

Answer:
"""
print("\n--- Prompt Style 7 (Multi-Step Conditional Logic Attempt) ---")
# Give it more tokens as it might generate a longer, more reasoned response
answer_7 = generate_llm_answer_experimental(model, tokenizer, prompt_style_7, max_new_tokens=350)
print(f"  Generated Answer:\n    {answer_7}")



--- Experiment 1: Prompt Engineering ---
  Retrieving context for: "What is a key reason OpenSSH is considered secure?"


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Context (start): Document 1:

["codebase with the OpenSSH 7.6 release. SSH is a protocol that can be used for many applications across many platforms including most Unix variants (GNU/Linux, the BSDs including Apple's macOS, and Solaris), as well as Microsoft Windows. Some of the applications below may require featu...

--- Testing different prompts for query: "What is a key reason OpenSSH is considered secure?" ---

--- Prompt Style 1 (Baseline) ---
  Generated Answer:
    OpenSSH is considered secure due to several key reasons:

1. **Encryption of All Information**: OpenSSH encrypts all information, including usernames and passwords, ensuring that data transmitted between the client and server is protected.

2. **Authentication and Key Management**:
   - **Authentication Based on Public Keys**: SSH uses public key cryptography for authentication. This means that only the owner of the key pair (private key) can authenticate, making it highly secure.
   - **Private Key Security**: Pr

# Multi-Step RAG Logic 

In [None]:

# Programmatic Multi-Step RAG Logic for ALL Sample Queries

import time
import pandas as pd
from IPython.display import display
import gc # For garbage collection

# --- Assume all prerequisites are loaded from previous cells: ---
# model, tokenizer, get_related_docs, generate_llm_answer_experimental,
# sample_queries, documents_corpus_for_retriever (or global 'documents' correctly set)
# -------------------------------------------------------------

print(f"--- Programmatic Multi-Step RAG for ALL {len(sample_queries)} Sample Queries ---")

# Ensure the global 'documents' variable is correctly set if get_related_docs relies on it.
# This check can remain as it's crucial for the retriever.
if 'documents_corpus_for_retriever' in locals():
    if 'documents' not in globals() or globals()['documents'] is not documents_corpus_for_retriever:
        globals()['documents'] = documents_corpus_for_retriever
        print("Set global 'documents' variable from 'documents_corpus_for_retriever' for retriever access.")
elif 'documents' not in globals():
    print("CRITICAL ERROR: Neither 'documents_corpus_for_retriever' nor global 'documents' is defined for the retriever. Halting.")
    # exit() # Or raise an error

all_programmatic_results = []

for q_idx, selected_query in enumerate(sample_queries):
    print(f"\n\n<<<<< PROCESSING QUERY {q_idx+1}/{len(sample_queries)}: \"{selected_query}\" >>>>>")
    query_start_time = time.time()

    retrieved_context_str = "Error: Could not retrieve context."
    answer_from_context = "Error: Could not generate from context."
    answer_from_general_knowledge = "N/A (Context may have been used or other error)"

    try:
        # --- Stage 0: Retrieve Context ---
        print("\n  --- Stage 0: Retrieving Context ---")
        retrieved_context_str = get_related_docs(selected_query, k=3)
        print(f"  Retrieved Context (first 300 chars):\n    '{retrieved_context_str[:300].replace(chr(10), ' ')}...'")

        # --- Stage 1: Attempt to Answer Strictly from Context ---
        print("\n  --- Stage 1: Attempting Answer Strictly from Context ---")
        strict_context_prompt = f"Based ONLY on the following 'Provided Context', answer the 'Question'. If the answer is not present in the 'Provided Context', you MUST respond with only the exact phrase 'CONTEXT_INSUFFICIENT'. Do not use any other knowledge.\n\nProvided Context:\n{retrieved_context_str}\n\nQuestion: {selected_query}\n\nAnswer:\n"
        answer_from_context = generate_llm_answer_experimental(
            model, tokenizer, strict_context_prompt,
            max_new_tokens=200, temperature=0.1, top_p=0.7, do_sample=True
        )
        print(f"    LLM Response (Strict Context):\n      '{answer_from_context}'")

        # --- Stage 2: Python Logic to Decide Next Step ---
        final_answer_for_query = ""
        source_of_answer_for_query = ""
        used_strict_context_answer = False

        if "CONTEXT_INSUFFICIENT" in answer_from_context.upper() or \
           (len(answer_from_context.split()) < 5 and \
            any(phrase in answer_from_context.lower() for phrase in ["don't know", "cannot answer", "not sure", "no information", "context does not provide"])):
            print("\n  --- Stage 1 Result: Context was insufficient or LLM indicated so. ---")
            
            print("\n  --- Stage 3: Attempting Answer from General Knowledge ---")
            general_knowledge_prompt = f"Please answer the following question based on your general knowledge: {selected_query}\n\nAnswer:\n"
            answer_from_general_knowledge = generate_llm_answer_experimental(
                model, tokenizer, general_knowledge_prompt,
                max_new_tokens=250, temperature=0.7, top_p=0.9, do_sample=True
            )
            print(f"    LLM Response (General Knowledge):\n      '{answer_from_general_knowledge}'")

            is_informative_gk_answer = True
            non_informative_phrases_gk = ["don't know", "cannot answer", "not sure", "no information", "unable to provide", "no context", "lack the ability", "not capable"]
            if any(phrase in answer_from_general_knowledge.lower() for phrase in non_informative_phrases_gk) or \
               len(answer_from_general_knowledge.split()) < 6:
                is_informative_gk_answer = False
                
            if is_informative_gk_answer:
                final_answer_for_query = f"[Based on General Knowledge]:\n{answer_from_general_knowledge}"
                source_of_answer_for_query = "General Knowledge"
            else:
                final_answer_for_query = "I do not have enough information to answer this question from the provided context or my general knowledge."
                source_of_answer_for_query = "Insufficient Information (Context & GK)"
        else:
            print("\n  --- Stage 1 Result: LLM provided an answer from context. ---")
            final_answer_for_query = f"[Based on Provided Context]:\n{answer_from_context}"
            source_of_answer_for_query = "Provided Context"
            used_strict_context_answer = True

    except Exception as query_processing_error:
        print(f"!!!! ERROR processing query '{selected_query}': {query_processing_error} !!!!")
        final_answer_for_query = "Error during processing this query."
        source_of_answer_for_query = "Error"
        used_strict_context_answer = False # Ensure it's defined

    query_end_time = time.time()
    print("\n  -----------------------------------------")
    print(f"  FINAL DETERMINED ANSWER for Query {q_idx+1}")
    print(f"  Source of Answer: {source_of_answer_for_query}")
    print(f"  Answer:\n{final_answer_for_query}")
    print(f"  Time taken for this query: {query_end_time - query_start_time:.2f} seconds")
    print("  -----------------------------------------")

    all_programmatic_results.append({
        "query": selected_query,
        "retrieved_context_for_strict_rag": retrieved_context_str,
        "strict_rag_llm_response": answer_from_context,
        "used_strict_context_answer": used_strict_context_answer,
        "general_knowledge_llm_response": answer_from_general_knowledge,
        "final_answer_source": source_of_answer_for_query,
        "final_answer_text": final_answer_for_query
    })
    
    # Memory Management for long loops
    del retrieved_context_str
    del answer_from_context
    if 'strict_context_prompt' in locals(): del strict_context_prompt
    if 'answer_from_general_knowledge' in locals() and answer_from_general_knowledge != "N/A (Context may have been used or other error)":
        del answer_from_general_knowledge
    if 'general_knowledge_prompt' in locals(): del general_knowledge_prompt
    gc.collect() # Force Python's garbage collector to free up memory


print("\n\n===== Programmatic Multi-Step RAG for ALL Sample Queries COMPLETE =====")
print(">>> Review the 'FINAL DETERMINED ANSWER' and 'Source of Answer' for each query. <<<")

# Optional: Display results in a DataFrame
df_programmatic_results = pd.DataFrame(all_programmatic_results)
print("\n--- Summary of Programmatic RAG Results ---")
# To display long text in pandas without truncation:
# pd.set_option('display.max_colwidth', None)
display(df_programmatic_results[['query', 'final_answer_source', 'final_answer_text']])
# pd.reset_option('display.max_colwidth')

--- Programmatic Multi-Step RAG for ALL 6 Sample Queries ---


<<<<< PROCESSING QUERY 1/6: "What is a key reason OpenSSH is considered secure?" >>>>>

  --- Stage 0: Retrieving Context ---


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Retrieved Context (first 300 chars):
    'Document 1:  ["codebase with the OpenSSH 7.6 release. SSH is a protocol that can be used for many applications across many platforms including most Unix variants (GNU/Linux, the BSDs including Apple's macOS, and Solaris), as well as Microsoft Windows. Some of the applications below may require featu...'

  --- Stage 1: Attempting Answer Strictly from Context ---
    LLM Response (Strict Context):
      'OpenSSH is considered secure due to its use of the SSH-2 protocol, which includes several security features such as encryption, authentication, and key exchange mechanisms. Specifically, the SSH-2 protocol uses strong encryption algorithms, secure key exchange methods, and authentication mechanisms that prevent unauthorized access. Additionally, OpenSSH has a large community of developers and users who continuously monitor and update the software to address potential vulnerabilities. This ongoing maintenance and improvement contribute to the o

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Retrieved Context (first 300 chars):
    'Document 1: ["VMware vSphere Integrated Containers. The Cloud Foundry Diego project integrates Docker into the Cloud Foundry PaaS. Nanobox uses Docker (natively and with VirtualBox) containers as a core part of its software development platform. Red Hat's OpenShift PaaS integrates Docker with relate...'

  --- Stage 1: Attempting Answer Strictly from Context ---
    LLM Response (Strict Context):
      'The main difference between the original and'

  --- Stage 1 Result: LLM provided an answer from context. ---

  -----------------------------------------
  FINAL DETERMINED ANSWER for Query 2
  Source of Answer: Provided Context
  Answer:
[Based on Provided Context]:
The main difference between the original and
  Time taken for this query: 17.15 seconds
  -----------------------------------------


<<<<< PROCESSING QUERY 3/6: "In which film did Christopher Walken portray a character who gives a speech involving a gold watch related to his exp

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Retrieved Context (first 300 chars):
    '['and killers. Cimino countered that his film was not political, polemical, literally accurate, or posturing for any particular point of view. He further defended his position by saying that he had news clippings from Singapore that confirm Russian roulette was used during the war (without specifyin...'

  --- Stage 1: Attempting Answer Strictly from Context ---
    LLM Response (Strict Context):
      'In the film "The Deer Hunter," Christopher Walken portrays a character who gives a speech involving a gold watch related to his experiences in the Vietnam War. The character is a young Pennsylvania steelworker who is emotionally destroyed by the Vietnam War. Walken's performance in this film earned him an Academy Award for Best Supporting Actor. To prepare for the role, Walken consumed only bananas, water, and rice for a week to achieve the gaunt appearance required for the character. Walken explains in the film that he had hidden the gold wat

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Retrieved Context (first 300 chars):
    'Document 1:  ['exists the potential for injury while running (just as there is in any sport), there are many benefits. Some of these benefits include potential weight loss, improved cardiovascular and respiratory health (reducing the risk of cardiovascular and respiratory diseases), improved cardiov...'

  --- Stage 1: Attempting Answer Strictly from Context ---
    LLM Response (Strict Context):
      'Running, like all forms of regular exercise, offers several health benefits, including:

1. **Weight Loss**: Running can help in weight loss, which is beneficial for overall health.
2. **Improved Cardiovascular and Respiratory Health**: Running reduces the risk of cardiovascular and respiratory diseases.
3. **Improved Cardiovascular Fitness**: Running strengthens the heart and lungs, improving overall fitness.
4. **Reduced Total Blood Cholesterol**: Running can help lower total cholesterol levels.
5. **Strengthening of Bones**: Running can hel

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Retrieved Context (first 300 chars):
    'Document 1:  ['the Flags Act 1953, section 8 of that Act specified that "this Act does not affect the right or privilege of a person to fly the Union Jack." The Union Jack continued to be used for a period thereafter as a national flag. The current national flag of New Zealand was given official sta...'

  --- Stage 1: Attempting Answer Strictly from Context ---
    LLM Response (Strict Context):
      'The New Zealand Ensign had formal legislation passed for its use earlier. According to the provided context, the New Zealand Ensign Act was passed in 1902, while the Union Jack Act was passed in 1953. The 1953 Act specifically states that it does not affect the right or privilege of a person to'

  --- Stage 1 Result: LLM provided an answer from context. ---

  -----------------------------------------
  FINAL DETERMINED ANSWER for Query 5
  Source of Answer: Provided Context
  Answer:
[Based on Provided Context]:
The New Zealand Ensign had fo

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Retrieved Context (first 300 chars):
    '['The future president wrote that it was the last time he ever stole. Carter would be credited by his eldest son with being the person who most shaped his "work habits and ambitions". Carter was a conservative in his political views. However, his son Jimmy recollected that, "within our family we nev...'

  --- Stage 1: Attempting Answer Strictly from Context ---
    LLM Response (Strict Context):
      'Yes, Jimmy Carter's high school activities had a significant influence on his professional pursuit. According to the provided context, Carter was a diligent student with a fondness for reading. He also participated in various extracurricular activities such as playing on the Plains High School basketball team, joining the Future Farmers of America, and developing a lifelong interest in woodworking. These activities not only helped him develop a strong work ethic and academic skills but also exposed him to different fields of interest, which la

Unnamed: 0,query,final_answer_source,final_answer_text
0,What is a key reason OpenSSH is considered sec...,Provided Context,[Based on Provided Context]:\nOpenSSH is consi...
1,What are the main components of Docker's service?,Provided Context,[Based on Provided Context]:\nThe main differe...
2,In which film did Christopher Walken portray a...,Provided Context,"[Based on Provided Context]:\nIn the film ""The..."
3,Which health benefits are associated with runn...,Provided Context,"[Based on Provided Context]:\nRunning, like al..."
4,"Which flag, the New Zealand Ensign or the Unio...",Provided Context,[Based on Provided Context]:\nThe New Zealand ...
5,Did Jimmy Carter's high school activities have...,General Knowledge,[Based on General Knowledge]:\nBased on my gen...


# Programmatic Multi-Step RAG with Stage 1 VOTING

In [None]:


import time
import pandas as pd
from IPython.display import display, HTML # For better DataFrame display
import gc
import numpy as np # Just in case, though not heavily used here

print(f"--- Programmatic RAG with Stage 1 VOTING for ALL {len(sample_queries)} Sample Queries ---")

# Ensure the global 'documents' variable is correctly set for the retriever
# This assumes get_related_docs expects a global variable named 'documents'
# which should be your 'documents_corpus_for_retriever'
if 'documents_corpus_for_retriever' in locals():
    if 'documents' not in globals() or globals()['documents'] is not documents_corpus_for_retriever:
        globals()['documents'] = documents_corpus_for_retriever
        print("Set global 'documents' variable from 'documents_corpus_for_retriever' for retriever access.")
elif 'documents' not in globals():
    print("CRITICAL ERROR: Retriever corpus ('documents' or 'documents_corpus_for_retriever') not defined. Halting.")
    raise NameError("Retriever corpus not available. Please define 'documents_corpus_for_retriever' and ensure 'documents' global is set if needed.")

# --- Define the "Strict" Parameter Sets for Stage 1 Voting ---
# These should be conservative, aiming to get "CONTEXT_INSUFFICIENT" or very short answers for bad context
strict_param_sets_for_voting = [
    {"name": "S1_V_Greedy", "max_new_tokens": 100, "temperature": 0.7, "top_p": 0.9, "do_sample": False},
    {"name": "S1_V_VeryLowTemp", "max_new_tokens": 150, "temperature": 0.01, "top_p": 0.5, "do_sample": True},
    {"name": "S1_V_LowTemp", "max_new_tokens": 150, "temperature": 0.1, "top_p": 0.7, "do_sample": True}
]
# --- Voting Threshold ---
# If using 3 sets, 2 votes for "insufficient" seems reasonable.
MIN_VOTES_FOR_INSUFFICIENT = 2
if len(strict_param_sets_for_voting) == 2: MIN_VOTES_FOR_INSUFFICIENT = 1 # Adjust if you use 2 sets
if len(strict_param_sets_for_voting) == 1: MIN_VOTES_FOR_INSUFFICIENT = 1 # If only 1 set, it must indicate

all_voting_rag_results = []

for q_idx, selected_query in enumerate(sample_queries):
    print(f"\n\n<<<<< PROCESSING QUERY {q_idx+1}/{len(sample_queries)}: \"{selected_query}\" >>>>>")
    query_start_time = time.time()

    retrieved_context_str = "Error: Could not retrieve context."
    # Initialize logging variables for this query
    stage1_llm_responses_for_voting = {}
    answer_from_context_chosen_via_vote = "N/A (Voting process did not yield a context answer)"
    answer_from_general_knowledge = "N/A (Stage 1 might have succeeded or error in strict RAG)"
    proceed_to_general_knowledge_after_vote = False # Default to not needing GK

    try:
        # --- Stage 0: Retrieve Context ---
        print("\n  --- Stage 0: Retrieving Context ---")
        retrieved_context_str = get_related_docs(selected_query, k=3) # Ensure this k matches your retriever's design for LLM
        print(f"  Retrieved Context (first 300 chars):\n    '{retrieved_context_str[:300].replace(chr(10), ' ')}...'")

        # --- Stage 1: Attempt to Answer Strictly from Context (with VOTING) ---
        print("\n  --- Stage 1: Voting on Context Sufficiency ---")
        strict_context_prompt_template = "Based ONLY on the following 'Provided Context', answer the 'Question'. If the answer is not present in the 'Provided Context', you MUST respond with only the exact phrase 'CONTEXT_INSUFFICIENT'. Do not use any other knowledge.\n\nProvided Context:\n{context}\n\nQuestion: {query}\n\nAnswer:\n"
        current_strict_prompt = strict_context_prompt_template.format(context=retrieved_context_str, query=selected_query)

        insufficient_context_votes = 0
        valid_context_answers_from_vote = []

        for i_vote, params_vote in enumerate(strict_param_sets_for_voting):
            print(f"    Vote Attempt {i_vote+1} with params: {params_vote['name']}")
            stage1_answer_attempt = generate_llm_answer_experimental(
                model, tokenizer, current_strict_prompt,
                max_new_tokens=params_vote["max_new_tokens"],
                temperature=params_vote.get("temperature", 0.7), # .get for safety if a param is missing
                top_p=params_vote.get("top_p", 0.9),
                do_sample=params_vote["do_sample"]
            )
            stage1_llm_responses_for_voting[params_vote['name']] = stage1_answer_attempt
            print(f"      LLM Response ({params_vote['name']}): '{stage1_answer_attempt}'")

            # Your refined Python logic to check for insufficiency
            # Increased length check slightly for more robustness
            if "CONTEXT_INSUFFICIENT" in stage1_answer_attempt.upper() or \
               (len(stage1_answer_attempt.split()) < 7 and \
                any(phrase in stage1_answer_attempt.lower() for phrase in ["don't know", "cannot answer", "not sure", "no information", "context does not provide", "unable to determine", "not found in the context"])):
                insufficient_context_votes += 1
                print(f"      --> VOTE: INSUFFICIENT from {params_vote['name']}")
            else:
                valid_context_answers_from_vote.append({"param_set_name": params_vote['name'], "answer": stage1_answer_attempt})
                print(f"      --> VOTE: SUFFICIENT (attempted answer) from {params_vote['name']}")

        # --- Stage 2: Python Logic to Decide Next Step (Based on Votes) ---
        final_answer_for_query = ""
        source_of_answer_for_query = ""
        
        if insufficient_context_votes >= MIN_VOTES_FOR_INSUFFICIENT:
            print(f"\n  --- Stage 1 Overall Result (Voting): {insufficient_context_votes}/{len(strict_param_sets_for_voting)} votes indicate context was insufficient. ---")
            proceed_to_general_knowledge_after_vote = True
            answer_from_context_chosen_via_vote = f"CONTEXT_INSUFFICIENT ({insufficient_context_votes} votes)"
        elif valid_context_answers_from_vote:
            print("\n  --- Stage 1 Overall Result (Voting): At least one attempt suggests context is usable. ---")
            # Strategy: Pick the answer from the first valid attempt.
            # You could implement more sophisticated logic here (e.g., longest, specific param set preference).
            answer_from_context_chosen_via_vote = valid_context_answers_from_vote[0]["answer"]
            final_answer_for_query = f"[Based on Provided Context ({valid_context_answers_from_vote[0]['param_set_name']})]:\n{answer_from_context_chosen_via_vote}"
            source_of_answer_for_query = f"Provided Context ({valid_context_answers_from_vote[0]['param_set_name']})"
            # proceed_to_general_knowledge_after_vote remains False
        else: 
            print("\n  --- Stage 1 Overall Result (Voting): Undetermined (e.g., all attempts errored or were too short but not the cue). Defaulting to insufficient. ---")
            proceed_to_general_knowledge_after_vote = True
            answer_from_context_chosen_via_vote = "CONTEXT_INSUFFICIENT (All Stage 1 attempts failed or were uninformative)"

        if proceed_to_general_knowledge_after_vote:
            print("\n  --- Stage 3: Attempting Answer from General Knowledge ---")
            general_knowledge_prompt = f"Please answer the following question based on your general knowledge: {selected_query}\n\nAnswer:\n"
            answer_from_general_knowledge = generate_llm_answer_experimental(
                model, tokenizer, general_knowledge_prompt,
                max_new_tokens=250, temperature=0.7, top_p=0.9, do_sample=True
            )
            print(f"    LLM Response (General Knowledge):\n      '{answer_from_general_knowledge}'")

            is_informative_gk_answer = True
            non_informative_phrases_gk = ["don't know", "cannot answer", "not sure", "no information", "unable to provide", "no context", "lack the ability", "not capable"]
            if any(phrase in answer_from_general_knowledge.lower() for phrase in non_informative_phrases_gk) or \
               len(answer_from_general_knowledge.split()) < 6: # Reasonably informative answer check
                is_informative_gk_answer = False
                
            if is_informative_gk_answer:
                final_answer_for_query = f"[Based on General Knowledge]:\n{answer_from_general_knowledge}"
                source_of_answer_for_query = "General Knowledge"
            else:
                final_answer_for_query = "I do not have enough information to answer this question from the provided context or my general knowledge."
                source_of_answer_for_query = "Insufficient Information (Context & GK)"
        
    except Exception as query_processing_error:
        print(f"!!!! ERROR processing query '{selected_query}': {query_processing_error} !!!!")
        final_answer_for_query = "Error during processing this query."
        source_of_answer_for_query = "Error"

    query_end_time = time.time()
    print("\n  -----------------------------------------")
    print(f"  FINAL DETERMINED ANSWER for Query {q_idx+1}")
    print(f"  Source of Answer: {source_of_answer_for_query}")
    print(f"  Answer:\n{final_answer_for_query}") # This will print the final answer text
    print(f"  Time taken for this query: {query_end_time - query_start_time:.2f} seconds")
    print("  -----------------------------------------")

    all_voting_rag_results.append({
        "query": selected_query,
        "retrieved_context_snippet": retrieved_context_str[:500] + "..." if isinstance(retrieved_context_str, str) else "Error in context",
        "stage1_all_voting_responses": stage1_llm_responses_for_voting,
        "stage1_insufficient_votes": insufficient_context_votes,
        "stage1_chosen_context_answer_if_any": answer_from_context_chosen_via_vote,
        "general_knowledge_llm_response": answer_from_general_knowledge,
        "final_answer_source": source_of_answer_for_query,
        "final_answer_text": final_answer_for_query # Ensure this holds the actual text
    })
    
    # Memory Management
    del retrieved_context_str, stage1_llm_responses_for_voting, answer_from_context_chosen_via_vote
    if 'answer_from_general_knowledge' in locals() and answer_from_general_knowledge != "N/A (Stage 1 might have succeeded or error in strict RAG)": # Check before del
        del answer_from_general_knowledge
    if 'current_strict_prompt' in locals(): del current_strict_prompt
    if 'general_knowledge_prompt' in locals(): del general_knowledge_prompt
    gc.collect()


print("\n\n===== Programmatic Multi-Step RAG with Stage 1 Voting COMPLETE =====")

# Display results in a DataFrame
df_voting_rag_results = pd.DataFrame(all_voting_rag_results)
print("\n--- Summary of Programmatic RAG Results with Stage 1 Voting ---")
# For better display of long text and all rows/columns
pd.set_option('display.max_colwidth', 200)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
# Select key columns for initial review
display(df_voting_rag_results[[
    'query',
    'final_answer_source',
    'final_answer_text', # Make sure this column has the text
    'stage1_insufficient_votes',
    # 'stage1_chosen_context_answer_if_any', # Can be verbose
    # 'stage1_all_voting_responses' # Can be very verbose
]])
pd.reset_option('display.max_colwidth')
pd.reset_option('display.max_rows')
pd.reset_option('display.max_columns')


print(">>> To see individual Stage 1 voting responses, you can inspect the full 'df_voting_rag_results' DataFrame or 'all_voting_rag_results' list. <<<")

--- Programmatic RAG with Stage 1 VOTING for ALL 6 Sample Queries ---


<<<<< PROCESSING QUERY 1/6: "What is a key reason OpenSSH is considered secure?" >>>>>

  --- Stage 0: Retrieving Context ---


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Retrieved Context (first 300 chars):
    'Document 1:  ["codebase with the OpenSSH 7.6 release. SSH is a protocol that can be used for many applications across many platforms including most Unix variants (GNU/Linux, the BSDs including Apple's macOS, and Solaris), as well as Microsoft Windows. Some of the applications below may require featu...'

  --- Stage 1: Voting on Context Sufficiency ---
    Vote Attempt 1 with params: S1_V_Greedy




      LLM Response (S1_V_Greedy): 'OpenSSH is considered secure due to its use of the SSH-2 protocol, which includes several security features such as encryption, authentication, and key exchange mechanisms. Specifically, the SSH-2 protocol uses strong encryption algorithms, secure key exchange methods, and robust authentication protocols. Additionally, OpenSSH has been widely adopted and used in various environments, which helps to ensure its security through widespread testing and feedback. However, the provided context does not explicitly state these reasons, so the answer is based on general understanding'
      --> VOTE: SUFFICIENT (attempted answer) from S1_V_Greedy
    Vote Attempt 2 with params: S1_V_VeryLowTemp
      LLM Response (S1_V_VeryLowTemp): 'OpenSSH is considered secure due to its use of the SSH-2 protocol, which includes several security features such as encryption, authentication, and key exchange mechanisms. Specifically, the SSH-2 protocol uses strong encryption a

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Retrieved Context (first 300 chars):
    'Document 1: ["VMware vSphere Integrated Containers. The Cloud Foundry Diego project integrates Docker into the Cloud Foundry PaaS. Nanobox uses Docker (natively and with VirtualBox) containers as a core part of its software development platform. Red Hat's OpenShift PaaS integrates Docker with relate...'

  --- Stage 1: Voting on Context Sufficiency ---
    Vote Attempt 1 with params: S1_V_Greedy
      LLM Response (S1_V_Greedy): 'The main components of Docker's service are:

- Resource Isolation: Docker uses the resource isolation features of the Linux kernel such as cgroups and kernel namespaces.
- Union-Capable File System: Docker uses a union-capable file system such as OverlayFS and others to allow independent "containers" to run within a single Linux instance.
- Lightweight Container Engine: Docker is a lightweight container engine that allows developers to package their applications along with runtime and dependencies into a portable co

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Retrieved Context (first 300 chars):
    '['and killers. Cimino countered that his film was not political, polemical, literally accurate, or posturing for any particular point of view. He further defended his position by saying that he had news clippings from Singapore that confirm Russian roulette was used during the war (without specifyin...'

  --- Stage 1: Voting on Context Sufficiency ---
    Vote Attempt 1 with params: S1_V_Greedy
      LLM Response (S1_V_Greedy): 'In the film "The Deer Hunter," Christopher Walken portrays a character who gives a speech involving a gold watch related to his experiences in the Vietnam War. The character is a young Pennsylvania steelworker who is emotionally destroyed by the Vietnam War. Walken's performance in this film earned him an Academy Award for Best Supporting Actor. To prepare for the role, Walken consumed only bananas, water, and rice for a week to achieve the gaunt appearance required for the character. Walken explains in the'
      --

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Retrieved Context (first 300 chars):
    'Document 1:  ['exists the potential for injury while running (just as there is in any sport), there are many benefits. Some of these benefits include potential weight loss, improved cardiovascular and respiratory health (reducing the risk of cardiovascular and respiratory diseases), improved cardiov...'

  --- Stage 1: Voting on Context Sufficiency ---
    Vote Attempt 1 with params: S1_V_Greedy
      LLM Response (S1_V_Greedy): 'Running, like all forms of regular exercise, offers several health benefits, including:

1. **Weight Loss**: Running can help in burning calories, leading to weight loss.
2. **Improved Cardiovascular and Respiratory Health**: Running reduces the risk of cardiovascular and respiratory diseases.
3. **Improved Cardiovascular Fitness**: Running strengthens the heart and lungs.
4. **Reduced Total Blood Cholesterol**: Running can help lower total cholesterol levels.
5. **Strengthening of Bones**: Running can'
      --> VOT

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Retrieved Context (first 300 chars):
    'Document 1:  ['the Flags Act 1953, section 8 of that Act specified that "this Act does not affect the right or privilege of a person to fly the Union Jack." The Union Jack continued to be used for a period thereafter as a national flag. The current national flag of New Zealand was given official sta...'

  --- Stage 1: Voting on Context Sufficiency ---
    Vote Attempt 1 with params: S1_V_Greedy
      LLM Response (S1_V_Greedy): 'The New Zealand Ensign had formal legislation passed for its use earlier. According to the provided context, the New Zealand Ensign Act was passed in 1902, while the Union Jack Act was passed in 1953. The 1953 Act specifically states that it does not affect the right or privilege of a person to fly the Union Jack. Therefore, the New Zealand Ensign had formal legislation passed for its use earlier. CONTEXT_INSUFFICIENT

Note: The'
      --> VOTE: INSUFFICIENT from S1_V_Greedy
    Vote Attempt 2 with params: S1_V_VeryL

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Retrieved Context (first 300 chars):
    '['The future president wrote that it was the last time he ever stole. Carter would be credited by his eldest son with being the person who most shaped his "work habits and ambitions". Carter was a conservative in his political views. However, his son Jimmy recollected that, "within our family we nev...'

  --- Stage 1: Voting on Context Sufficiency ---
    Vote Attempt 1 with params: S1_V_Greedy
      LLM Response (S1_V_Greedy): 'Yes, Jimmy Carter's high school activities had a significant influence on his professional pursuit. According to the provided context, Carter was a diligent student with a fondness for reading. He also participated in various extracurricular activities, including sports and school plays. For example, he played on the Plains High School basketball team, joined the Future Farmers of America, and developed a lifelong interest in woodworking. These activities not only helped him build confidence and leadership skills but

Unnamed: 0,query,final_answer_source,final_answer_text,stage1_insufficient_votes
0,What is a key reason OpenSSH is considered secure?,Provided Context (S1_V_Greedy),"[Based on Provided Context (S1_V_Greedy)]:\nOpenSSH is considered secure due to its use of the SSH-2 protocol, which includes several security features such as encryption, authentication, and key ...",1
1,What are the main components of Docker's service?,General Knowledge,[Based on General Knowledge]:\nDocker's service is made up of three main components:\n\n1. Container Engine: This is the runtime environment that runs and manages the containers. It is responsible...,2
2,In which film did Christopher Walken portray a character who gives a speech involving a gold watch related to his experiences in the Vietnam War?,Provided Context (S1_V_Greedy),"[Based on Provided Context (S1_V_Greedy)]:\nIn the film ""The Deer Hunter,"" Christopher Walken portrays a character who gives a speech involving a gold watch related to his experiences in the Vietn...",0
3,Which health benefits are associated with running?,Provided Context (S1_V_Greedy),"[Based on Provided Context (S1_V_Greedy)]:\nRunning, like all forms of regular exercise, offers several health benefits, including:\n\n1. **Weight Loss**: Running can help in burning calories, lea...",0
4,"Which flag, the New Zealand Ensign or the Union Jack, had formal legislation passed for its use earlier?",Provided Context (S1_V_VeryLowTemp),"[Based on Provided Context (S1_V_VeryLowTemp)]:\nThe New Zealand Ensign had formal legislation passed for its use earlier. According to the provided context, the",1
5,Did Jimmy Carter's high school activities have any influence on his professional pursuit?,Provided Context (S1_V_Greedy),"[Based on Provided Context (S1_V_Greedy)]:\nYes, Jimmy Carter's high school activities had a significant influence on his professional pursuit. According to the provided context, Carter was a dili...",0


>>> To see individual Stage 1 voting responses, you can inspect the full 'df_voting_rag_results' DataFrame or 'all_voting_rag_results' list. <<<
