# **1. Library Installations**

In [None]:
!pip install datasets pydantic beautifulsoup4
!pip install sentence-transformers
!pip install scikit-learn
!pip install nltk
!pip install rouge_score
!pip install datasets
!pip install fuzzywuzzy

# **2. Dataset Preparation**

In [None]:
from datasets import load_dataset
from bs4 import BeautifulSoup
from random import choice

def clean_html(html_content: str) -> str:
    """Clean HTML content by removing HTML tags."""
    if html_content:
        soup = BeautifulSoup(html_content, "html.parser")
        return soup.get_text(strip=True)
    return ""

def clean_description(description: list) -> str:
    """Clean description list into a single string."""
    if description:
        return " ".join(description)  # Join list items into a single string
    return ""

def stream_clean_rag_samples(dataset_name: str, dataset_config: str, num_samples: int):

    #placeholder for the data
    data = []

    # Load the dataset from Hugging Face with streaming enabled and a specific configuration
    dataset = load_dataset(dataset_name, dataset_config, split="train", streaming=True)  # Adjust the split if necessary

    # Stream random samples
    count = 0
    for sample in dataset:
        if count >= num_samples:
            break

        # Extract question (handling 'text' subfield if present)
        question = sample.get("question", None)
        if isinstance(question, dict):
            question = question.get("text", None)

        context = sample.get("document_title", None) or sample.get("context", None) or sample.get("document", None) or sample.get("search_results", None)

        # Clean HTML or description if the context contains the relevant subfields
        if context and isinstance(context, dict):
            if "html" in context:
                context = clean_html(context["html"])  # Clean the HTML content to plain text
            elif "description" in context:
                context = clean_description(context["description"])  # Convert the list to a plain string

        # Handle response, checking for different possible field names
        try:
            response = choice(sample.get("answer", {}).get("aliases", [None])) or sample.get("long_answer_candidates", None) or sample.get("answers", {}).get("text", [None])[0]

            # If response is a list or dict, extract the answer text
            if isinstance(response, list):
              response = response[0] if response else None  # Take the first answer if it's a list
            elif isinstance(response, dict):
              response = response.get("text", None) or response.get("value", None) # Get value from 'text' or 'value' field

        except:
            response = sample.get("answer", None) or sample.get("long_answer_candidates", None) or sample.get("answers", {}).get("text", [None])[0]


        # Check if any field is missing and print the actual field names in the sample
        missing_fields = []
        if question is None:
            missing_fields.append("question")
        if context is None:
            missing_fields.append("context/document/search_results")
        if response is None:
            missing_fields.append("response/long_answer_candidates/answer")

        if missing_fields:
            # Print the actual field names present in the sample
            print(f"Sample {count + 1} - Present fields: {', '.join(sample.keys())}")

        # Clean output
        #print(f"Sample {count + 1}:")
        #print(f"Question: {question if question else 'No question found.'}")
        #print(f"Context: {context if context else 'No context found.'}")
        #print(f"Response: {response if response else 'No response found.'}")
        #print("-" * 50)

        data.append({"Question": question, "Context": context, "Response": response})

        count += 1

    return data

# Example usage: Streaming from SQuAD, Trivia QA, and WikiQA datasets
datasets = [
    ("squad", None),  # SQuAD dataset
    ("trivia_qa", "rc"),  # Config is required for trivia_qa
    ("wiki_qa", None)  # WikiQA dataset
]

full_data = {}

for dataset_name, dataset_config in datasets:
    print(f"Streaming from dataset: {dataset_name}")
    full_data[dataset_name] = stream_clean_rag_samples(dataset_name, dataset_config, num_samples=5)
    print("=" * 100)

# **3. Ratio of Improvement Metric**

In [None]:
import numpy as np

def ratio_of_improvement(sim_beqa, sim_baseline):
    """
    Computes the Ratio of Improvement (RI) for BEQA vs. a baseline approach.

    RI is defined as:

      RI = ( Σ Sim_BEQA(q_m) ) / ( Σ Sim_Baseline(q_m) )

    where the sums are taken over all queries m in {1, 2, ..., M}.

    Parameters
    ----------
    sim_beqa : list or np.ndarray
        A list/array of semantic similarity scores for queries using the BEQA approach.
    sim_baseline : list or np.ndarray
        A list/array of semantic similarity scores for queries using the baseline approach.

    Returns
    -------
    float
        The ratio of the sum of BEQA similarity scores to the sum of baseline similarity scores.
        A value > 1.0 indicates that BEQA outperforms the baseline on average.
    """
    sim_beqa = np.array(sim_beqa, dtype=float)
    sim_baseline = np.array(sim_baseline, dtype=float)

    # Safety checks
    if len(sim_beqa) == 0 or len(sim_baseline) == 0:
        raise ValueError("Input lists cannot be empty.")
    if len(sim_beqa) != len(sim_baseline):
        raise ValueError("The two lists must have the same length.")

    sum_beqa = np.sum(sim_beqa)
    sum_baseline = np.sum(sim_baseline)

    # Avoid division by zero in edge cases
    if sum_baseline == 0:
        raise ZeroDivisionError("The sum of baseline similarities is zero.")

    return sum_beqa / sum_baseline


# Example usage:
if __name__ == "__main__":
    # Suppose we have similarity scores for 5 queries
    sim_beqa_example = [0.82, 0.91, 0.78, 0.88, 0.85]
    sim_baseline_example = [0.80, 0.85, 0.74, 0.89, 0.80]

    ri_value = ratio_of_improvement(sim_beqa_example, sim_baseline_example)
    print(f"Ratio of Improvement (RI) = {ri_value:.3f}")

# **5. TF-IDF with BEQA**

In [None]:
import time
import json
from typing import List, Dict
from pydantic import BaseModel, ValidationError, Field
import requests
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

# -------------------------------------------------------------------
# 1. Existing Pydantic Model + BEQA Transform Code (Groq)
# -------------------------------------------------------------------
class QAItem(BaseModel):
    Question: str = Field(..., description="The transformed question")
    Answer: str = Field(..., description="The transformed answer")

THROTTLE_LIMIT = 3  # Seconds between consecutive API calls
MAX_RETRIES = 5     # Maximum number of retries for valid responses

GROQ_API_URL = "https://api.groq.com/openai/v1/chat/completions"  # Example endpoint

def parse_groq_output(raw_output: str) -> List[Dict]:
    """
    Parse and validate the output from the Groq model.
    """
    try:
        data = json.loads(raw_output)
        if not isinstance(data, list):
            print("Parsed data is not a list.")
            return []
        items = []
        for item in data:
            qa_item = QAItem(**item)  # Validate via Pydantic
            items.append(qa_item.dict())
        return items

    except (ValidationError, json.JSONDecodeError, TypeError) as e:
        print(f"Failed to parse or validate JSON output: {e}")
        return []

def transform_with_groq(context: str, original_query: str, model_id: str, api_key: str) -> List[Dict]:
    """
    Transform retrieved context into structured QA pairs using the Groq API with retries.
    """
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}",
    }

    prompt_template = f"""
You are given:
- Context: "{context}"
- Original Query: "{original_query}"

Your task:
1. Generate one or more question-answer pairs that capture key information from the context.
2. Output your result as valid JSON (no markdown) and use the following schema (an array of objects):
   [
     {{
       "Question": "...",
       "Answer": "..."
     }},
     ...
   ]

Constraints:
- Make sure the JSON is valid.
- No additional keys beyond 'Question' and 'Answer'.
- Do not wrap the output in quotes or markdown.
- If you are unsure, keep it simple.
"""

    retries = 0
    while retries < MAX_RETRIES:
        try:
            data = {
                "model": model_id,
                "messages": [
                    {
                        "role": "user",
                        "content": prompt_template.strip()
                    }
                ],
                "temperature": 0.7,
                "max_tokens": 512,
                "top_p": 1.0,
                "frequency_penalty": 0,
                "presence_penalty": 0
            }
            response = requests.post(GROQ_API_URL, headers=headers, json=data)
            response.raise_for_status()

            raw_output = response.json()['choices'][0]['message']['content'].strip()
            qa_items = parse_groq_output(raw_output)
            if qa_items:
                return qa_items
            else:
                print(f"[Attempt {retries+1}] Model output was not valid JSON or did not match schema. Retrying...")

        except (requests.exceptions.RequestException, ValidationError, json.JSONDecodeError) as e:
            print(f"[Attempt {retries+1}] Error: {e}. Retrying...")

        # Increment retries and throttle
        retries += 1
        time.sleep(THROTTLE_LIMIT)

    print("Maximum retries reached. No valid response obtained.")
    return []

# -------------------------------------------------------------------
# 2. Sample Data Builders
# -------------------------------------------------------------------
def build_queries_and_corpus(cleaned_data: List[dict]):
    """
    For each item in cleaned_data, we read 'Question' & 'Context' to build two lists:
      - queries (List[str])
      - corpus (List[str])
    """
    queries = []
    corpus = []
    for item in cleaned_data:
        q = item.get("Question") or ""
        c = item.get("Context") or ""
        queries.append(q)
        corpus.append(c)
    return queries, corpus


def vanilla_tfidf_retrieval(queries: List[str], corpus: List[str]):
    """
    Build a TF-IDF vectorizer on the corpus, transform each query,
    and compute the maximum similarity score for each query.
    """
    vectorizer = TfidfVectorizer()
    corpus_tfidf = vectorizer.fit_transform(corpus)  # shape=(#docs, #features)

    # For demonstration, retrieve only the top doc for each query
    # Also store which doc is top for each query
    sim_scores_list = []
    best_doc_indices = []

    for q in queries:
        query_vec = vectorizer.transform([q])  # shape=(1, #features)
        scores = query_vec.dot(corpus_tfidf.T).toarray().flatten()
        max_idx = int(np.argmax(scores))
        max_score = float(scores[max_idx])
        sim_scores_list.append(max_score)
        best_doc_indices.append(max_idx)

    return sim_scores_list, best_doc_indices

# -------------------------------------------------------------------
# 3. Integration: Use TF-IDF, Then BEQA, Then Similarity
# -------------------------------------------------------------------
if __name__ == '__main__':
    # Suppose 'full_data' is a dictionary with dataset_name -> List[dict], as in your code
    # Each dict has { "Question": ..., "Context": ..., "Response": ... }

    # Example (simplified)
    full_data = {
        "squad": [
            {"Question": "When was Python created?",
             "Context": "Python is a language created by Guido van Rossum, first released in 1991.",
             "Response": "1991"},
            {"Question": "What does GIL stand for?",
             "Context": "Python's GIL stands for Global Interpreter Lock.",
             "Response": "Global Interpreter Lock"}
        ],
        "wiki_qa": [
            {"Question": "Who developed Python?",
             "Context": "Guido van Rossum developed Python.",
             "Response": "Guido van Rossum"}
        ]
    }

    # Define your Groq credentials
    API_KEY = ""
    MODEL_ID = "llama-3.3-70b-versatile"

    for dataset_name, dataset_content in full_data.items():
        print(f"\n=== Processing dataset: {dataset_name} ===")
        queries, corpus = build_queries_and_corpus(dataset_content)

        # Step A: Use TF-IDF to retrieve top doc for each query
        similarities, best_doc_indices = vanilla_tfidf_retrieval(queries, corpus)
        print("Baseline TF-IDF Similarities:", similarities)

        # Step B: Apply BEQA transform to the retrieved doc for each query
        #         Then optionally compute the similarity with the QA pairs
        #         For demonstration, let's just transform the doc -> QA pairs,
        #         and compute a new similarity using the QA pairs' question text.

        second_stage_similarities = []
        for i, q in enumerate(queries):
            top_doc = corpus[best_doc_indices[i]]

            # 1) Transform the doc into QA pairs
            beqa_result = transform_with_groq(
                context=top_doc,
                original_query=q,
                model_id=MODEL_ID,
                api_key=API_KEY
            )

            # 2) (Optional) Build a 'mini-corpus' from the QA items' questions/answers
            #    Let's pick just the 'Question' fields for a new corpus
            beqa_questions = [qa["Question"] for qa in beqa_result]
            # If there are no QA pairs, skip
            if not beqa_questions:
                second_stage_similarities.append(0.0)
                continue

            # 3) Use TF-IDF again, but now on the QA "questions" from the model
            mini_vectorizer = TfidfVectorizer()
            mini_corpus_tfidf = mini_vectorizer.fit_transform(beqa_questions)
            query_vec = mini_vectorizer.transform([q])
            scores = query_vec.dot(mini_corpus_tfidf.T).toarray().flatten()
            best_score = float(np.max(scores))
            second_stage_similarities.append(best_score)

        print("BEQA-based Similarities:", second_stage_similarities)

        # If you want, you could also measure similarity to the "Answer" fields
        # or combine them. This is flexible depending on your use case.

        print ("Ratio of Improvement:", ratio_of_improvement(second_stage_similarities, similarities))

# **6. Extension to Custom Datasets and Retrievers**

Below is a concise set of instructions for **extending** the code to a custom dataset (instead of the simple `full_data` dictionary) and a custom retriever (instead of the current TF-IDF pipeline).

---

## Extending to a Custom Dataset

### A. Load Your Dataset

1. **Data Format**: Suppose your dataset is stored in a CSV, JSON, or is fetched from an API. You want to produce a list of dictionaries like:
   ```python
   [
     {"Question": "some question", "Context": "some context", "Response": "some response"},
     ...
   ]
   ```
   at the end of your loading/parsing process.

2. **Write a Loader/Parsing Function**:
   - If your dataset is in CSV:
     ```python
     import csv

     def load_my_dataset(csv_file_path: str):
         data = []
         with open(csv_file_path, 'r', encoding='utf-8') as f:
             reader = csv.DictReader(f)
             for row in reader:
                 data.append({
                     "Question": row.get("QuestionColumn", ""),
                     "Context": row.get("ContextColumn", ""),
                     "Response": row.get("ResponseColumn", "")
                 })
         return data
     ```
   - If your dataset is in JSON, adapt similarly with `json.load(...)`.

3. **Feed the Loaded Data into the Existing Pipeline**:
   - You can then place the loaded data into your `full_data` dictionary, e.g.:
     ```python
     my_data = load_my_dataset("my_questions.csv")
     full_data = {"my_dataset_name": my_data}
     ```
   - The rest of your code will handle the new dataset in the same loop where it processes `full_data`.

### B. Integrate with `build_queries_and_corpus`

- Once you have the list of dictionaries from your loader, you can call:
  ```python
  queries, corpus = build_queries_and_corpus(my_data)
  ```
- Then proceed with your retrieval+BEQA pipeline exactly as shown in the `if __name__ == '__main__':` section.

---

## 2. Extending to a Custom Retriever

The current code uses a **vanilla TF-IDF** retriever, defined by:

```python
def vanilla_tfidf_retrieval(queries: List[str], corpus: List[str]):
    # ...
    return sim_scores_list, best_doc_indices
```

### A. Creating a New Retriever

1. **Implementation**: Suppose you want to replace TF-IDF with a custom approach (BM25, DPR, or a specialized neural model). You need a function with the same interface:

   ```python
   def my_custom_retriever(queries: List[str], corpus: List[str]):
       """
       Accepts queries and a corpus of documents.
       Returns:
         sim_scores_list: a list of floats representing, for each query,
                          the highest similarity to a corpus doc.
         best_doc_indices: a list of ints where best_doc_indices[i] is the index
                           of the best doc for queries[i].
       """
       # (1) Build your retrieval system on 'corpus'
       # (2) For each query, find the doc with the highest similarity
       # ...
       return sim_scores_list, best_doc_indices
   ```

2. **Example:**
   - If you’re using **BM25** from `rank_bm25`, you’d do something like:
     ```python
     from rank_bm25 import BM25Okapi

     def my_custom_retriever(queries, corpus):
         tokenized_corpus = [doc.split() for doc in corpus]
         bm25 = BM25Okapi(tokenized_corpus)

         sim_scores_list = []
         best_doc_indices = []
         for q in queries:
             tokenized_q = q.split()
             scores = bm25.get_scores(tokenized_q)
             max_idx = int(np.argmax(scores))
             max_score = float(scores[max_idx])
             sim_scores_list.append(max_score)
             best_doc_indices.append(max_idx)

         return sim_scores_list, best_doc_indices
     ```
   - If you’re using a neural approach, e.g., **Sentence-BERT**:
     ```python
     from sentence_transformers import SentenceTransformer
     import numpy as np

     def my_custom_retriever(queries, corpus):
         model = SentenceTransformer('all-MiniLM-L6-v2')
         corpus_embeddings = model.encode(corpus, convert_to_numpy=True)
         query_embeddings = model.encode(queries, convert_to_numpy=True)

         sim_scores_list = []
         best_doc_indices = []
         for i, q_emb in enumerate(query_embeddings):
             # Compute dot product or cosine similarity
             scores = np.dot(corpus_embeddings, q_emb)  # shape=(#docs,)
             max_idx = np.argmax(scores)
             max_score = float(scores[max_idx])
             sim_scores_list.append(max_score)
             best_doc_indices.append(max_idx)

         return sim_scores_list, best_doc_indices
     ```
   - Adjust the similarity measure to your preference.

### B. Plug the New Retriever into the Pipeline

In the main code, instead of:

```python
similarities, best_doc_indices = vanilla_tfidf_retrieval(queries, corpus)
```

…just call:

```python
similarities, best_doc_indices = my_custom_retriever(queries, corpus)
```

Everything else (the **BEQA transform** and second-stage similarity) will remain the same—because your new retriever function still provides:

- `similarities`: top similarity scores for each query,
- `best_doc_indices`: index of the top document in `corpus` for each query.

---

## 3. Final Example Putting It All Together

```python
if __name__ == '__main__':
    # 1) Load the dataset from a CSV
    my_data = load_my_dataset("my_questions.csv")   # implement your CSV loader
    full_data = {"my_dataset": my_data}

    # 2) Use custom retriever
    for dataset_name, dataset_content in full_data.items():
        queries, corpus = build_queries_and_corpus(dataset_content)
        
        # Replace with your own function:
        similarities, best_doc_indices = my_custom_retriever(queries, corpus)

        # Then do the same transform + second-stage similarity as you do now
        # ...
```

---

**Summary**:

1. **Custom Dataset**  
   - Write a loader that yields a list of dicts \([{"Question": Q, "Context": C, "Response": R}, ...]\).  
   - Integrate that list into your pipeline by calling `build_queries_and_corpus`.

2. **Custom Retriever**  
   - Implement a function that returns `(sim_scores_list, best_doc_indices)` from your chosen retrieval logic.  
   - Replace `vanilla_tfidf_retrieval` calls with your custom function.

Everything else—BEQA transforms, second-phase similarity, ratio-of-improvement calculations—remains the same. This approach ensures a **pluggable** architecture where you can swap out or expand the dataset source and the retrieval method.