# Creating evaluation dataset

Here we simply take our DIY dataset and generate five questions for each content item. These questions will be used to test the retrieval capabilities of the rag.

In [2]:
import pandas as pd 



diy_dataset = pd.read_csv('../DIY_dataset/final_dataset.csv')


In [38]:
diy_dataset = diy_dataset.dropna(subset='content')

In [26]:
def build_prompt(content):

    user_message = "Generate 5 general questions for the given content. Output only the 5 questions as a valid Python list. Do not include any explanation, comments, or extra text—only the Python list." + '\n The content is:\n ' + content
    return user_message

In [28]:
def create_system_message():
    system_message = """ 
    Generate 5 general questions for the given content. Output only the 5 questions as a valid Python list. Do not include any 
    explanation, comments, or extra text—only the Python list.


    """
    #use only facts from the context. if the context doesnt contain the answer, output NONE
    return system_message

In [29]:
def llm(user_message,system_message):
    import requests

    url = "http://localhost:1234/v1/chat/completions"
    headers = {
        "Content-Type": "application/json"
    }
    data = {
        "model": "Llama-3.2-3B-Instruct-GGUF",
        "messages": [
            {"role": "system", "content": system_message},
            {"role": "user", "content": user_message}
        ],
        "temperature": 0,
        "max_tokens": -1,
        "stream": False
    }
    
    response = requests.post(url, headers=headers, json=data)
    return response.json()['choices'][0]['message']['content']
    

In [44]:
import ast
def generate_questions(content):
    user_message = build_prompt(content)
    system_message = create_system_message()
    answer = llm(user_message,system_message)
    return ast.literal_eval(answer)

In [45]:
import ast
import json
import pandas as pd
import time
from tqdm import tqdm

def safe_parse(answer):
    """Try parsing LLM output into Python object, fallback to raw string list."""
    try:
        return ast.literal_eval(answer)
    except Exception:
        try:
            return json.loads(answer)
        except Exception:
            return [answer]  # fallback if nothing works

def generate_questions(content, retries=3):
    """Query LLM and parse the result safely, retrying on errors."""
    user_message = build_prompt(content)
    system_message = create_system_message()
    for attempt in range(retries):
        try:
            answer = llm(user_message, system_message)
            return safe_parse(answer)
        except Exception as e:
            print(f"⚠️ Error on attempt {attempt+1}: {e}")
            time.sleep(2)  # small delay before retry
    return []  # give up after retries

def process_with_progress(diy_dataset, save_path="diy_dataset_progress.csv", checkpoint_interval=50):
    """
    Process dataset row by row with progress bar and checkpoint saving.
    
    - save_path: file to save progress (CSV).
    - checkpoint_interval: save every N rows.
    """
    results = []
    start_idx = 0

    # 🔄 Resume from checkpoint if available
    try:
        checkpoint = pd.read_csv(save_path)
        results = checkpoint['questions'].tolist()
        start_idx = len(results)
        print(f"Resuming from checkpoint at row {start_idx}...")
    except FileNotFoundError:
        print("No checkpoint found. Starting fresh...")

    # 🚀 Process rows with progress bar
    for i, content in enumerate(tqdm(diy_dataset['content'][start_idx:], initial=start_idx, total=len(diy_dataset))):
        results.append(generate_questions(content))

        # 💾 Save checkpoint every N rows
        if (i + start_idx + 1) % checkpoint_interval == 0:
            temp_df = diy_dataset.copy()
            temp_df['questions'] = results + [None] * (len(diy_dataset) - len(results))
            temp_df.to_csv(save_path, index=False)
            print(f"Checkpoint saved at row {i + start_idx + 1}.")

    # ✅ Final save
    diy_dataset['questions'] = results
    diy_dataset.to_csv(save_path, index=False)
    print("Processing complete. Final dataset saved.")

    return diy_dataset


In [46]:
diy_dataset = process_with_progress(diy_dataset, save_path="diy_dataset_progress.csv", checkpoint_interval=50)


No checkpoint found. Starting fresh...


  5%|███▋                                                                          | 50/1063 [05:19<1:55:33,  6.84s/it]

Checkpoint saved at row 50.


  9%|███████▏                                                                     | 100/1063 [10:33<1:49:45,  6.84s/it]

Checkpoint saved at row 100.


 12%|█████████▏                                                                   | 126/1063 [13:55<2:58:57, 11.46s/it]

⚠️ Error on attempt 1: 'choices'
⚠️ Error on attempt 2: 'choices'
⚠️ Error on attempt 3: 'choices'


 14%|██████████▊                                                                  | 150/1063 [17:58<1:35:45,  6.29s/it]

Checkpoint saved at row 150.


 19%|██████████████▍                                                              | 200/1063 [23:43<1:18:38,  5.47s/it]

Checkpoint saved at row 200.


 24%|██████████████████                                                           | 250/1063 [29:36<1:39:28,  7.34s/it]

Checkpoint saved at row 250.


 28%|█████████████████████▋                                                       | 300/1063 [35:34<1:27:07,  6.85s/it]

Checkpoint saved at row 300.


 33%|█████████████████████████▎                                                   | 350/1063 [40:54<1:40:56,  8.49s/it]

Checkpoint saved at row 350.


 38%|████████████████████████████▉                                                | 400/1063 [47:07<1:07:02,  6.07s/it]

Checkpoint saved at row 400.


 42%|████████████████████████████████▌                                            | 450/1063 [53:14<1:11:21,  6.98s/it]

Checkpoint saved at row 450.


 47%|████████████████████████████████████▏                                        | 500/1063 [59:39<1:01:40,  6.57s/it]

Checkpoint saved at row 500.


 52%|██████████████████████████████████████▊                                    | 550/1063 [1:05:47<1:12:12,  8.45s/it]

Checkpoint saved at row 550.


 56%|███████████████████████████████████████████▍                                 | 600/1063 [1:12:39<53:33,  6.94s/it]

Checkpoint saved at row 600.


 61%|█████████████████████████████████████████████▊                             | 650/1063 [1:19:28<1:03:52,  9.28s/it]

Checkpoint saved at row 650.


 63%|███████████████████████████████████████████████▎                           | 670/1063 [1:22:36<1:39:24, 15.18s/it]

⚠️ Error on attempt 1: 'choices'
⚠️ Error on attempt 2: 'choices'
⚠️ Error on attempt 3: 'choices'


 63%|███████████████████████████████████████████████▎                           | 671/1063 [1:22:48<1:33:35, 14.33s/it]

⚠️ Error on attempt 1: 'choices'
⚠️ Error on attempt 2: 'choices'
⚠️ Error on attempt 3: 'choices'


 66%|██████████████████████████████████████████████████▋                          | 700/1063 [1:26:53<43:17,  7.16s/it]

Checkpoint saved at row 700.


 71%|██████████████████████████████████████████████████████▎                      | 750/1063 [1:34:56<34:34,  6.63s/it]

Checkpoint saved at row 750.


 75%|█████████████████████████████████████████████████████████▉                   | 800/1063 [1:40:53<37:50,  8.63s/it]

Checkpoint saved at row 800.


 80%|█████████████████████████████████████████████████████████████▌               | 850/1063 [1:46:10<21:24,  6.03s/it]

Checkpoint saved at row 850.


 85%|█████████████████████████████████████████████████████████████████▏           | 900/1063 [1:51:22<16:49,  6.19s/it]

Checkpoint saved at row 900.


 89%|████████████████████████████████████████████████████████████████████▊        | 950/1063 [1:57:41<13:27,  7.15s/it]

Checkpoint saved at row 950.


 94%|███████████████████████████████████████████████████████████████████████▍    | 1000/1063 [2:04:12<08:29,  8.08s/it]

Checkpoint saved at row 1000.


 99%|███████████████████████████████████████████████████████████████████████████ | 1050/1063 [2:09:39<01:20,  6.17s/it]

Checkpoint saved at row 1050.


100%|████████████████████████████████████████████████████████████████████████████| 1063/1063 [2:11:05<00:00,  7.40s/it]

Processing complete. Final dataset saved.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  diy_dataset['questions'] = results


In [2]:
import pandas as pd
diy_dataset_evaluation = pd.read_csv('diy_dataset_progress.csv')

In [3]:
import ast

diy_dataset_evaluation['questions'] = diy_dataset_evaluation['questions'].apply(ast.literal_eval)


In [5]:
diy_dataset_evaluation.head(1)

Unnamed: 0,start_time,end_time,playlist_title,playlist_id,video_title,video_id,chapter_title,chapter_id,content,clip_link,questions
0,0.0,42.0,Electrical,PLemm4hCbY93PD9g4X5LQRwQlvgiLIjNvP,3 Way Switch,LDyvcM0gcVs,3 Way Switch: Introduction,9d4c4c4a-be25-0454-f575-143687f39194,[music] oh check it out three-way switches tha...,https://youtu.be/LDyvcM0gcVs?t=0,"[What are three-way switches used for?, How do..."


In [4]:
diy_dataset_evaluation['questions'].loc[3]

['What are the typical types of wires used for home wiring?',
 'How do three-way switches use four conductors?',
 'What is the purpose of a bare copper wire in electrical wiring?',
 'Why is an extra red wire useful when working with three-way switches?',
 'How do the white, black, and neutral wires work together to power lights?']

In [6]:
id_and_questions = []

for index, row in diy_dataset_evaluation.iterrows():
    for q in row['questions']:
        id_and_questions.append((row['chapter_id'],q))

In [8]:
id_and_questions_df = pd.DataFrame(id_and_questions,columns=['id','questions'])

In [9]:
id_and_questions_df.head(4)

Unnamed: 0,id,questions
0,9d4c4c4a-be25-0454-f575-143687f39194,What are three-way switches used for?
1,9d4c4c4a-be25-0454-f575-143687f39194,How do three-way switches work?
2,9d4c4c4a-be25-0454-f575-143687f39194,Can anyone install three-way switches?
3,9d4c4c4a-be25-0454-f575-143687f39194,Are three-way switches commonly found in homes?


In [10]:
id_and_questions_df.to_csv('ground_truth_retrieval.csv',index=False)

# Retrieval Evaluation

In [53]:
evaluation_dataset = pd.read_csv('diy_dataset_progress.csv')
evaluation_dataset = evaluation_dataset.drop(columns='questions')
evaluation_dataset = evaluation_dataset.to_dict(orient='records')

In [54]:
evaluation_dataset[0]

{'start_time': 0.0,
 'end_time': 42.0,
 'playlist_title': 'Electrical',
 'playlist_id': 'PLemm4hCbY93PD9g4X5LQRwQlvgiLIjNvP',
 'video_title': '3 Way Switch',
 'video_id': 'LDyvcM0gcVs',
 'chapter_title': '3 Way Switch: Introduction',
 'chapter_id': '9d4c4c4a-be25-0454-f575-143687f39194',
 'content': "[music] oh check it out three-way switches that's what we're talking about wiring three-way switches hi i'm tim carter. And first i want to let you know a little safety note we're working with high voltage electric the same thing that you have in your own home whenever you work with electricity make sure it's turned off i don't want you to get electrocuted three-way switches can be found in many homes. And they're very useful they allow you to control a light or maybe several lights from two different locations. But the problem is they look a lot different than a regular switch a three-way switch",
 'clip_link': 'https://youtu.be/LDyvcM0gcVs?t=0'}

In [55]:
import minsearch
index = minsearch.Index(
    text_fields = ['chapter_title','content'],
    keyword_fields = ["playlist_title"]
)
index.fit(evaluation_dataset)


<minsearch.minsearch.Index at 0x28909423830>

In [13]:
df_questions = pd.read_csv('ground_truth_retrieval.csv')

In [15]:
ground_truth = df_questions.to_dict(orient='records')

In [17]:
ground_truth[0]

{'id': '9d4c4c4a-be25-0454-f575-143687f39194',
 'questions': 'What are three-way switches used for?'}

In [14]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)
    
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [83]:
def minsearch_search(query):
    
    boost = {}
    results = index.search(
        query=query,
        #filter_dict={'course': course},
        boost_dict=boost,
        num_results=10
    )

    return results

In [84]:
from tqdm.auto import tqdm
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['id']
        results = search_function(q)
        relevance = [d['chapter_id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [85]:
evaluate(ground_truth, lambda q: minsearch_search(q['questions']))


  0%|          | 0/5008 [00:00<?, ?it/s]

{'hit_rate': 0.5736821086261981, 'mrr': 0.347409145367411}

# Finding the best parameters for boosting

In [87]:
df_validation =  df_questions[:1000]
df_test = df_questions[1000:]

In [88]:
import random

def simple_optimize(param_ranges, objective_function, n_iterations=10):
    best_params = None
    best_score = float('-inf')  # Assuming we're minimizing. Use float('-inf') for maximizing.

    for _ in range(n_iterations):
        # Generate random parameters
        current_params = {}
        for param, (min_val, max_val) in param_ranges.items():
            if isinstance(min_val, int) and isinstance(max_val, int):
                current_params[param] = random.randint(min_val, max_val)
            else:
                current_params[param] = random.uniform(min_val, max_val)

        # Evaluate the objective function
        current_score = objective_function(current_params)

        # Update best score and parameters if this is better
        if current_score > best_score:
            best_score = current_score
            best_params = current_params

    return best_params, best_score





In [89]:
gt_val = df_validation.to_dict(orient='records')

In [90]:
def minsearch_search(query,boost=None):
    #boost = {'content': 50.0,'chapter_title':2.0}
    if boost is None:
        boost = {}
    results = index.search(
        query=query,
        #filter_dict={'course': course},
        boost_dict=boost,
        num_results=10
    )

    return results

In [95]:
param_ranges = {'playlist_title': (0.0,6.0),
 'video_title': (0.0,6.0),
 'chapter_title': (0.0,6.0),
 'content': (0.0,15.0)
               
               }

def objective(boost_params):
    def search_function(q):
        return minsearch_search(q['questions'], boost_params)

    results = evaluate(gt_val,search_function)
    return results['mrr']
        

In [99]:
simple_optimize(param_ranges, objective, n_iterations=100)

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

({'playlist_title': 3.4311350452088907,
  'video_title': 3.6054325902612927,
  'chapter_title': 1.3418523035711323,
  'content': 12.3649113626517},
 0.5781781746031744)

# Minsearch improved (optimized retrieval)

In [100]:
def minsearch_improved(query):
    
    boost ={'playlist_title': 3.4311350452088907,
  'video_title': 3.6054325902612927,
  'chapter_title': 1.3418523035711323,
  'content': 12.3649113626517}
    
    results = index.search(
        query=query,
        #filter_dict={'course': course},
        boost_dict=boost,
        num_results=10
    )

    return results

In [101]:
evaluate(ground_truth, lambda q: minsearch_improved(q['questions']))


  0%|          | 0/5008 [00:00<?, ?it/s]

{'hit_rate': 0.805111821086262, 'mrr': 0.5459778639890454}