### Setting up Env

In [2]:
import os
from dotenv import load_dotenv
import pandas as pd
from datasets import Dataset
from datasets import load_dataset

load_dotenv(override=True)

  from .autonotebook import tqdm as notebook_tqdm


True

### Downloading dataset

In [3]:
# Load from hub
ds_vejledninger = load_dataset(
    "jealk/dk_retrieval_benchmark",
    "retsinformation",
    split="train",
    #download_mode="force_redownload",
)

In [4]:
# Create pandas dataframe from the dataset using the huggingface datasets library
df_vejledninger = ds_vejledninger.to_pandas()
df_vejledninger.head()

Unnamed: 0,url,title,html_content,text_content
0,https://www.retsinformation.dk/eli/retsinfo/20...,Vejledning om regulering af satser fra 1. janu...,"<div class=""document-content"" id=""restylingRoo...",Vejledning om regulering af satser fra 1. janu...
1,https://www.retsinformation.dk/eli/retsinfo/20...,Vejledning om satser i 2024 for betaling af ud...,"<div class=""document-content"" id=""restylingRoo...",Vejledning om satser i 2024 for betaling af ud...
2,https://www.retsinformation.dk/eli/retsinfo/20...,Vejledning om obligatorisk selvbooking af jobs...,"<div class=""document-content"" id=""restylingRoo...",Vejledning om obligatorisk selvbooking af jobs...
3,https://www.retsinformation.dk/eli/retsinfo/20...,Vejledning til bekendtgørelse om tilskud til s...,"<div class=""document-content"" id=""restylingRoo...",Vejledning til bekendtgørelse om tilskud til s...
4,https://www.retsinformation.dk/eli/retsinfo/20...,Vejledning om fleksløntilskud m.v.,"<div class=""document-content"" id=""restylingRoo...",Vejledning om fleksløntilskud m.v.\n1.Indledni...


### Chunking text data

In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("intfloat/e5-base-v2")

def token_length_function(text_input):
  return len(tokenizer.encode(text_input, add_special_tokens=False))

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 512,
    chunk_overlap  = 0,
    length_function = token_length_function,
    separators = ["\n\n", "\n", ". ", "? ", "! "]
)

Token indices sequence length is longer than the specified maximum sequence length for this model (1350 > 512). Running this sequence through the model will result in indexing errors


In [6]:
#For some reason, Langchains text splitter is horribly slow (compared to llamaindex) takes 2+ minutes to run on my CPU
split_documents = text_splitter.create_documents(list(df_vejledninger["text_content"]), metadatas = [{"title": title} for title in df_vejledninger["title"]])

### Filtering contexts

#### Filtering using TextDescriptives

In [7]:
import textdescriptives as td
import spacy
from typing import List, Dict, Optional
import os

#add optional meta data, list of dicts
def filter_text_by_td(text_list: List[str], filter_type: bool=True) -> List[str]:
    """Filter nodes by the textdescriptives quality check

    Args:
    text_list> a list of stext strings
    fiter_type: A boolean defining whether to filter by texts that passed (True) or failed (False) the textdescriptives quality check

    Returns:
    A list of text chunks that passed the textdescriptives quality check
    """
    nlp = spacy.blank("da")
    nlp.add_pipe("sentencizer")
    quality_pipe = nlp.add_pipe("textdescriptives/quality")
    docs = list(nlp.pipe(text_list))
    filtered_texts = [doc.text for doc in docs if doc._.passed_quality_check==filter_type]
    
    return filtered_texts

In [8]:
#Sample 300 texts
texts_passed_td = filter_text_by_td([text.page_content for text in split_documents[0:300]])
docs_passed_td = [doc for doc in split_documents if doc.page_content in texts_passed_td]

#### Filtering using LLM

In [9]:
import json
import logging
from typing import Dict, Any
from tqdm import tqdm  # Import tqdm

from openai import OpenAI
client = OpenAI()

def q_eval_system_prompt():
    sys_prompt = """Din opgave er at evaluere et givet tekstuddrag for at bestemme, om det er egnet til at danne grundlag for et generelt spørgsmål, der er relevant for eksempelvis en eksamen eller en test. 
    For at vurdere dette, skal du fokusere på følgende tre nøglekriterier:

    1. Klarhed: Vurder, om teksten er formuleret klart og direkte, således at et spørgsmål til denne tekst, vil kunne besvares uden yderligere forklaringer. Teksten skal være læsbar og ikke usammenhængende i sin struktur.
    
    2. Konkret Information: Afgør, om uddraget indeholder specifikke, faktuelle informationer, der kan danne grundlag for et præcist og direkte spørgsmål. Teksten skal præsentere håndgribelige fakta eller data, som et spørgsmål kan baseres på.

    3. Kontekstuel Helhed: Bedøm, om teksten leverer tilstrækkelig kontekst for at et spørgsmål baseret på uddraget vil være meningsfuldt og forståeligt uden behov for yderligere information. Teksten skal være selvstændig og give en fuld forståelse af det emne, der behandles.

    Baseret på din evaluering:

    - Tildel scoren 1, hvis tekstuddraget opfylder alle tre kriterier, og der kan formuleres et naturligt, klart og kontekstuelt meningsfuldt spørgsmål baseret på teksten.

    - Tildel scoren 0, hvis tekstuddraget ikke opfylder et eller flere af de ovenstående kriterier, hvilket gør det uegnet til at danne grundlag for et generelt spørgsmål.
    """
    return sys_prompt

def q_eval_user_prompt(text: str) -> str:
    """Prepare the prompt for the API call."""
    
    qa_egnet_tmlp = """Du er en erfaren sagsbehandler. 
    Din Opgave:
    Vurder det følgende tekstuddrag og angiv, om det er egnet til at stille et generelt spørgsmål til.

    Uddrag:
    {chunk_text}
    
    Returner din vurdering i følgende JSON-format:

    {{
    "llm_score": [indsæt enten 0 eller 1 her]
    }}
    """
    return qa_egnet_tmlp.format(chunk_text=text)


def json_api_call(system_prompt: str, user_prompt: str, oai_model: str="gpt-3.5-turbo-0125") -> Dict[str, Any]:
    """Perform the API call to evaluate the text."""
    try:
        completion = client.chat.completions.create(
            model=oai_model,
            temperature=0,
            messages=[
                {
                    "role": "system",
                    "content": system_prompt
                },
                {
                    "role": "user", 
                    "content": user_prompt
                },
            ],
            response_format={"type": "json_object"}
        )
        return json.loads(completion.choices[0].message.content)
    except json.JSONDecodeError as e:
        logging.error(f'JSON parsing failed: {e}')
    except Exception as e:
        logging.error(f'API call failed: {e}')
    return {}

def filter_text_by_llm(text_list: List[str]) -> List[str]:
    """Filter text chunks by a LLM quality check
    
    Args: A list of text strings
    
    Returns: A list of text chunks that passed the LLM quality check
    """
    texts_passed_llm = []
    system_prompt = q_eval_system_prompt()
    for text in tqdm(text_list, desc="Evaluating texts"):
        user_prompt = q_eval_user_prompt(text)
        response = json_api_call(system_prompt, user_prompt)
        if response:
            if response['llm_score'] == 1:
                texts_passed_llm.append(text)
            else:
                continue
        else:
            logging.error(f'Failed to evaluate below text due to an earlier error. \n {text}')
    return texts_passed_llm

In [10]:
#Sample just 50 texts
texts_passed_llm = filter_text_by_llm([text.page_content for text in docs_passed_td[:50]])
docs_passed_llm = [doc for doc in split_documents if doc.page_content in texts_passed_llm]

Evaluating texts: 100%|██████████| 50/50 [00:47<00:00,  1.05it/s]


### Generating Questions

In [11]:
def generate_question_template(text: str, num_q: int=1) -> str:
    question_tmlp = """Nedenfor er et uddrag (kontekst) fra en længere tekst:
    ---------------------
    {context_str}
    ---------------------
    Givet ovenstående uddrag og ingen forudgående viden, er din opgave at generere præcis {num_questions_per_chunk} spørgsmål til teksten.
    En sætning skal kun indeholde 1 spørgsmål, og spørgsmålet skal være formuleret kort og præcist. 
    Svaret til spørgsmålet, skal kunne findes i ovenstående uddrag.
    Spørgsmålet skal indeholde specifik kontekst, således at spørgsmålet efterfølgende kan besvares entydigt og uden kendskab til uddraget. 
    Spørgsmålene skal stilles i et sprog som en borger uden juridisk ekspertise kan forstå.

    Eksempel på et spørgsmål der ikke har en specifik kontekst, og som fejlagtigt indeholder 2 spørgsmål i 1 sætning: 
    "Hvilket dokument har den nye vejledning erstattet, og hvornår blev den udsendt?" -Da det ikke angivet hvilket dokument der er tale om, og derfor er svaret til spørgsmålet ikke entyidgt, uden kendskab til uddraget. Sætningen indeholder desuden 2 spørgsmål i samme sætning. 

    Eksempel på et godt spørgsmål, som kan besvares entydigt uden kendskab til uddraget:
    "Hvilke to indbetalinger udgør det samlede medlemsbidrag til en a-kasse?" - Da det er klart hvad der spørges om, og der kun er 1 rigtigt svar i den givne lovtekst.
    """
    return question_tmlp.format(context_str=text, num_questions_per_chunk=num_q)

In [12]:
def question_api_call(user_prompt: str, oai_model: str="gpt-4-0125-preview") -> Dict[str, Any]:
    """Perform the API call to evaluate the text."""
    try:
        completion = client.chat.completions.create(
            model=oai_model,
            temperature=0,
            messages=[
                {
                    "role": "system",
                    "content": "Din opgave er at stille præcise spørgsmål til et givet tekstuddrag og returnere en JSON med en liste af spørgsmål i formatet {{Q: [spørgsmål1, spørsmål2, ...}}."
                },
                {
                    "role": "user", 
                    "content": user_prompt
                },
            ],
            response_format={"type": "json_object"}
        )
        return json.loads(completion.choices[0].message.content)
    except json.JSONDecodeError as e:
        logging.error(f'JSON parsing failed: {e}')
    except Exception as e:
        logging.error(f'API call failed: {e}')
    return {'Q': 'API error'}

In [13]:
from typing import Dict, List, Tuple
import uuid
from tqdm import tqdm
from langchain_core.documents import Document  # Import statement assumed; adjust based on actual import path

class QuestionContextManager:
    """
    Manages a collection of questions and their associated context chunks as Document objects.
    Allows for adding questions with contexts and displaying a specified number of these question-context pairs.
    """

    def __init__(self):
        self.questions: Dict[str, Document] = {}
        self.contexts: Dict[str, Document] = {}
        self.question_context_id_pairs: Dict[str, List[str]] = {}

    def add_question_context(self, question: Document, context: Document):
        """
        Adds a question and its associated context (both as Document objects) to the manager.
        Generates unique IDs for both the question and the context, storing them and their association.

        Parameters:
        - question (Document): The Document object containing the question.
        - context (Document): The Document object containing the context.
        """
        unique_question_id = str(uuid.uuid4())
        unique_context_id = str(uuid.uuid4())
        self.questions[unique_question_id] = question
        self.contexts[unique_context_id] = context
        self.question_context_id_pairs[unique_question_id] = [unique_context_id]

    @property
    def question_context_pairs(self) -> List[Tuple[Document, List[Document]]]:
        """
        Returns a list of tuples, each containing a question Document and a list of its associated context Documents.
        """
        return [(self.questions[qid], [self.contexts[cid] for cid in self.question_context_id_pairs[qid]]) for qid in self.questions]

    def display_question_context_pairs(self, num_pairs: int = None):
        """
        Displays a specified number of question-context pairs. If no number is specified, all pairs are displayed.

        Parameters:
        - num_pairs (int, optional): The number of question-context pairs to display. If None, all pairs are displayed. Defaults to None.
        """
        displayed_pairs = 0
        for q_id, context_ids in self.question_context_id_pairs.items():
            if num_pairs is not None and displayed_pairs >= num_pairs:
                break

            question = self.questions[q_id]
            print(f"Question: {question.page_content}")
            for c_id in context_ids:
                context = self.contexts[c_id]
                print(f"\nContext: {context.page_content}")
            print("-" * 40)  # Separator for readability
            displayed_pairs += 1

    def filter_questions_by_length(self, min_length: int = 20, max_length: int = 150):
        """
        Filters out questions that do not fall within the specified minimum and maximum character length.
        Updates the object by removing questions and their associated contexts that do not meet the criteria.

        Parameters:
        - min_length (int): The minimum character length for questions to be kept. Default to 20.
        - max_length (int): The maximum character length for questions to be kept. Default to 150.
        """
        questions_to_remove = [q_id for q_id, question in self.questions.items()
                               if not (min_length <= len(question.page_content) <= max_length)]

        # Remove the questions and question_context pairs
        for q_id in questions_to_remove:
            del self.questions[q_id]
            del self.question_context_id_pairs[q_id]

        # Identify contexts that are no longer linked to any questions
        contexts_to_remove = {context_id for context_id in self.contexts
                              if all(context_id not in contexts for contexts in self.question_context_id_pairs.values())}

        # Remove these contexts
        for context_id in contexts_to_remove:
            del self.contexts[context_id]

        print(f"Removed {len(questions_to_remove)} questions.")

    def __repr__(self):
        return f"<QuestionContextManager with {len(self.questions)} questions>"

In [14]:
def generate_questions(textContexts: List[Document], num_questions: int = 1, oai_model: str = "gpt-4-0125-preview", duplicate_metadata: bool = True) -> QuestionContextManager:
    """
    Generates questions from a list of context Documents and returns a QuestionContextManager
    containing the generated questions and their contexts.

    Parameters:
    - contexts (List[Document]): A list of Document objects to generate questions from.
    - num_questions (int): Number of questions to generate per context. Default is 1.
    - oai_model (str): The model to use for generating questions. Default is "gpt-4-0125-preview".
    - duplicate_metadata (bool): If True, duplicate the metadata from context to the generated questions.

    Returns:
    QuestionContextManager: An object containing the generated questions and their contexts.
    """
    result = QuestionContextManager()
    for context in tqdm(textContexts):
        question_prompt = generate_question_template(context.page_content, num_questions)
        response = question_api_call(question_prompt, oai_model)  
        try:
            questions = response['Q']
            for question_text in questions:
                question_document = Document(page_content=question_text.strip(), metadata=context.metadata if duplicate_metadata else {})
                result.add_question_context(question_document, context)
        except KeyError as e:
            print(f'Error parsing json response: {e}')
    return result

In [15]:
#Generate questions for a sub-sample of the passed documents
qc_meta = generate_questions(docs_passed_llm[:10])

100%|██████████| 10/10 [00:31<00:00,  3.11s/it]


### Question filtering

In [47]:
qc_meta.filter_questions_by_length()
qc_meta.display_question_context_pairs(3)

Removed 0 questions.
Question: Hvem skal regulere løbende erstatninger tilkendt før 1. januar 2024?

Context: De private arbejdsskadeforsikringsselskaber samt de arbejdsgivere, der er fritaget for at afgive risikoen efter loven, skal selv regulere løbende erstatninger, som er tilkendt før 1. januar 2024. Ved løbende erstatninger tilkendt i 2024 vil det fremgå af Arbejdsmarkedets Erhvervssikrings afgørelse, hvilke beløb, der skal udbetales i 2024.
----------------------------------------
Question: Hvordan beregnes grundlønnen for løbende erstatninger ifølge Arbejdstilsynets bilag fra den 5. januar 2024?

Context: Arbejdstilsynet, den 5. januar 2024
Sine Frederiksen
/ Helle Klostergaard Christensen
Bilag 1
Bilaget indeholder eksempler på beregninger af kapitalerstatninger, godtgørelsesbeløb og overgangsbeløb samt løbende erstatninger og godtgørelser, som tilskadekomne eller dennes efterladte har ret til efter lov om arbejdsskadesikring, lov om sikring mod følger af arbejdsskade, lov om a

### Updating the question-context pairs

In [92]:
import chromadb
from chromadb.utils import embedding_functions

chroma_client = chromadb.Client()

collection_name = "qc_collection"

# Check if the collection already exists
if chroma_client.get_collection(collection_name):
    # If it does, delete the existing collection
    chroma_client.delete_collection(collection_name)

db_collection = chroma_client.create_collection(
    name=collection_name,
    embedding_function=embedding_functions.SentenceTransformerEmbeddingFunction('intfloat/multilingual-e5-base', normalize_embeddings=True),
    metadata={"hnsw:space": "cosine"}
) 

In [93]:
def add_qc_to_chroma(question_context_obj: QuestionContextManager, chroma_collection, question_prepend: str = "query:", context_prepend: str = "passage:"):
    # Extracting question documents and their IDs
    """
    question_documents = list(question_context_obj.questions.values())
    question_texts = [f'{question_prepend} {doc.page_content}' for doc in question_documents]
    question_ids = list(question_context_obj.questions.keys())
    
    # Assuming each Document can carry its own metadata, we can enrich the ChromaDB metadata with it
    question_metadatas = [{"type": "question", **doc.metadata} for doc in question_documents]
    
    chroma_collection.add(
        documents=question_texts,
        ids=question_ids,
        metadatas=question_metadatas
    )
    """
    # Extracting context documents and their IDs
    context_documents = list(question_context_obj.contexts.values())
    context_texts = [f'{context_prepend} {doc.page_content}' for doc in context_documents]
    context_ids = list(question_context_obj.contexts.keys())
    
    # Assuming each Document can carry its own metadata, we can enrich the ChromaDB metadata with it
    context_metadatas = [{"type": "context", **doc.metadata} for doc in context_documents]
    
    chroma_collection.add(
        documents=context_texts,
        ids=context_ids,
        metadatas=context_metadatas
    )

In [None]:
add_qc_to_chroma(qc_meta, db_collection)

In [166]:
def filter_context_candidates(chroma_db_collection, question_context_object: QuestionContextManager, top_k: int = 5, dist_threshold: float = 0, include_origin_context: bool = False) -> Dict[str, List[str]]:
    """
    Filters context candidates for each question based on similarity scores and optionally includes the original context.

    This function queries a database collection for context candidates related to a set of questions. It filters these
    candidates based on a distance threshold compared to the ground truth context's distance. The function can include
    the original context in the results if specified.

    Parameters:
    - chroma_db_collection: The database collection to query for context candidates.
    - question_context_object: An object containing question and context information.
    - top_k: The number of top results to consider from the query.
    - dist_threshold: The threshold for including additional contexts based on their distance from the ground truth context.
    - include_origin_context: A boolean to indicate whether the original context should be included in the results.

    Returns:
    - A dictionary mapping each question ID to a list of filtered context candidate IDs.
    """
    
    query_filtered = {}

    questions_list = [doc.page_content for doc in question_context_object.questions.values()]

    batch_query_result = chroma_db_collection.query(
        query_texts=questions_list,
        where={"type": "context"},
        n_results=top_k
    )

    for idx, (q_id, q_document) in enumerate(question_context_object.questions.items()):
        query_id_list = batch_query_result['ids'][idx]
        query_distances_list = batch_query_result.get('distances', [])[idx]

        ground_truth_id = question_context_object.question_context_id_pairs[q_id][0]  # Assuming one ground truth per question

        if ground_truth_id in query_id_list:
            gt_idx = query_id_list.index(ground_truth_id)
            ground_truth_distance = query_distances_list[gt_idx]
            # Adjust the starting point based on whether the original context is included
            start_idx = gt_idx if include_origin_context else gt_idx + 1
            context_ids = []

            # Include IDs that are within the distance threshold
            for id_, distance in zip(query_id_list[start_idx:], query_distances_list[start_idx:]):
                if abs(distance - ground_truth_distance) <= dist_threshold:
                    context_ids.append(id_)
            # Optionally include the ground truth ID if required
            if include_origin_context:
                context_ids.insert(0, ground_truth_id)
        else:
            context_ids = query_id_list if include_origin_context else []

        query_filtered[q_id] = context_ids

    return query_filtered


In [179]:
context_candidates_id = filter_context_candidates(db_collection, qc_meta, dist_threshold=0.05, include_origin_context=False)
context_candidates_id = {k: v for k, v in context_candidates_id.items() if v}
context_candidates_id

{'65e34c14-bf1b-4594-9ea7-21c8a67e5a86': ['ec1a738b-1435-4668-ac4b-1e179f2e2b92'],
 '3fa3d7dc-9349-44fd-b4fb-cc7574f0447b': ['6cbfa00d-3790-46de-8825-6d541c519b1e'],
 'd1ba2c32-f0a0-46b8-a307-eda8255e9afb': ['b6119300-09d2-498f-8d65-a15a0ff27eab',
  '29839634-7813-48b9-bfc1-235465dd8f05'],
 '9e05aa95-bce9-4c8d-a8ec-41a349d407aa': ['6cbfa00d-3790-46de-8825-6d541c519b1e',
  '29839634-7813-48b9-bfc1-235465dd8f05',
  '447af735-1c24-4bec-ae94-4b2f21156143'],
 'ba322d25-8737-4dea-ba40-080785cdd637': ['6cbfa00d-3790-46de-8825-6d541c519b1e',
  'b6119300-09d2-498f-8d65-a15a0ff27eab',
  '447af735-1c24-4bec-ae94-4b2f21156143'],
 '7e0964e1-49d9-4a4e-afb6-1f050c1388f5': ['b6119300-09d2-498f-8d65-a15a0ff27eab'],
 'c92d3b46-6ee6-452e-bbee-4da8f22c070d': ['b59d5231-9923-43d7-990c-9b9eb2005375',
  '447af735-1c24-4bec-ae94-4b2f21156143',
  '34ff745b-2f65-4947-8975-3b0340b255fa',
  '6cbfa00d-3790-46de-8825-6d541c519b1e']}

### Using LLM to assess context candidates

In [180]:
def c_eval_system_prompt():
    sys_prompt = """Din opgave er at evaluere hvorvidt et givent tekstuddrag indeholder svaret til et spørgsmål. Du skal alene vurdere om uddraget indeholder svaret, og ikke om svaret er korrekt.

    - Tildel scoren 1, hvis tekstuddraget indeholder svaret til spørgsmålet.

    - Tildel scoren 0, hvis tekstuddraget ikke kan bruges til at besvare spørgsmålet.
    """
    return sys_prompt

def c_eval_user_prompt(question: str, context: str) -> str:
    """Prepare the prompt for the API call."""
    
    qa_egnet_tmlp = """Din Opgave:
    
    Vurder om følgende spørgsmål kan besvares ud fra den givne kontekst i tekstuddraget:
    
    spørgsmål:
    {insert_question}
    
    tekstuddrag:
    {insert_context}
    
    Returner din vurdering i følgende JSON-format:

    {{
    "context_score": [indsæt enten 0 eller 1 her]
    }}
    """
    return qa_egnet_tmlp.format(insert_question=question, insert_context=context)


def context_question_assesment(context_candidates, question_context_object: QuestionContextManager) -> Dict[str, List[str]]:
    """
    Iterates over the context candidate texts and uses a LLM call to assess whether the context matches the corresponding question
    """
    question_context_matches = {}
    system_prompt = c_eval_system_prompt()
    
    for q_id, c_id_list in tqdm(context_candidates.items()):
        question_text = question_context_object.questions[q_id].page_content
        for c_id in c_id_list:
            context_text = question_context_object.contexts[c_id].page_content
            user_prompt = c_eval_user_prompt(question=question_text, context=context_text)
            response = json_api_call(system_prompt, user_prompt)
            if response:
                if response['context_score'] == 1:
                    if q_id not in question_context_matches:
                        question_context_matches[q_id] = [c_id]
                    else:
                        question_context_matches[q_id].append(c_id)
                else:
                    continue
            else:
                logging.error(f'Failed to evaluate below text due to an earlier error. \n {text}')
    return question_context_matches

In [181]:
question_context_matches = context_question_assesment(context_candidates_id, qc_meta)

100%|██████████| 7/7 [00:11<00:00,  1.69s/it]


In [186]:
# Function to append the filtered question-context matches to the existing qc_meta.question_context_id_pairs
def update_question_context_pairs(q_c_to_append, question_context_object: QuestionContextManager):
    for q_id, c_id_list in q_c_to_append.items():
        if q_id in question_context_object.question_context_id_pairs:
            # Create a set from the existing IDs for quick lookup
            existing_ids_set = set(question_context_object.question_context_id_pairs[q_id])
            # Filter out duplicates while preserving order
            filtered_c_id_list = [c_id for c_id in c_id_list if c_id not in existing_ids_set]
            # Extend the existing list with the filtered, non-duplicate IDs
            question_context_object.question_context_id_pairs[q_id].extend(filtered_c_id_list)
        else:
            # Directly assign the list if the q_id is not already present
            question_context_object.question_context_id_pairs[q_id] = c_id_list

In [190]:
update_question_context_pairs(question_context_matches, qc_meta)
qc_meta.question_context_id_pairs

{'235d1d09-2db0-47b0-9812-efef4d39b618': ['ec1a738b-1435-4668-ac4b-1e179f2e2b92'],
 '65e34c14-bf1b-4594-9ea7-21c8a67e5a86': ['cecc6833-2e9d-4275-821f-0f6d2103cf56'],
 '3fa3d7dc-9349-44fd-b4fb-cc7574f0447b': ['6cbfa00d-3790-46de-8825-6d541c519b1e',
  '29839634-7813-48b9-bfc1-235465dd8f05'],
 'd1ba2c32-f0a0-46b8-a307-eda8255e9afb': ['6cbfa00d-3790-46de-8825-6d541c519b1e'],
 '9e05aa95-bce9-4c8d-a8ec-41a349d407aa': ['b316eda5-1236-44ce-a3bd-108b8f861731',
  '6cbfa00d-3790-46de-8825-6d541c519b1e',
  '29839634-7813-48b9-bfc1-235465dd8f05'],
 '375558d5-28da-42c9-b5ba-abfa09a40e3b': ['b13a547c-7463-44b3-b15d-1c60a6e8ae02'],
 'ba322d25-8737-4dea-ba40-080785cdd637': ['447af735-1c24-4bec-ae94-4b2f21156143',
  '6cbfa00d-3790-46de-8825-6d541c519b1e',
  '34ff745b-2f65-4947-8975-3b0340b255fa'],
 '7e0964e1-49d9-4a4e-afb6-1f050c1388f5': ['b59d5231-9923-43d7-990c-9b9eb2005375',
  'b6119300-09d2-498f-8d65-a15a0ff27eab'],
 'dc0950b0-e95f-409b-b3bc-a135e2411ca5': ['447af735-1c24-4bec-ae94-4b2f21156143'],
 

## Leftovers

In [175]:
qc_meta.questions['9e05aa95-bce9-4c8d-a8ec-41a349d407aa']

Document(page_content='Hvordan kan dagpengemodtagere selvbooke jobsamtaler i arbejdsløshedskassen?', metadata={'title': 'Vejledning om obligatorisk selvbooking af jobsamtaler for forskellige målgrupper'})

In [176]:
qc_meta.contexts['29839634-7813-48b9-bfc1-235465dd8f05']

Document(page_content='Indledning\nDenne vejledning omfatter pligt til selvbooking for visse målgrupper i lov om en aktiv beskæftigelsesindsats, jf. lovbekendtgørelse nr. 701 af 22. maj 2022. Pligt til selvbooking for sygedagpengemodtagere er ikke omfattet af denne vejledning.\nPligt til selvbooking gælder for dagpengemodtagere, kontanthjælps- og uddannelseshjælpsmodtagere, overgangsydelsesmodtagere, som ikke er omfattet af introduktionsprogrammet efter integrationsloven, personer i jobafklaringsforløb, personer i ressourceforløb, fleksjobvisiterede, som modtager ledighedsydelse, og personer i revalideringsforløb.\nPligt til selvbooking omfatter jobsamtaler efter kapitel 7 i lov om en aktiv beskæftigelsesindsats. Dagpengemodtagere har derudover pligt til at selvbooke rådighedssamtaler i arbejdsløshedskassen, hvis arbejdsløshedskassen har fastsat, at der skal ske selvbooking af en rådighedssamtale, jf. rådighedsbekendtgørelsens § 10 (bekendtgørelse nr. 1210 af 28. september 2023).', met

In [150]:
qc_meta.contexts['6cbfa00d-3790-46de-8825-6d541c519b1e']

Document(page_content='Fra 1. januar 2024 får arbejdsløshedskasserne ansvaret for kontaktforløbet for dagpengemodtagere i de første 3 måneder, mens jobcenteret har ansvaret for dagpengemodtageres kontaktforløb efter de første 3 måneder. Vejledningen tager afsæt i denne arbejdsdeling, hvor dagpengemodtagere i målgruppen for uddannelsespålæg, jf. § 27, stk. 3, i lov om en aktiv beskæftigelsesindsats, imidlertid visiteres til et kontaktforløb i jobcenteret senest 2 uger efter personens tilmelding som jobsøgende. Jobcenteret har således ansvaret for kontaktforløbet for dagpengemodtagere i målgruppen for uddannelsespålæg. På den baggrund er dagpengemodtagere i målgruppe for uddannelsespålæg omfattet af pligten til selvbooking af jobsamtaler med jobcenteret.\nFra 1. januar 2024 kan fælles jobsamtaler med dagpengemodtagere selvbookes i det omfang den enkelte kommune understøtter dette – og den enkelte kommune og arbejdsløshedskasser har aftalt timeslots, hvor de fælles jobsamtaler kan afholde

In [56]:
qc_meta.question_context_id_pairs

{'235d1d09-2db0-47b0-9812-efef4d39b618': ['ec1a738b-1435-4668-ac4b-1e179f2e2b92'],
 '65e34c14-bf1b-4594-9ea7-21c8a67e5a86': ['cecc6833-2e9d-4275-821f-0f6d2103cf56'],
 '3fa3d7dc-9349-44fd-b4fb-cc7574f0447b': ['29839634-7813-48b9-bfc1-235465dd8f05'],
 'd1ba2c32-f0a0-46b8-a307-eda8255e9afb': ['6cbfa00d-3790-46de-8825-6d541c519b1e'],
 '9e05aa95-bce9-4c8d-a8ec-41a349d407aa': ['b316eda5-1236-44ce-a3bd-108b8f861731'],
 '375558d5-28da-42c9-b5ba-abfa09a40e3b': ['b13a547c-7463-44b3-b15d-1c60a6e8ae02'],
 'ba322d25-8737-4dea-ba40-080785cdd637': ['34ff745b-2f65-4947-8975-3b0340b255fa'],
 '7e0964e1-49d9-4a4e-afb6-1f050c1388f5': ['b59d5231-9923-43d7-990c-9b9eb2005375'],
 'dc0950b0-e95f-409b-b3bc-a135e2411ca5': ['447af735-1c24-4bec-ae94-4b2f21156143'],
 'c92d3b46-6ee6-452e-bbee-4da8f22c070d': ['b6119300-09d2-498f-8d65-a15a0ff27eab']}

In [104]:
qc_meta.questions

{'235d1d09-2db0-47b0-9812-efef4d39b618': Document(page_content='Hvem skal regulere løbende erstatninger tilkendt før 1. januar 2024?', metadata={'title': 'Vejledning om regulering af satser fra 1. januar 2024 efter lov om arbejdsskadesikring, lov om sikring mod følger af arbejdsskade, lov om arbejdsskadeforsikring og lov om forsikring mod følger af ulykkestilfælde'}),
 '65e34c14-bf1b-4594-9ea7-21c8a67e5a86': Document(page_content='Hvordan beregnes grundlønnen for løbende erstatninger ifølge Arbejdstilsynets bilag fra den 5. januar 2024?', metadata={'title': 'Vejledning om regulering af satser fra 1. januar 2024 efter lov om arbejdsskadesikring, lov om sikring mod følger af arbejdsskade, lov om arbejdsskadeforsikring og lov om forsikring mod følger af ulykkestilfælde'}),
 '3fa3d7dc-9349-44fd-b4fb-cc7574f0447b': Document(page_content='Hvilke målgrupper er omfattet af pligten til selvbooking ifølge loven om en aktiv beskæftigelsesindsats fra 22. maj 2022?', metadata={'title': 'Vejledning 

In [42]:
questions_list = [doc.page_content for doc in list(qc_meta.questions.values())]

In [43]:
questions_list[0]

'Hvem skal regulere løbende erstatninger tilkendt før 1. januar 2024?'

In [95]:
questions_list = [doc.page_content for doc in list(qc_meta.questions.values())]

db_search = db_collection.query(
    query_texts = questions_list,
    where={"type": "context"},
    n_results=5
)

[[0.097584068775177,
  0.15659433603286743,
  0.17552393674850464,
  0.18490701913833618,
  0.20026171207427979],
 [0.09000265598297119,
  0.11974334716796875,
  0.1689140796661377,
  0.1734117865562439,
  0.17344695329666138],
 [0.0862841010093689,
  0.12259435653686523,
  0.1447177529335022,
  0.14601296186447144,
  0.18048101663589478],
 [0.11279726028442383,
  0.15674149990081787,
  0.1585928201675415,
  0.1638789176940918,
  0.16871750354766846],
 [0.0791313648223877,
  0.09400498867034912,
  0.10000258684158325,
  0.1135631799697876,
  0.1328728199005127],
 [0.0892874002456665,
  0.1413293480873108,
  0.14738941192626953,
  0.1564478874206543,
  0.16016292572021484],
 [0.10472530126571655,
  0.13769900798797607,
  0.1410079002380371,
  0.14229196310043335,
  0.16628289222717285],
 [0.1070985198020935,
  0.13767129182815552,
  0.1634054183959961,
  0.18144559860229492,
  0.1822643280029297],
 [0.06129348278045654,
  0.11234098672866821,
  0.11622011661529541,
  0.12383377552032471

In [103]:
#Make a list comprehension that calculates 1 - dist for each element in the nested lists of db_search['distances'] 
similarity = [[1 - dist for dist in dist_list] for dist_list in db_search['distances']]
similarity


[[0.902415931224823,
  0.8434056639671326,
  0.8244760632514954,
  0.8150929808616638,
  0.7997382879257202],
 [0.9099973440170288,
  0.8802566528320312,
  0.8310859203338623,
  0.8265882134437561,
  0.8265530467033386],
 [0.9137158989906311,
  0.8774056434631348,
  0.8552822470664978,
  0.8539870381355286,
  0.8195189833641052],
 [0.8872027397155762,
  0.8432585000991821,
  0.8414071798324585,
  0.8361210823059082,
  0.8312824964523315],
 [0.9208686351776123,
  0.9059950113296509,
  0.8999974131584167,
  0.8864368200302124,
  0.8671271800994873],
 [0.9107125997543335,
  0.8586706519126892,
  0.8526105880737305,
  0.8435521125793457,
  0.8398370742797852],
 [0.8952746987342834,
  0.8623009920120239,
  0.8589920997619629,
  0.8577080368995667,
  0.8337171077728271],
 [0.8929014801979065,
  0.8623287081718445,
  0.8365945816040039,
  0.8185544013977051,
  0.8177356719970703],
 [0.9387065172195435,
  0.8876590132713318,
  0.8837798833847046,
  0.8761662244796753,
  0.862031102180481],
 [0

In [82]:
def compare_search_with_ground_truth(search_results: List[List[str]], question_context_id_pairs: dict) -> dict:
    # Initialize a dictionary to store the comparison results
    comparison_results = {}

    # Iterate over each question ID and its corresponding ground truth context ID(s)
    for idx, (question_id, ground_truth_context_ids) in enumerate(question_context_id_pairs.items()):
        # Assuming the order of search_results matches the order of question_context_id_pairs
        # Fetch the search result list for the current question
        current_search_ids = search_results[idx]

        # Initialize an empty list to store context IDs ranked higher than the ground truth
        higher_ranked_context_ids = []

        for ground_truth_id in ground_truth_context_ids:
            # Check if the ground truth context ID is in the current list of search IDs
            if ground_truth_id in current_search_ids:
                # Find the index of the ground truth context ID in the search results
                print(f'Match found for question {question_id}!')
                ground_truth_index = current_search_ids.index(ground_truth_id)

                # Add all context IDs ranked higher than (including) the ground truth context ID
                higher_ranked_context_ids = current_search_ids[:ground_truth_index + 1]
                break  # Assuming one ground truth context ID per question, we can break after finding it

        # Update the comparison results dictionary
        comparison_results[question_id] = higher_ranked_context_ids

    return comparison_results


In [111]:
def filter_potential_context_answers(chroma_db_collection, question_context_object):
    # Initialize a dictionary to store the filtered query results
    query_filtered = {}

    # Iterate over each question ID and its corresponding Document object
    for q_id, q_document in question_context_object.questions.items():
        # Perform a query using the text of the current question
        query_result = chroma_db_collection.query(
            query_texts=[q_document.page_content],
            where={"type": "context"},
            n_results=5
        )

        # Assuming query_result returns a list of lists of IDs, take the first list as we performed a single query
        query_id_list = query_result['ids'][0]  # Adjust this line based on the actual structure of query_result

        # Initialize a list to store context IDs ranked higher than the ground truth
        higher_ranked_context_ids = []

        # Iterate over ground truth context IDs for the current question
        for ground_truth_id in question_context_object.question_context_id_pairs[q_id]:
            # Check if the ground truth context ID is in the list of query IDs
            if ground_truth_id in query_id_list:
                # Find the index of the ground truth context ID in the query results
                idx = query_id_list.index(ground_truth_id)

                # Add all context IDs ranked higher than or equal to the ground truth context ID
                higher_ranked_context_ids = query_id_list[:idx + 1]
                break  # Assuming one ground truth context ID per question, we can break after finding it

        # Update the filtered query results dictionary
        query_filtered[q_id] = higher_ranked_context_ids

    return query_filtered

In [112]:
q_filtered = filter_potential_context_answers(db_collection, qc_meta)

In [113]:
def filter_potential_context_answers_batch(chroma_db_collection, question_context_object):
    # Initialize a dictionary to store the filtered query results
    query_filtered = {}

    # Extract all questions into a list
    questions_list = [doc.page_content for doc in question_context_object.questions.values()]

    # Perform a batch query using the list of question texts
    batch_query_result = chroma_db_collection.query(
        query_texts=questions_list,
        where={"type": "context"},
        n_results=5
    )

    # Iterate over the questions and their corresponding results
    for idx, (q_id, q_document) in enumerate(question_context_object.questions.items()):
        # Extract the list of IDs for the current question's results
        query_id_list = batch_query_result['ids'][idx]

        # Initialize a list to store context IDs ranked higher than the ground truth
        higher_ranked_context_ids = []

        # Iterate over ground truth context IDs for the current question
        for ground_truth_id in question_context_object.question_context_id_pairs[q_id]:
            # Check if the ground truth context ID is in the list of query IDs
            if ground_truth_id in query_id_list:
                # Find the index of the ground truth context ID in the query results
                idx = query_id_list.index(ground_truth_id)

                # Add all context IDs ranked higher than or equal to the ground truth context ID
                higher_ranked_context_ids = query_id_list[:idx + 1]
                break  # Assuming one ground truth context ID per question, we can break after finding it

        # Update the filtered query results dictionary
        query_filtered[q_id] = higher_ranked_context_ids

    return query_filtered

In [114]:
q_filtered = filter_potential_context_answers_batch(db_collection, qc_meta)

In [130]:
def filter_potential_context_answers_batch(chroma_db_collection, question_context_object, dist_threshold: float=0):
    # Initialize a dictionary to store the filtered query results
    query_filtered = {}

    # Extract all questions into a list
    questions_list = [doc.page_content for doc in question_context_object.questions.values()]

    # Perform a batch query using the list of question texts
    batch_query_result = chroma_db_collection.query(
        query_texts=questions_list,
        where={"type": "context"},
        n_results=5
    )

    for idx, (q_id, q_document) in enumerate(question_context_object.questions.items()):
        query_id_list = batch_query_result['ids'][idx]
        query_distances_list = batch_query_result['distances'][idx] if 'distances' in batch_query_result else []

        ground_truth_id = question_context_object.question_context_id_pairs[q_id][0]  # Assuming one ground truth per question

        if ground_truth_id in query_id_list:
            gt_idx = query_id_list.index(ground_truth_id)
            ground_truth_distance = query_distances_list[gt_idx]

            # Include all IDs up to and including the ground truth's position
            context_ids = query_id_list[:gt_idx+1]

            # Additionally, include IDs beyond the ground truth's position if they are within the distance threshold
            for id_, distance in zip(query_id_list[gt_idx+1:], query_distances_list[gt_idx+1:]):
                if abs(distance - ground_truth_distance) <= dist_threshold:
                    context_ids.append(id_)
        else:
            # If the ground truth is not in the top K, return the entire list
            context_ids = query_id_list

        # Update the filtered query results dictionary
        query_filtered[q_id] = context_ids

    return query_filtered

In [133]:
q_filtered = filter_potential_context_answers_batch(db_collection, qc_meta, dist_threshold=0.05)

In [134]:
q_filtered

{'235d1d09-2db0-47b0-9812-efef4d39b618': ['ec1a738b-1435-4668-ac4b-1e179f2e2b92'],
 '65e34c14-bf1b-4594-9ea7-21c8a67e5a86': ['cecc6833-2e9d-4275-821f-0f6d2103cf56',
  'ec1a738b-1435-4668-ac4b-1e179f2e2b92'],
 '3fa3d7dc-9349-44fd-b4fb-cc7574f0447b': ['29839634-7813-48b9-bfc1-235465dd8f05',
  '6cbfa00d-3790-46de-8825-6d541c519b1e'],
 'd1ba2c32-f0a0-46b8-a307-eda8255e9afb': ['6cbfa00d-3790-46de-8825-6d541c519b1e',
  'b6119300-09d2-498f-8d65-a15a0ff27eab',
  '29839634-7813-48b9-bfc1-235465dd8f05'],
 '9e05aa95-bce9-4c8d-a8ec-41a349d407aa': ['b316eda5-1236-44ce-a3bd-108b8f861731',
  '6cbfa00d-3790-46de-8825-6d541c519b1e',
  '29839634-7813-48b9-bfc1-235465dd8f05',
  '447af735-1c24-4bec-ae94-4b2f21156143'],
 '375558d5-28da-42c9-b5ba-abfa09a40e3b': ['b13a547c-7463-44b3-b15d-1c60a6e8ae02'],
 'ba322d25-8737-4dea-ba40-080785cdd637': ['34ff745b-2f65-4947-8975-3b0340b255fa',
  '6cbfa00d-3790-46de-8825-6d541c519b1e',
  'b6119300-09d2-498f-8d65-a15a0ff27eab',
  '447af735-1c24-4bec-ae94-4b2f21156143'],

In [85]:
comp_results = compare_search_with_ground_truth(db_search['ids'], qc_meta.question_context_id_pairs)

Match found for question 235d1d09-2db0-47b0-9812-efef4d39b618!
Match found for question 65e34c14-bf1b-4594-9ea7-21c8a67e5a86!
Match found for question 3fa3d7dc-9349-44fd-b4fb-cc7574f0447b!
Match found for question d1ba2c32-f0a0-46b8-a307-eda8255e9afb!
Match found for question 9e05aa95-bce9-4c8d-a8ec-41a349d407aa!
Match found for question 375558d5-28da-42c9-b5ba-abfa09a40e3b!
Match found for question ba322d25-8737-4dea-ba40-080785cdd637!
Match found for question 7e0964e1-49d9-4a4e-afb6-1f050c1388f5!
Match found for question dc0950b0-e95f-409b-b3bc-a135e2411ca5!
Match found for question c92d3b46-6ee6-452e-bbee-4da8f22c070d!


In [86]:
comp_results

{'235d1d09-2db0-47b0-9812-efef4d39b618': ['ec1a738b-1435-4668-ac4b-1e179f2e2b92'],
 '65e34c14-bf1b-4594-9ea7-21c8a67e5a86': ['cecc6833-2e9d-4275-821f-0f6d2103cf56'],
 '3fa3d7dc-9349-44fd-b4fb-cc7574f0447b': ['29839634-7813-48b9-bfc1-235465dd8f05'],
 'd1ba2c32-f0a0-46b8-a307-eda8255e9afb': ['6cbfa00d-3790-46de-8825-6d541c519b1e'],
 '9e05aa95-bce9-4c8d-a8ec-41a349d407aa': ['b316eda5-1236-44ce-a3bd-108b8f861731'],
 '375558d5-28da-42c9-b5ba-abfa09a40e3b': ['b13a547c-7463-44b3-b15d-1c60a6e8ae02'],
 'ba322d25-8737-4dea-ba40-080785cdd637': ['34ff745b-2f65-4947-8975-3b0340b255fa'],
 '7e0964e1-49d9-4a4e-afb6-1f050c1388f5': ['b59d5231-9923-43d7-990c-9b9eb2005375'],
 'dc0950b0-e95f-409b-b3bc-a135e2411ca5': ['447af735-1c24-4bec-ae94-4b2f21156143'],
 'c92d3b46-6ee6-452e-bbee-4da8f22c070d': ['b6119300-09d2-498f-8d65-a15a0ff27eab']}

In [None]:
#Function that takes the db_search result and 

In [33]:
qc_meta.questions

{'235d1d09-2db0-47b0-9812-efef4d39b618': Document(page_content='Hvem skal regulere løbende erstatninger tilkendt før 1. januar 2024?', metadata={'title': 'Vejledning om regulering af satser fra 1. januar 2024 efter lov om arbejdsskadesikring, lov om sikring mod følger af arbejdsskade, lov om arbejdsskadeforsikring og lov om forsikring mod følger af ulykkestilfælde'}),
 '65e34c14-bf1b-4594-9ea7-21c8a67e5a86': Document(page_content='Hvordan beregnes grundlønnen for løbende erstatninger ifølge Arbejdstilsynets bilag fra den 5. januar 2024?', metadata={'title': 'Vejledning om regulering af satser fra 1. januar 2024 efter lov om arbejdsskadesikring, lov om sikring mod følger af arbejdsskade, lov om arbejdsskadeforsikring og lov om forsikring mod følger af ulykkestilfælde'}),
 '3fa3d7dc-9349-44fd-b4fb-cc7574f0447b': Document(page_content='Hvilke målgrupper er omfattet af pligten til selvbooking ifølge loven om en aktiv beskæftigelsesindsats fra 22. maj 2022?', metadata={'title': 'Vejledning 