### Setting up Env

In [2]:
import os
from dotenv import load_dotenv
import pandas as pd
from datasets import Dataset
from datasets import load_dataset

load_dotenv(override=True)

  from .autonotebook import tqdm as notebook_tqdm


True

### Downloading dataset

In [3]:
# Load from hub
ds_vejledninger = load_dataset(
    "jealk/dk_retrieval_benchmark",
    "retsinformation",
    split="train",
    #download_mode="force_redownload",
)

In [4]:
# Create pandas dataframe from the dataset using the huggingface datasets library
df_vejledninger = ds_vejledninger.to_pandas()
df_vejledninger.head()

Unnamed: 0,url,title,html_content,text_content
0,https://www.retsinformation.dk/eli/retsinfo/20...,Vejledning om regulering af satser fra 1. janu...,"<div class=""document-content"" id=""restylingRoo...",Vejledning om regulering af satser fra 1. janu...
1,https://www.retsinformation.dk/eli/retsinfo/20...,Vejledning om satser i 2024 for betaling af ud...,"<div class=""document-content"" id=""restylingRoo...",Vejledning om satser i 2024 for betaling af ud...
2,https://www.retsinformation.dk/eli/retsinfo/20...,Vejledning om obligatorisk selvbooking af jobs...,"<div class=""document-content"" id=""restylingRoo...",Vejledning om obligatorisk selvbooking af jobs...
3,https://www.retsinformation.dk/eli/retsinfo/20...,Vejledning til bekendtgørelse om tilskud til s...,"<div class=""document-content"" id=""restylingRoo...",Vejledning til bekendtgørelse om tilskud til s...
4,https://www.retsinformation.dk/eli/retsinfo/20...,Vejledning om fleksløntilskud m.v.,"<div class=""document-content"" id=""restylingRoo...",Vejledning om fleksløntilskud m.v.\n1.Indledni...


### Chunking text data

In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("intfloat/e5-base-v2")

def token_length_function(text_input):
  return len(tokenizer.encode(text_input, add_special_tokens=False))

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 512,
    chunk_overlap  = 0,
    length_function = token_length_function,
    separators = ["\n\n", "\n", ". ", "? ", "! "]
)

Token indices sequence length is longer than the specified maximum sequence length for this model (1350 > 512). Running this sequence through the model will result in indexing errors


In [6]:
#For some reason, Langchains text splitter is horribly slow (compared to llamaindex) takes 2+ minutes to run on my CPU
split_documents = text_splitter.create_documents(list(df_vejledninger["text_content"]), metadatas = [{"title": title} for title in df_vejledninger["title"]])

### Filtering contexts

#### Filtering using TextDescriptives

In [7]:
import textdescriptives as td
import spacy
from typing import List, Dict, Optional
import os

#add optional meta data, list of dicts
def filter_text_by_td(text_list: List[str], filter_type: bool=True) -> List[str]:
    """Filter nodes by the textdescriptives quality check

    Args:
    text_list> a list of stext strings
    fiter_type: A boolean defining whether to filter by texts that passed (True) or failed (False) the textdescriptives quality check

    Returns:
    A list of text chunks that passed the textdescriptives quality check
    """
    nlp = spacy.blank("da")
    nlp.add_pipe("sentencizer")
    quality_pipe = nlp.add_pipe("textdescriptives/quality")
    docs = list(nlp.pipe(text_list))
    filtered_texts = [doc.text for doc in docs if doc._.passed_quality_check==filter_type]
    
    return filtered_texts

In [8]:
#Sample 300 texts
texts_passed_td = filter_text_by_td([text.page_content for text in split_documents[0:300]])
docs_passed_td = [doc for doc in split_documents if doc.page_content in texts_passed_td]

#### Filtering using LLM

In [9]:
import json
import logging
from typing import Dict, Any
from tqdm import tqdm  # Import tqdm

from openai import OpenAI
client = OpenAI()

def q_eval_system_prompt():
    sys_prompt = """Din opgave er at evaluere et givet tekstuddrag for at bestemme, om det er egnet til at danne grundlag for et generelt spørgsmål, der er relevant for eksempelvis en eksamen eller en test. 
    For at vurdere dette, skal du fokusere på følgende tre nøglekriterier:

    1. Klarhed: Vurder, om teksten er formuleret klart og direkte, således at et spørgsmål til denne tekst, vil kunne besvares uden yderligere forklaringer. Teksten skal være læsbar og ikke usammenhængende i sin struktur.
    
    2. Konkret Information: Afgør, om uddraget indeholder specifikke, faktuelle informationer, der kan danne grundlag for et præcist og direkte spørgsmål. Teksten skal præsentere håndgribelige fakta eller data, som et spørgsmål kan baseres på.

    3. Kontekstuel Helhed: Bedøm, om teksten leverer tilstrækkelig kontekst for at et spørgsmål baseret på uddraget vil være meningsfuldt og forståeligt uden behov for yderligere information. Teksten skal være selvstændig og give en fuld forståelse af det emne, der behandles.

    Baseret på din evaluering:

    - Tildel scoren 1, hvis tekstuddraget opfylder alle tre kriterier, og der kan formuleres et naturligt, klart og kontekstuelt meningsfuldt spørgsmål baseret på teksten.

    - Tildel scoren 0, hvis tekstuddraget ikke opfylder et eller flere af de ovenstående kriterier, hvilket gør det uegnet til at danne grundlag for et generelt spørgsmål.
    """
    return sys_prompt

def q_eval_user_prompt(text: str) -> str:
    """Prepare the prompt for the API call."""
    
    qa_egnet_tmlp = """Du er en erfaren sagsbehandler. 
    Din Opgave:
    Vurder det følgende tekstuddrag og angiv, om det er egnet til at stille et generelt spørgsmål til.

    Uddrag:
    {chunk_text}
    
    Returner din vurdering i følgende JSON-format:

    {{
    "llm_score": [indsæt enten 0 eller 1 her]
    }}
    """
    return qa_egnet_tmlp.format(chunk_text=text)


def json_api_call(system_prompt: str, user_prompt: str, oai_model: str="gpt-3.5-turbo-0125") -> Dict[str, Any]:
    """Perform the API call to evaluate the text."""
    try:
        completion = client.chat.completions.create(
            model=oai_model,
            temperature=0,
            messages=[
                {
                    "role": "system",
                    "content": system_prompt
                },
                {
                    "role": "user", 
                    "content": user_prompt
                },
            ],
            response_format={"type": "json_object"}
        )
        return json.loads(completion.choices[0].message.content)
    except json.JSONDecodeError as e:
        logging.error(f'JSON parsing failed: {e}')
    except Exception as e:
        logging.error(f'API call failed: {e}')
    return {}

def filter_text_by_llm(text_list: List[str]) -> List[str]:
    """Filter text chunks by a LLM quality check
    
    Args: A list of text strings
    
    Returns: A list of text chunks that passed the LLM quality check
    """
    texts_passed_llm = []
    system_prompt = q_eval_system_prompt()
    for text in tqdm(text_list, desc="Evaluating texts"):
        user_prompt = q_eval_user_prompt(text)
        response = json_api_call(system_prompt, user_prompt)
        if response:
            if response['llm_score'] == 1:
                texts_passed_llm.append(text)
            else:
                continue
        else:
            logging.error(f'Failed to evaluate below text due to an earlier error. \n {text}')
    return texts_passed_llm

In [10]:
#Sample just 50 texts
texts_passed_llm = filter_text_by_llm([text.page_content for text in docs_passed_td[:50]])
docs_passed_llm = [doc for doc in split_documents if doc.page_content in texts_passed_llm]

Evaluating texts: 100%|██████████| 50/50 [00:47<00:00,  1.05it/s]


### Generating Questions

In [11]:
def generate_question_template(text: str, num_q: int=1) -> str:
    question_tmlp = """Nedenfor er et uddrag (kontekst) fra en længere tekst:
    ---------------------
    {context_str}
    ---------------------
    Givet ovenstående uddrag og ingen forudgående viden, er din opgave at generere præcis {num_questions_per_chunk} spørgsmål til teksten.
    En sætning skal kun indeholde 1 spørgsmål, og spørgsmålet skal være formuleret kort og præcist. 
    Svaret til spørgsmålet, skal kunne findes i ovenstående uddrag.
    Spørgsmålet skal indeholde specifik kontekst, således at spørgsmålet efterfølgende kan besvares entydigt og uden kendskab til uddraget. 
    Spørgsmålene skal stilles i et sprog som en borger uden juridisk ekspertise kan forstå.

    Eksempel på et spørgsmål der ikke har en specifik kontekst, og som fejlagtigt indeholder 2 spørgsmål i 1 sætning: 
    "Hvilket dokument har den nye vejledning erstattet, og hvornår blev den udsendt?" -Da det ikke angivet hvilket dokument der er tale om, og derfor er svaret til spørgsmålet ikke entyidgt, uden kendskab til uddraget. Sætningen indeholder desuden 2 spørgsmål i samme sætning. 

    Eksempel på et godt spørgsmål, som kan besvares entydigt uden kendskab til uddraget:
    "Hvilke to indbetalinger udgør det samlede medlemsbidrag til en a-kasse?" - Da det er klart hvad der spørges om, og der kun er 1 rigtigt svar i den givne lovtekst.
    """
    return question_tmlp.format(context_str=text, num_questions_per_chunk=num_q)

In [12]:
def question_api_call(user_prompt: str, oai_model: str="gpt-4-0125-preview") -> Dict[str, Any]:
    """Perform the API call to evaluate the text."""
    try:
        completion = client.chat.completions.create(
            model=oai_model,
            temperature=0,
            messages=[
                {
                    "role": "system",
                    "content": "Din opgave er at stille præcise spørgsmål til et givet tekstuddrag og returnere en JSON med en liste af spørgsmål i formatet {{Q: [spørgsmål1, spørsmål2, ...}}."
                },
                {
                    "role": "user", 
                    "content": user_prompt
                },
            ],
            response_format={"type": "json_object"}
        )
        return json.loads(completion.choices[0].message.content)
    except json.JSONDecodeError as e:
        logging.error(f'JSON parsing failed: {e}')
    except Exception as e:
        logging.error(f'API call failed: {e}')
    return {'Q': 'API error'}

In [13]:
from typing import Dict, List, Tuple
import uuid
from tqdm import tqdm
from langchain_core.documents import Document  # Import statement assumed; adjust based on actual import path

class QuestionContextManager:
    """
    Manages a collection of questions and their associated context chunks as Document objects.
    Allows for adding questions with contexts and displaying a specified number of these question-context pairs.
    """

    def __init__(self):
        self.questions: Dict[str, Document] = {}
        self.contexts: Dict[str, Document] = {}
        self.question_context_id_pairs: Dict[str, List[str]] = {}

    def add_question_context(self, question: Document, context: Document):
        """
        Adds a question and its associated context (both as Document objects) to the manager.
        Generates unique IDs for both the question and the context, storing them and their association.

        Parameters:
        - question (Document): The Document object containing the question.
        - context (Document): The Document object containing the context.
        """
        unique_question_id = str(uuid.uuid4())
        unique_context_id = str(uuid.uuid4())
        self.questions[unique_question_id] = question
        self.contexts[unique_context_id] = context
        self.question_context_id_pairs[unique_question_id] = [unique_context_id]

    @property
    def question_context_pairs(self) -> List[Tuple[Document, List[Document]]]:
        """
        Returns a list of tuples, each containing a question Document and a list of its associated context Documents.
        """
        return [(self.questions[qid], [self.contexts[cid] for cid in self.question_context_id_pairs[qid]]) for qid in self.questions]

    def display_question_context_pairs(self, num_pairs: int = None):
        """
        Displays a specified number of question-context pairs. If no number is specified, all pairs are displayed.

        Parameters:
        - num_pairs (int, optional): The number of question-context pairs to display. If None, all pairs are displayed. Defaults to None.
        """
        displayed_pairs = 0
        for q_id, context_ids in self.question_context_id_pairs.items():
            if num_pairs is not None and displayed_pairs >= num_pairs:
                break

            question = self.questions[q_id]
            print(f"Question: {question.page_content}")
            for c_id in context_ids:
                context = self.contexts[c_id]
                print(f"\nContext: {context.page_content}")
            print("-" * 40)  # Separator for readability
            displayed_pairs += 1

    def filter_questions_by_length(self, min_length: int = 20, max_length: int = 150):
        """
        Filters out questions that do not fall within the specified minimum and maximum character length.
        Updates the object by removing questions and their associated contexts that do not meet the criteria.

        Parameters:
        - min_length (int): The minimum character length for questions to be kept. Default to 20.
        - max_length (int): The maximum character length for questions to be kept. Default to 150.
        """
        questions_to_remove = [q_id for q_id, question in self.questions.items()
                               if not (min_length <= len(question.page_content) <= max_length)]

        # Remove the questions and question_context pairs
        for q_id in questions_to_remove:
            del self.questions[q_id]
            del self.question_context_id_pairs[q_id]

        # Identify contexts that are no longer linked to any questions
        contexts_to_remove = {context_id for context_id in self.contexts
                              if all(context_id not in contexts for contexts in self.question_context_id_pairs.values())}

        # Remove these contexts
        for context_id in contexts_to_remove:
            del self.contexts[context_id]

        print(f"Removed {len(questions_to_remove)} questions.")

    def __repr__(self):
        return f"<QuestionContextManager with {len(self.questions)} questions>"

In [14]:
def generate_questions(textContexts: List[Document], num_questions: int = 1, oai_model: str = "gpt-4-0125-preview", duplicate_metadata: bool = True) -> QuestionContextManager:
    """
    Generates questions from a list of context Documents and returns a QuestionContextManager
    containing the generated questions and their contexts.

    Parameters:
    - contexts (List[Document]): A list of Document objects to generate questions from.
    - num_questions (int): Number of questions to generate per context. Default is 1.
    - oai_model (str): The model to use for generating questions. Default is "gpt-4-0125-preview".
    - duplicate_metadata (bool): If True, duplicate the metadata from context to the generated questions.

    Returns:
    QuestionContextManager: An object containing the generated questions and their contexts.
    """
    result = QuestionContextManager()
    for context in tqdm(textContexts):
        question_prompt = generate_question_template(context.page_content, num_questions)
        response = question_api_call(question_prompt, oai_model)  
        try:
            questions = response['Q']
            for question_text in questions:
                question_document = Document(page_content=question_text.strip(), metadata=context.metadata if duplicate_metadata else {})
                result.add_question_context(question_document, context)
        except KeyError as e:
            print(f'Error parsing json response: {e}')
    return result

In [15]:
#Generate questions for a sub-sample of the passed documents
qc_meta = generate_questions(docs_passed_llm[:10])

100%|██████████| 10/10 [00:31<00:00,  3.11s/it]


### Question filtering

In [None]:
qc_meta.filter_questions_by_length()
qc_meta.display_question_context_pairs(3)

### Updating the question-context pairs

In [17]:
import chromadb
from chromadb.utils import embedding_functions

chroma_client = chromadb.Client()

db_collection = chroma_client.create_collection(
    name="qc_collection",
    embedding_function=embedding_functions.SentenceTransformerEmbeddingFunction('intfloat/multilingual-e5-base'),
    metadata={"hnsw:space": "cosine"}
)

def add_qc_to_chroma(question_context_obj: QuestionContextManager, chroma_collection, question_prepend: str = "query:", context_prepend: str = "passage:"):
    # Extracting question documents and their IDs
    question_documents = question_context_obj.get_question_documents()  # This method should return a dict of {id: Document}
    question_texts = [f'{question_prepend} {doc.text}' for doc in question_documents.values()]
    question_ids = list(question_documents.keys())
    
    # Assuming each Document can carry its own metadata, we can enrich the ChromaDB metadata with it
    question_metadatas = [{"type": "question", **doc.metadata} for doc in question_documents.values()]
    
    chroma_collection.add(
        documents=question_texts,
        ids=question_ids,
        metadatas=question_metadatas
    )
    
    # Extracting context documents and their IDs
    context_documents = question_context_obj.get_context_documents()  # This method should return a dict of {id: Document}
    context_texts = [f'{context_prepend} {doc.text}' for doc in context_documents.values()]
    context_ids = list(context_documents.keys())
    
    # Similarly, enriching ChromaDB metadata with Document metadata
    context_metadatas = [{"type": "context", **doc.metadata} for doc in context_documents.values()]
    
    chroma_collection.add(
        documents=context_texts,
        ids=context_ids,
        metadatas=context_metadatas
    )
