##### Environment Setup

In [2]:
%%capture

%pip install -U langchain
%pip install -U langchain-community
%pip install -U langchain-huggingface
%pip install -U langchain_experimental
%pip install -U langchain_openai
%pip install -U langchain-chroma

%pip install -U chromadb

%pip install -U unstructured
%pip install -U sentence-transformers
%pip install -U nltk
%pip install -U spacy
%pip install -U --upgrade pymupdf
%pip install -U transformers torch

!python -m spacy download en_core_web_sm

# Warning control
import warnings
warnings.filterwarnings("ignore")

In [3]:
from google.colab import userdata

# Once you have obtained the Hugging Face API token with read permission, assign it to `HF_API_KEY`.
HF_API_KEY = userdata.get('HF_API_KEY')

# For Neo4j, update the URL and password using the credentials file you downloaded.
NEO4J_USERNAME = "neo4j"
NEO4J_URI = userdata.get('NEO4J_URI')
NEO4J_PASSWORD = userdata.get('NEO4J_PASSWORD')

In [4]:
from langchain_huggingface.embeddings import HuggingFaceEndpointEmbeddings

# define huggingface embedding endpoint
hf_embeddings = HuggingFaceEndpointEmbeddings(
    #model= "BAAI/bge-base-en-v1.5",             # model embedding name
    model="sentence-transformers/all-MiniLM-L6-v2",
    task="feature-extraction",                  # generate embeddings (dense vectors)
    huggingfacehub_api_token=HF_API_KEY         # ðŸ¤— huggingface API token
)

# get output vector length
vect_embed = hf_embeddings.embed_query("Hello world!")
EMBEDDING_VECTOR_LENGTH = len(vect_embed)
print("Embedding vector type: ", type(vect_embed))
print("Embedding vector length: ", EMBEDDING_VECTOR_LENGTH)

Embedding vector type:  <class 'list'>
Embedding vector length:  384


In [5]:
from langchain_huggingface import HuggingFaceEndpoint

# define huggingface generation endpoint
hf_llm = HuggingFaceEndpoint(
    repo_id="mistralai/Mistral-7B-Instruct-v0.3", # Model Name
    task="text-generation",                       # task as generating a text response
    max_new_tokens=150,                           # maximum numbers of generated tokens
    do_sample=False,                              # disables sampling
    huggingfacehub_api_token=HF_API_KEY           # ðŸ¤— huggingface API token
)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


##### Indexing

In [22]:
import fitz  # PyMuPDF
import re
import spacy
from langchain.schema import Document
import logging
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')

# Initialize SpaCy's English model
nlp = spacy.load('en_core_web_sm')

def extract_text_with_metadata(pdf_path, page_limit=None):
    try:
        with fitz.open(pdf_path) as file:
            logging.info("PDF opened successfully.")

            filename = os.path.basename(pdf_path)
            title, _ = os.path.splitext(filename)

            #return "\n".join(page.get_text() for page in file)
            pages_text = []
            for page_num in range(len(file) if page_limit is None else min(len(file), page_limit)):
                page = file.load_page(page_num)
                text = page.get_text("text").strip()
                pages_text.append({
                    'page_number': page_num + 1,  # 1-indexed
                    'text': text
                })
            return title, pages_text

    except Exception as e:
        logging.error(f"Error extracting text from PDF: {e}")
        return "Untitled Document", []

def clean_extracted_text(raw_text):
    cleaned_pages = []

    for page in raw_text:
        text = page['text']
        page_number = page['page_number']
        lines = text.split('\n')
        cleaned_lines = []

        for line in lines:
            # Skip lines with DOIs
            if re.search(r'doi:\s*\d+\.\d+/\S+', line, re.IGNORECASE):
                continue
            # Skip lines starting with numbers followed by ':' or '.'
            if re.match(r'^\d+[:.]', line):
                continue
            # Skip lines containing email addresses
            if re.search(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', line):
                continue
            # Skip lines containing specific keywords
            if re.search(r'(Received|revised|accepted|Keywords|Abstract)', line, re.IGNORECASE):
                continue
            # Skip lines that are standalone numbers
            if re.match(r'^\d+$', line.strip()):
                continue
            # Optionally skip very short lines
            # if len(line.strip()) < 20:
                # continue
            # Fix hyphenated line breaks (e.g., "retrieval-\n augmented")
            line = re.sub(r'(\w+)-\s*\n\s*(\w+)', r'\1\2', line)
            # Replace multiple spaces with a single space
            line = re.sub(r'\s+', ' ', line)
            cleaned_lines.append(line.strip())

        # Preserve paragraph breaks by joining with double newline
        cleaned_text = '\n'.join(cleaned_lines) # can give \n\n
        cleaned_pages.append({
            'page_number': page_number,
            'cleaned_text': cleaned_text
        })

    return cleaned_pages

def tokenize_sentences(cleaned_pages):
    tokenized_pages = []

    for page in cleaned_pages:
        text = page['cleaned_text']
        page_number = page['page_number']
        doc = nlp(text)
        sentences = [sent.text.strip() for sent in doc.sents if len(sent.text.strip()) > 20]
        tokenized_pages.append({
            'page_number': page_number,
            'sentences': sentences
        })
    return tokenized_pages

def structure_sentences(tokenized_pages, title):
    documents = []
    for page in tokenized_pages:
        page_number = page['page_number']
        for sentence in page['sentences']:
            doc = Document(
                page_content=sentence,
                metadata={
                    'page_number': page_number,
                    'source': title
                }
            )
            documents.append(doc)
    return documents

def create_chunks(documents):
    # Initialize RecursiveCharacterTextSplitter with the desired parameters
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=2000,                    # Define chunk size
        chunk_overlap=200,                  # Define chunk overlap
        add_start_index=True                # Optionally add start index to metadata
    )

    # Split documents into chunks
    chunks = text_splitter.split_documents(documents)
    return chunks

In [23]:
pdf_path = '20241015_MISSION_KI_Glossar_v1.0 en.pdf'

title, pages_raw_text = extract_text_with_metadata(pdf_path,5) #2nd argument is for page limit(optional)
# if not pages_raw_text:
#     print("No text extracted from the PDF.")
#     return

cleaned_pages = clean_extracted_text(pages_raw_text)

# Tokenize into sentences
tokenized_pages = tokenize_sentences(cleaned_pages)

# Structure sentences using LangChain's Document with metadata
documents = structure_sentences(tokenized_pages, title)

chunks = create_chunks(documents)
print(f"Total #chunks: {len(chunks)}")

print("\nFirst 5 and Last 3 Chunks with Metadata:")
for idx, chunk in enumerate(chunks[:5] + chunks[-3:], 1):
    print(f"{idx}: {chunk.page_content}")
    print(f"   Metadata: Title='{chunk.metadata['source']}', Page Number={chunk.metadata.get('page_number', 'N/A')}, Start Index={chunk.metadata.get('start_index', 'N/A')}\n")


Total #chunks: 38

First 5 and Last 3 Chunks with Metadata:
1: October 15, 2024
MISSION KI
Glossary v1.0
c /o acatech
German Academy of Science and Engineering
Office: Karolinenplatz 4,
80333 Munich Germany
www.acatech.de
   Metadata: Title='20241015_MISSION_KI_Glossar_v1.0 en', Page Number=1, Start Index=0

2: 1 / 11
TABLE OF CONTENTS
TABLE OF CONTENTS
00
   Metadata: Title='20241015_MISSION_KI_Glossar_v1.0 en', Page Number=2, Start Index=0

3: Preliminary remarks on v1.0_______________________________________________________1
01 Generic terms __________________________________________________________________1
02 Individual quality dimensions ______________________________________________________2
03 Horizontal concepts______________________________________________________________4
04 Further terms __________________________________________________________________5
   Metadata: Title='20241015_MISSION_KI_Glossar_v1.0 en', Page Number=2, Start Index=0

4: Glossary
v1.0
1 / 11
Glossary 

##### Embed

In [24]:
from tqdm import tqdm
from langchain_chroma import Chroma                    # This is the database we will use to store the embeddings

vectorstore = None
Chroma().delete_collection()
for split in tqdm(chunks, colour="green"):
    if vectorstore:
      vectorstore.add_documents([split])                # Add new split to the vectorstore
    else:
      vectorstore = Chroma.from_documents(              # Generate the vectorstore with first split
                documents=[split],
                embedding=hf_embeddings,
                collection_metadata={"hnsw:space": "cosine"}     # by default L2 distance measured
                )

100%|[32mâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ[0m| 38/38 [00:16<00:00,  2.27it/s]


##### Retrieval

In [25]:
retriever = vectorstore.as_retriever(
      search_type="similarity_score_threshold",           # similarity function
      search_kwargs={"score_threshold": 0.5}                     # number of retrieved relevant documents # "k": 3
    )

In [28]:
#user_query = "An attempt to determine certain characteristics, performance or comparable characteristics."
#user_query="Property of an â†’AI system,"
user_query="Human supervision and intervention refer to the ability of a skilled individual to monitor and modify the behavior or functioning of an AI system within its application context, including making necessary adjustments or halting its operations as needed."
query_embedding = hf_embeddings.embed_query(user_query)

similarity_search_by_vector_with_relevance_scores():
 It calculates the distance rather than similarity.
 Default distance is L2, which is updated to cosine


In [29]:
# using query
# results = retriever.get_relevant_documents(user_query)
# using user query embedding
results = vectorstore.similarity_search_by_vector_with_relevance_scores(
    query_embedding

)

# Convert cosine distance to cosine similarity
docs_with_similarity = [(doc, 1 - score) for doc, score in results]

# Display the relevant documents
print("Relevant Chunks for the Query (Using Embedding):")
for doc, score in docs_with_similarity:
  print(doc.page_content)
  print(doc.metadata)
  print(f"Similarity Score: {score} \n")

Relevant Chunks for the Query (Using Embedding):
Human Oversight and Control
Property of an â†’AI system, including its embedding in the application context, with regard to the
possibility for a - technically competent - human individual to adequately observe and change the
behavior and/or functioning of this â†’AI system in principle and during operation and, if necessary, to
terminate it.
{'page_number': 5, 'source': '20241015_MISSION_KI_Glossar_v1.0 en', 'start_index': 0}
Similarity Score: 0.5840766429901123 

Explainability
Characteristics of an â†’KI system with regard to the basic comprehensibility and comprehensibility of
functionality, behavior and output for human specialists, but also for affected persons
and users.
{'page_number': 4, 'source': '20241015_MISSION_KI_Glossar_v1.0 en', 'start_index': 0}
Similarity Score: 0.3151412010192871 

User information
Characteristics of an â†’AI system with regard to the quality of information, interaction and operation by a
user, includi

#### using context in prompt

In [None]:
from langchain.prompts import PromptTemplate

# prepare prompt
template = """Use the following 3 pieces of context to answer the question at the end.
Use three sentences maximum and keep the answer as concise as possible.
Say I don't know when you need to.
{context}
Question: {question}
Helpful Answer:"""

prompt = PromptTemplate.from_template(template)

# Function to format chunks into context
def format_docs(top_k_chunks):
    return "\n\n".join(chunk.page_content for chunk in top_k_chunks)

additional_context = "Requirement refers to concepts such as Fairness and Explainability."

def add_context(retrived_docs):
    additional_context = "Requirement refers to concepts such as Fairness and Explainability."

    return additional_context + '\n\n---------\n\n' + retrived_docs



In [None]:
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema import StrOutputParser

simple_rag_chain = (
 {  "context": add_context | retriever | format_docs | add_context,
    "question":RunnablePassthrough()}
 | prompt                                 # build the prompt
 | hf_llm                                 # llm for generation
 | StrOutputParser()                      # collect the response text
)

# Display response
print("Generated Response:")
print(simple_rag_chain.invoke(user_query))



Generated Response:
 Requirement refers to essential conditions or features needed to meet certain standards in a project or system, such as Fairness and Explainability.


In [None]:
from langchain.prompts import PromptTemplate

# prepare prompt
template = """Use the following 3 pieces of context to answer the question at the end.
Use three sentences maximum and keep the answer as concise as possible.
Say I don't know when you need to.
{context}
Question: {question}
new_context: {additional_context}
Helpful Answer:"""

prompt = PromptTemplate.from_template(template)

# Function to format chunks into context
def format_docs(top_k_chunks):
    return "\n\n".join(chunk.page_content for chunk in top_k_chunks)

additional_context = "Requirement refers to concepts such as Fairness and Explainability."

def add_context(additional_context):
    return "Requirement refers to concepts such as Fairness and Explainability."

from langchain.schema.runnable import RunnablePassthrough
from langchain.schema import StrOutputParser

simple_rag_chain = (
 {  "context": retriever | format_docs,
    "question":RunnablePassthrough()}
 | prompt                                 # build the prompt
 | hf_llm                                 # llm for generation
 | StrOutputParser()                      # collect the response text
)

# Display response
print("Generated Response:")
print(simple_rag_chain.invoke(user_query, additional_context))

Generated Response:


AttributeError: 'str' object has no attribute 'items'

#### Misc

In [None]:
from sentence_transformers import SentenceTransformer, util

# try 'all-Mpnet-base-v2' for better accuracy
model = SentenceTransformer('all-MiniLM-L6-v2')

# Example sentences from documents
document_sentences = [
    "Can you recommend a good Italian place to eat?",
    "What is the nearest Chinese restaurant?",
    "I'm looking for a place that serves authentic Italian cuisine.",
    "Where can I find good Italian food around here?",
    "Is there any Italian restaurant open nearby?",
    "I need directions to the nearest coffee shop.",
    "Tell me about the best pizza spots in town.",
    "Are there any Italian restaurants that offer delivery?",
    "What is the best Japanese sushi place in this area?",
    "I am interested in learning Italian cuisine recipes."
]

user_query = "I want to find the best Italian restaurant nearby."

# Generate embeddings
document_embeddings = model.encode(document_sentences, convert_to_tensor=True)
query_embedding = model.encode(user_query, convert_to_tensor=True)

# Compute cosine similarity
cosine_scores = util.pytorch_cos_sim(query_embedding, document_embeddings)

# Get the most relevant sentences (sorting in descending order of similarity)
relevant_indices = cosine_scores.argsort(descending=True).tolist()[0]
print("Top relevant sentences:")
for idx in relevant_indices[:5]:  # Adjust number of top results as needed
    print(f"Sentence: {document_sentences[idx]}, Score: {cosine_scores[0][idx].item():.4f}")


Top relevant sentences:
Sentence: Can you recommend a good Italian place to eat?, Score: 0.8437
Sentence: Is there any Italian restaurant open nearby?, Score: 0.8398
Sentence: I'm looking for a place that serves authentic Italian cuisine., Score: 0.8295
Sentence: Where can I find good Italian food around here?, Score: 0.7989
Sentence: Are there any Italian restaurants that offer delivery?, Score: 0.7651


In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Initialize the embeddings model from LangChain
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# Example sentences from documents
document_sentences = [
    "Can you recommend a good Italian place to eat?",
    "What is the nearest Chinese restaurant?",
    "I'm looking for a place that serves authentic Italian cuisine.",
    "Where can I find good Italian food around here?",
    "Is there any Italian restaurant open nearby?",
    "I need directions to the nearest coffee shop.",
    "Tell me about the best pizza spots in town.",
    "Are there any Italian restaurants that offer delivery?",
    "What is the best Japanese sushi place in this area?",
    "I am interested in learning Italian cuisine recipes."
]

# User query
user_query = "I want to find the best Italian restaurant nearby."

# Generate embeddings for the query and documents
query_embedding = np.array(embeddings.embed_query(user_query)).reshape(1, -1)  # Convert to 2D array for similarity calculation
doc_embeddings = np.array(embeddings.embed_documents(document_sentences))

# Compute cosine similarity
cosine_scores = cosine_similarity(query_embedding, doc_embeddings)

# Get the most relevant sentences (sorting in descending order of similarity)
relevant_indices = cosine_scores.argsort(axis=1)[0][::-1]  # Sort indices by descending scores
print("Top relevant sentences:")
for idx in relevant_indices[:5]:  # Adjust number of top results as needed
    print(f"Sentence: {document_sentences[idx]}, Score: {cosine_scores[0][idx]:.4f}")


Top relevant sentences:
Sentence: Can you recommend a good Italian place to eat?, Score: 0.8437
Sentence: Is there any Italian restaurant open nearby?, Score: 0.8398
Sentence: I'm looking for a place that serves authentic Italian cuisine., Score: 0.8295
Sentence: Where can I find good Italian food around here?, Score: 0.7989
Sentence: Are there any Italian restaurants that offer delivery?, Score: 0.7651
