##### Environment Setup

In [None]:
%%capture

%pip install -U langchain huggingface_hub transformers sentence_transformers
%pip install -U langchain-community
%pip install -U langchain-huggingface
%pip install -U langchain_experimental
%pip install -U langchain_openai
%pip install -U langchain-chroma

%pip install -U chromadb

%pip install -U unstructured
%pip install -U sentence-transformers
%pip install -U nltk
%pip install -U spacy
%pip install -U --upgrade pymupdf
%pip install -U transformers torch

!python -m spacy download en_core_web_sm

# Warning control
import warnings
warnings.filterwarnings("ignore")

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings

# Define the embedding model and configuration
embeddings_model_name = 'dunzhang/stella_en_1.5B_v5'
embeddings_model_kwargs = {"device":'cpu',                 # Load the embedding on CPU
                'trust_remote_code':True}       # As the model not official code trust it

## @Islam, please fix (To-do)
# Load the embedding model locally
hf_embeddings = HuggingFaceEmbeddings(     # use huggingFace embedding class
    model_name = embeddings_model_name,
    model_kwargs = embeddings_model_kwargs
)

vect_embed = hf_embeddings.embed_query("Hello world!")
print("Embedding vector type: ", type(vect_embed))
print("Embedding vector length: ", len(vect_embed))


You could use https://huggingface.co/settings/tokens/new?tokenType=read to obtain the API token with read permission after creating HF account.

In [None]:
from google.colab import userdata

HF_API_KEY = userdata.get('HF_API_KEY')

In [None]:
from langchain_huggingface import HuggingFaceEndpoint

# for now mistral model works but we will need to update
# define huggingface generation endpoint
hf_llm = HuggingFaceEndpoint(
    repo_id="mistralai/Mistral-7B-Instruct-v0.3", # Model Name
    task="text-generation",                       # task as generating a text response
    max_new_tokens=150,                           # maximum numbers of generated tokens
    do_sample=False,                              # disables sampling
    huggingfacehub_api_token=HF_API_KEY           # ðŸ¤— huggingface API token
)

##### Indexing

In [None]:
import fitz  # PyMuPDF
import re
import spacy
from langchain.schema import Document
import logging
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')

# Initialize SpaCy's English model
nlp = spacy.load('en_core_web_sm')

def extract_text_with_metadata(pdf_path, page_limit=None):
    try:
        with fitz.open(pdf_path) as file:
            logging.info("PDF opened successfully.")

            filename = os.path.basename(pdf_path)
            title, _ = os.path.splitext(filename)

            #return "\n".join(page.get_text() for page in file)
            pages_text = []
            for page_num in range(len(file) if page_limit is None else min(len(file), page_limit)):
                page = file.load_page(page_num)
                text = page.get_text("text").strip()
                pages_text.append({
                    'page_number': page_num + 1,  # 1-indexed
                    'text': text
                })
            return title, pages_text

    except Exception as e:
        logging.error(f"Error extracting text from PDF: {e}")
        return "Untitled Document", []

def clean_extracted_text(raw_text):
    cleaned_pages = []

    for page in raw_text:
        text = page['text']
        page_number = page['page_number']
        lines = text.split('\n')
        cleaned_lines = []

        for line in lines:
            # Skip lines with DOIs
            if re.search(r'doi:\s*\d+\.\d+/\S+', line, re.IGNORECASE):
                continue
            # Skip lines starting with numbers followed by ':' or '.'
            if re.match(r'^\d+[:.]', line):
                continue
            # Skip lines containing email addresses
            if re.search(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', line):
                continue
            # Skip lines containing specific keywords
            if re.search(r'(Received|revised|accepted|Keywords|Abstract)', line, re.IGNORECASE):
                continue
            # Skip lines that are standalone numbers
            if re.match(r'^\d+$', line.strip()):
                continue
            # Optionally skip very short lines
            # if len(line.strip()) < 20:
                # continue
            # Fix hyphenated line breaks (e.g., "retrieval-\n augmented")
            line = re.sub(r'(\w+)-\s*\n\s*(\w+)', r'\1\2', line)
            # Replace multiple spaces with a single space
            line = re.sub(r'\s+', ' ', line)
            cleaned_lines.append(line.strip())

        # Preserve paragraph breaks by joining with double newline
        cleaned_text = '\n'.join(cleaned_lines) # can give \n\n
        cleaned_pages.append({
            'page_number': page_number,
            'cleaned_text': cleaned_text
        })

    return cleaned_pages

def tokenize_sentences(cleaned_pages):
    tokenized_pages = []

    for page in cleaned_pages:
        text = page['cleaned_text']
        page_number = page['page_number']
        doc = nlp(text)
        sentences = [sent.text.strip() for sent in doc.sents if len(sent.text.strip()) > 20]
        tokenized_pages.append({
            'page_number': page_number,
            'sentences': sentences
        })
    return tokenized_pages

def create_chunks(tokenized_pages, title):
    documents = []
    for page in tokenized_pages:
        page_number = page['page_number']
        for sentence in page['sentences']:
            doc = Document(
                page_content=sentence,
                metadata={
                    'page_number': page_number,
                    'source': title
                }
            )
            documents.append(doc)
    return documents


In [None]:
pdf_path = '20241015_MISSION_KI_Glossar_v1.0 en.pdf'

title, pages_raw_text = extract_text_with_metadata(pdf_path) #2nd argument is for page limit(optional)

cleaned_pages = clean_extracted_text(pages_raw_text)

# Tokenize into sentences
tokenized_pages = tokenize_sentences(cleaned_pages)

# Structure sentences using LangChain's Document with metadata
chunks = create_chunks(tokenized_pages, title)

print(f"Total #chunks: {len(chunks)}")

print("\nFirst 5 and Last 3 Chunks with Metadata:")
for idx, chunk in enumerate(chunks[:5] + chunks[-3:], 1):
    print(f"{idx}: ")
    print('===================')
    print(chunk.page_content)
    print('===================')
    print(f"   Metadata: Title='{chunk.metadata['source']}', Page Number={chunk.metadata.get('page_number', 'N/A')}, Start Index={chunk.metadata.get('start_index', 'N/A')}\n")


##### Embed

In [None]:
from tqdm import tqdm
from langchain_chroma import Chroma                    # This is the database we will use to store the embeddings

vectorstore = None
Chroma().delete_collection()
for split in tqdm(chunks, colour="green"):
    if vectorstore:
      vectorstore.add_documents([split])                # Add new split to the vectorstore
    else:
      vectorstore = Chroma.from_documents(              # Generate the vectorstore with first split
                documents=[split],
                embedding=hf_embeddings,
                collection_metadata={"hnsw:space": "cosine"}     # by default L2 distance measured
                )

##### Retrieval

In [None]:
# as we need to make sure that we will not use the full context of the model
# we use k but we can use high numbers as sentence is small
retriever = vectorstore.as_retriever(
      search_type="similarity_score_threshold",           # similarity function
      search_kwargs={"score_threshold": 0.5,
                     'k':10}                     # number of retrieved relevant documents
    )

#### using context in prompt

In [None]:
from langchain.prompts import PromptTemplate

user_example_input = "User information
Characteristics of an â†’AI system with regard to the quality of information, interaction and operation by a
user, including knowledge of the involvement of AI, barriers, and the quality of the user experience"
user_example_output = 'User Information is part AI system, "

context = "Interpretability
Property of an â†’ AI model that its model parameters, weights or other (mathematical) properties are as
directly comprehensible as possible and directly understandable for specialist personnel."

# prepare prompt
template = """Use the following 3 pieces of context to answer the question at the end.
Use three sentences maximum and keep the answer as concise as possible.
Say I don't know when you need to.
{context}
Question: {question}
Helpful Answer:"""

prompt = PromptTemplate.from_template(template)

# Function to format chunks into context
def format_docs(top_k_chunks):
    return "\n\n".join(chunk.page_content for chunk in top_k_chunks)

additional_context = "Requirement refers to concepts such as Fairness and Explainability."

def add_context(retrived_docs):
    additional_context = "Requirement refers to concepts such as Fairness and Explainability."

    return additional_context + '\n\n---------\n\n' + retrived_docs



In [None]:
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema import StrOutputParser

simple_rag_chain = (
 {  "context": add_context | retriever | format_docs | add_context,
    "question":RunnablePassthrough()}
 | prompt                                 # build the prompt
 | hf_llm                                 # llm for generation
 | StrOutputParser()                      # collect the response text
)

# Display response
print("Generated Response:")
print(simple_rag_chain.invoke(user_query))

In [None]:
from langchain.prompts import PromptTemplate

# prepare prompt
template = """Use the following 3 pieces of context to answer the question at the end.
Use three sentences maximum and keep the answer as concise as possible.
Say I don't know when you need to.
{context}
Question: {question}
new_context: {additional_context}
Helpful Answer:"""

prompt = PromptTemplate.from_template(template)

# Function to format chunks into context
def format_docs(top_k_chunks):
    return "\n\n".join(chunk.page_content for chunk in top_k_chunks)

additional_context = "Requirement refers to concepts such as Fairness and Explainability."

def add_context(additional_context):
    return "Requirement refers to concepts such as Fairness and Explainability."

from langchain.schema.runnable import RunnablePassthrough
from langchain.schema import StrOutputParser

simple_rag_chain = (
 {  "context": retriever | format_docs,
    "question":RunnablePassthrough()}
 | prompt                                 # build the prompt
 | hf_llm                                 # llm for generation
 | StrOutputParser()                      # collect the response text
)

# Display response
print("Generated Response:")
print(simple_rag_chain.invoke(user_query, additional_context))