Environment Setup

In [None]:
!pip install -q langchain_huggingface
!pip install -q langchain-community
!pip install -q "unstructured[local-inference]"
!pip install -q pymupdf

Connect to the models

In [None]:
from google.colab import userdata

# get your HF TOKEN from https://huggingface.co/settings/tokens/new?tokenType=read
HF_API_KEY = userdata.get('HF_API_KEY')

In [None]:
from langchain_huggingface import HuggingFaceEndpoint

# for now mistral model works but we will need to update
# define huggingface generation endpoint
hf_llm = HuggingFaceEndpoint(
    repo_id="mistralai/Mistral-7B-Instruct-v0.3", # Model Name
    task="text-generation",                       # task as generating a text response
    max_new_tokens=200,                           # maximum numbers of generated tokens
    do_sample=False,                              # disables sampling
    huggingfacehub_api_token=HF_API_KEY           # ðŸ¤— huggingface API token
)

In [None]:
# use LLaMA model internal server
from openai import OpenAI
#set the base url to the local server
# client = OpenAI(base_url="WILL PROVIDE THE URL",
                # api_key="") #setting an api key is required from the openai framework, but the server itself does not use it

#get a list of models hosted on the local server
# print(client.models.list())

# Call the ChatCompletion API
# completion = client.chat.completions.create(
#       model="meta-llama-3.1-70b-instruct-fp8",
#       messages=[
#           {"role": "system", "content": "You are a helpful assistant."},
#           {"role": "user", "content": "Hello, are you there?"}
#           ])

#print the response

# print(completion.choices[0].message.content)

Loading the data

In [None]:
sentence_data_file = 'sentence_data.txt'
pdf_path = '20241015_MISSION_KI_Glossar_v1.0 en.pdf'

# load the sentences data and split it for first part
###   UPDATE THIS part to load the snetences  ###

# for chunks load the pdf file and do the chunking and select rondom thirty chunks

from langchain.document_loaders import UnstructuredPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

loader = UnstructuredPDFLoader(pdf_path)
data = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=70)
documents = text_splitter.split_documents(data)
import random
random.seed(42)
num_chunks_to_select = 15  # Number of random chunks you want
random_chunks = random.sample(documents, num_chunks_to_select)

Few shot examples

In [None]:
sentences_examples = [
    {
        "sentence":"""Transparency
Property of an â†’KI system that is explainable and comprehensible. In the context of this quality
standard, "transparency" also includes documentation of the properties of the â†’KI system.""",
        "output":"""Transparency is a property of KI system
        Transparency is explainable
        Transparency is comprehensible
        Transparency includes documentation of properties of KI system"""
    },
    {
        "sentence":"""opacity
opaqueness:
Property of a system that appropriate information about the system is unavailable to relevant stakeholders.""",
        "output":"""Opacity also called Opaqueness
        Opacity is a Property of a system
        Property of a system has characteristic Information unavailable to stakeholders"""
    },
    {
        "sentence":"""This requirement is closely linked with the principle of explicability and encompasses transparency of elements relevant to an AI system: the data, the system and the business models.""",
        "output":"""Requirement is linked with principle of explicability
        Requirement encompasses Transparency of elements
        Transparency is relevant to AI system
        Elements include Data
        Elements include System
        Elements include Business models"""
    },
]

chunks_examples = [
    {
        "chunk":"""behavior and/or functioning of this â†’AI system in principle and during operation and, if necessary, to
terminate it.
Monitoring
Procedure in which deviations between observable actual states and the desired target states are detected
during the operation of an â†’KI system.
Non-discrimination
Characteristic of an open process carried out by an â†’KI system if, in the course of this process, several
human individuals are treated in comparison with each other and this process is carried out in an open
process.
is legally free from the mistreatment of a human individual on the basis of a legally protected
characteristic.
User information""",
        "output":"""AI system has property functioning in principle and during operation
        Monitoring is deviations between observable actual states and desired target states
        Deviations occur during operation of an KI system
        Monitoring is a procedure for detecting deviations during operation (Optional, better to have)
        Non-discrimination is a characteristic of an open process by an KI system
        Open process treats several human individuals in comparison with each other
        Non-discrimination ensures no mistreatment of a human individual
        Mistreatment is based on legally protected characteristics"""
    },
    {
        "chunk":"""characteristic.
User information
Characteristics of an â†’AI system with regard to the quality of information, interaction and operation by a
user, including knowledge of the involvement of AI, barriers, and the quality of the user experience.
freedom and with a view to preventing nudging.
Robustness
Ability of an â†’AI system to maintain its regular and usual behavior and functioning in the best possible
way even in the event of non-malicious, adverse, disruptive or faulty inputs or external influences.
to keep.
Traceability
Property of an â†’KI system with regard to the ability to record the consecutive sequence of all decisions""",
        "output":"""AI system has characteristic User information
        User information relates to quality of information, interaction and operation by a user
        AI system includes Knowledge of AI involvement
        AI system includes barriers
        AI system includes quality of the user experience
        AI system has ability Robustness
        Robustness allows regular and usual behavior in adverse conditions
        AI system maintains behavior even under faulty inputs or external influences
        AI system has property Traceability
        Traceability relates to recording consecutive sequence of all decisions"""
    },
    {
        "chunk":"""that enter or have entered an â†’KI system along the entire life cycle.
Transparency
Property of an â†’KI system that is explainable and comprehensible. In the context of this quality
standard, "transparency" also includes documentation of the properties of the â†’KI system.""",
        "output":"""Transparency is a property of KI system
        Transparency is explainable
        Transparency is comprehensible
        Transparency includes documentation of properties of the KI system"""
    }
]

output parsing into list of strings

In [None]:
from typing import List
from langchain.schema import BaseOutputParser

class NewLineSeparatedOutputParser(BaseOutputParser):
    def parse(self, text: str) -> List[str]:
        return text.strip().split('\n')

prompt engineering

In [None]:
# please keep a record for all prompts you try
from langchain.schema.runnable import RunnablePassthrough
from langchain.prompts import PromptTemplate

user_query = "HERE"

# prepare prompt
###   UPDATE THIS PROMPT  ###
template = """BUILD YOUR PROMPT ADDING THE USER_QUERY AND EXAMPLE"""

prompt = PromptTemplate.from_template(template)

simple_rag_chain = (
 prompt                                   # build the prompt
 | hf_llm                                 # llm for generation
 | NewLineSeparatedOutputParser()         # collect the response into list
)

# Display response
print("Generated Response List:")
print(simple_rag_chain.invoke(user_query))