In [1]:
!pip install langchain-community langchain-core
!pip install openai
!pip install pypdf
!pip install faiss-cpu
!pip install tiktoken
!pip install pyyaml

Collecting langchain-community
  Downloading langchain_community-0.2.16-py3-none-any.whl.metadata (2.7 kB)
Collecting langchain-core
  Downloading langchain_core-0.2.38-py3-none-any.whl.metadata (6.2 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting langchain<0.3.0,>=0.2.16 (from langchain-community)
  Downloading langchain-0.2.16-py3-none-any.whl.metadata (7.1 kB)
Collecting langsmith<0.2.0,>=0.1.0 (from langchain-community)
  Downloading langsmith-0.1.116-py3-none-any.whl.metadata (13 kB)
Collecting tenacity!=8.4.0,<9.0.0,>=8.1.0 (from langchain-community)
  Downloading tenacity-8.5.0-py3-none-any.whl.metadata (1.2 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain-core)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.22.0-py3-none-any.whl.m

In [8]:
#Helps to access to files in drive. We will need a PDF file
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
# Pypdf is a library in Python that provides a Pythonic API for interacting with PDF documents.
from pypdf import PdfReader

# I have stored my key secretly in Yaml.
import yaml


# langchain_community is a module from the LangChain framework, which is designed for creating
# applications that use large language models (LLMs) in an efficient and effective manner.
# OpenAIEmbeddings is a class used to generate embeddings, which are dense vector representations
# of text, from OpenAI models. These embeddings are used for various NLP tasks like search, similarity
# comparison, and clustering.
from langchain_community.embeddings.openai import OpenAIEmbeddings

# CharacterTextSplitter is a class from LangChain that is used to split text into smaller chunks.
# This is particularly useful when dealing with large text data, making it easier to process, analyze,
# or generate embeddings from smaller, more manageable pieces of text.
from langchain.text_splitter import CharacterTextSplitter

# The following are various vector stores used in LangChain to store and search over embeddings:
# - ElasticVectorSearch: Uses Elasticsearch to store embeddings and perform vector search.
# - Pinecone: A managed vector database that allows for scalable and efficient storage and search
#   over embeddings.
# - Weaviate: An open-source vector search engine that provides various features, including storing
#   and searching embeddings.
# - FAISS: A library developed by Facebook AI Research for efficient similarity search and clustering
#   of dense vectors. It is commonly used for building large-scale vector search systems.
from langchain.vectorstores import ElasticVectorSearch, Pinecone, Weaviate, FAISS


Mounted at /content/drive


In [9]:
import os
import yaml

with open("/content/drive/MyDrive/DS_Portfolio/LangChain and OpenAI to chat with PDF files/OpenAIApi.yaml", 'r') as file:
    data = yaml.safe_load(file)

# Access the value of 'your_key_name'
key_value = data.get('your_key_name')

os.environ['OPENAI_API_KEY'] = key_value

In [59]:
reader = PdfReader('/content/drive/MyDrive/DS_Portfolio/LangChain and OpenAI to chat with PDF files/ai act de.pdf')

In [60]:
###### OLD CODE WHICH CONFUSES PAGES ####
# # It will go to each page and read text from each page, raw_text will contain all the text
# raw_text = ''
# for i, page in enumerate(reader.pages):
#   text = page.extract_text()
#   if text:
#     raw_text += text

In [76]:
raw_text = ''
for i, page in enumerate(reader.pages):
    text = page.extract_text()
    if text:
        raw_text += f'\n\n[Page {i+1}]\n{text}'

In [77]:
raw_text

'\n\n[Page 1]\nArtikel 4\nKI-Kompetenz\nDie Anbieter und Betreiber von KI-Systemen ergreif en Maßnahmen, um nach best en Kräf ten sicherzust ellen, dass ihr \nPersonal und andere Personen, die in ihrem Auftrag mit dem Betr ieb und der Nutzung von KI-Syste men befasst sind, über \nein ausreich endes Maß an KI-K ompet enz verfüge n, wobei ihre technisc hen Kenntnisse, ihre Erfahrung, ihre Ausbildung und \nSchulung und der Kontext, in dem die KI-Syste me eingesetzt werden sollen, sowie die Pers onen oder Pers onengr uppen, bei \ndenen die KI-Syste me einge setzt werden sollen, zu berücksichtig en sind.\nKAPITEL II\nVERBOTENE PRAKTIK EN IM KI-BEREICH\nArtikel 5\nVerbotene Praktiken im KI-Bereich\n(1) Folgend e Praktiken im KI-Bereich sind verbote n:\na)das Inverkehrbri ngen, die Inbetr iebnahme oder die Verwendung eines KI-Systems, das Technik en der unterschw elligen \nBeeinf lussung außerhalb des Bewusstseins einer Person oder absichtlic h manipulative oder täuschende Technik en mit \nde

In [78]:
# Create an instance of CharacterTextSplitter
textsplitter = CharacterTextSplitter(
    separator = "\n",               # The separator to split the text (newline in this case)
    chunk_size = 1000,              # Chunk size of 1000 characters
    chunk_overlap = 200,            # Overlap of 200 characters between consecutive chunks
    length_function = len           # The function used to measure the length (using len() here)
)



In [80]:
# Now, apply the splitter to your text
chunks = textsplitter.split_text(raw_text)  # Replace 'your_text' with the actual text you're splitting

In [81]:
for i, chunk in enumerate(chunks):
    print(f"Chunk {i + 1}: {chunk}")

Chunk 1: [Page 1]
Artikel 4
KI-Kompetenz
Die Anbieter und Betreiber von KI-Systemen ergreif en Maßnahmen, um nach best en Kräf ten sicherzust ellen, dass ihr 
Personal und andere Personen, die in ihrem Auftrag mit dem Betr ieb und der Nutzung von KI-Syste men befasst sind, über 
ein ausreich endes Maß an KI-K ompet enz verfüge n, wobei ihre technisc hen Kenntnisse, ihre Erfahrung, ihre Ausbildung und 
Schulung und der Kontext, in dem die KI-Syste me eingesetzt werden sollen, sowie die Pers onen oder Pers onengr uppen, bei 
denen die KI-Syste me einge setzt werden sollen, zu berücksichtig en sind.
KAPITEL II
VERBOTENE PRAKTIK EN IM KI-BEREICH
Artikel 5
Verbotene Praktiken im KI-Bereich
(1) Folgend e Praktiken im KI-Bereich sind verbote n:
a)das Inverkehrbri ngen, die Inbetr iebnahme oder die Verwendung eines KI-Systems, das Technik en der unterschw elligen 
Beeinf lussung außerhalb des Bewusstseins einer Person oder absichtlic h manipulative oder täuschende Technik en mit
Chunk 2: Beein

In [65]:
embeddings = OpenAIEmbeddings()

  embeddings = OpenAIEmbeddings()


In [82]:
docsearch = FAISS.from_texts(chunks, embeddings)

In [83]:
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI

In [84]:
chain = load_qa_chain(OpenAI(), chain_type = 'stuff')

In [69]:
query = "What are the forbiden uses of AI?"
docs = docsearch.similarity_search(query)
answer = chain.run(input_documents=docs, question=query)

  chain.run(input_documents=docs, question=query)


' The forbidden practices in the AI sector include using AI systems for profiling individuals or evaluating their personal characteristics, using AI systems for facial recognition through indiscriminate scanning of photos or surveillance footage, using AI systems to derive emotions of individuals in the workplace or educational institutions, and using systems for biometric categorization based on race, political beliefs, union membership, religion, sexual life, or sexual orientation.'

In [70]:
query = "I have an AI product how can i make sure that I can place my product in market?"
docs = docsearch.similarity_search(query)
answer = chain.run(input_documents=docs, question=query)

In [71]:
answer

' You can ensure that your AI product meets the requirements set in Article 16 of the regulation, including having a quality management system and providing clear documentation to relevant authorities. You may also need to undergo a conformity assessment process and obtain a EU conformity declaration before placing your product on the market. Additionally, you may need to register your product and comply with registration obligations outlined in Article 49.'

In [85]:
query = "I have a high risk AI system how can i make sure that I can place my product in market? Tell me also the page of a document to look at"
docs = docsearch.similarity_search(query)
answer = chain.run(input_documents=docs, question=query)

In [75]:
answer

' To ensure that you can place your high risk AI system in the market, you will need to follow the obligations outlined in Article 16 of the document, specifically points (a) and (b). Additionally, you will need to have a quality management system in place that complies with Article 17. You can find this information on page 144 of the document.'

In [86]:
query = "I have a high risk AI system how can i make sure that I can place my product in market? Tell me also the page of a document to look at"
docs = docsearch.similarity_search(query)
answer = chain.run(input_documents=docs, question=query)

In [87]:
answer

' According to Article 16 of the document, as an Anbieter (provider) of a Hochrisiko-KI-System (high-risk AI system), you must ensure that your system meets the requirements set in Section 2. This can be found on page 12 of the document. Additionally, you must also have a qualitätsmanagementsystem (quality management system) in place, as stated in Article 16c. This information can be found on page 13 of the document.'