In [8]:
%pip install PyPDF2
import PyPDF2

"""
Extracts text from PDF
Parameters:
    pdf_path (string): file path to pdf in directory tree
Returns a string of the pdf's contents
"""
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page_num in range(len(reader.pages)):
            text += reader.pages[page_num].extract_text()
        return text

pdf_path = "/content/EU AI Act.pdf"
document_text = extract_text_from_pdf(pdf_path)
print(document_text)

BRIEFING  
EU Legislation in Progress  
 
EPRS | European Parliamentary Research  Service  
Author: Tambiama  Madiega  
Members' Research Service 
PE 698.792  –  March 2024  EN 
Artificial intelligence act  
OVERVIEW  
European Union lawmakers reached a political agreement on the draft artificial intelligence (AI) act 
in December 2023. Proposed by the European Commission in April  2021, t he draft AI act, the first 
binding  worldwide  horizontal regulation  on AI, 
sets a common framework for the use and supply of 
AI systems in the EU. It offers  a classification for AI sy stems with different requirements and 
obligations tailored on a ' risk-based approach '. Some AI systems presenting 'unacceptable ' risks are 
prohibited. A wide range of 'high -risk' AI systems that can have a detrimental impact on people' s 
health, safety or on their fundamental rights  are authorised, but subject to a set of requirements and 
obligations to gain access to the EU market. AI systems posing limi

In [9]:
!pip install llama-index
!pip install pymilvus
import os
import numpy as np
from openai import AzureOpenAI
import textwrap
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection



In [24]:
# Connect to Azure Openai
os.environ["AZURE_OPENAI_API_KEY"] = "4b3ea2d5074c4059ad670e016e83b853"
client = AzureOpenAI(
    api_key = os.getenv("AZURE_OPENAI_API_KEY"),
    api_version = "2024-02-01",
    azure_endpoint = "https://dxc-eu-ai-act-chatbot.openai.azure.com/"
)

# model: gpt 35 turbo
# model version: 0301
deployment_name = "DXC"

# Send a test completion call to generate an answer
prompt = "What is the EU AI Act?"
response = client.completions.create(
    model = deployment_name,
    prompt = prompt,
    temperature = 1,
    max_tokens = 100,
    top_p = 0.5,
    frequency_penalty = 0,
    presence_penalty = 0,
    stop = None
)

print("User prompt: " + prompt)
print("Response: " + response.choices[0].text)

User prompt: What is the EU AI Act?
Response:  A Q&A with DLA Piper’s Giulio Coraggio

The EU AI Act is a proposed set of regulations on the use of artificial intelligence in the European Union. The act is intended to establish a legal framework for AI in the EU and to ensure that AI is developed and used in a way that is safe, transparent, and ethical. Giulio Coraggio, a partner at DLA Piper, discusses the key provisions of the EU AI Act and what it means for businesses operating in the EU


In [39]:
# Connect to Milvus
ENDPOINT = "https://in03-d592609349d65df.serverless.gcp-us-west1.cloud.zilliz.com"
TOKEN = "c96536849f835f69648e8e1586f3e09794e9d9c63c2375d55bb62aa5a7b031bba566b53d6afd4410bb85f4aeea14f1406b947fe5"
connections.connect(
   uri = ENDPOINT,
   token = TOKEN)

# Define schema for Milvus collection
fields = [
    FieldSchema(name = "id", dtype = DataType.INT64, is_primary = True),
    FieldSchema(name = "embedding", dtype = DataType.FLOAT_VECTOR, dim = 1536),
    FieldSchema(name = "text", dtype = DataType.VARCHAR, max_length = 65535)
]

schema = CollectionSchema(fields, "EU AI Act Collection")
collection = Collection("eu_ai_act", schema)

"""
  Generates embeddings using Azure OpenAI
  Parameters:
      text (string): section of text to create embeddings for
  Returns embedding of the text, vector of floats
"""
def generate_embedding(text):
    response = client.embeddings.create(
        input = text,
        model = "DXC-embedding"
    )
    return response.data[0].embedding

# Split the text into smaller chunks (vector of strings with max length of 5000 characters)
chunk_size = 5000
chunks = textwrap.wrap(document_text, chunk_size)
print("Example chunk: " + chunks[0])



Example chunk: BRIEFING   EU Legislation in Progress     EPRS | European Parliamentary Research  Service   Author: Tambiama  Madiega   Members' Research Service  PE 698.792  –  March 2024  EN  Artificial intelligence act   OVERVIEW   European Union lawmakers reached a political agreement on the draft artificial intelligence (AI) act  in December 2023. Proposed by the European Commission in April  2021, t he draft AI act, the first  binding  worldwide  horizontal regulation  on AI,  sets a common framework for the use and supply of  AI systems in the EU. It offers  a classification for AI sy stems with different requirements and  obligations tailored on a ' risk-based approach '. Some AI systems presenting 'unacceptable ' risks are  prohibited. A wide range of 'high -risk' AI systems that can have a detrimental impact on people' s  health, safety or on their fundamental rights  are authorised, but subject to a set of requirements and  obligations to gain access to the EU market. AI syst

In [None]:
# Prepare data for insertion into Milvus
embeddings = np.array([generate_embedding(chunk) for chunk in chunks], dtype = np.float32)
ids = list(range(len(embeddings)))
data = [ids, embeddings, chunks]

# Insert into Milvus
collection.insert(data)
collection.flush()
print("Embeddings inserted into Milvus")

In [32]:
index_params = {
    "index_type": "IVF_FLAT",
    "params": {"nlist": 128},
    "metric_type": "L2"
}

# Create an index on the embedding field in Milvus
collection.create_index(field_name = "embedding", index_params = index_params)

"""
  Perform a similarity search based on user query
  Parameters:
      query_embeddings (numpy array): user's query embedding
      top_k (int): max number of similar text chunks to return
  Returns list (strings) of similar text chunks from pdf
"""
def query_milvus(query_embedding, top_k = 3):
    collection.load()
    # if not collection.has_index():
    #     collection.create_index("embedding", index_params)

    results = collection.search(# Gathers list of metadata of the top sources
        data = query_embedding,
        anns_field = "embedding",
        param = {"metric_type": "L2", "params": {"nprobe": 15}},
        limit = top_k,
        output_fields=["text"]
    )
    print(results)
    retrieved_chunks = [hit.entity.get("text") for hit in results[0]]
    return retrieved_chunks


user_query = "What are the rules around deep fakes?"
query_embedding = np.array([generate_embedding(user_query)], dtype = np.float32)
relevant_text_chunks = query_milvus(query_embedding, 3)
context = " ".join(relevant_text_chunks)
prompt = f"Answer the question based on the following context: {context}\n\n{user_query}"

# Print metadata of the top retrieved text chunks from Milvus
print(user_query + "\n")
print("Top Sources:")
num = 1
for chunk in relevant_text_chunks:
    print("   " + str(num) + ". " + chunk[:200])
    num += 1

response = client.completions.create(
    model = deployment_name,
    prompt = prompt,
    temperature = 1,
    max_tokens = 100,
    top_p = 0.5,
    frequency_penalty = 0,
    presence_penalty = 0,
    stop = None
)
print("\nResponse: " + response.choices[0].text)


data: ['[\'id: 7, distance: 0.4337833523750305, entity: {\\\'text\\\': "do not adhere to an  approved code of practice will be required to demonstrate adequate alternative means of compliance.   Sandboxing and real- world testing   The measures to support investment  in AI systems have been strengthened. National  authorities  must establish at least one AI regulatory sandbox at national level to facilitate the development and  testing of innovative AI systems under strict regulatory oversight.18 Such regulatory sandbox es  provide for a controlled environment that fosters innovation and facilitates the developm ent,  training, testing and validation of innovative AI systems for a limited time before their placement on the market or entry  into service . The AI regulatory sandbox must enable, where appropriate, t esting  of AI systems in real -world conditions outside o f a laboratory for a limited period (subject to  compliance with EU data protection law rules and principles). Furthe