### Learning RAG with HuggingFace Transformers Library

In [1]:
%pip install -qU sentence-transformers huggingface-hub langchain-community langchain-text-splitters pypdf tqdm pymilvus

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m39.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m298.0/298.0 kB[0m [31m26.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.8/212.8 kB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.9/5.9 MB[0m [31m104.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m58.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m411.6/411.6 kB[0m [31m33.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.4/49.4 MB[0m [31m44.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.6/53.6 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
%%bash

wget -q https://artificialintelligenceact.eu/wp-content/uploads/2021/08/The-AI-Act.pdf

In [3]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("/content/The-AI-Act.pdf")
documents = loader.load()

print(f"Totally {len(documents)} Document(s) Found!")

Totally 108 Document(s) Found!


In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
docs = text_splitter.split_documents(documents)

print(f"Totally {len(docs)} Document(s) Splitted Found!")

Totally 399 Document(s) Splitted Found!


In [5]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer("BAAI/bge-small-en-v1.5")

def embed_text(text):
  return embedding_model.encode([text], normalize_embeddings=True).tolist()[0]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [6]:
from pymilvus import MilvusClient

milvus_client = MilvusClient(
    uri="./hf_milvus_demo.db")

collection_name = "rag_collection"

DEBUG:pymilvus.milvus_client.milvus_client:Created new connection using: 387318bd56f249d69e592b092f30cafa


In [7]:
if milvus_client.has_collection(collection_name):
  milvus_client.drop_collection(collection_name)

In [8]:
test_embedding = embed_text("This is a simple test text")
embedding_dimension = len(test_embedding)

In [9]:
milvus_client.create_collection(
    collection_name = collection_name,
    dimension = embedding_dimension,
    metric_type = "IP",
    consistency_level = "Strong"
)

DEBUG:pymilvus.milvus_client.milvus_client:Successfully created collection: rag_collection
DEBUG:pymilvus.milvus_client.milvus_client:Successfully created an index on collection: rag_collection


In [10]:
from tqdm import tqdm

text_lines = [chunk.page_content for chunk in docs]
data = []

for i, line in enumerate(tqdm(text_lines, desc="Creating Embeddings")):
  data.append(
      {
          "id": i,
          "text": line,
          "vector": embed_text(line)
      }
  )

insert_response = milvus_client.insert(collection_name=collection_name, data=data)

print(f"Totally {insert_response['insert_count']} Document(s) Inserted!")

Creating Embeddings: 100%|██████████| 399/399 [00:04<00:00, 80.91it/s]


Totally 399 Document(s) Inserted!


In [12]:
question = "what is the legal basis for the proposal?"

search_response = milvus_client.search(
    collection_name = collection_name,
    data=[
        embed_text(question)
    ],
    limit=3,
    search_params={
        "metric_type": "IP",
        "params": {}
    },
    output_fields = ["text"]
)

In [13]:
import json

retrieved_lines_with_distance = [
    (response["entity"]["text"], response["distance"])
    for response in search_response[0]
]

print(json.dumps(retrieved_lines_with_distance, indent=4))

[
    [
        "EN 6  EN \n2. LEGAL BASIS, SUBSIDIARITY AND PROPORTIONALITY \n2.1. Legal basis \nThe legal basis for the proposal is in the first place Article 114 of the Treaty on the \nFunctioning of the European Union (TFEU), which provides for the adoption of measures to \nensure the establishment and functioning of the internal market.  \nThis proposal constitutes a core part of the EU digital single market strategy. The primary \nobjective of this proposal is to ensure the proper functioning of the internal market by setting \nharmonised rules in particular on the development, placing on the Union market and the use \nof products and services making use of AI technologies or provided as stand -alone AI \nsystems. Some Member States are already considering national rules to ensure that AI is safe \nand is developed and used in compliance with fundamental rights obligations. This will likely \nlead to two main problems: i) a fragmentation of the internal market on essential elemen

In [15]:
context = "\n".join([line for line, _ in retrieved_lines_with_distance])

context

'EN 6  EN \n2. LEGAL BASIS, SUBSIDIARITY AND PROPORTIONALITY \n2.1. Legal basis \nThe legal basis for the proposal is in the first place Article 114 of the Treaty on the \nFunctioning of the European Union (TFEU), which provides for the adoption of measures to \nensure the establishment and functioning of the internal market.  \nThis proposal constitutes a core part of the EU digital single market strategy. The primary \nobjective of this proposal is to ensure the proper functioning of the internal market by setting \nharmonised rules in particular on the development, placing on the Union market and the use \nof products and services making use of AI technologies or provided as stand -alone AI \nsystems. Some Member States are already considering national rules to ensure that AI is safe \nand is developed and used in compliance with fundamental rights obligations. This will likely \nlead to two main problems: i) a fragmentation of the internal market on essential elements\nto request i

In [16]:
prompt = """
Use the following context provided inside <context> tag and provide answer to the question enclosed with <question> tag.

<context>
{context}
</context>

<question>
{question}
</question>
"""

In [26]:
from huggingface_hub import InferenceClient

repo_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"

In [18]:
PROMPT = prompt.format(context=context, question=question)

print(PROMPT)


Use the following context provided inside <context> tag and provide answer to the question enclosed with <question> tag.

<context>
EN 6  EN 
2. LEGAL BASIS, SUBSIDIARITY AND PROPORTIONALITY 
2.1. Legal basis 
The legal basis for the proposal is in the first place Article 114 of the Treaty on the 
Functioning of the European Union (TFEU), which provides for the adoption of measures to 
ensure the establishment and functioning of the internal market.  
This proposal constitutes a core part of the EU digital single market strategy. The primary 
objective of this proposal is to ensure the proper functioning of the internal market by setting 
harmonised rules in particular on the development, placing on the Union market and the use 
of products and services making use of AI technologies or provided as stand -alone AI 
systems. Some Member States are already considering national rules to ensure that AI is safe 
and is developed and used in compliance with fundamental rights obligations. Th

In [27]:
import os


In [28]:
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"

llm = InferenceClient(
    model = repo_id,
    timeout = 120,
)

answer = llm.text_generation(
    PROMPT,
    max_new_tokens = 1000).strip()

print(answer)

The legal basis for the proposal is Article 114 of the Treaty on the Functioning of the European Union (TFEU). This article provides for the adoption of measures to ensure the establishment and functioning of the internal market. The proposal is a core part of the EU digital single market strategy and aims to ensure the proper functioning of the internal market by setting harmonised rules for AI technologies and systems.
