In [None]:
HUGGING_FACE_TOKEN = "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"
GEMINI_TOKEN = "AIzaSyArDcTFUTzztpgCIlogXSYQwBhUieZxv7Y"

## **1* Estrazione del testo dal libro*

In [None]:
import PyPDF2

def extract_text_from_pdf(pdf_path, output_txt_path):
    try:
        with open(pdf_path, "rb") as pdf_file:
            reader = PyPDF2.PdfReader(pdf_file)
            text = ""
            for page in reader.pages:
                text += page.extract_text() + "\n"
        
        with open(output_txt_path, "w", encoding="utf-8") as txt_file:
            txt_file.write(text)
        
        print(f"Testo estratto e salvato in {output_txt_path}")
    except Exception as e:
        print(f"Errore durante l'estrazione del testo: {e}")

# Esempio di utilizzo
pdf_file_path = "data/book.pdf"  
output_text_path = "data/book.txt"
extract_text_from_pdf(pdf_file_path, output_text_path)


## **2* Inizializzazione del Retriever (FAISS)** 🔍

📌 FAISS: Cos'è, Come Funziona e a Cosa Serve
FAISS (Facebook AI Similarity Search) è una libreria sviluppata da Meta AI per eseguire ricerche veloci su grandi set di dati vettoriali. È ottimizzata per trovare il Nearest Neighbor (NN) in spazi ad alta dimensionalità, rendendola ideale per compiti di similarity search come la ricerca di documenti, immagini o frasi simili.

<ul>
    <li>
        <b>Generazione degli Embeddings:</b>
        Un modello NLP (es. Sentence Transformers) converte il testo in vettori numerici.
        Ogni documento viene trasformato in una rappresentazione densa in uno spazio vettoriale.
    </li>
    <li>
        <b>Creazione dell’Indice FAISS:</b>
        FAISS memorizza questi vettori in una struttura dati ottimizzata per ricerche veloci.
        Supporta diversi tipi di indicizzazione (es. Flat, HNSW, IVF) a seconda delle esigenze.
    </li>
    <li>
    <b>Ricerca e Recupero:</b>
    Un nuovo testo viene trasformato in un embedding.
    FAISS trova i vettori più vicini nel database (nearest neighbors) restituendo i documenti più simili.
    </li>
</ul>

In [5]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.document_loaders import TextLoader

document_path = "data/book.txt"
loader = TextLoader(document_path, encoding="utf-8")  
doc_loader = loader.load()


# Split del testo per migliorare la ricerca
text_splitter = RecursiveCharacterTextSplitter(chunk_size=8000, chunk_overlap=150)
split_docs = text_splitter.split_documents(doc_loader)

# Creazione degli embeddings con un modello open-source
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
#embedding_model = HuggingFaceEmbeddings(model_name="BAAI/bge-m3")

# Creazione del database FAISS
vectorstore = FAISS.from_documents(split_docs, embedding_model)

# Salviamo il database FAISS
vectorstore.save_local("faiss_index__all-MiniLM-L6-v2")
print("Retriever FAISS inizializzato e salvato.")


Retriever FAISS inizializzato e salvato.


## *3*  Preprocessing delle informazioni

In [2]:
import google.generativeai as genai
genai.configure(api_key=GEMINI_TOKEN)
model = genai.GenerativeModel("gemini-1.5-pro-latest")

def call_llm(prompt):
    response = model.generate_content(prompt)
    return response.text

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import os
import glob
import pypdf
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings

# Carica il database FAISS
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = FAISS.load_local(
    "data/faiss_index/ALL_faiss_index__all-MiniLM-L6-v2/",
    embedding_model,
    allow_dangerous_deserialization=True
)

def extract_text_from_pdf(pdf_path):
    """Estrae il testo da un file PDF."""
    with open(pdf_path, "rb") as f:
        reader = pypdf.PdfReader(f)
        return "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])

def retrieve_relevant_info(text, k=5):
    """Recupera informazioni pertinenti dal database FAISS."""
    docs = vectorstore.similarity_search(text, k=k)
    return "\n".join([doc.page_content for doc in docs])

def generate_better_text(text, additional_info):
    """Genera un testo migliorato e approfondito basandosi sul contenuto estratto."""
    prompt = (
        "Rewrite the provided text to make it clearer, more detailed, and comprehensive. "
        "Analyze the main topics discussed, integrate relevant insights, and enrich the content with additional details. "
        "Ensure that the resulting text is structured naturally and flows smoothly, without explicit references to the original sources. "
        "Maintain the original meaning while incorporating pertinent explanations and context.\n\n"
        "Original Text:\n" + text + "\n\n"
        "Additional Information:\n" + additional_info
    )

    return call_llm(prompt)



def process_pdfs(input_directory, output_directory):
    """Processa tutti i PDF in una directory e salva i testi migliorati e approfonditi."""
    os.makedirs(output_directory, exist_ok=True)
    pdf_files = sorted(glob.glob(os.path.join(input_directory, "*.pdf")))  # Ordina alfabeticamente
    skip_first = 11
    cont = 0
    for pdf_path in pdf_files:
        cont += 1
        if cont < skip_first:
            continue    
        
        print(f"Processando: {pdf_path}")
        extracted_text = extract_text_from_pdf(pdf_path)
        if not extracted_text.strip():
            print(f"Nessun testo estratto da {pdf_path}")
            continue
        
        additional_info = retrieve_relevant_info(extracted_text, k=3)
        improved_text = generate_better_text(extracted_text, additional_info)
        output_path = os.path.join(output_directory, os.path.basename(pdf_path).replace(".pdf", ".txt"))
        
        with open(output_path, "w", encoding="utf-8") as f:
            f.write(improved_text)
        print(f"Salvato: {output_path}")


input_directory = "data/slides/original/" 
output_directory = "data/merged/preprocessed_by_gemini" 
process_pdfs(input_directory, output_directory)


Ignoring wrong pointing object 7 0 (offset 0)
Ignoring wrong pointing object 9 0 (offset 0)
Ignoring wrong pointing object 11 0 (offset 0)
Ignoring wrong pointing object 13 0 (offset 0)
Ignoring wrong pointing object 15 0 (offset 0)
Ignoring wrong pointing object 25 0 (offset 0)
Ignoring wrong pointing object 27 0 (offset 0)
Ignoring wrong pointing object 29 0 (offset 0)
Ignoring wrong pointing object 47 0 (offset 0)
Ignoring wrong pointing object 49 0 (offset 0)
Ignoring wrong pointing object 51 0 (offset 0)
Ignoring wrong pointing object 53 0 (offset 0)
Ignoring wrong pointing object 55 0 (offset 0)
Ignoring wrong pointing object 64 0 (offset 0)
Ignoring wrong pointing object 72 0 (offset 0)


Processando: data/slides/original\0. Course Introduction.pdf
Salvato: data/merged/preprocessed_by_gemini\0. Course Introduction.txt
Processando: data/slides/original\09. Transformers I.pdf


Ignoring wrong pointing object 6 0 (offset 0)
Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 10 0 (offset 0)
Ignoring wrong pointing object 12 0 (offset 0)
Ignoring wrong pointing object 14 0 (offset 0)
Ignoring wrong pointing object 19 0 (offset 0)
Ignoring wrong pointing object 21 0 (offset 0)
Ignoring wrong pointing object 23 0 (offset 0)
Ignoring wrong pointing object 31 0 (offset 0)
Ignoring wrong pointing object 41 0 (offset 0)
Ignoring wrong pointing object 43 0 (offset 0)
Ignoring wrong pointing object 45 0 (offset 0)
Ignoring wrong pointing object 53 0 (offset 0)
Ignoring wrong pointing object 61 0 (offset 0)
Ignoring wrong pointing object 63 0 (offset 0)
Ignoring wrong pointing object 65 0 (offset 0)
Ignoring wrong pointing object 67 0 (offset 0)
Ignoring wrong pointing object 69 0 (offset 0)
Ignoring wrong pointing object 74 0 (offset 0)
Ignoring wrong pointing object 164 0 (offset 0)
Ignoring wrong pointing object 166 0 (offset 0)
Ignoring wron

Salvato: data/merged/preprocessed_by_gemini\09. Transformers I.txt
Processando: data/slides/original\1. NLP Overview.pdf
Salvato: data/merged/preprocessed_by_gemini\1. NLP Overview.txt
Processando: data/slides/original\10. Transformers II.pdf
Salvato: data/merged/preprocessed_by_gemini\10. Transformers II.txt
Processando: data/slides/original\11. From Transformers to LLMs.pdf
Salvato: data/merged/preprocessed_by_gemini\11. From Transformers to LLMs.txt
Processando: data/slides/original\12. HuggingFace.pdf
Salvato: data/merged/preprocessed_by_gemini\12. HuggingFace.txt
Processando: data/slides/original\13. Encoder-only Transformers.pdf
Salvato: data/merged/preprocessed_by_gemini\13. Encoder-only Transformers.txt
Processando: data/slides/original\14. Decoder-only Transformers.pdf
Salvato: data/merged/preprocessed_by_gemini\14. Decoder-only Transformers.txt
Processando: data/slides/original\15. Encoder-Decoder Transformers.pdf
Salvato: data/merged/preprocessed_by_gemini\15. Encoder-Decode

Ignoring wrong pointing object 6 0 (offset 0)
Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 10 0 (offset 0)
Ignoring wrong pointing object 12 0 (offset 0)
Ignoring wrong pointing object 14 0 (offset 0)
Ignoring wrong pointing object 18 0 (offset 0)
Ignoring wrong pointing object 20 0 (offset 0)
Ignoring wrong pointing object 22 0 (offset 0)
Ignoring wrong pointing object 31 0 (offset 0)
Ignoring wrong pointing object 33 0 (offset 0)
Ignoring wrong pointing object 35 0 (offset 0)
Ignoring wrong pointing object 37 0 (offset 0)
Ignoring wrong pointing object 42 0 (offset 0)
Ignoring wrong pointing object 44 0 (offset 0)
Ignoring wrong pointing object 46 0 (offset 0)
Ignoring wrong pointing object 48 0 (offset 0)
Ignoring wrong pointing object 53 0 (offset 0)
Ignoring wrong pointing object 55 0 (offset 0)
Ignoring wrong pointing object 60 0 (offset 0)
Ignoring wrong pointing object 62 0 (offset 0)
Ignoring wrong pointing object 64 0 (offset 0)
Ignoring wrong 

Salvato: data/merged/preprocessed_by_gemini\17. Fine tuning.txt
Processando: data/slides/original\18. Prompt Engineering.pdf


ResourceExhausted: 429 Resource has been exhausted (e.g. check quota).

## *4* Unisco in un unico File

In [None]:
import os

def merge_text_files(folder_path, output_file):

    text_files = sorted([f for f in os.listdir(folder_path) if f.endswith(".txt")])
    
    with open(output_file, 'w', encoding='utf-8') as outfile:
        for file_name in text_files:
            file_path = os.path.join(folder_path, file_name)
            with open(file_path, 'r', encoding='utf-8') as infile:
                outfile.write(infile.read() + '\n')  
    
    print(f"Unione completata: {output_file}")

# Esempio di utilizzo
folder_path = "data/merged/preprocessed_by_gemini"  
output_file = "data/all_preprocessed_by_gemini.txt"  
merge_text_files(folder_path, output_file)


In [None]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import TextLoader

document_path = "data/all_preprocessed_by_gemini.txt"
loader = TextLoader(document_path, encoding="utf-8")  
doc_loader = loader.load()


# Split del testo per migliorare la ricerca
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
split_docs = text_splitter.split_documents(doc_loader)

# Creazione degli embeddings con un modello open-source
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
#embedding_model = HuggingFaceEmbeddings(model_name="BAAI/bge-m3")

# Creazione del database FAISS
vectorstore = FAISS.from_documents(split_docs, embedding_model)

# Salviamo il database FAISS
vectorstore.save_local("ALL_faiss_index__all-MiniLM-L6-v2")
print("Retriever FAISS inizializzato e salvato.")

## *5* RAG

In [None]:
import gradio as gr
import requests
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings

# Configurazione del modello
MODEL_LLM_PATH = "mistralai/Mistral-7B-Instruct-v0.2"
API_URL = f"https://api-inference.huggingface.co/models/{MODEL_LLM_PATH}"
HEADERS = {"Authorization": f"Bearer {HUGGING_FACE_TOKEN}"}

# Caricamento embeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = FAISS.load_local("ALL_faiss_index__all-MiniLM-L6-v2", embeddings, allow_dangerous_deserialization=True)

def generate_response(question, debug = False):
    docs = vectorstore.similarity_search(question, k=10)
    context = "\n".join([doc.page_content for doc in docs])

    prompt = (
        "You are an AI assistant using Retrieval-Augmented Generation (RAG). "
        "Use the following context to answer the question. If the answer is not in the context, say you don't know. " 
        "Give discursive answers\n\n"
        f"Context:\n{context}\n\n"
        f"Question: {question}\n"
        "-----END_QUESTION-----"
    )

    payload = {"inputs": prompt}
    response = requests.post(API_URL, headers=HEADERS, json=payload)

    if debug:
        print(f"API Response Code: {response.status_code}")
        print(f"Total Response: \n{response}\n\n\n\n")
        print(f"API Raw Response: {response.text}")

    if response.status_code != 200:
        return f"Errore API ({response.status_code}): {response.text}"
    
    try:
        response_json = response.json()
        generated_text = response_json[0].get("generated_text", "Errore nella generazione della risposta")
        clean_response = generated_text.split("-----END_QUESTION-----")[-1].strip() 
        return clean_response

      
    except requests.exceptions.JSONDecodeError:
        return "Errore nel parsing della risposta JSON"


#### ESEMPIO DI UTILIZZO

In [38]:
# Example usage:
response = generate_response("Talk me about Gardrails", debug = False)
print(response)

Guardrails are crucial mechanisms designed to mitigate risks associated with Large Language Models (LLMs) by implementing policies and technical solutions. They ensure that LLMs generate outputs that are safe, accurate, and contextually relevant, fostering trust and enabling reliable real-world applications.

Without guardrails, LLMs can unintentionally perpetuate harmful stereotypes, generate misinformation, or produce outputs that are illegal or unethical. They may also be susceptible to adversarial attacks, where users deliberately try to circumvent safety measures, further highlighting the importance of robust guardrails.

There are several types of guardrails, including ethical guardrails and operational guardrails. Ethical guardrails focus on avoiding bias, misinformation, and ensuring fairness in the LLM's responses. Operational guardrails align outputs with business or user objectives and can incite politeness, help, and consistency with brand guidelines.

Several techniques ca