In [None]:
HUGGING_FACE_TOKEN = "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"
RS_TOKEN = "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"

JV_GEMINI_TOKEN = "AIzaSyArDcTFUTzztpgCIlogXSYQwBhUieZxv7Y"
RS_GEMINI_TOKEN = "AIzaSyAS0kVBJkyFyosoCwqAQyJM0ElyKEzrmgM"
VM_GEMINI_TOKEN = "AIzaSyD22Kr3nfSrvkE45KJlbIZHLuTA_cYuBYM"

## Estrazione del testo dal libro

In [None]:
import PyPDF2

def extract_text_from_pdf(pdf_path, output_txt_path):
    try:
        with open(pdf_path, "rb") as pdf_file:
            reader = PyPDF2.PdfReader(pdf_file)
            text = ""
            for page in reader.pages:
                text += page.extract_text() + "\n"
        
        with open(output_txt_path, "w", encoding="utf-8") as txt_file:
            txt_file.write(text)
        
        print(f"Testo estratto e salvato in {output_txt_path}")
    except Exception as e:
        print(f"Errore durante l'estrazione del testo: {e}")

pdf_file_path = "data/book.pdf"  
output_text_path = "data/book.txt"
extract_text_from_pdf(pdf_file_path, output_text_path)

## Inizializzazione del Retriever (FAISS) del LIBRO

📌 FAISS: Cos'è, Come Funziona e a Cosa Serve
FAISS (Facebook AI Similarity Search) è una libreria sviluppata da Meta AI per eseguire ricerche veloci su grandi set di dati vettoriali. È ottimizzata per trovare il Nearest Neighbor (NN) in spazi ad alta dimensionalità, rendendola ideale per compiti di similarity search come la ricerca di documenti, immagini o frasi simili.

<ul>
    <li>
        <b>Generazione degli Embeddings:</b>
        Un modello NLP (es. Sentence Transformers) converte il testo in vettori numerici.
        Ogni documento viene trasformato in una rappresentazione densa in uno spazio vettoriale.
    </li>
    <li>
        <b>Creazione dell’Indice FAISS:</b>
        FAISS memorizza questi vettori in una struttura dati ottimizzata per ricerche veloci.
        Supporta diversi tipi di indicizzazione (es. Flat, HNSW, IVF) a seconda delle esigenze.
    </li>
    <li>
    <b>Ricerca e Recupero:</b>
    Un nuovo testo viene trasformato in un embedding.
    FAISS trova i vettori più vicini nel database (nearest neighbors) restituendo i documenti più simili.
    </li>
</ul>

In [None]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.document_loaders import TextLoader

document_path = "data/book.txt"
loader = TextLoader(document_path, encoding="utf-8")  
doc_loader = loader.load()


text_splitter = RecursiveCharacterTextSplitter(chunk_size=8000, chunk_overlap=150)
split_docs = text_splitter.split_documents(doc_loader)

# Creazione degli embeddings con un modello open-source
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
#embedding_model = HuggingFaceEmbeddings(model_name="BAAI/bge-m3")

# Creazione del database FAISS
vectorstore = FAISS.from_documents(split_docs, embedding_model)

vectorstore.save_local("data/faiss_index/BOOK_faiss_index__all-MiniLM-L6-v2")
print("Retriever FAISS inizializzato e salvato.")


## *3*  Preprocessing delle informazioni

#### Estrarre info dalle slides

In [None]:
import os
import glob
import pypdf
import pytesseract
from pdf2image import convert_from_path

def extract_text_from_pdf(pdf_path):
    """Estrae il testo direttamente da un file PDF usando pypdf."""
    with open(pdf_path, "rb") as f:
        reader = pypdf.PdfReader(f)
        # Accumula il testo estratto da ogni pagina, se disponibile
        return "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])

def extract_text_and_ocr_from_pdf(pdf_path, lang, ocr_dpi=300):
    
    extracted_text = extract_text_from_pdf(pdf_path)
    
    # Converti il PDF in immagini (una per ogni pagina)
    images = convert_from_path(pdf_path, dpi=ocr_dpi)
    
    # Estrai il testo dalle immagini utilizzando pytesseract
    ocr_text_list = []
    for idx, image in enumerate(images):
        ocr_text = pytesseract.image_to_string(image, lang=lang)
        # Rimuove eventuali spazi o righe vuote
        ocr_text_list.append(ocr_text.strip())
    
    # Combina il testo estratto e quello ottenuto con OCR
    combined_text = (
        "=== Extracted text from PDF ===\n" + extracted_text.strip() +
        "\n\n=== Extracted Text from images (OCR) ===\n" + "\n\n".join(ocr_text_list)
    )
    return combined_text

def process_pdfs(input_directory, output_directory, ocr_dpi=300, lang="eng"):
    
    os.makedirs(output_directory, exist_ok=True)
    pdf_files = sorted(glob.glob(os.path.join(input_directory, "*.pdf")))
    
    for pdf_path in pdf_files:
        print(f"Processing {pdf_path} ...")
        combined_text = extract_text_and_ocr_from_pdf(pdf_path, ocr_dpi=ocr_dpi, lang=lang)
        base_name = os.path.splitext(os.path.basename(pdf_path))[0]
        output_path = os.path.join(output_directory, base_name + ".txt")
        
        with open(output_path, "w", encoding="utf-8") as f:
            f.write(combined_text)
        print(f"Saved output to {output_path}")
        
        
        
#------------------------------------------------------------#

input_dir = "data/slides/original"   
output_dir = "data/slides/preprocessed/STEP_1" 
process_pdfs(input_dir, output_dir)


### Riscrivere meglio il testo dalle slides

In [None]:
import google.generativeai as genai
genai.configure(api_key=VM_GEMINI_TOKEN)
model = genai.GenerativeModel("gemini-1.5-pro-latest")

def call_llm(prompt):
    response = model.generate_content(prompt)
    return response.text

In [None]:
import os
import glob

def generate_better_text_of_slide(text):
    prompt = (
        "The following text has been extracted from a PDF and is poorly formatted, with inconsistent spacing, line breaks, and structure. "
        "Your task is to rewrite the text to improve its readability and formatting. Specifically:\n\n"
        "1. Remove unnecessary line breaks and spaces to create a smooth, continuous flow of text.\n"
        "2. Correct any formatting issues, such as misplaced punctuation, inconsistent capitalization, or fragmented sentences.\n"
        "3. Ensure the text is clean and easy to read, with proper spacing and structure.\n"
        "4. Is important that you don't lose any information!.\n\n"
        "Here is the text to reformat:\n\n"
        f"{text}"
    )
    
    return call_llm(prompt)

def improve_slides(input_directory, output_directory):
    # Crea la directory di output se non esiste
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
    
    files = glob.glob(os.path.join(input_directory, "*"))
    
    skip_first = 19
    cont = 0
    
    for file_path in files:
        cont += 1
        if cont <= skip_first:
            continue  
        
        print(f"Processing: {file_path}")
        
        # Estrai il testo dal file
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                extracted_text = file.read()
        except UnicodeDecodeError:
            print(f"Errore di codifica nel file {file_path}. Tentativo con codifica 'latin-1'.")
            with open(file_path, 'r', encoding='latin-1') as file:
                extracted_text = file.read()
        
        if not extracted_text.strip():
            print(f"No text extracted from {file_path}")
            continue
        
        # Migliora il testo
        improved_text = generate_better_text_of_slide(extracted_text)
        
        # Salva il testo migliorato in un file nella directory di output
        output_path = os.path.join(output_directory, os.path.basename(file_path))
        with open(output_path, "w", encoding="utf-8") as f:
            f.write(improved_text)
        
        print(f"Saved: {output_path}")

# Directory di input e output
input_directory = "data/slides/preprocessed/STEP_1" 
output_directory = "data/slides/preprocessed/STEP_2" 

improve_slides(input_directory, output_directory)

### Unire Informazioni di slides e libro

In [None]:
import os
import glob
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings

#embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
embedding_model = HuggingFaceEmbeddings(model_name="BAAI/bge-m3")

#faiss_index = "data/faiss_index/BOOK_faiss_index__all-MiniLM-L6-v2" 
faiss_index = "data/faiss_index/BOOK_faiss_index__bge-m3"

vectorstore = FAISS.load_local(
    faiss_index, 
    embedding_model,
    allow_dangerous_deserialization=True
)

def retrieve_relevant_info(text, k=5):
    docs = vectorstore.similarity_search(text, k=k)
    return "\n".join([doc.page_content for doc in docs])

def generate_better_text(text, additional_info):  
    prompt = (  
        "### Task Description\n"
        "Enhance the given text by preserving all its original information while improving clarity, coherence, and depth. "
        "Expand on key concepts by integrating relevant insights and additional context without altering the meaning or omitting any details. "
        "Ensure that the enhanced text flows naturally and remains logically structured.\n\n"

        "### Provided Information\n"
        "**Original Text:**\n" + text + "\n\n"
        "**Additional Context:**\n" + additional_info + "\n\n"

        "### Guidelines & Constraints\n"
        "- Retain all information from the original text without omitting any details.\n"
        "- Add relevant explanations and context to enrich understanding.\n"
        "- Improve readability, coherence, and logical flow.\n"
        "- Do not introduce personal opinions or unverifiable information.\n"
        "- Maintain a structured format with sections separated by the delimiter:\n"
        "  `<----------section---------->`\n\n"
    )  
    return call_llm(prompt)  


def generate_final_textfile(input_directory, output_directory):
    
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
    
    files = glob.glob(os.path.join(input_directory, "*"))
    
    skip_first = 4
    cont = 0
    
    for file_path in files:
        cont += 1
        if cont <= skip_first:
            continue  
        
        print(f"Processing: {file_path}")
        
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                extracted_text = file.read()
        except UnicodeDecodeError:
            print(f"Errore di codifica nel file {file_path}. Tentativo con codifica 'latin-1'.")
            with open(file_path, 'r', encoding='latin-1') as file:
                extracted_text = file.read()
        
        if not extracted_text.strip():
            print(f"No text extracted from {file_path}")
            continue
        
        additional_info = retrieve_relevant_info(extracted_text, k=20)
        improved_text = generate_better_text(extracted_text, additional_info)
        output_path = os.path.join(output_directory, os.path.basename(file_path))
        
        with open(output_path, "w", encoding="utf-8") as f:
            f.write(improved_text)
        print(f"Salvato: {output_path}")


input_directory  = "data/slides/preprocessed/STEP_2"
output_directory = "data/slides/preprocessed/STEP_3_BGE-m"
generate_final_textfile(input_directory, output_directory)

## *4* Unisco in un unico File

In [2]:
import os

def merge_text_files(folder_path, output_file):

    text_files = sorted([f for f in os.listdir(folder_path) if f.endswith(".txt")])
    
    additional_files = ["course.txt", "curricula.txt"]

    separator = "\n<----------section---------->\n\n"

    with open(output_file, 'w', encoding='utf-8') as outfile:
        # Scrittura dei file originali con separatore
        for file_name in text_files:
            file_path = os.path.join(folder_path, file_name)
            with open(file_path, 'r', encoding='utf-8') as infile:
                outfile.write(infile.read() + separator)

        # Scrittura dei file aggiuntivi con separatore
        for extra_file in additional_files:
            extra_file_path = os.path.join("data", extra_file)
            if os.path.exists(extra_file_path):
                with open(extra_file_path, 'r', encoding='utf-8') as infile:
                    outfile.write(infile.read() + separator)  

    print(f"Unione completata: {output_file}")

folder_path = "data/slides/preprocessed/STEP_3_BGE-m"  
output_file = "data/3Steps_10Marzo2025.txt"  
merge_text_files(folder_path, output_file)


Unione completata: data/3Steps_10Marzo2025.txt


#### FAISS A DIMENSIONE FISSA

In [None]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import TextLoader

document_path = "data/all_preprocessed_by_gemini.txt"
loader = TextLoader(document_path, encoding="utf-8")  
doc_loader = loader.load()


# Split del testo per migliorare la ricerca
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
split_docs = text_splitter.split_documents(doc_loader)

# Creazione degli embeddings con un modello open-source
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
#embedding_model = HuggingFaceEmbeddings(model_name="BAAI/bge-m3")

# Creazione del database FAISS
vectorstore = FAISS.from_documents(split_docs, embedding_model)

# Salviamo il database FAISS
vectorstore.save_local("ALL_faiss_index__all-MiniLM-L6-v2")
print("Retriever FAISS inizializzato e salvato.")

#### FAISS A DIMENSIONE VARIABILE

In [None]:

from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter


document_path = "data/3Steps_10Marzo2025.txt"

# Caricamento del documento
loader = TextLoader(document_path, encoding="utf-8")  
doc_loader = loader.load()

text_splitter = RecursiveCharacterTextSplitter(    
    separators=["<----------section---------->"],
    chunk_overlap=0,  # Nessuna sovrapposizione poiché già suddiviso logicamente
    keep_separator=False  # Rimuove il separatore dai chunk
)
split_docs = text_splitter.split_documents(doc_loader)

# Creazione degli embeddings con un modello open-source
#embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
embedding_model = HuggingFaceEmbeddings(model_name="BAAI/bge-m3")

# Creazione del database FAISS
vectorstore = FAISS.from_documents(split_docs, embedding_model)

# Salvataggio del database FAISS
vectorstore.save_local("data/faiss_index/ALL__11Marzo2025__bge-m3")
print("Retriever FAISS inizializzato e salvato con suddivisione basata su separatore.")


  embedding_model = HuggingFaceEmbeddings(model_name="BAAI/bge-m3")
  from .autonotebook import tqdm as notebook_tqdm



