In [1]:
import os
import fitz  # PyMuPDF
import pandas as pd
import pprint
import uuid
import chromadb
from chromadb.config import Settings
from chromadb import HttpClient

In [2]:
# Initialize ChromaDB client
chroma_client = HttpClient(host='localhost', port=8200)  # Ensure this is the correct port for ChromaDB

In [3]:
from chromadb.utils import embedding_functions

# Use a different sentence transformer: all-mpnet-base-v2
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-mpnet-base-v2")

  from tqdm.autonotebook import tqdm, trange


In [4]:
# Create a collection
collection = chroma_client.get_or_create_collection(name="ML_doc_collection", embedding_function=sentence_transformer_ef)

In [5]:
# Function to read PDF and convert to text
def read_pdf(file_path):
    try:
        with fitz.open(file_path) as pdf_document:
            text = ""
            for page_num in range(pdf_document.page_count):
                page = pdf_document[page_num]
                text += page.get_text()
        return text
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return ""

In [6]:
# Understand the Data (analyze PDF files)
def analyze_pdf_structure(directory):
    file_structure = {}
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".pdf"):
                file_path = os.path.join(root, file)
                content = read_pdf(file_path)
                if content:
                    file_structure[file_path] = content
                else:
                    print(f"No content found in {file_path}")
    return file_structure

In [7]:
# Load and Preprocess the Text
def load_and_preprocess(pdf_structure):
    preprocessed_data = []
    for file_path, content in pdf_structure.items():
        lines = content.split('\n')
        preprocessed_data.append({
            "file_path": file_path,
            "lines": lines,
            "line_count": len(lines)
        })
    if preprocessed_data:
        return pd.DataFrame(preprocessed_data)
    else:
        print("No data to preprocess")
        return None


In [8]:
# Chunk the text data and index in ChromaDB
def chunk_code(preprocessed_code, collection, chunk_size=50 ):
    code_chunks = []
    
    for index, row in preprocessed_code.iterrows():
        lines = row['lines']
        for i in range(0, len(lines), chunk_size):
            chunk = lines[i:i + chunk_size]
            unique_id = str(uuid.uuid4())  # Generate a unique ID
            chunk_content = "\n".join(chunk)
            chunk_data = {
                "id": unique_id,
                "file_path": row['file_path'],
                "chunk_start": i,
                "chunk_end": i + len(chunk),
                "chunk_content": chunk_content,
            }
            code_chunks.append(chunk_data)
            
            # Insert the chunk in ChromaDB
            collection.add(ids=[unique_id], documents=[chunk_content])  # Pass only the content as string
    
    return pd.DataFrame(code_chunks)


In [9]:
#!ls ../../data

In [10]:
# File path to the directory containing PDF files
pdf_directory = "../../data"
pdf_structure = analyze_pdf_structure(pdf_directory)  # Uses pdf_directory

# Debug statement to check pdf_structure
#print(f"PDF Structure: {pdf_structure}")

No content found in ../../data/Generative Artificial Intelligence in the Metaverse Era.pdf
No content found in ../../data/AIML/Artificial Intelligence & Generative AI for Beginners.pdf


In [11]:
preprocessed_pdf = load_and_preprocess(pdf_structure)

# Debug statement to check preprocessed_pdf
#preprocessed_pdf

In [12]:
if preprocessed_pdf is not None:
    pdf_chunks = chunk_code(preprocessed_pdf, collection)
    print(pdf_chunks.head())
else:
    print("Preprocessed PDF data is None")

                                     id  \
0  93bdddbe-646f-48e5-b8a4-4691d979cee6   
1  1f7cb548-a28c-4611-b4b6-9d68d6497096   
2  e50ba8a6-842a-432e-b03b-6eea56bf9b52   
3  20fa0730-db72-4662-b944-c3a78191023b   
4  d1434cde-3c6a-46e4-8b4b-8b6178623adb   

                                           file_path  chunk_start  chunk_end  \
0  ../../data/Non-Expert Programmers in the Gener...            0         50   
1  ../../data/Non-Expert Programmers in the Gener...           50        100   
2  ../../data/Non-Expert Programmers in the Gener...          100        150   
3  ../../data/Non-Expert Programmers in the Gener...          150        200   
4  ../../data/Non-Expert Programmers in the Gener...          200        250   

                                       chunk_content  
0  Non-Expert Programmers in the Generative AI Fu...  
1  International 4.0 License.\nThis is the author...  
2  adapt code from various\nsources [46]. Even mo...  
3  whether model-generated code meets th

In [13]:
pdf_chunks.head(20)

Unnamed: 0,id,file_path,chunk_start,chunk_end,chunk_content
0,93bdddbe-646f-48e5-b8a4-4691d979cee6,../../data/Non-Expert Programmers in the Gener...,0,50,Non-Expert Programmers in the Generative AI Fu...
1,1f7cb548-a28c-4611-b4b6-9d68d6497096,../../data/Non-Expert Programmers in the Gener...,50,100,International 4.0 License.\nThis is the author...
2,e50ba8a6-842a-432e-b03b-6eea56bf9b52,../../data/Non-Expert Programmers in the Gener...,100,150,adapt code from various\nsources [46]. Even mo...
3,20fa0730-db72-4662-b944-c3a78191023b,../../data/Non-Expert Programmers in the Gener...,150,200,whether model-generated code meets their needs...
4,d1434cde-3c6a-46e4-8b4b-8b6178623adb,../../data/Non-Expert Programmers in the Gener...,200,250,Our work focuses on envisioning a future for n...
5,7f8b392c-0591-477f-9bb0-02bf3d06cd64,../../data/Non-Expert Programmers in the Gener...,250,300,custom-built programming assistant in PyCharm....
6,a314f6e6-a2f1-4107-8865-c7352865f7cb,../../data/Non-Expert Programmers in the Gener...,300,350,"ment, finding that access to the Code LLM was ..."
7,1a46f71c-ce44-46e6-9818-93cce99472e5,../../data/Non-Expert Programmers in the Gener...,350,400,the program should do.\nAnother design decisio...
8,c6422178-1f8d-42dd-a95a-ca987f0689c9,../../data/Non-Expert Programmers in the Gener...,400,450,College under a reliance agreement with Oberli...
9,a06ce627-7c73-43e0-a065-45be1fd89fdf,../../data/Non-Expert Programmers in the Gener...,450,500,"to 4, removing the more difficult timed tasks,..."


In [14]:
len(pdf_chunks)

903

In [15]:
pprint.pprint(collection.peek())

{'data': None,
 'documents': ['personal data, internal company data, NSFW (‘Not Safe for '
               'Work’) content, or literature. \n'
               'R20. \n'
               'Embedding Inversion \n'
               'To enable LLMs to process texts, they are typically embedded '
               'into a vector space. Embedding inversion aims \n'
               'to reconstruct the original input text from these embeddings. '
               'Such attacks are particularly relevant in the \n'
               'context of LLM-integrated applications. In these cases, data '
               'necessary for operation are stored as \n'
               'embeddings in corresponding vector databases, which are often '
               'hosted by external service providers. Morris \n'
               'et al., for example, present an algorithm that iteratively '
               'adjusts an input text to achieve a given target \n'
               'embedding (Morris, et al., 2023).  \n'
               'R21.

In [16]:
collection.count()

1969