In [2]:
import os
import fitz  # PyMuPDF
import pandas as pd
import pprint
import uuid
import chromadb
from chromadb.config import Settings
from chromadb import HttpClient

In [3]:
# Initialize ChromaDB client
chroma_client = HttpClient(host='localhost', port=8200)  # Ensure this is the correct port for ChromaDB

In [4]:
from chromadb.utils import embedding_functions

# Use a different sentence transformer: all-mpnet-base-v2
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-mpnet-base-v2")

  from tqdm.autonotebook import tqdm, trange


In [5]:
# Create a collection
collection = chroma_client.get_or_create_collection(name="ML_doc_collection", embedding_function=sentence_transformer_ef)

In [6]:
# Function to read PDF and convert to text
def read_pdf(file_path):
    try:
        with fitz.open(file_path) as pdf_document:
            text = ""
            for page_num in range(pdf_document.page_count):
                page = pdf_document[page_num]
                text += page.get_text()
        return text
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return ""

In [7]:
# Understand the Data (analyze PDF files)
def analyze_pdf_structure(directory):
    file_structure = {}
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".pdf"):
                file_path = os.path.join(root, file)
                content = read_pdf(file_path)
                if content:
                    file_structure[file_path] = content
                else:
                    print(f"No content found in {file_path}")
    return file_structure

In [8]:
# Load and Preprocess the Text
def load_and_preprocess(pdf_structure):
    preprocessed_data = []
    for file_path, content in pdf_structure.items():
        lines = content.split('\n')
        preprocessed_data.append({
            "file_path": file_path,
            "lines": lines,
            "line_count": len(lines)
        })
    if preprocessed_data:
        return pd.DataFrame(preprocessed_data)
    else:
        print("No data to preprocess")
        return None


In [9]:
# Chunk the text data and index in ChromaDB
def chunk_code(preprocessed_code, collection, chunk_size=50 ):
    code_chunks = []
    
    for index, row in preprocessed_code.iterrows():
        lines = row['lines']
        for i in range(0, len(lines), chunk_size):
            chunk = lines[i:i + chunk_size]
            unique_id = str(uuid.uuid4())  # Generate a unique ID
            chunk_content = "\n".join(chunk)
            chunk_data = {
                "id": unique_id,
                "file_path": row['file_path'],
                "chunk_start": i,
                "chunk_end": i + len(chunk),
                "chunk_content": chunk_content,
            }
            code_chunks.append(chunk_data)
            
            # Insert the chunk in ChromaDB
            collection.add(ids=[unique_id], documents=[chunk_content])  # Pass only the content as string
    
    return pd.DataFrame(code_chunks)


In [10]:
#!ls ../../data

In [11]:
# File path to the directory containing PDF files
pdf_directory = "../../data"
pdf_structure = analyze_pdf_structure(pdf_directory)  # Uses pdf_directory

# Debug statement to check pdf_structure
#print(f"PDF Structure: {pdf_structure}")

No content found in ../../data/Generative Artificial Intelligence in the Metaverse Era.pdf
No content found in ../../data/AIML/Artificial Intelligence & Generative AI for Beginners.pdf


In [12]:
preprocessed_pdf = load_and_preprocess(pdf_structure)

# Debug statement to check preprocessed_pdf
#preprocessed_pdf

In [13]:
if preprocessed_pdf is not None:
    pdf_chunks = chunk_code(preprocessed_pdf, collection)
    print(pdf_chunks.head())
else:
    print("Preprocessed PDF data is None")

                                     id  \
0  db3e08ef-009c-4af9-9d35-e91ca17bbecc   
1  40b3923d-30a7-4dd4-8468-185595090b71   
2  8ef99920-f769-4491-92b2-610fe5c3e810   
3  337de023-a790-4878-a765-f87f8f52a322   
4  f9c416dd-9ba6-4c2b-9c32-04a641eef76e   

                                           file_path  chunk_start  chunk_end  \
0  ../../data/Non-Expert Programmers in the Gener...            0         50   
1  ../../data/Non-Expert Programmers in the Gener...           50        100   
2  ../../data/Non-Expert Programmers in the Gener...          100        150   
3  ../../data/Non-Expert Programmers in the Gener...          150        200   
4  ../../data/Non-Expert Programmers in the Gener...          200        250   

                                       chunk_content  
0  Non-Expert Programmers in the Generative AI Fu...  
1  International 4.0 License.\nThis is the author...  
2  adapt code from various\nsources [46]. Even mo...  
3  whether model-generated code meets th

In [14]:
pdf_chunks.head(20)

Unnamed: 0,id,file_path,chunk_start,chunk_end,chunk_content
0,db3e08ef-009c-4af9-9d35-e91ca17bbecc,../../data/Non-Expert Programmers in the Gener...,0,50,Non-Expert Programmers in the Generative AI Fu...
1,40b3923d-30a7-4dd4-8468-185595090b71,../../data/Non-Expert Programmers in the Gener...,50,100,International 4.0 License.\nThis is the author...
2,8ef99920-f769-4491-92b2-610fe5c3e810,../../data/Non-Expert Programmers in the Gener...,100,150,adapt code from various\nsources [46]. Even mo...
3,337de023-a790-4878-a765-f87f8f52a322,../../data/Non-Expert Programmers in the Gener...,150,200,whether model-generated code meets their needs...
4,f9c416dd-9ba6-4c2b-9c32-04a641eef76e,../../data/Non-Expert Programmers in the Gener...,200,250,Our work focuses on envisioning a future for n...
5,9fa0bc0d-5bf8-4268-9c1b-3179afc7c8c8,../../data/Non-Expert Programmers in the Gener...,250,300,custom-built programming assistant in PyCharm....
6,6946cd9f-59d2-4ee9-831c-1f4fa6ee8122,../../data/Non-Expert Programmers in the Gener...,300,350,"ment, finding that access to the Code LLM was ..."
7,8f18a147-48dc-4b4e-baf4-7ca40ac2cfb5,../../data/Non-Expert Programmers in the Gener...,350,400,the program should do.\nAnother design decisio...
8,328223fa-c693-4dd2-995d-e1bd615aec4c,../../data/Non-Expert Programmers in the Gener...,400,450,College under a reliance agreement with Oberli...
9,bf802fe2-a6cd-4129-84e1-70a746cf4423,../../data/Non-Expert Programmers in the Gener...,450,500,"to 4, removing the more difficult timed tasks,..."


In [15]:
len(pdf_chunks)

903

In [16]:
pprint.pprint(collection.peek())

{'data': None,
 'documents': ['and testing unintended memorization in neural networks. In '
               '28th USENIX Security Symposium (USENIX\n'
               'Security 19), pp. 267–284, 2019.\n'
               'Nicholas Carlini, Florian Tramer, Eric Wallace, Matthew '
               'Jagielski, Ariel Herbert-Voss, Katherine Lee, Adam\n'
               'Roberts, Tom Brown, Dawn Song, Ulfar Erlingsson, et al. '
               'Extracting training data from large language\n'
               'models. In 30th USENIX Security Symposium (USENIX Security '
               '21), pp. 2633–2650, 2021.\n'
               'Nicholas Carlini, Steve Chien, Milad Nasr, Shuang Song, '
               'Andreas Terzis, and Florian Tramer. Membership\n'
               'inference attacks from first principles.\n'
               'In 2022 IEEE Symposium on Security and Privacy (SP), pp.\n'
               '1897–1914. IEEE, 2022a.\n'
               'Nicholas Carlini, Daphne Ippolito, Matthew Jagielski, '
   

In [17]:
collection.count()

2872