In [11]:
import os
import fitz  # PyMuPDF
import pandas as pd
import pprint
import uuid
import chromadb
from chromadb.config import Settings
from chromadb import HttpClient

In [12]:
# Initialize ChromaDB client
chroma_client = HttpClient(host='localhost', port=8200)  # Ensure this is the correct port for ChromaDB

In [31]:
from chromadb.utils import embedding_functions

# Use a different sentence transformer: all-mpnet-base-v2
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-mpnet-base-v2")

In [32]:
# Create a collection
collection = chroma_client.get_or_create_collection(name="ML_doc_collection", embedding_function=sentence_transformer_ef)

In [14]:
# Function to read PDF and convert to text
def read_pdf(file_path):
    try:
        with fitz.open(file_path) as pdf_document:
            text = ""
            for page_num in range(pdf_document.page_count):
                page = pdf_document[page_num]
                text += page.get_text()
        return text
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return ""

In [15]:
# Understand the Data (analyze PDF files)
def analyze_pdf_structure(directory):
    file_structure = {}
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".pdf"):
                file_path = os.path.join(root, file)
                content = read_pdf(file_path)
                if content:
                    file_structure[file_path] = content
                else:
                    print(f"No content found in {file_path}")
    return file_structure

In [16]:
# Load and Preprocess the Text
def load_and_preprocess(pdf_structure):
    preprocessed_data = []
    for file_path, content in pdf_structure.items():
        lines = content.split('\n')
        preprocessed_data.append({
            "file_path": file_path,
            "lines": lines,
            "line_count": len(lines)
        })
    if preprocessed_data:
        return pd.DataFrame(preprocessed_data)
    else:
        print("No data to preprocess")
        return None


In [17]:
# Chunk the text data and index in ChromaDB
def chunk_code(preprocessed_code, collection, chunk_size=50 ):
    code_chunks = []
    
    for index, row in preprocessed_code.iterrows():
        lines = row['lines']
        for i in range(0, len(lines), chunk_size):
            chunk = lines[i:i + chunk_size]
            unique_id = str(uuid.uuid4())  # Generate a unique ID
            chunk_content = "\n".join(chunk)
            chunk_data = {
                "id": unique_id,
                "file_path": row['file_path'],
                "chunk_start": i,
                "chunk_end": i + len(chunk),
                "chunk_content": chunk_content,
            }
            code_chunks.append(chunk_data)
            
            # Insert the chunk in ChromaDB
            collection.add(ids=[unique_id], documents=[chunk_content])  # Pass only the content as string
    
    return pd.DataFrame(code_chunks)


In [25]:
#!ls ../../data

In [29]:
# File path to the directory containing PDF files
pdf_directory = "../../data"
pdf_structure = analyze_pdf_structure(pdf_directory)  # Uses pdf_directory

# Debug statement to check pdf_structure
#print(f"PDF Structure: {pdf_structure}")

No content found in ../../data/Artificial Intelligence & Generative AI for Beginners.pdf
No content found in ../../data/Generative Artificial Intelligence in the Metaverse Era.pdf


In [30]:
preprocessed_pdf = load_and_preprocess(pdf_structure)

# Debug statement to check preprocessed_pdf
#preprocessed_pdf

In [33]:
if preprocessed_pdf is not None:
    pdf_chunks = chunk_code(preprocessed_pdf, collection)
    print(pdf_chunks.head())
else:
    print("Preprocessed PDF data is None")

                                     id  \
0  6d21733f-ecba-4d30-88a1-bfd7cf09905c   
1  5c1d556c-fe6e-4597-99bf-edb7d7364bfe   
2  0e125d8c-88bb-448b-9a31-6239a34363d2   
3  0ef921fe-e91c-40b0-870c-ed3f78de0620   
4  6b03be47-74d7-4d5d-8029-8d3062a29119   

                                           file_path  chunk_start  chunk_end  \
0  ../../data/A Primer on Generative Artificial I...            0         50   
1  ../../data/A Primer on Generative Artificial I...           50        100   
2  ../../data/A Primer on Generative Artificial I...          100        150   
3  ../../data/A Primer on Generative Artificial I...          150        200   
4  ../../data/A Primer on Generative Artificial I...          200        250   

                                       chunk_content  
0  Citation: Kalota, F. A Primer on\nGenerative A...  
1  Simply put, generative AI generates content, w...  
2  beings [9]. The realization of AGI is many yea...  
3  If the patient has a cough, a fever, 

In [36]:
pdf_chunks.head(20)

Unnamed: 0,id,file_path,chunk_start,chunk_end,chunk_content
0,6d21733f-ecba-4d30-88a1-bfd7cf09905c,../../data/A Primer on Generative Artificial I...,0,50,"Citation: Kalota, F. A Primer on\nGenerative A..."
1,5c1d556c-fe6e-4597-99bf-edb7d7364bfe,../../data/A Primer on Generative Artificial I...,50,100,"Simply put, generative AI generates content, w..."
2,0e125d8c-88bb-448b-9a31-6239a34363d2,../../data/A Primer on Generative Artificial I...,100,150,beings [9]. The realization of AGI is many yea...
3,0ef921fe-e91c-40b0-870c-ed3f78de0620,../../data/A Primer on Generative Artificial I...,150,200,"If the patient has a cough, a fever, and a sor..."
4,6b03be47-74d7-4d5d-8029-8d3062a29119,../../data/A Primer on Generative Artificial I...,200,250,"exploration, dimension reduction, or pattern r..."
5,3aa032ac-46b7-4403-9896-18fa0e4df1be,../../data/A Primer on Generative Artificial I...,250,300,Non-linear complex correlational models.\nThe ...
6,71d48313-fbc3-4f85-ba86-8533dec4e3f0,../../data/A Primer on Generative Artificial I...,300,350,"Before going to the next step, let us consider..."
7,e1d90f50-d8e5-4b55-a0d9-47e23cab07b3,../../data/A Primer on Generative Artificial I...,350,400,"Now, going back to the discussion of ANN, the ..."
8,5e75c4be-2476-4824-90e9-585f8e2da298,../../data/A Primer on Generative Artificial I...,400,450,lot higher than “is”.\n6.3. Transformer\nA tra...
9,d6c60be7-98ab-4661-b4e8-058dbe24ce19,../../data/A Primer on Generative Artificial I...,450,500,"For businesses, generative AI can be utilized ..."


In [37]:
len(pdf_chunks)

903

In [40]:
pprint.pprint(collection.peek())

{'data': None,
 'documents': ['including earlier waves of AI, primarily affected \n'
               'workers with less education and training. As a result, \n'
               'income inequality tended to increase in the U.S. and \n'
               'many other developed nations. In contrast, generative \n'
               'AI has the potential to affect many types of work that \n'
               'have primarily been done by well-compensated people \n'
               'including writers, executives, entrepreneurs, scientists, \n'
               'and artists. This may reverse some of the past effects \n'
               'of IT and AI when it comes to inequality. So far, there \n'
               'have been speculation and case examples, but not \n'
               'much systematic empirical evidence either way.\n'
               'At Stanford Digital Economy Lab, we are cataloging \n'
               'the list of economic activities likely to be affected \n'
               'by generative AI and 

In [41]:
collection.count()

903