In [2]:
import nltk
import fitz  # PyMuPDF
import pandas as pd
import uuid
import pprint
from sentence_transformers import SentenceTransformer
from langchain import OpenAI
import chromadb
from chromadb import HttpClient
from chromadb.config import Settings

nltk.download('punkt')

  from tqdm.autonotebook import tqdm, trange
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ashwinikumar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# Initialize ChromaDB client
chroma_client = HttpClient(host='localhost', port=8200)  # Ensure this is the correct port for ChromaDB

In [4]:
# Initialize the sentence transformer model
model = SentenceTransformer('all-mpnet-base-v2')



: 

In [None]:
# Check if the collection exists, otherwise create it
try:
    collection = chroma_client.get_collection(name="pdf_chunks")
except Exception as e:
    if 'does not exist' in str(e):
        collection = chroma_client.create_collection(name="pdf_chunks")
    else:
        raise e

In [None]:
# Function to read PDF and convert to text
def read_pdf(file_path):
    try:
        with fitz.open(file_path) as pdf_document:
            text = ""
            for page_num in range(pdf_document.page_count):
                page = pdf_document[page_num]
                text += page.get_text()
        return text
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return ""

In [None]:
# Function to split text into sentences using nltk
def split_text_into_sentences(text):
    sentences = nltk.sent_tokenize(text)
    return sentences

In [None]:
# Function to process text and index in ChromaDB
def split_and_index_text(preprocessed_code, collection):
    code_chunks = []
    for index, row in preprocessed_code.iterrows():
        sentences = row['lines']
        for i, sentence in enumerate(sentences):
            unique_id = str(uuid.uuid4())  # Generate a unique ID
            chunk_data = {
                "id": unique_id,
                "file_path": row['file_path'],
                "chunk_start": i,
                "chunk_end": i + 1,
                "chunk_content": sentence,
            }
            code_chunks.append(chunk_data)
            
            # Insert the chunk in ChromaDB
            collection.add(ids=[unique_id], documents=[sentence])  # Pass only the content as string
    return pd.DataFrame(code_chunks)

In [None]:
# Function to analyze PDF structure and preprocess
def analyze_pdf_structure(directory):
    file_structure = {}
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".pdf"):
                file_path = os.path.join(root, file)
                content = read_pdf(file_path)
                if content:
                    file_structure[file_path] = content
                else:
                    print(f"No content found in {file_path}")
    return file_structure

def load_and_preprocess(pdf_structure):
    preprocessed_data = []
    for file_path, content in pdf_structure.items():
        sentences = split_text_into_sentences(content)
        preprocessed_data.append({
            "file_path": file_path,
            "lines": sentences,
            "line_count": len(sentences)
        })
    if preprocessed_data:
        return pd.DataFrame(preprocessed_data)
    else:
        print("No data to preprocess")
        return None

In [None]:
# File path to the directory containing PDF files
pdf_directory = "/Users/ashwinikumar/AI_Bootcamp/Student_AI_repos/final_project/data"
pdf_structure = analyze_pdf_structure(pdf_directory)  # Uses pdf_directory

# Debug statement to check pdf_structure
print(f"PDF Structure: {pdf_structure}")

preprocessed_pdf = load_and_preprocess(pdf_structure)

# Debug statement to check preprocessed_pdf
print(f"Preprocessed PDF: {preprocessed_pdf}")

if preprocessed_pdf is not None:
    pdf_chunks = chunk_code(preprocessed_pdf)
    print(pdf_chunks.head())
else:
    print("Preprocessed PDF data is None")

pprint.pprint(pdf_chunks)