In [4]:
import os
import nltk
import fitz  # PyMuPDF
import pandas as pd
import uuid
import pprint
from sentence_transformers import SentenceTransformer
from langchain import OpenAI
import chromadb
from chromadb import HttpClient
from chromadb.config import Settings

nltk.download('punkt')

  from tqdm.autonotebook import tqdm, trange
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ashwinikumar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
# Initialize ChromaDB client
chroma_client = HttpClient(host='localhost', port=8200)  # Ensure this is the correct port for ChromaDB

In [6]:
# Initialize the sentence transformer model
# model = SentenceTransformer('all-mpnet-base-v2')

In [7]:
# Check if the collection exists, otherwise create it
try:
    collection = chroma_client.get_collection(name="pdf_chunks")
except Exception as e:
    if 'does not exist' in str(e):
        collection = chroma_client.create_collection(name="pdf_chunks")
    else:
        raise e

In [9]:
# Function to read PDF and convert to text
def read_pdf(file_path):
    try:
        with fitz.open(file_path) as pdf_document:
            text = ""
            for page_num in range(pdf_document.page_count):
                page = pdf_document[page_num]
                text += page.get_text()
        return text
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return ""

In [11]:
# Function to analyze PDF structure
def analyze_pdf_structure(directory):
    file_structure = {}
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".pdf"):
                file_path = os.path.join(root, file)
                content = read_pdf(file_path)
                if content:
                    file_structure[file_path] = content
                else:
                    print(f"No content found in {file_path}")
    return file_structure

In [12]:
# Function to split text into sentences using nltk
def split_text_into_sentences(text):
    sentences = nltk.tokenize.sent_tokenize(text, language='english')
    return sentences

In [13]:
# Load and Preprocess Text 
def load_and_preprocess(pdf_structure):
    preprocessed_data = []
    for file_path, content in pdf_structure.items():
        sentences = split_text_into_sentences(content)
        preprocessed_data.append({
            "file_path": file_path,
            "sentences": sentences,
            "sentence_count": len(sentences)
        })
    if preprocessed_data:
        return pd.DataFrame(preprocessed_data)
    else:
        print("No data to preprocess")
        return None

In [14]:
# Function to Index Sentences in ChromaDB
def index_sentences(preprocessed_data, collection):
    code_chunks = []
    for index, row in preprocessed_data.iterrows():
        sentences = row['sentences']
        for i, sentence in enumerate(sentences):
            unique_id = str(uuid.uuid4())  # Generate a unique ID
            chunk_data = {
                "id": unique_id,
                "file_path": row['file_path'],
                "sentence_index": i,
                "sentence_content": sentence,
            }
            code_chunks.append(chunk_data)
            
            # Insert the sentence in ChromaDB
            collection.add(ids=[unique_id], documents=[sentence])  # Pass only the content as string
    
    return pd.DataFrame(code_chunks)

In [15]:
# File path to the directory containing PDF files
pdf_directory = "../../data"
pdf_structure = analyze_pdf_structure(pdf_directory)  # Uses pdf_directory

preprocessed_pdf = load_and_preprocess(pdf_structure)

if preprocessed_pdf is not None:
    index_sentences(preprocessed_pdf, collection)
    print(preprocessed_pdf.head())
else:
    print("Preprocessed PDF data is None")

No content found in ../../data/Generative Artificial Intelligence in the Metaverse Era.pdf
No content found in ../../data/AIML/Artificial Intelligence & Generative AI for Beginners.pdf
                                           file_path  \
0  ../../data/Non-Expert Programmers in the Gener...   
1  ../../data/Principles of Generative AI A Techn...   
2  ../../data/Responsible Generative AI - What to...   
3  ../../data/Generative AI and the future of edu...   
4  ../../data/Generative AI and ChatGPT  Applicat...   

                                           sentences  sentence_count  
0  [Non-Expert Programmers in the Generative AI F...             986  
1  [ \nKaran Singh, Assistant Professor of Operat...             151  
2  [Responsible Generative AI: What to Generate a...            3368  
3  [The International Journal of Management Educa...             477  
4  [Full Terms & Conditions of access and use can...             831  


In [16]:
preprocessed_pdf

Unnamed: 0,file_path,sentences,sentence_count
0,../../data/Non-Expert Programmers in the Gener...,[Non-Expert Programmers in the Generative AI F...,986
1,../../data/Principles of Generative AI A Techn...,"[ \nKaran Singh, Assistant Professor of Operat...",151
2,../../data/Responsible Generative AI - What to...,[Responsible Generative AI: What to Generate a...,3368
3,../../data/Generative AI and the future of edu...,[The International Journal of Management Educa...,477
4,../../data/Generative AI and ChatGPT Applicat...,[Full Terms & Conditions of access and use can...,831
5,../../data/Scientists' Perspectives on Potenti...,[arXiv:2304.01420v1 [cs.CY] 4 Apr 2023\nScie...,677
6,../../data/Generative AI At Work.pdf,[NBER WORKING PAPER SERIES\nGENERATIVE AI AT W...,841
7,../../data/Generative Artificial Intelligence.pdf,"[Vol., :(0123456789)\n1 3\nElectronic Markets ...",818
8,../../data/The Widening Gap - The Benefits and...,[The Widening Gap: The Benefits and Harms of G...,953
9,../../data/Generative AI in Introductory Progr...,[1\n2\n3\n4\n5\n6\n7\n8\n9\n10\n11\n12\n13\n14...,787


In [None]:
# # File path to the directory containing PDF files
# pdf_directory = "/Users/ashwinikumar/AI_Bootcamp/Student_AI_repos/final_project/data"
# pdf_structure = analyze_pdf_structure(pdf_directory)  # Uses pdf_directory

# # Debug statement to check pdf_structure
# print(f"PDF Structure: {pdf_structure}")

# preprocessed_pdf = load_and_preprocess(pdf_structure)

# # Debug statement to check preprocessed_pdf
# print(f"Preprocessed PDF: {preprocessed_pdf}")

# if preprocessed_pdf is not None:
#     pdf_chunks = chunk_code(preprocessed_pdf)
#     print(pdf_chunks.head())
# else:
#     print("Preprocessed PDF data is None")

# pprint.pprint(pdf_chunks)

PDF Structure: {}
No data to preprocess
Preprocessed PDF: None
Preprocessed PDF data is None


NameError: name 'pdf_chunks' is not defined