In [1]:
%pip install PyPDF2 pymongo 

Collecting PyPDF2
  Using cached pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting pymongo
  Downloading pymongo-4.13.2-cp313-cp313-macosx_11_0_arm64.whl.metadata (22 kB)
Collecting dnspython<3.0.0,>=1.16.0 (from pymongo)
  Using cached dnspython-2.7.0-py3-none-any.whl.metadata (5.8 kB)
Using cached pypdf2-3.0.1-py3-none-any.whl (232 kB)
Downloading pymongo-4.13.2-cp313-cp313-macosx_11_0_arm64.whl (965 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m965.9/965.9 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hUsing cached dnspython-2.7.0-py3-none-any.whl (313 kB)
Installing collected packages: PyPDF2, dnspython, pymongo
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3/3[0m [pymongo]m2/3[0m [pymongo]n]
[1A[2KSuccessfully installed PyPDF2-3.0.1 dnspython-2.7.0 pymongo-4.13.2
Note: you may need to restart the kernel to use updated packages.


In [1]:
import PyPDF2
from pymongo import MongoClient
import re

In [2]:
# extracting text from input pdf
def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            for page in pdf_reader.pages:
                text += page.extract_text()
        return text
    except Exception as e:
        print(f"Error extracting text from PDF: {str(e)}")
        return None

# before sentence splitting, removes whitespace and distracting special characters 
def clean_text(text):
    if text is None:
        return None
    text = re.sub(r'\s+', ' ', text)
    text = text.replace('\n', ' ').strip()
    return text

# clean up sentence splitting 
def custom_sentence_split(text):
    if not text:
        return []
    sentences = re.split(r'(?<![A-Z][a-z]\.)(?<![A-Z]\.)(?<=\.|\?|\!)\s+(?=[A-Z])', text)
    sentences = [s.strip() for s in sentences if s.strip()]
    return sentences

def store_sentences_in_mongodb(sentences, pdf_filename):
    try:
        client = MongoClient('mongodb://localhost:27017/')
        db = client['pdf_database']
        collection = db['sentences']

        documents = []
        for index, sentence in enumerate(sentences):
            doc = {
                'pdf_filename': pdf_filename,
                'sentence_index': index,
                'sentence_text': sentence,
                'page_number': None
            }
            documents.append(doc)
        result = collection.insert_many(documents)
        print(f"Successfully stored {len(result.inserted_ids)} sentences in MongoDB")
        return result.inserted_ids
    except Exception as e:
        print(f"Error storing sentences in MongoDB: {str(e)}")
    finally:
        client.close()

In [4]:
# test with small example
test_text = "This is a test sentence. This is another test sentence! Is this working? Dr. Li went to the store. Mr. Jones was there too."
print("Testing sentence splitting:")
test_sentences = custom_sentence_split(test_text)
for i, sentence in enumerate(test_sentences, 1):
    print(f"{i}. {sentence}")

Testing sentence splitting:
1. This is a test sentence.
2. This is another test sentence!
3. Is this working?
4. Dr. Li went to the store.
5. Mr. Jones was there too.


In [None]:
# process PDF file (REPLACE THE FILENAME)
pdf_path = 'REPLACE.pdf'

text = extract_text_from_pdf(pdf_path)
cleaned_text = clean_text(text)

sentences = custom_sentence_split(cleaned_text)

print("\nFirst 5 sentences from PDF:")
for i, sentence in enumerate(sentences[:5]):
    print(f"{i+1}. {sentence}")

In [None]:
# store in mongoDB
def test_mongodb():
    try: 
        client = MongoClient('mongodb://localhost:27017/', serverSelectionTimeoutMS=2000)
        client.server_info()
        print("MongoDB connection worked!")
        return True 
    except Exception as e:
        print(f"MongoDB connection didn't work: {str(e)}")
        return False

if test_mongodb():
    inserted_ids = store_sentences_in_mongodb(sentences, pdf_path)

In [None]:
# check stored data 
def verify_stored_data(pdf_filename, limit=5):
    try:
        client = MongoClient('mongodb://localhost:27017')
        db = client['pdf_database']
        collection = db['sentences']

        stored_sentences = collection.find({'pdf_filename': pdf_filename}).limit(limit)
        print(f"\nFirst {limit} stored sentences from MongoDB:")
        for i, doc in enumerate(stored_sentences, 1):
            print(f"{i}. {doc['sentence_text']}")

    except Exception as e:
        print(f"Error querying MongoDB: {str(e)}")
    finally: 
        client.close()

verify_stored_data(pdf_path)