In [1]:
print("Hello")

Hello


In [8]:
import subprocess
import sys

# List of libraries to install
libraries = [
    "pymongo",
    "PyPDF2",
    "pdfplumber",
    "nltk",
    "transformers",
    "scikit-learn",
    "asyncio",
    # "textract"
    # "pyPDF2",
    # "pymongo"
]

# Install libraries with retry
for library in libraries:
    install_status = subprocess.call([sys.executable, "-m", "pip", "install", library])
    
    if install_status != 0:
        print(f"Error installing {library}. Retrying...")
        subprocess.call([sys.executable, "-m", "pip", "install", "--upgrade", "--force-reinstall", library])



In [12]:
!pip uninstall pdfminer.six -y
!pip install pdfminer.six==20231228


Found existing installation: pdfminer.six 20240706
Uninstalling pdfminer.six-20240706:
  Successfully uninstalled pdfminer.six-20240706
Collecting pdfminer.six==20231228
  Using cached pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Using cached pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
Installing collected packages: pdfminer.six
Successfully installed pdfminer.six-20231228


In [1]:
import os
# import textract
from PyPDF2 import PdfReader
import pdfplumber
from pymongo import MongoClient

# from summarization import summarize_text

from src.summarization import summarize_text
from src.keyword_extraction import extract_keywords

# from keyword_extraction import extract_keywords

# Initialize MongoDB connection
client = MongoClient("mongodb://localhost:27017/")
db = client["pdf_summary_db"]
collection = db["pdf_documents"]

# Define the base directory containing the PDFs
base_dir = 'pdfs/'

# Function to read PDFs using pdfplumber for better accuracy
def read_pdf(file):
    try:
        with pdfplumber.open(file) as pdf:
            text = ''
            for page in pdf.pages:
                text += page.extract_text() if page.extract_text() else ''
        return text
    except Exception as e:
        print(f'Error reading PDF {file}: {e}')
        return None

# Walk through the directory structure and process PDFs
for root, dirs, files in os.walk(base_dir):
    for file_name in files:
        # Print the file name for debugging purposes
        print(f'Processing file: {file_name}')
        
        # Get the file path
        file_path = os.path.join(root, file_name)
        
        # Handle only PDFs in this case
        if file_name.endswith('.pdf'):
            content = read_pdf(file_path)
        else:
            print(f'Skipping file: {file_name} (unsupported format)')
            continue
        
        if content is not None:
            # Summarize the text
            summary = summarize_text(content, method="textrank")  # You can use the transformer-based method here
            
            # Extract keywords from the text
            keywords = extract_keywords(content, num_keywords=5)
            
            # Store the results in MongoDB
            document = {
                "pdf_name": file_name,
                "summary": summary,
                "keywords": keywords
            }
            collection.insert_one(document)
            print(f'Successfully processed and saved {file_name}')
        else:
            print(f'Failed to read file {file_name}. Content is None.')

print('PDF processing complete, summaries and keywords saved to MongoDB successfully!')


PDF processing complete, summaries and keywords saved to MongoDB successfully!
