In [2]:
import nltk

# Download required NLTK resources if they’re not available
def download_nltk_resources():
    resources = ['punkt', 'stopwords', 'wordnet']
    for resource in resources:
        try:
            nltk.data.find(f'tokenizers/{resource}' if resource == 'punkt' else f'corpora/{resource}')
        except LookupError:
            nltk.download(resource)

download_nltk_resources()  # Call the function to download resources if missing


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
import os
import fitz  # PyMuPDF
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import json
from rank_bm25 import BM25Okapi

nltk.download('punkt')
nltk.download('stopwords')

def read_files_from_directory(directory_path):
    documents = {}
    for filename in os.listdir(directory_path):
        if filename.endswith('.pdf'):
            file_path = os.path.join(directory_path, filename)
            with fitz.open(file_path) as pdf_document:
                text = ""
                for page_number in range(len(pdf_document)):
                    page = pdf_document.load_page(page_number)
                    text += page.get_text()
                if text.strip():  # Check if the extracted text is not empty
                    documents[filename] = text
                else:
                    print(f"Warning: {filename} is empty after text extraction and will be skipped.")
    return documents

def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    tokens = [token.lower() for token in tokens if token.lower() not in stop_words and token not in string.punctuation]
    return tokens

def index_files(directory_path):
    file_contents = read_files_from_directory(directory_path)
    if not file_contents:
        raise ValueError("No PDF files found in the specified directory")
    
    documents = [preprocess_text(content) for content in file_contents.values()]
    if not documents:
        raise ValueError("All PDF files are empty after preprocessing")
    
    bm25 = BM25Okapi(documents)
    return bm25, file_contents

def search_documents(query, bm25, file_contents, top_n=5):
    query_tokens = preprocess_text(query)
    scores = bm25.get_scores(query_tokens)
    ranked_scores = sorted(enumerate(scores), key=lambda x: x[1], reverse=True)
    
    results = [(list(file_contents.keys())[idx], list(file_contents.values())[idx]) for idx, score in ranked_scores[:top_n]]
    return results

if __name__ == "__main__":
    try:
        # Indexing
        directory_path = 'POM'
        bm25, file_contents = index_files(directory_path)
        
        # Searching
        query = 'Organizational goals'
        results = search_documents(query, bm25, file_contents)

        for filename, content in results:  # Print top results
            print(f"File: {filename}")
            print(f"Content: {content[:200]}...")  # Print first 200 characters of the content
            print()
    except ValueError as e:
        print(e)


In [3]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
import os
import nltk
nltk_data_path = os.path.join(os.path.expanduser("~"), "nltk_data")
if not os.path.exists(nltk_data_path):
    os.makedirs(nltk_data_path)
nltk.data.path.append(nltk_data_path)

nltk.download('punkt', download_dir=nltk_data_path)
nltk.download('stopwords', download_dir=nltk_data_path)
nltk.download('wordnet', download_dir=nltk_data_path)


[nltk_data] Downloading package punkt to C:\Users\HP\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to C:\Users\HP\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to C:\Users\HP\nltk_data...


True