In [2]:
pip install PyPDF2


Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import PyPDF2
import pytesseract
from pdf2image import convert_from_path
from docx import Document

# Function to extract text from text-based PDFs
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, "rb") as file:
        pdf_reader = PyPDF2.PdfReader(file)
        text = ""
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            text += page.extract_text()
    return text


def extract_text_from_scanned_pdf(pdf_path):
    images = convert_from_path(pdf_path)
    text = ""
    for image in images:
        text += pytesseract.image_to_string(image)
    return text


def extract_text_from_docx(docx_path):
    doc = Document(docx_path)
    text = ""
    for para in doc.paragraphs:
        text += para.text + "\n"
    return text


def process_directory(directory_path):
    corpus = []  
    

    for filename in os.listdir(directory_path):
        file_path = os.path.join(directory_path, filename)
        
        if filename.endswith(".pdf"):
            if is_scanned_pdf(file_path):

                text = extract_text_from_scanned_pdf(file_path)
            else:

                text = extract_text_from_pdf(file_path)
            corpus.append(text)
        
        elif filename.endswith(".docx"):

            text = extract_text_from_docx(file_path)
            corpus.append(text)

    return corpus


def is_scanned_pdf(pdf_path):
    try:
        text = extract_text_from_pdf(pdf_path)
        return len(text.strip()) == 0  # If no text is extracted, it's likely a scanned PDF
    except:
        return True


directory_path = "files" 
corpus = process_directory(directory_path)


knowledge_base = "\n".join(corpus)

output_file_path = "knowledge_base.txt"
with open(output_file_path, "w", encoding="utf-8") as file:
    file.write(knowledge_base)

print(f"Text extraction complete! Knowledge base saved to '{output_file_path}'.")


Text extraction complete! Knowledge base saved to 'knowledge_base.txt'.
