In [7]:
import pdfplumber
import camelot
from pdf2image import convert_from_path
import pytesseract
import cv2
import os
import glob

In [9]:
# Base folder
base_folder = "/Users/kumarpersonal/Downloads/Inquiry-Assistant/Context"

# PDF input folder
pdf_folder = os.path.join(base_folder, "PDFs")
pdf_files = glob.glob(os.path.join(pdf_folder, "*.pdf"))

# Output folders
text_folder = os.path.join(base_folder, "Text")
table_folder = os.path.join(base_folder, "Tables")

# 1. TEXT EXTRACTION with pdfplumber
def extract_text_pdfplumber(pdf_path):
    all_text = []
    with pdfplumber.open(pdf_path) as pdf:
        for i, page in enumerate(pdf.pages):
            text = page.extract_text()
            all_text.append(f"\n--- Page {i+1} ---\n{text}")
    return "\n".join(all_text)

# 2. TABLE EXTRACTION with Camelot 
def extract_tables(pdf_path):
    tables = camelot.read_pdf(pdf_path, pages='all', flavor='lattice')
    extracted_tables = []
    for i, table in enumerate(tables):
        extracted_tables.append(table.df)
    return extracted_tables

# 3. IMAGE OCR (text from charts/images) 
def ocr_from_pdf_images(pdf_path):
    pages = convert_from_path(pdf_path)
    ocr_texts = []
    for i, image in enumerate(pages):
        img_path = f"temp_page_{i+1}.png"
        image.save(img_path, "PNG")

        img = cv2.imread(img_path)
        text = pytesseract.image_to_string(img)
        ocr_texts.append(f"\n--- OCR from Page {i+1} ---\n{text}")

        os.remove(img_path)
    return "\n".join(ocr_texts)

# Remove old output files for a given filename prefix
def cleanup_old_outputs(base_name):
    text_pattern = os.path.join(text_folder, f"{base_name}_text.txt")
    ocr_pattern = os.path.join(text_folder, f"{base_name}_ocr.txt")
    table_pattern = os.path.join(table_folder, f"{base_name}_table_*.csv")

    for file in glob.glob(text_pattern):
        os.remove(file)
        print(f"Deleted old file: {file}")

    for file in glob.glob(ocr_pattern):
        os.remove(file)
        print(f"Deleted old file: {file}")

    for file in glob.glob(table_pattern):
        os.remove(file)
        print(f"Deleted old file: {file}")

# Run All Steps for each file
if __name__ == "__main__":
    for pdf_path in pdf_files:
        filename = os.path.splitext(os.path.basename(pdf_path))[0]
        print(f"\nProcessing: {filename}.pdf")

        # Step 0: Cleanup previous output for this file
        cleanup_old_outputs(filename)

        # 1. Text
        print("Extracting structured text...")
        text_data = extract_text_pdfplumber(pdf_path)
        with open(os.path.join(text_folder, f"{filename}_text.txt"), "w", encoding="utf-8") as f:
            f.write(text_data)

        # 2. Tables
        print("Extracting tables...")
        tables = extract_tables(pdf_path)
        for idx, table_df in enumerate(tables):
            table_df.to_csv(os.path.join(table_folder, f"{filename}_table_{idx+1}.csv"), index=False)

        # 3. OCR
        print("Performing OCR on images...")
        ocr_text = ocr_from_pdf_images(pdf_path)
        with open(os.path.join(text_folder, f"{filename}_ocr.txt"), "w", encoding="utf-8") as f:
            f.write(ocr_text)

        print(f"Finished processing {filename}.pdf")

    print("\nAll PDF files processed. Clean and updated outputs are ready.")



Processing: placement-records-data-science.pdf
Extracting structured text...
Extracting tables...
Performing OCR on images...
Finished processing placement-records-data-science.pdf

Processing: about-scaler.pdf
Extracting structured text...
Extracting tables...
Performing OCR on images...


Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value


Finished processing about-scaler.pdf

Processing: ssb-prospectus.pdf
Extracting structured text...


Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P1' is an invalid float value
Cannot set gray stroke color because /'P2' is an invalid float value
Cannot set gray stroke color because /'P3' is an invalid float value
Cannot set gray stroke color because /'P4' is an invalid float value
Cannot set gray stroke color because /'P5' is an invalid float value
Cannot set gray stroke color because /'P6' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P1' is an invalid float value
Cannot set gray stroke color because /'P2' is an invalid float value
Cannot set gray stroke color because /'P3' is an invalid float value
Cannot set gray stroke color because /'P4' is an invalid float value
Cannot set gray stroke color becau

Extracting tables...


Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P1' is an invalid float value
Cannot set gray stroke color because /'P2' is an invalid float value
Cannot set gray stroke color because /'P3' is an invalid float value
Cannot set gray stroke color because /'P4' is an invalid float value
Cannot set gray stroke color because /'P5' is an invalid float value
Cannot set gray stroke color because /'P6' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P1' is an invalid float value
Cannot set gray stroke color because /'P2' is an invalid float value
Cannot set gray stroke color becau

Performing OCR on images...


Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P1' is an invalid float value
Cannot set gray stroke color because /'P2' is an invalid float value
Cannot set gray stroke color because /'P3' is an invalid float value
Cannot set gray stroke color because /'P4' is an invalid float value
Cannot set gray stroke color because /'P5' is an invalid float value
Cannot set gray stroke color because /'P6' is an invalid float value
Cannot set gray stroke color because /'P7' is an invalid float value
Cannot set gray stroke color because /'P8' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P1' is an invalid float value
Cannot set gray stroke color because /'P2' is an invalid float value


Finished processing ssb-prospectus.pdf

Processing: devops-brochure.pdf
Extracting structured text...


Cannot set gray stroke color because /'P0' is an invalid float value


Extracting tables...


Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P1' is an invalid float value
Cannot set gray stroke color because /'P2' is an invalid float value
Cannot set gray stroke color because /'P3' is an invalid float value
Cannot set gray stroke color because /'P4' is an invalid float value
Cannot set gray stroke color because /'P5' is an invalid float value
Cannot set gray stroke color because /'P6' is an invalid float value
Cannot set gray stroke color because /'P7' is an invalid float value
Cannot set gray stroke color because /'P8' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P1' is an invalid float value
Cannot set gray stroke color because /'P2' is an invalid float value


Performing OCR on images...
Finished processing devops-brochure.pdf

Processing: placement-records-academy.pdf
Extracting structured text...
Extracting tables...
Performing OCR on images...


Cannot set gray stroke color because /'P0' is an invalid float value


Finished processing placement-records-academy.pdf

Processing: bachelors-programme.pdf
Extracting structured text...
Extracting tables...


Cannot set gray stroke color because /'P0' is an invalid float value


Performing OCR on images...
Finished processing bachelors-programme.pdf

Processing: ai-ml-brochure.pdf
Extracting structured text...
Extracting tables...




Performing OCR on images...


Cannot set gray stroke color because /'P0' is an invalid float value


Finished processing ai-ml-brochure.pdf

Processing: masters-programme.pdf
Extracting structured text...


Cannot set gray stroke color because /'P0' is an invalid float value


Extracting tables...


Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value


Performing OCR on images...
Finished processing masters-programme.pdf

Processing: sst-prospectus.pdf
Extracting structured text...
Extracting tables...




Performing OCR on images...
Finished processing sst-prospectus.pdf

Processing: academy-brochure.pdf
Extracting structured text...


Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value


Extracting tables...


Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value


Performing OCR on images...
Finished processing academy-brochure.pdf

Processing: ds-ml-brochure.pdf
Extracting structured text...


Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P1' is an invalid float value
Cannot set gray stroke color because /'P2' is an invalid float value
Cannot set gray stroke color because /'P3' is an invalid float value
Cannot set gray stroke color because /'P4' is an invalid float value


Extracting tables...


Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P1' is an invalid float value
Cannot set gray stroke color because /'P2' is an invalid float value
Cannot set gray stroke color because /'P3' is an invalid float value
Cannot set gray stroke color because /'P4' is an invalid float value


Performing OCR on images...
Finished processing ds-ml-brochure.pdf

All PDF files processed. Clean and updated outputs are ready.


In [None]:
corpus = []

for txt_file in glob.glob("/Users/kumarpersonal/Downloads/Inquiry-Assistant/Context/Text/*.txt"):
    with open(txt_file, "r", encoding="utf-8") as f:
        corpus.append(f.read())

full_text = "\n".join(corpus)

'\n--- Page 1 ---\n\n\n--- Page 2 ---\nABOUT\nSCALER\nScaler (by InterviewBit) is an outcome-focused leading\ned-tech platform for tech enthusiasts. We are a\ntransformative tech school devoted to creating a growth\necosystem to assist software professionals in unlocking\ntalent & opportunities at every stage of their career. We are\nempowering learners with cutting-edge AI tools and\nframeworks, ensuring that our learners are both\nindustry-ready and future-ready. Our curriculum\nintegrates AI-driven approaches to problem-solving,\nmaking Scaler graduates stand out in the age of Artificial\nIntelligence.\n\n--- Page 3 ---\nindex\n04 Tech\nLandscape\n05 Build\nAI Skills\n15 Success\nStories\n06 Academy\nOverview 18 Batch\nProfile\n07 Scaler\nAdvantage 19 Meet our\nInstructors\n08 Who is\nthis for? 21 Meet our\nMentors\n09 Curriculum\nOutline\n24 Scaler\nSupport\n13 Level Up With\nIIT-Roorkee\n26 How to\nEnrol?\n14 IIT-Roorkee\nCampus\n27 Your\nImmersion\nCertificates\n\n--- Page 4 ---\

In [None]:
with open("/Users/kumarpersonal/Downloads/Inquiry-Assistant/Context/corpus.txt", "w", encoding="utf-8") as f_out:
    f_out.write(full_text)