# PDF Data Preprocessing

This notebook demonstrates how to preprocess PDF files for text extraction and analysis.

## Import necessary libraries

In [7]:
!pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
Installing collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [8]:
# Import Libraries for PDF preprocessing for fine-tuning an LLM
import os
import re
import pandas as pd
import numpy as np
import random
import json
import string
import time
import logging
import argparse
import sys
import shutil
import glob
import subprocess

# For PDF text extraction
import PyPDF2

# For progress bar
from tqdm.notebook import tqdm

# For creating output directory if it doesn't exist
import os.path as osp

In [11]:
# The PDFs are stored in the data directory

data_dir = "data"
output_dir = "data"

for dirpath, dirnames, filenames in os.walk(data_dir):
    for filename in filenames:
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(dirpath, filename)
            # Exclude pdfs that have already been processed
            if os.path.exists(os.path.join(output_dir, f"{os.path.splitext(filename)[0]}.txt")):
                print(f"Skipping {pdf_path}, already processed.")
                continue
            # Skip encrypted PDFs
            try:
                with open(pdf_path, "rb") as file:
                    reader = PyPDF2.PdfReader(file)
                    if reader.is_encrypted:
                        print(f"Skipping {pdf_path}, it is encrypted.")
                        continue
            except Exception as e:
                print(f"Error reading {pdf_path}: {e}")
                continue
            # Process the PDF file here
            print(f"Processing {pdf_path}...")
            # Add your PDF processing code here

# Use PyPDF2 to extract text from the PDF file

            with open(pdf_path, "rb") as file:
                reader = PyPDF2.PdfReader(file)
                text = ""
                for page in reader.pages:
                    text += page.extract_text()

            # Save the extracted text to a new, single text file
            output_file = os.path.join(output_dir, f"{os.path.splitext(filename)[0]}.txt")
            with open(output_file, "w", encoding="utf-8") as text_file:
                text_file.write(text)

            print(f"Extracted text saved to {output_file}")




Skipping data\AN_PRC 117F User manual.pdf, already processed.
Skipping data\AN_PRC 117G User manual.pdf, already processed.
Skipping data\BGAN Explorer 710 User Manual.pdf, already processed.
Skipping data\BGAN Explorer_710 User manual.pdf, already processed.
Skipping data\Cannon PIXMA iP100 User manual.pdf, already processed.
Skipping data\CISCO 1900_series User manual.pdf, already processed.
Skipping data\CISCO 2500_series User manual.pdf, already processed.
Skipping data\CISCO 2800_series User manual.pdf, already processed.
Skipping data\CISCO 2811__voice_security_bundle_router User manual.pdf, already processed.
Skipping data\CISCO ASA_5505 User manual.pdf, already processed.
Skipping data\CISCO catalyst_3560 User manual.pdf, already processed.
Skipping data\CISCO Catalyst_3750 User manual.pdf, already processed.
Skipping data\Garmin GPSMAP_78 User manual User manual.pdf, already processed.
Skipping data\GETAC B300G7 User Manual.pdf, already processed.
Skipping data\GETAC CF-19 Use

In [12]:
# Merge all the text files into a single file

with open(os.path.join(output_dir, "merged_text.txt"), "w", encoding="utf-8") as outfile:
    for dirpath, dirnames, filenames in os.walk(output_dir):
        for filename in filenames:
            if filename.endswith(".txt") and filename != "merged_text.txt":
                with open(os.path.join(dirpath, filename), "r", encoding="utf-8") as infile:
                    outfile.write(infile.read())
                    outfile.write("\n")  # Add a newline between files

# Print the number of files processed
print(f"Processed {len([f for f in filenames if f.endswith('.txt')])} text files.")

# Print the total number of words in the merged file
with open(os.path.join(output_dir, "merged_text.txt"), "r", encoding="utf-8") as infile:
    text = infile.read()
    word_count = len(text.split())
    print(f"Total number of words in merged file: {word_count}")

Processed 26 text files.
Total number of words in merged file: 971051
