## PDF Detection V1

In [1]:
import numpy as np
import pandas as pd
import sys
import os
import pytesseract                            # API for letting python interface with Google's tesseract OCR software

import cv2                                    # Open Computer Vision library
import PyPDF2                                 # All things PDF format related
import io                                     # Something about messing with memory
from wand.image import Image                  # For messing with images
from PIL import Image as Im                   # Likewise images
import codecs                                 # Unknown

**import data**
- Extract each page of each PDF
- Convert each page to PNG

In [2]:
# Alternative process that uses command-line tool pdftocairo directly
def pdf_reader_cairo(filename):
    os.system("pdftocairo -r 300 -png ./example_data_PDF/" + filename + ".pdf ./converted/" + filename)
    return("Converted " + filename + " to png")

In [3]:
# Grab filenames 
files = [filename.split(".")[0] for filename in os.listdir("./example_data_PDF") if ".pdf" in filename]

**Convert PDF to PNG**

In [None]:
for filename in files:
    print(pdf_reader_cairo(filename))

**Apply image transmutes to promote readability of machine**

In [6]:
# Read in images, greyscale, apply filters, save
def pre_process(filename):
    
    png_files = [pngname for pngname in os.listdir("./converted") if filename in pngname]
    
    for pngname in png_files:
        
        # Convert to greyscale
        concatenated = cv2.imread('./converted/'+pngname, 0)
    
        # Threshold image to black/white 
        num, grey_composite = cv2.threshold(concatenated, 127, 255, cv2.THRESH_BINARY)
    
        # inverting image
        inverted_composite = 255-grey_composite
    
        # Perform closing, dilation and erosion
        kernel = np.ones((2,2), np.uint8) 
        closed_composite = cv2.morphologyEx(inverted_composite, cv2.MORPH_CLOSE, kernel)
        
        # Undo inversion
        closed_composite = 255-closed_composite
        
        # Write to file ready for OCR
        cv2.imwrite("./preprocessed/"+pngname, closed_composite)
        
    print("Image pre-processing complete for " + filename)

    return(1)

In [None]:
# Apply preprocessing
for file in files: 
    pre_process(file)

**Apply OCR to every Pre Proccessed Image**
- OCR is Optical Character Recognition. OCR systems transform a two-dimensional image of text into machine-readable text
- We spit these out into a CSV so we can inspect the data manually.

In [None]:
for filename in files:
    
    #find all of the pre-processed pages
    png_files = [pngname for pngname in os.listdir("./preprocessed") if filename in pngname]
    
    for pngname in png_files:
        # Iterate through all of the pages
        f = open("./ocr_output/"+pngname+"._ocr_data.csv", "w")
        f.write(pytesseract.image_to_data(Im.open("./preprocessed/"+pngname)))
        f.close()
        print("OCR'ed " + pngname)


In [None]:
# Iterate through original filenames, create one csv of extracted data per files
for filename in files:
    
    # Blank DF for data
    df_doc = pd.DataFrame()
    
    #find all of the pre-processed pages
    csv_files = sorted([csvname for csvname in os.listdir("./ocr_output") if filename in csvname])
    
    csv_num = 1
    
    for each in csv_files:
        
        try:
            # Reading csv is tricky, weird save format separated by spaces + tabs
            df_page = pd.read_csv("./ocr_output/" + each,
                                  sep=' |\t',
                                  error_bad_lines=False,
                                  engine='python')
        
            # Append csv (page) number
            df_page['csv_num'] = csv_num
        
            df_doc = df_doc.append(df_page)
            
            print("Processed "+filename+" page "+str(csv_num))
            csv_num = csv_num + 1
        
        except:
            print("Failed on "+filename+" page"+str(csv_num))
            csv_num = csv_num + 1
    
    
    df_doc.to_csv("./ocr_output_compiled/"+filename+".csv")

### **Takeaways with this method:**
- It's a good beginner step into the Extracting PDF world, but lacks funcitonality. We don't want to have to look line 
by line for the data we're looking for and makes understanding the data difficult.