In [1]:
import fitz
import re
import os
import pandas as pd
from collections import Counter
from tqdm import tqdm

In [2]:
def parse_file(filepath):
    """Parses pdf file, returns dictionary where key=pagenum, value=list of
    text/image block strings (paragraphs) for pdf.

    Args:
        filepath (str):  Filepath for .pdf file

    Returns:
        dict: Dictionary of paragraphs where key=pagenum and value=paragraph
        list.
    """
    with fitz.open(filepath) as doc:
        block_dict = {(idx + 1): page.getText("blocks") for idx, page in enumerate(doc)}
        block_dict = {
            key: [block[4] for block in value] for key, value in block_dict.items()
        }
    return block_dict


def toy_clean(pdf_dict):
    """ Converts extra whitespace, newline and  non-breaking characters into single space, 
        Removes all image strings,
        Removes Adobe InDesign strings.
        
        Args: 
           pdf_dict: Dictionary of paragraphs where key=pagenum and value=paragraph list
        
        Returns:
            pdf_dict: Dictionary of paragraphs where key=pagenum and value=paragraph list
    """
    pdf_dict.update((k, [re.sub(r'\s+', ' ', u"{0}".format(x)) for x in v]) for k,v in pdf_dict.items())
    pdf_dict.update((k, [x for x in v if not '<image:' in x]) for k,v in pdf_dict.items())
    pdf_dict.update((k, [x for x in v if not '.indd' in x]) for k,v in pdf_dict.items())
    return pdf_dict
        

### Defining functions with specific requirements

In [6]:
import warnings
from math import log10, floor

def round_sig(x, sig=2):
    """
    Returns number rounded to 2 significant figures
    """
    return round(x, sig-int(floor(log10(abs(x))))-1)


def search_and_count_chars(dictionary, lookup):
    """
    Returns the count of characters in all dictionary values that contain the specified lookup regex string
    """

    c = (sum(sum(len(re.findall(lookup,s)) for s in subList) for subList in dictionary.values()))
    return c

def rep_char_search(pdf_dict):
    """
    Checks a parsed and cleaned pdf dictionary for remaining replacement characters. Does not modify the input.   
    """

    replacement_characters = search_and_count_chars(pdf_dict, '�')
    all_characters = search_and_count_chars(pdf_dict, r'[\S]')
    percent_non_readable = (100*replacement_characters/all_characters)
    
    if replacement_characters != 0:
        warnings.warn("Warning! This document contains {0} non-readable characters: {1}% of the entire document.".format(replacement_characters,round_sig(percent_non_readable,2)))
              
    return pdf_dict


### Testing Function 

In [7]:
fname = ("/home/{0}/Aug21_Pivigo_S/data/raw/annual_reports/2020/{1}_Annual_Report_2020.pdf".format('juliet_t','DCC'))

pdf_dict = parse_file(fname)

partially_cleaned = toy_clean(pdf_dict)

partially_cleaned = rep_char_search(partially_cleaned)




### Checking total occurences across entire dataset

In [None]:
# Iterating through every PDF to count instances of � replacement character, and total non-whitespace character count

directory = ("/home/{0}/Aug21_Pivigo_S/data/raw/annual_reports/2020".format("juliet_t"))
filenames = []
replacement_characters_present={}
all_characters={}

for filename in tqdm(os.listdir(directory)):
    if filename.endswith(".pdf"):
        filenames.append(filename)
        doc=toy_clean(parse_file(directory+'//'+filename))
        replacement_characters_present['{0}'.format(" ".join(map(str,filename.split("_")[:-3])))] = [search_and_count_chars(doc,'�'),search_and_count_chars(doc,r'[\S]'),int(directory[-4:])]


In [None]:
def print_lst(dct):
    """
    Just a nicer way of printing contents of dictionary.
    """
    print("Instances:")
    dct=dict(sorted(dct.items(), key=lambda item: item[1], reverse=True))
    for item, amount in dct.items():
        print("{0} ({1})".format(item, amount))

# print_lst(replacement_characters_present)

In [None]:
# replacement_characters_present_2019 = dict(sorted(replacement_characters_present.items(), key=lambda item: item[1], reverse=True))

# replacement_characters_present_2019

In [None]:
# replacement_characters_present_2020 = dict(sorted(replacement_characters_present.items(), key=lambda item: item[1], reverse=True))
# replacement_characters_present_2020

In [None]:
replacement_chars_2019 = pd.DataFrame.from_dict(replacement_characters_present_2019, orient='index', columns=['Replacement_Character_Count','Total_Character_Count','Year'])
replacement_chars_2020 = pd.DataFrame.from_dict(replacement_characters_present_2020, orient='index', columns=['Replacement_Character_Count','Total_Character_Count','Year'])

replacement_chars = pd.concat([replacement_chars_2019,replacement_chars_2020], axis=0)
replacement_chars = replacement_chars.sort_values(by=['Replacement_Character_Count', 'Year'], ascending=False)

replacement_chars['%_of_total_chars']=(100*replacement_chars['Replacement_Character_Count']/replacement_chars['Total_Character_Count'])

In [None]:
replacement_chars.head(n=25)

### Comparing string and character counts

In [None]:
fname = ("/home/{0}/Aug21_Pivigo_S/data/raw/annual_reports/2020/{1}_Annual_Report_2020.pdf".format('juliet_t','Amazon'))

pdf_dict = parse_file(fname)
# partially_cleaned = toy_clean(pdf_dict)
partially_cleaned = pdf_dict

test_pn=22

partially_cleaned[test_pn]

In [None]:
def search_and_count(myDict, lookup):
    """
    Returns the count of the number of strings in a dictionary containing the specified lookup
    """
    c = (sum(sum(lookup in s for s in subList) for subList in myDict.values()))
#     return print("Instances of '{0}' =".format(lookup), c)
    return c

In [None]:
def search_and_count_chars(myDict, lookup):
    """
    Returns the count of characters in all values of a dictionary that contain the specified lookup string
    """
    
    c = (sum(sum(len(re.findall(lookup,s)) for s in subList) for subList in myDict.values()))
#     return print("Instances of '{0}' =".format(lookup), c)
    return c

In [None]:
print("strings containing �:",search_and_count(partially_cleaned, '�'))
print("characters equal to �:",search_and_count_chars(partially_cleaned, '�'))