Elements for automated cleaning:

- remove special characters
- try reconnecting line endings (check against standard spell check dictionary - enchant)
- remove multiple periods
- for now, leave modifying words for later (if needed)

In [1]:
import enchant

In [2]:
# broker = enchant.Broker()
# broker.describe()
# broker.list_languages()

In [3]:
# import enchant
import os

In [4]:
source_directory = "../data/text/ocr/"
output_directory = "../data/text/auto_corrected/"

In [5]:
def open_file(filepath):
    
    with open(filepath) as f:
        content = f.read()
    
    return content

In [6]:
import re

def remove_extra_punct(content):
    """
    Function identifies periods that occur in multiple sequence.
    """
    
    content = re.sub(r'\.{2,}|[\. ]{2,}|:{2,}|;{2}|,{2,}|\s{2,}', " ", content)
#     content = re.sub(r'', " ", content)
    
    return content

In [7]:
def normalize_dashes(content):
    
    # Substitute for all other dashes
    content = re.sub(r"—-—–‑", r"-", content)
    
    return content

In [8]:
def reconnect_lines(content):
    # load dictionary
    d = enchant.Dict("en_US")
    
    # find line endings
    splits = re.findall(r'(\w+)(\-\s{1,})([a-z]+)', content)
    for split in splits:
        # combine into word
        test_word = "".join([split[0], split[2]])
        if d.check(test_word) is True:
            pattern = "".join(split)
#             print(pattern)
            content = re.sub(pattern, test_word, content)
        else:
            continue 
    
    return content

In [9]:
def remove_special_chars(content):
    """Use regex to remove special characters except for punctuation.
    Note:
        Modify this function before use if content includes characters from languages other than English.
    Args:
        content(str): File content as string
    Returns:
        str: File content with special characters removed.

    """
    # Replace all special characters with a space (as these tend to occur at the end of lines)
    return re.sub(r"[^a-zA-Z0-9\s,.!?$:;\&\'\"]", r" ", content)

In [10]:
for f in os.listdir(source_directory):
    content = open_file(os.path.join(source_directory, f))
    corrected = remove_extra_punct(remove_special_chars(reconnect_lines(normalize_dashes(content))))
    print(corrected)
    with open(os.path.join(output_directory, f), 'w') as o:
        o.write(corrected)

 " " O
C
'l'HE HEALTH REFORMER 
: , r  c ,; :CY ,c:,c ,  ; ;: x ; x x  o   ;  ' I Sickness has come to be the ruling con !! tl l :t 0 ' l n l dition of mankind, and health tbc cxcep  " "' " ! tion; and a better state of things cannot be expected until the laws of life rnd health are better understood and obeyed 
To the Reader 
BY H s 1,.n, M o These laws we shall endeavor faithfully to I , 1 fi t t b , th explain ; and shall inculcate a strict and n comm" 1or tie rs 1me eiore e  "' ! rntelhgent obedience thereto public as an editor of a He.ilth Reform: Th b f II Ith 
t d e sn JCCt o ea 1s no a en01m Jonrnal, 1t may be expected that we say I b' t All 1 d II natLOna su cc c asses an a para word to om readers m reference to what II 
te d 
urh l , ties arc equa y m reste m 1t n 1 e we rntend to do, and what we design shall ' b l l d l t f h  a 1 therefore, we shall advocate the proper e t 1e sty e an c iarae er o t e peno 1 
 care of our bodies, or the presen at ton of cal ot which wchave 