# Text Cleaning

In [1]:
import csv
import pandas
import numpy
import random

import re
import string
import spacy
import enchant
from enchant.checker import SpellChecker

In [2]:
nlp = spacy.load('en')
d = enchant.Dict("en_US")

In [3]:
combined = pandas.read_csv('05 Combined Cleaned.csv', delimiter=',', encoding='latin-1').fillna('')
combined[:1]

Unnamed: 0,ID,Process Step,Problem Type,Contributing Factors,Overall Severity,Incident Description,Language,Translated,Neat Cleaned,Bony Cleaned
0,2511,Treatment delivery,"Wrong, missing, mislabeled, or damaged treatme...",Distraction or diversions involving staff,,Non prescribed bolus . Bolus was not prescribe...,En,Non prescribed bolus . Bolus was not prescribe...,non prescribed bolus . bolus was not prescribe...,non prescribe bolus bolus prescribe md pt rece...


In [3]:
combined = pandas.read_csv('04 Combined Translated.csv', delimiter=',', encoding='latin-1').fillna('')
combined[:1]

Unnamed: 0,ID,Process Step,Problem Type,Contributing Factors,Overall Severity,Incident Description,Language,Translated
0,2511,Treatment delivery,"Wrong, missing, mislabeled, or damaged treatme...",Distraction or diversions involving staff,,Non prescribed bolus . Bolus was not prescribe...,En,Non prescribed bolus . Bolus was not prescribe...


In [4]:
punctuations = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~'

def remove_breaks(string):
    return string.replace('\r', '.').replace('\n', '.')

def replace_entities(string):
    ret = string
    for ent in nlp(string).ents:
        if ent.label_ in ['TIME', 'DATE', 'PERCENT', 'CARDINAL', 'QUANTITY', 'ORDINAL']:
            ret = ret.replace(ent.text, ' '+ent.label_.lower()+' ')
    return ret

def stop_words_removal_and_lemmatization(string):
    lems = [token.lemma_ for token in nlp(string) if not token.is_stop]
    return ' '.join(lems)

def stop_words_removal_and_lemmatization_and_entity_replacement(string):
    return stop_words_removal_and_lemmatization(replace_entities(string))

def remove_spaces(string):
    return ' '.join([token for token in string.split(' ') if token is not ''])

# Only autocorrect words with more than 4 characters to avoid autocorrecting 
def autocorrect(string):
    words = []
    for token in string.split(' '):
        if len(token)>4 and not d.check(token):
            if token == 'isocentre' or token == 'isocenter':
                words.append('isocentre')
            else:
                try:
                    words.append(d.suggest(token)[0])
                except Exception:
                    words.append(token)
        else:
            words.append(token)
    return ' '.join([word for word in words])

def remove_punctuations(string):
    return re.sub(r'[^\w\s]', ' ', string)

def isolate_punctuations(string):
    letters = []
    for i in range(len(string)):
        if string[i] in punctuations:
            if not (i<len(string)-1 and string[i] is '.' and string[i+1].isdigit()):
                letters.append(' ' + string[i] + ' ')
            else:
                letters.append(string[i])
        else:
            letters.append(string[i])
    return ''.join(letters)

def remove_numerals(string):
    return ''.join([letter for letter in string if not letter.isdigit()])

def isolate_numerals(string):
    splits = [0]
    i = 0
    while (i < len(string)):
        if string[i].isdigit():
            start = i
            end = i
            while (end<len(string) and (string[end].isdigit() or string[end] is '.')):
                end += 1
            i = end
            splits.append(start)
            splits.append(end)
        i += 1
    splits = splits + [len(string)]
    substrings = []
    for i in range(len(splits)-1):
        substrings.append(string[splits[i]:splits[i+1]])
    return ' '.join([substring for substring in substrings])

def lower(string):
    return string.lower()

# So called because neat_clean makes text neat and tidy.
# One can still understand the cleaned string.`
def neat_clean(string):
    return lower(remove_spaces(remove_spaces(isolate_punctuations(remove_breaks(string)))))

# So called because only words that are key remain
# Stop words are removed and words are replaced by their lemmas
# One cannot understand the cleaned string.
def bony_clean(string):
    return lower(remove_spaces(remove_numerals(remove_punctuations(remove_spaces(stop_words_removal_and_lemmatization_and_entity_replacement(remove_breaks(string)))))))

def neat_clean_with_autocorrect(string):
    return lower(remove_spaces(replace_entities(autocorrect(remove_spaces(isolate_punctuations(remove_breaks(string)))))))

def bony_clean_with_autocorrect(string):
    return lower(remove_spaces(remove_numerals(remove_punctuations(autocorrect(remove_spaces(stop_words_removal_and_lemmatization_and_entity_replacement(remove_breaks(string))))))))

In [10]:
# combined['Neat Cleaned'] = combined['Incident Description'].apply(neat_clean_with_autocorrect)
combined['Bony Cleaned'] = combined['Translated'].apply(bony_clean_with_autocorrect)
combined[:1]

Unnamed: 0,ID,Process Step,Problem Type,Contributing Factors,Overall Severity,Incident Description,Language,Translated,Neat Cleaned,Bony Cleaned
0,2511,Treatment delivery,"Wrong, missing, mislabeled, or damaged treatme...",Distraction or diversions involving staff,,Non prescribed bolus . Bolus was not prescribe...,En,Non prescribed bolus . Bolus was not prescribe...,non prescribed bolus . bolus was not prescribe...,non prescribed bolus bolus prescribe md pt rec...


In [19]:
combined.to_csv('05 Combined Cleaned.csv', encoding='utf-8', index=False, quoting=csv.QUOTE_NONNUMERIC)