In [1]:
import re
import string
import itertools
from collections import Counter
import nltk
import pandas as pd
import numpy as np
import pickle
from tqdm import tqdm
import pkg_resources
from symspellpy import SymSpell, Verbosity

In [2]:
nRecords = 4000

In [3]:
def removePunctuations(text):
    remove = string.punctuation.replace("'", "") 
    pattern = r"[{}]".format(remove) 
    return re.sub(pattern, " ", text) 

def replaceDigitsWithd(text):
    return re.sub('\d', 'd', text)

def preprocessText(text):
    text = text.lower()
    text = text.replace("\n", " ")
    text = removePunctuations(text)
    text = replaceDigitsWithd(text)
    output = text.split(' ')
    output = list(filter(lambda a: a != '', output))
    return output

def getWordCounts(data):
    combinedList = list(itertools.chain.from_iterable(data))
    countDict = dict(Counter(combinedList))
    return countDict

def findCorrectSpelling(inputWord, countDict, minFreq):  
    minDistance = 1e10
    correctSpelling = inputWord
    for word in countDict.keys():        
        if inputWord != word:
            distance = nltk.edit_distance(word, inputWord)
            if distance < minDistance:
                minDistance = distance
                correctSpelling = word
    return correctSpelling

def getSpellingCorrections(countDict, minFreq = 5):
    corrections = {}
    words = countDict.keys()
    for word in words:
        if countDict[word] < minFreq:
            corrections[word] = findCorrectSpelling(word, countDict, minFreq)
    return corrections

def preprocess(inputdata):
    data = []

    for text in inputdata: 
        data.append(preprocessText(text))
         
    processedData = spellify(data, minFreq=5)
    return processedData

In [4]:
dataset = pd.read_pickle('./cleanData.pkl')
dataset = dataset[:nRecords]

In [5]:
# Correct spellings
sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
dictionary_path = pkg_resources.resource_filename(
    "symspellpy", "frequency_dictionary_en_82_765.txt"
)

sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
tokenFreq = Counter(itertools.chain.from_iterable(dataset['PREPROCESSED_TEXT']))
wrongSpellings = [] 
for token in tokenFreq:
    if tokenFreq[token] <= 5:
        wrongSpellings.append(token)
        
word2idx = {k: i+1 for i, k in enumerate(tokenFreq.keys())}

tqdm.pandas()

def spellify(text):
    newRecord = []
    for word in text:
        if word in wrongSpellings:
            suggestions = sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=2)
            if len(suggestions) > 0:
                newRecord.append(suggestions[0].term)
        else:
            newRecord.append(word)
    return newRecord

dataset['PREPROCESSED_TEXT'] = dataset['PREPROCESSED_TEXT'].progress_apply(spellify)

100%|██████████| 4000/4000 [02:21<00:00, 28.23it/s]


In [6]:
from ast import literal_eval
dataset['ICD9_CODE'] = dataset.ICD9_CODE.apply(lambda x: literal_eval(str(x)))

In [7]:
with open('./cleanData-' + str(nRecords) + '.pkl', 'wb') as handle:
    pickle.dump(dataset, handle)