# Weill Cornell MedSync Data Processing
## How might we recognize handwritten medical charts and convert them to searchable text?

In [12]:
# Import packages
import enchant
import string
import pandas as pd
import docx
import os
from sklearn.feature_extraction.text import CountVectorizer
#import textract

## 1. Import Data

In [13]:
containsTranscriptions = []
noTranscriptions = []
docNames = []

def getDocuments2(loc, f0, f1, f2) :
    if f2.endswith(".docx") :
        docNames.append(f2)
    elif f2 == "Completed Transcriptions" :
        for f3 in os.listdir(loc + f0 + "/" + f1 + "/" + f2) :
            if f3.endswith(".docx") :
                docNames.append(f3)

def getDocuments1(loc, f0, f1) :
    if f1 == "Transcribed Documents" :
        noTranscriptions.remove(f0)
        containsTranscriptions.append(f0)
        for f2 in os.listdir(loc + f0 + "/" + f1) :
            getDocuments2(loc, f0, f1, f2)

def getDocuments0(loc) :
    for f0 in os.listdir(loc) :
        if f0 != ".DS_Store" :
            noTranscriptions.append(f0)
            for f1 in os.listdir(loc + f0) :
                getDocuments1(loc, f0, f1)

def getDocuments00(loc) :
    titles = [] ; words = []
    for f in os.listdir(loc) :
        if f.endswith(".txt") :
            titles.append(f.replace(' copy.txt',''))
            f1 = open("TranscribedFolder/" + f)
            words.append(f1.read().replace('\n', ''))
    
    return titles, words
    
# getDocuments0("Transcriptions/")
allTitles, allWords = getDocuments00("TranscribedFolder/")

In [14]:
# Ensure Titles are consistent
# t10 = [] ; t20 = []
# for f in os.listdir("TranscriptionImages/") :
#     if f.endswith(".png") :
#         t10.append(f.replace('.png',''))

# print(list(set(allTitles) - set(t10)))
# print(list(set(t10) - set(allTitles)))

In [15]:
noTranscriptions

[]

In [16]:
docNames

[]

## 2. Clean and Store Data

Medical dictionary from the [Pacific Northwest University of Health Sciences](http://www.pnwu.edu/inside-pnwu/departments/technology-resources/medical-dictionary/).

In [17]:
# Get English dictionary from PyEnchant library
english = enchant.Dict("en_us")
# Get Medical dictionary
f1 = open("medicalVocabulary.txt")
medicalVocab = f1.read() ; f1.close()
medicalVocab = medicalVocab.lower() ; medicalVocab = medicalVocab.split("\n") 
# Get Stopwords
from nltk.corpus import stopwords
stop = stopwords.words('english')

# Function to check if String contains number
def digitExists(data):
    return any(x.isdigit() for x in data)

# Strip punctuation function
def stripPunctuation(term) :
    punctuation = list(string.punctuation) # String of punctuation characters
    punctuation.remove('.') # To prevent issues with ellipses.
    whiteList = ['—', '.'] # We want to replace em-dashes and ellipses with whitespace
    punctuation.append("+") ; punctuation.append("-") ; punctuation.append(".")
    stripped = ''
    for character in term:
        if character not in punctuation and character in whiteList:
            stripped = stripped + ' '
        elif character not in punctuation and character not in whiteList:
            stripped = stripped + character
    return stripped

# Function to clean list of Strings into lists of lists of tokens
def clean(w):
    cleaned = []
    for doc in w:
        temp = []
        for term in doc.split() :
            if term not in stop and len(term) > 4  and (digitExists(term) == False) :
                temp.append(stripPunctuation(term.lower()))
        cleaned.append(temp)
    return cleaned

# Function that takes in a single document, the list of non-English words,
# the list of medical terms, and the vocabulary of English words from document
def process(words) :
    totalV = []; error = [] ; medical = []
    for l in words:
        for word in l:
            if word : # Check if String is not empty
                val = int(english.check(word))
                if word in medicalVocab : # Check if word is medical
                    medical.append(word)
                elif english.check(word) :
                    totalV.append(word)   
                else :
                    error.append(word)
    # Remove reundant words
    totalV = list(set(totalV))
    medical = sorted(list(set(medical)))
    error = list(set(error))
    
    return totalV, medical, error

# Function to generate Pandas DataFrame from processed data
def generateDF(titles, documents) :
    df = pd.DataFrame(columns = ('File', 'Words'))
    for i in range(0, len(titles)) :
        df.loc[i] = [titles[i], documents[i]]
        df.loc[i] = [titles[i], ' '.join(documents[i])]
    return df

In [18]:
allCleaned = clean(allWords)
voc, medical, error = process(allCleaned)
df = generateDF(allTitles, allCleaned)

In [19]:
len(voc)

1249

In [20]:
len(medical)

85

In [21]:
len(error)

1030

In [23]:
df.head()

Unnamed: 0,File,Words
0,1st_Surgical _1854-1855_P235,inflamation ankle joint catherine lundy marrie...
1,1st_Surgical _1854-1855_P301,poisoned sarah miller single amdnov markel adm...
2,1st_Surgical Casebook_1857-58_p116,spinal disease haine sep gauburen unfortunate ...
3,1st_Surgical Casebook_1857-58_P172,wound chest tityen germany bartendersep patie...
4,1st_Surgical Casebook_1857-58_P3,iliac abscess horton ireland laborer admjuly p...


## 3. Process Data (Bag-of-Words)

In [24]:
cv = CountVectorizer(vocabulary = voc, min_df = 50)
allCounts = cv.fit_transform(df['Words'].values)
trainDF = pd.DataFrame(allCounts.todense(), columns = cv.get_feature_names())
wordCounts = pd.concat([df['File'], trainDF], axis = 1)

In [25]:
len(cv.get_feature_names())

1249

In [26]:
wc = wordCounts[wordCounts != 0].count()
wc = wc.sort_values(ascending = False)

In [27]:
wc

File            50
pulse           27
patient         25
since           24
right           22
admission       21
discharged      19
tongue          18
treatment       17
still           16
bowels          15
night           15
slight          15
appetite        14
cured           13
cough           13
pains           13
considerable    13
taken           12
first           11
fever           11
weeks           11
better          11
oz              10
rather          10
three           10
chill           10
there           10
moist            9
removed          9
                ..
suite            1
warmth           1
cocks            1
frost            1
median           1
afforded         1
distant          1
death            1
drank            1
tinge            1
scattered        1
gammon           1
shape            1
moderate         1
depth            1
wheels           1
become           1
die              1
enlarged         1
inflicting       1
vegetable        1
d           

In [28]:
# wordCounts.sum()

In [29]:
cv1 = CountVectorizer(vocabulary = medical, min_df = 0.3)
mCounts = cv1.fit_transform(df['Words'].values)
medDF = pd.DataFrame(mCounts.todense(), columns = cv1.get_feature_names())
medicalCounts = pd.concat([df['File'], medDF], axis = 1)

In [30]:
len(cv1.get_feature_names())

85

In [31]:
os.chdir("Output/")
wordCounts.to_csv('allCounts.csv', encoding='utf-8', index=False)
medicalCounts.to_csv('medicalCounts.csv', encoding='utf-8', index=False)