# Weill Cornell MedSync Data Processing
## How might we recognize handwritten medical charts and convert them to searchable text?

In [1]:
# Import packages
import enchant
import string
import pandas as pd
import docx
import os
from sklearn.feature_extraction.text import CountVectorizer
#import textract

## 1. Import Data

In [2]:
containsTranscriptions = []
noTranscriptions = []
docNames = []

def getDocuments2(loc, f0, f1, f2) :
    if f2.endswith(".docx") :
        docNames.append(f2)
    elif f2 == "Completed Transcriptions" :
        for f3 in os.listdir(loc + f0 + "/" + f1 + "/" + f2) :
            if f3.endswith(".docx") :
                docNames.append(f3)

def getDocuments1(loc, f0, f1) :
    if f1 == "Transcribed Documents" :
        noTranscriptions.remove(f0)
        containsTranscriptions.append(f0)
        for f2 in os.listdir(loc + f0 + "/" + f1) :
            getDocuments2(loc, f0, f1, f2)

def getDocuments0(loc) :
    for f0 in os.listdir(loc) :
        if f0 != ".DS_Store" :
            noTranscriptions.append(f0)
            for f1 in os.listdir(loc + f0) :
                getDocuments1(loc, f0, f1)

def getDocuments00(loc) :
    titles = [] ; words = []
    for f in os.listdir(loc) :
        if f.endswith(".txt") :
            titles.append(f.replace(' copy.txt',''))
            f1 = open("TranscribedFolder/" + f)
            words.append(f1.read().replace('\n', ''))
    
    return titles, words
    
getDocuments0("Transcriptions/")
allTitles, allWords = getDocuments00("TranscribedFolder/")

In [3]:
# Ensure Titles are consistent
# t10 = [] ; t20 = []
# for f in os.listdir("TranscriptionImages/") :
#     if f.endswith(".png") :
#         t10.append(f.replace('.png',''))

# print(list(set(allTitles) - set(t10)))
# print(list(set(t10) - set(allTitles)))

In [4]:
containsTranscriptions

['Transcriber 01',
 'Transcriber 02',
 'Transcriber 03',
 'Transcriber 04',
 'Transcriber 05',
 'Transcriber 06',
 'Transcriber 07',
 'Transcriber 08',
 'Transcriber 09',
 'Transcriber 10',
 'Transcriber 11',
 'Transcriber 12',
 'Transcriber 13',
 'Transcriber 14',
 'Transcriber 16',
 'Transcriber 17',
 'Transcriber 18']

In [5]:
noTranscriptions

['Transcriber 15',
 'Transcriber 19',
 'Transcriber 20',
 'Transcriber 21',
 'Transcriber 22',
 'Transcriber 23',
 'Transcriber 24']

In [6]:
docNames

['NYH Med Div 1837_39_p311.docx',
 'NYH Med Div 1850_51_P5.docx',
 'NYH med div 1850_51_p526.docx',
 'NYH Med Div 1850_51_pg12.docx',
 'Medical_1847-48_P335.docx',
 'Medical_1855-1856_P2.docx',
 'Medical_1855-56_P125.docx',
 'Medical_1855-56_P126.docx',
 'Medical_New York Hospital 1st Surgical Division Casebook_1840-42_P5.docx',
 '1st_Surgical_1846-47 Part One P4.docx',
 '1st_Surgical_1846-47 Part One P46.docx',
 '1st_Surgical_1846-47 Part One P88.docx',
 '1st_Surgical_1846-47 Part One P89.docx',
 '1st_Surgical_1850-51_P108.docx',
 '1st_Surgical_1850_P2.docx',
 '1t_Surgical_1850-51_P69.docx',
 '1st_Surgical _1854-1855_P235.docx',
 '1st_Surgical _1854-1855_P301.docx',
 '1st_Surgical Casebook_1857-58_p116.docx',
 '1st_Surgical Casebook_1857-58_P172.docx',
 '1st_Surgical Casebook_1857-58_P3.docx',
 '1st_Surgical Casebook_1857_58_P527.docx',
 '2nd_Medical_1862-63_P121.docx',
 '2nd_Medical_1862-63_P122.docx',
 '2nd_Medical_1862-63_P2.docx',
 '2nd_Medical_1862-63_P3.docx',
 '2nd_Surgical_184

## 2. Clean and Store Data

Medical dictionary from the [Pacific Northwest University of Health Sciences](http://www.pnwu.edu/inside-pnwu/departments/technology-resources/medical-dictionary/).

In [None]:
# Get English dictionary from PyEnchant library
english = enchant.Dict("en_us")
# Get Medical dictionary
f1 = open("medicalVocabulary.txt")
medicalVocab = f1.read() ; f1.close()
medicalVocab = medicalVocab.lower() ; medicalVocab = medicalVocab.split("\n") 
# Get Stopwords
from nltk.corpus import stopwords
stop = stopwords.words('english')

# Function to check if String contains number
def digitExists(data):
    return any(x.isdigit() for x in data)

# Strip punctuation function
def stripPunctuation(term) :
    punctuation = list(string.punctuation) # String of punctuation characters
    punctuation.remove('.') # To prevent issues with ellipses.
    whiteList = ['—', '.'] # We want to replace em-dashes and ellipses with whitespace
    punctuation.append("+") ; punctuation.append("-") ; punctuation.append(".")
    stripped = ''
    for character in term:
        if character not in punctuation and character in whiteList:
            stripped = stripped + ' '
        elif character not in punctuation and character not in whiteList:
            stripped = stripped + character
    return stripped

# Function to clean list of Strings into lists of lists of tokens
def clean(w):
    cleaned = []
    for doc in w:
        temp = []
        for term in doc.split() :
            if term not in stop and len(term) > 4  and (digitExists(term) == False) :
                temp.append(stripPunctuation(term.lower()))
        cleaned.append(temp)
    return cleaned

# Function that takes in a single document, the list of non-English words,
# the list of medical terms, and the vocabulary of English words from document
def process(words) :
    totalV = []; error = [] ; medical = []
    for l in words:
        for word in l:
            if word : # Check if String is not empty
                val = int(english.check(word))
                if word in medicalVocab : # Check if word is medical
                    medical.append(word)
                elif english.check(word) :
                    totalV.append(word)   
                else :
                    error.append(word)
    # Remove reundant words
    totalV = list(set(totalV))
    medical = sorted(list(set(medical)))
    error = list(set(error))
    
    return totalV, medical, error

# Function to generate Pandas DataFrame from processed data
def generateDF(titles, documents) :
    df = pd.DataFrame(columns = ('File', 'Words'))
    for i in range(0, len(titles)) :
        df.loc[i] = [titles[i], documents[i]]
        df.loc[i] = [titles[i], ' '.join(documents[i])]
    return df

In [None]:
allCleaned = clean(allWords)
voc, medical, error = process(allCleaned)
df = generateDF(allTitles, allCleaned)

In [None]:
len(voc)

In [None]:
len(medical)

In [None]:
len(error)

In [None]:
df.head()

## 3. Process Data (Bag-of-Words)

In [None]:
cv = CountVectorizer(vocabulary = voc, min_df = 50)
allCounts = cv.fit_transform(df['Words'].values)
trainDF = pd.DataFrame(allCounts.todense(), columns = cv.get_feature_names())
wordCounts = pd.concat([df['File'], trainDF], axis = 1)

In [None]:
len(cv.get_feature_names())

In [None]:
wc = wordCounts[wordCounts != 0].count()
wc = wc.sort_values(ascending = False)

In [None]:
wc

In [None]:
# wordCounts.sum()

In [None]:
cv1 = CountVectorizer(vocabulary = medical, min_df = 0.3)
mCounts = cv1.fit_transform(df['Words'].values)
medDF = pd.DataFrame(mCounts.todense(), columns = cv1.get_feature_names())
medicalCounts = pd.concat([df['File'], medDF], axis = 1)

In [None]:
len(cv1.get_feature_names())

In [None]:
os.chdir("Output/")
wordCounts.to_csv('allCounts.csv', encoding='utf-8', index=False)
medicalCounts.to_csv('medicalCounts.csv', encoding='utf-8', index=False)