In [None]:
import pandas as pd
import re
import nltk
import sklearn
import pickle
import numpy as np
import xmltodict as xml
import random

file = "/home/hodaya/Downloads/PathologyCOPY.xml"

# convert xml to dictionary using xmltodict
with open(file) as fd:
    doc = xml.parse(fd.read())

# preprocess the data (stripping special characters and lowercasing)
def preprocessing(data):
    preprocessed = []
    for i, val in enumerate(data['dataroot']['Pathology']): # i is report index
        date, typ, stat, desc, text =  val['Report_Date_Time'], val['Report_Type'], val['Report_Status'], val['Report_Description'], val['Report_Text']
        text = re.sub('\W+', ' ', text).lower().strip() # taking out special characters and lowercasing
        preprocessed.append((date, typ, stat, desc, text))
    return preprocessed

# deidentify the data (to protect the patient)
def deidentification(data):
    deidentified = []
    for line in data:
        for i in line:
            # accession number
            i = re.sub(r'[a-z]{1,2}[0-9]{2}\s[0-9]{3,6}', r'ACCESSION_NUMBER', i) # format of a## ######
            # names (4 formats)
            #i = re.sub(r'(?<=____)[a-z].*(\Z\sm\sd\s|\sct\sascp)', r'SIGNATURE_NAME', i) # ____firstname (m) lastname 'm d '/'ct ascp' 
            i = re.sub(r'(?<=ordering\sprovider\s)[a-z]{3,11}\s[a-z]{4,9}', r'ORDERING_PROVIDER', i) # ordering provider(space)lastname firstname 
            i = re.sub(r'[a-z]{4,10}\s[a-z]\s[a-z]{3,8}\s(md|m\sd|ct\sascp)', r'PHYSICIAN_NAME',i) # (4-10 letters) f (3-8 letters) 'md'/'m d'/'ct ascp'
            i = re.sub(r'[a-z]{4,10}\s[a-z]{3,8}\s(md|m\sd|ct\sascp)', r'PHYSICIAN_NAME',i) # (4-10 letters) (3-8 letters) 'md'/'m d'/'ct ascp'
            #i = re.sub(r'[a-z]{4,10}\s[a-z]\s[a-z]{4,8}', r'NAME', i) # regular name: firstname a lastname
            i = re.sub(r'(dr\s|mr\s|ms\s|mrs\s)[a-z]{4,10}\s[a-z]\s[a-z]{4,8}', r'NAME', i) # dr/mr/ms/mrs firstname a lastname
            i = re.sub(r'(dr\s|mr\s|ms\s|mrs\s)[a-z]{4,10}\s[a-z]{4,8}', r'NAME', i) # dr/mr/ms/mrs firstname lastname
            i = re.sub(r'(?<=dictated\sby\s)[a-z]{2}\s[a-z]{3}', r'PERSON_DICTATING', i) # dictated by xx xxx____ 
            i = re.sub(r'(?<=dictated\sby\s)[a-z]{2}', r'PERSON_DICTATING', i) # dictated by xx____
            i = re.sub(r'(?<=labeled\s)(jami\sr\srubins|jami\srubins|carmelina\sackerley)', r'PERSON_LABELING', i) # labeled jami r rubens OR labeled jami rubens
            # date & time
            i = re.sub(r'([1-9]|[0][1-9]|[1][0-2])[\s|/]([0-9]|0[1-9]|[12][0-9]|3[01])[\s|/]([0-9]{2,4})(?!(\s[0-9]{2}){3,})', r'DATE', i) # m/dd/yy, mm/dd/yy, mm/dd/yyyy (/ or ' ')
            i = re.sub(r'[0-9]{1,2}:[0-9]{2}:[0-9]{2}\s(AM|PM)', 'TIME', i)
            # address & telephone
            i = re.sub(r'2014\swashington\sstreet\snewton\sma\s02462', r'HOSPITAL_ADDRESS', i)
            i = re.sub(r'tel\s617\s243\s6140', r'HOSPITAL_TELEPHONE', i)
            deidentified.append(i)
    # validate you have found all the cases (take 20 reports at random & check them)
    random_reports = random.sample(deidentified, 20)
    for r in range(19):
        report = random_reports[r]
        # are all names gone from reports?
        #if re.search(r'[a-z]{4,10}\s[a-z]{4,9}', report) != None: # firstname lastname
            #return "A case that wasn't deidentified was caught (1st)"
        if re.search(r'[a-z]{4,10}\s[a-z]\s[a-z]{4,9}\s(md|m\sd|ct\sascp)', report) != None: # firstname m lastname md/ct ascp
            return "A case that wasn't deidentified was caught (2nd)"
        elif re.search(r'[a-z]{4,10}\s[a-z]{4,9}\s(md|m\sd|ct\sascp)', report) != None: # firstname lastname md/ct ascp
            return "A case that wasn't deidentified was caught (3rd)"
        #elif re.search(r'[a-z]{4,10}\s[a-z]\s[a-z]{4,9}', report) != None: # firstname m lastname
            #return "A case that wasn't deidentified was caught (4th)"
    return deidentified

preprocessed = preprocessing(doc)
deidentified = deidentification(preprocessed)
print(deidentified)

In [None]:
# HOW YOU CAN MANIPULATE THE DICTIONARY

i = 2
# print(doc['dataroot']['Pathology'][i]['Report_Type'])
# print(doc['dataroot']['Pathology'][i]['Report_Status'])
# print(doc['dataroot']['Pathology'][i]['Report_Text'])
# print(doc['dataroot']['Pathology'][i]['Report_Description'])

doc['dataroot']['Pathology'][i]['Report_Type']
doc['dataroot']['Pathology'][i]['Report_Status']
doc['dataroot']['Pathology'][i]['Report_Text']
doc['dataroot']['Pathology'][i]['Report_Description']

doc['dataroot']['Pathology'][2]
len(doc['dataroot']['Pathology']) # 24 because there are 24 reports
doc['dataroot']['Pathology'][23]

# print(doc['dataroot']['Pathology'][0].keys())

len(doc['dataroot']['Pathology'][0].keys())

doc['dataroot']['Pathology'] # shows all the pathologies
len(doc['dataroot']['Pathology'])

doc['dataroot'].keys()

doc['dataroot']['Pathology'][0].items()