In [1]:
from html.parser import HTMLParser
import xml.etree.ElementTree as ET
import os
import csv
import re
import pickle as pkl
from IPython.display import display, Markdown

In [2]:
# Class that parses html files and returns dictionary of ID:note
class ChHTMLParser(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.notes = {}
        self.idx = ''
        self.mytag = ""
        
    def handle_starttag(self, tag, attrs):
        self.mytag = tag
        if self.mytag == 'record' or self.mytag == 'doc':
            idn, self.idx = attrs[0]

    def handle_data(self, data):
        if self.mytag == 'text':
            if not re.match(r'\n$', data):
                if self.idx in self.notes.keys():
                    raise IndexError("Duplicated note IDs, please check each note has a unique ID to avoid overwriting.")
                self.notes[self.idx] = data.strip('\n')

In [3]:
# # Txt file reader (first line corresponds to the record number)
# def read_txt(file, header=True):
#     with open(file, 'r') as f:
#         rd = csv.reader(f)
#         if header:
#             rn = next(rd)[0]
#             out = ' '.join([r[0] for r in rd if len(r)>0])
#             return rn, out
#         else:
#             out = ' '.join([r[0] for r in rd if len(r)>0])
#             return out
def read_txt(file):
    """
    Txt file reader (first line corresponds to the record number if any)
    :param file: file name
    :return: note
    """
    f = open(file, mode='r', encoding='utf-8-sig')
    text = f.read()
    return text

In [4]:
# XML reader with CDATA sections
def readXML(file):
    tree = ET.parse(file)
    root = tree.getroot()
    for child in root:
        if child.tag == 'TEXT':
            return child.text

In [5]:
data_folder = './data'

# Notes from i2b2 challenges

## Smoking status (2006)

In [516]:
# Training and test sets with smoking status. xml format.
ss_train_file = data_folder + '/2006_smoking_status/smokers_surrogate_train_all_version2.xml'
ss_test_file = data_folder + '/2006_smoking_status/smokers_surrogate_test_all_groundtruth_version2.xml'

In [518]:
parser_smk_tr = ChHTMLParser()
with open(ss_train_file) as f:
    file = f.read()
parser_smk_tr.feed(file)
display(Markdown(f"**Training set notes**: {len(parser_smk_tr.notes)}"))

parser_smk_ts = ChHTMLParser()
with open(ss_test_file) as f:
    file = f.read()
parser_smk_ts.feed(file)
display(Markdown(f"**Test set notes**: {len(parser_smk_ts.notes)}"))

display(Markdown(f"**Total**: {len(parser_smk_tr.notes) + len(parser_smk_ts.notes)}"))

**Training set notes**: 398

**Test set notes**: 104

**Total**: 502

## Obesity and comorbidities (2008)

In [519]:
ob_train_file1 = data_folder + '/2008_obesity/obesity_patient_records_training.xml'
ob_train_file2 = data_folder + '/2008_obesity/obesity_patient_records_training2.xml'
ob_test_file = data_folder + '/2008_obesity/obesity_patient_records_test.xml'

In [520]:
parser_ob_tr = ChHTMLParser()
with open(ob_train_file1) as f:
    file1 = f.read()
with open(ob_train_file2) as f:
    file2 = f.read()
parser_ob_tr.feed(file1)
parser_ob_tr.feed(file2)
display(Markdown(f"**Training set notes**: {len(parser_ob_tr.notes)}"))

parser_ob_ts = ChHTMLParser()
with open(ob_test_file) as f:
    file = f.read()
parser_ob_ts.feed(file)
display(Markdown(f"**Test set notes**: {len(parser_ob_ts.notes)}"))

display(Markdown(f"**Total**: {len(parser_ob_tr.notes) + len(parser_ob_ts.notes)}"))

**Training set notes**: 730

**Test set notes**: 507

**Total**: 1237

## Medication extraction (2009)

In [47]:
med_tr_files = []
for ff in os.listdir(data_folder + '/2009_medication/training.sets.released'):
    for tf in os.listdir(os.path.join(data_folder + '/2009_medication/training.sets.released', ff)):
        if not re.match('^\.', tf):
            med_tr_files.append('/'.join([ff, tf]))

med_ts_files = [tf for tf in os.listdir(data_folder + '/2009_medication/train.test.released.8.17.09/') if tf not in [fftf.split('/')[1] for fftf in med_tr_files] and not re.match('^\.', tf)]

In [48]:
parser_med_tr = {tf.split('/')[1]: read_txt(os.path.join(data_folder + '/2009_medication/training.sets.released', tf)) for tf in med_tr_files}
parser_med_ts = {tf: read_txt(os.path.join(data_folder + '/2009_medication/train.test.released.8.17.09/', tf)) for tf in med_ts_files}

In [524]:
display(Markdown(f"**Training set notes**: {len(parser_med_tr)}"))
display(Markdown(f"**Test set notes**: {len(parser_med_ts)}"))
display(Markdown(f"**Total**: {len(parser_med_tr) + len(parser_med_ts)}"))

**Training set notes**: 696

**Test set notes**: 553

**Total**: 1249

**Remark**: Authors report 1,243 deidentified discharge summaries, <u>696 in the training</u> and <u>547 in the test set</u>$^1$.
1. Uzuner, O., Solti, I., & Cadag, E. (2010). Extracting medication information from clinical text. Journal of the American Medical Informatics Association : JAMIA, 17(5), 514–518. https://doi.org/10.1136/jamia.2010.003947

## Concept extraction, assertion and relation classification (2010)

In [526]:
cac_tr_files = []
for ff in os.listdir(data_folder + '/2010_relations/concept_assertion_relation_training_data'):
    for sff in os.listdir(os.path.join(data_folder + '/2010_relations/concept_assertion_relation_training_data', ff)):
        if sff == 'txt' or sff == 'unannotated':
            for tf in os.listdir(os.path.join(data_folder + '/2010_relations/concept_assertion_relation_training_data', ff, sff)):
                if not re.match('^\.', tf):
                    cac_tr_files.append('/'.join([ff, sff, tf]))
cac_ts_files = [tf for tf in os.listdir(data_folder + '/2010_relations/test_data') if not re.match('^\.', tf)]

In [527]:
parser_cac_tr = {}
for tr_path in cac_tr_files:
    text = read_txt(os.path.join(data_folder + '/2010_relations/concept_assertion_relation_training_data', tr_path))
    if re.match(r" *A[Dd][Mm][Ii][Ss][Ss][Ii][Oo][Nn] D[Aa][Tt][Ee] *:?", text):
        tr_file = tr_path.split('/')[2]
        idx = re.search(r'[0-9]+(_[a-z]{1})*', tr_file).group(0)
        parser_cac_tr[idx] = text
    else:
        idx = re.match(r'[0-9]+(_[a-z])*', text)
        parser_cac_tr[idx.group(0)] = text[idx.span()[1]:]

parser_cac_ts = {}
for ts_file in cac_ts_files:
    text = read_txt(os.path.join(data_folder + '/2010_relations/test_data', ts_file))
    if re.match(r" *A[Dd][Mm][Ii][Ss][Ss][Ii][Oo][Nn] D[Aa][Tt][Ee] *:?", text):
        idx = re.search(r'[0-9]+', ts_file).group(0)
        parser_cac_ts[idx] = text
    else:
        idx = re.match(r'[0-9]+(_[a-z])*', text)
        parser_cac_ts[idx.group(0)] = text[idx.span()[1]:]

In [528]:
display(Markdown(f"**Training set notes**: {len(parser_cac_tr)}"))
display(Markdown(f"**Test set notes**: {len(parser_cac_ts)}"))
display(Markdown(f"**Total**: {len(parser_cac_tr) + len(parser_cac_ts)}"))

**Training set notes**: 437

**Test set notes**: 255

**Total**: 692

**Remark**: authors report a total of <u>394 training reports</u>, <u>477 test reports</u>, and <u>877 unannotated reports</u>$^1$. One note in the test set is a duplicate, specifically _0017.txt_ and _0094.txt_ are the same (MRN: 139391631).

1. Uzuner, Ö., South, B. R., Shen, S., & DuVall, S. L. (2011). 2010 i2b2/VA challenge on concepts, assertions, and relations in clinical text. Journal of the American Medical Informatics Association : JAMIA, 18(5), 552–556. https://doi.org/10.1136/amiajnl-2011-000203

## Coreference resolution (2011)

In [529]:
cr_tr_files = []
for ff in os.listdir(data_folder + '/2011_coreference/i2b2_train/'):
    if not re.match('^\.', ff):
        for sff in os.listdir(os.path.join(data_folder + '/2011_coreference/i2b2_train', ff)):
            if sff == 'docs':
                for tf in os.listdir(os.path.join(data_folder + '/2011_coreference/i2b2_train', ff, sff)):
                    if not re.match('^\.', tf):
                        cr_tr_files.append('/'.join([ff, sff, tf]))
cr_ts_files = []
for ff in os.listdir(data_folder + '/2011_coreference/i2b2_Test/'):
    for sff in os.listdir(os.path.join(data_folder + '/2011_coreference/i2b2_Test', ff)):
        if sff == 'docs':
            for tf in os.listdir(os.path.join(data_folder + '/2011_coreference/i2b2_Test', ff, sff)):
                if not re.match('^\.', tf):
                    cr_ts_files.append('/'.join([ff, sff, tf]))

In [532]:
parser_cr_tr = {}
for tr_path in cr_tr_files:
    text = read_txt(os.path.join(data_folder + '/2011_coreference/i2b2_train', tr_path))
    if re.match(r" *A[Dd][Mm][Ii][Ss][Ss][Ii][Oo][Nn] D[Aa][Tt][Ee] *:?", text):
        tr_file = tr_path.split('/')[2]
        idx = re.search(r'[0-9]+', tr_file).group(0)
        parser_cr_tr[idx] = text
    else:
        idx = re.match(r'[0-9]+(_[a-z])*', text)
        parser_cr_tr[idx.group(0)] = text[idx.span()[1]:]
        
parser_cr_ts = {}
for ts_path in cr_ts_files:
    text = read_txt(os.path.join(data_folder + '/2011_coreference/i2b2_Test', ts_path))
    if re.match(r" *A[Dd][Mm][Ii][Ss][Ss][Ii][Oo][Nn] D[Aa][Tt][Ee] *:?", text):
        ts_file = ts_path.split('/')[2]
        idx = re.search(r'[0-9]+', ts_file).group(0)
        parser_cr_ts[idx] = text
    else:
        idx = re.match(r'[0-9]+(_[a-z])*', text)
        parser_cr_ts[idx.group(0)] = text[idx.span()[1]:]

In [533]:
display(Markdown(f"**Training set notes**: {len(parser_cr_tr)}"))
display(Markdown(f"**Test set notes**: {len(parser_cr_ts)}"))
display(Markdown(f"**Total**: {len(parser_cr_tr) + len(parser_cr_ts)}"))

**Training set notes**: 251

**Test set notes**: 172

**Total**: 423

In [573]:
# Check if duplicated from 2010
old_dt = [v.strip('\n') for v in parser_cac_tr.values()] + [v.strip('\n') for v in parser_cac_ts.values()]

count_tr = 0
count_ts = 0
for idx, note in parser_cr_tr.items():
    if note.strip('\n') in old_dt:
        count_tr += 1
for idx, note in parser_cr_ts.items():
    if note.strip('\n') in old_dt:
        count_ts += 1

In [574]:
display(Markdown(f"**Notes from 2010 challenge**: Tr: {count_tr}, Ts: {count_ts}"))

**Notes from 2010 challenge**: Tr: 251, Ts: 171

One note is missing from the test set because the duplicated version (MRN: 139391631) differs by the presence of string "[ report_end ]" and the 2010 dataset version is imported without.

**Remark**: the notes included in this challenge are a subset of those from 2010 challenge. In the paper <u>590 notes are included in the training set</u> and <u>388 in the testing set</u> but only the _Beth_ (N=194) and _Partners_ (N=230) datasets are available and the number of notes are consistent with those reported in the paper$^1$. As in the 2010 dataset a note is duplicated (same MRN: 139391631), see files _clinical-147.txt_ and _clinical-91.txt_ in the test set.

1. Uzuner O, Bodnari A, Shen S, Forbush T, Pestian J, South BR. Evaluating the state of the art in coreference resolution for electronic medical records. J Am Med Inform Assoc. 2012 Sep-Oct;19(5):786-91. doi: 10.1136/amiajnl-2011-000784. Epub 2012 Feb 24. PMID: 22366294; PMCID: PMC3422835.

## Temporal relations (2012)

In [210]:
tr_tr_files = []
tr_ts_files = []
for ff in os.listdir(data_folder + '/2012_temporal_relations/'):
    if re.search('release-fix', ff):
        for tf in os.listdir(os.path.join(data_folder + '/2012_temporal_relations', ff)):
            if re.search('.txt', tf):
                tr_tr_files.append('/'.join([ff, tf]))
    elif re.search('test-data-release', ff):
        for tf in os.listdir(os.path.join(data_folder + '/2012_temporal_relations', ff, 'txt')):
            if not re.match('^\.', tf):
                        tr_ts_files.append('/'.join([ff, 'txt', tf]))

In [575]:
parser_tr_tr = {}
for tr_path in tr_tr_files:
    text = read_txt(os.path.join(data_folder + '/2012_temporal_relations', tr_path))
    tr_file = tr_path.split('/')[1]
    tr_span = re.search(r'[0-9]+', tr_file).span()
    idx = tr_file[tr_span[0]:tr_span[1]]
    parser_tr_tr[idx] = text
parser_tr_ts = {}
for ts_path in tr_ts_files:
    text = read_txt(os.path.join(data_folder + '/2012_temporal_relations', ts_path))
    ts_file = ts_path.split('/')[2]
    ts_span = re.search(r'[0-9]+', ts_file).span()
    idx = ts_file[ts_span[0]:ts_span[1]]
    parser_tr_ts[idx] = text

In [576]:
display(Markdown(f"**Training set notes**: {len(parser_tr_tr)}"))
display(Markdown(f"**Test set notes**: {len(parser_tr_ts)}"))
display(Markdown(f"**Total**: {len(parser_tr_tr) + len(parser_tr_ts)}"))

**Training set notes**: 190

**Test set notes**: 120

**Total**: 310

In [577]:
# Check if duplicated from 2010
old_dt = [v.strip('\n').lower() for v in parser_cac_tr.values()] + [v.strip('\n').lower() for v in parser_cac_ts.values()]

count_tr = 0
count_ts = 0
for note in parser_tr_tr.values():
    if note.strip('\n').lower() in old_dt:
        count_tr += 1
for note in parser_tr_ts.values():
    if note.strip('\n').lower() in old_dt:
        count_ts += 1

In [578]:
display(Markdown(f"**Notes from 2010 challenge**: Tr: {count_tr}, Ts: {count_ts}"))

**Notes from 2010 challenge**: Tr: 0, Ts: 0

## Longitudinal annotated dataset (2014)

In [579]:
long_tr_files = [tf for tf in os.listdir(data_folder + '/2014_heart_disease/training-RiskFactors-Complete-Set1/') if not re.match(r'^\.', tf)]
for tf in os.listdir(data_folder + '/2014_heart_disease/training-RiskFactors-Complete-Set2/'):
    if not re.match(r'^\.', tf):
        long_tr_files.append(tf)
long_ts_files = [tf for tf in os.listdir(data_folder + '/2014_heart_disease/testing-RiskFactors-Complete/') if not re.match(r'^\.', tf)]

In [580]:
parser_long_tr = {}
for tr_file in sorted(long_tr_files):
    try:
        text = readXML(os.path.join(data_folder + '/2014_heart_disease', 'training-RiskFactors-Complete-Set1', tr_file))
    except FileNotFoundError:
        text = readXML(os.path.join(data_folder + '/2014_heart_disease', 'training-RiskFactors-Complete-Set2', tr_file))
    span = re.search(r'[0-9]+-[0-9]+', tr_file).span()
    idxt = tr_file[span[0]:span[1]].split('-')
    idx = idxt[0]
    n = idxt[1]
    if idx in parser_long_tr:
        parser_long_tr[idx][n] = text
    else:
        parser_long_tr[idx] = {n: text}

parser_long_ts = {}
for ts_file in sorted(long_ts_files):
    text = readXML(os.path.join(data_folder + '/2014_heart_disease/testing-RiskFactors-Complete/', ts_file))
    span = re.search(r'[0-9]+-[0-9]+', ts_file).span()
    idxt = ts_file[span[0]:span[1]].split('-')
    idx = idxt[0]
    n = idxt[1]
    if idx in parser_long_ts:
        parser_long_ts[idx][n] = text
    else:
        parser_long_ts[idx] = {n: text}

In [581]:
display(Markdown(f"**Training set**: **patients** {len(parser_long_tr)}; **notes** {sum([len(val) for val in parser_long_tr.values()])}"))
display(Markdown(f"**Test set**: **patients** {len(parser_long_ts)}; **notes** {sum([len(val) for val in parser_long_ts.values()])}"))
display(Markdown(f"**Total**: **patients** {len(parser_long_tr) + len(parser_long_ts)}; **notes**: {sum([len(val) for val in parser_long_tr.values()]) + sum([len(val) for val in parser_long_ts.values()])}"))

**Training set**: **patients** 178; **notes** 790

**Test set**: **patients** 118; **notes** 514

**Total**: **patients** 296; **notes**: 1304

## Cohort selection (2018 - Task 1)

In [582]:
parser_cs_tr = {}
for tf in os.listdir(data_folder + '/2018_cohort_selection/train/'):
    if not re.match(r'\.', tf):
        text = readXML(os.path.join(data_folder + '/2018_cohort_selection/train/', tf))
        parser_cs_tr[tf.split('.')[0]] = text

parser_cs_ts = {}
for tf in os.listdir(data_folder + '/2018_cohort_selection/test_notags/'):
    if not re.match(r'\.', tf):
        text = readXML(os.path.join(data_folder + '/2018_cohort_selection/test_notags/', tf))
        parser_cs_ts[tf.split('.')[0]] = text

In [583]:
display(Markdown(f"**Training set notes**: {len(parser_cs_tr)}"))
display(Markdown(f"**Test set notes**: {len(parser_cs_ts)}"))
display(Markdown(f"**Total**: {len(parser_cs_tr) + len(parser_cs_ts)}"))

**Training set notes**: 202

**Test set notes**: 86

**Total**: 288

**Remark**: Notes for this task are taken from the 2014 longitudinal dataset. On a first impression it looks like each note is obtained concatenating longitudinal notes for each patient. Hereafter we investigate if that is the case.

In [584]:
#Training
count_same = []
count_diff = []
for idx in parser_cs_tr.keys():
    try:
        count_same.append(len(re.findall(r'Record date: [0-9]{4}-[0-9]{2}-[0-9]{2}', parser_cs_tr[idx])) - len(parser_long_tr[idx]))
    except KeyError:
        count_diff.append(len(re.findall(r'Record date: [0-9]{4}-[0-9]{2}-[0-9]{2}', parser_cs_tr[idx])) - len(parser_long_ts[idx]))
        
display(Markdown(f"Training/Training perfect concatenation: {len([c for c in count_same if c==0])}"))
display(Markdown(f"Training/Training concatenation with one note less: {len([c for c in count_same if c==-1])}"))
display(Markdown(f"Training/Testing perfect concatenation: {len([c for c in count_diff if c==0])}"))
display(Markdown(f"**Total training count**: {len([c for c in count_same if c==0]) + len([c for c in count_same if c==-1]) + len([c for c in count_diff if c==0])}"))

Training/Training perfect concatenation: 125

Training/Training concatenation with one note less: 3

Training/Testing perfect concatenation: 74

**Total training count**: 202

In [585]:
#Testing
count_same = []
count_diff = []
for idx in parser_cs_ts.keys():
    try:
        count_same.append(len(re.findall(r'Record date: [0-9]{4}-[0-9]{2}-[0-9]{2}', parser_cs_ts[idx])) - len(parser_long_ts[idx]))
    except KeyError:
        count_diff.append(len(re.findall(r'Record date: [0-9]{4}-[0-9]{2}-[0-9]{2}', parser_cs_ts[idx])) - len(parser_long_tr[idx]))
        
display(Markdown(f"Testing/Testing perfect concatenation: {len([c for c in count_same if c==0])}"))
display(Markdown(f"Testing/Training perfect concatenation: {len([c for c in count_diff if c==0])}"))
display(Markdown(f"**Total training count**: {len([c for c in count_same if c==0]) + len([c for c in count_diff if c==0])}"))

Testing/Testing perfect concatenation: 39

Testing/Training perfect concatenation: 47

**Total training count**: 86

## Medication extraction (2018 - Task 2)

In [6]:
parser_me_tr = {}
for tf in os.listdir(data_folder + '/2018_medication_extraction/training_20180910/'):
    if re.search('.txt', tf):
        text = read_txt(os.path.join(data_folder + '/2018_medication_extraction/training_20180910', tf))
        idx = tf.split('.')[0]
        parser_me_tr[idx] = text
parser_me_ts = {}
for tf in os.listdir(data_folder + '/2018_medication_extraction/test/'):
    if re.search('.txt', tf):
        text = read_txt(os.path.join(data_folder + '/2018_medication_extraction/test', tf))
        idx = tf.split('.')[0]
        parser_me_ts[idx] = text

In [587]:
display(Markdown(f"**Training set notes**: {len(parser_me_tr)}"))
display(Markdown(f"**Test set notes**: {len(parser_me_ts)}"))
display(Markdown(f"**Total**: {len(parser_me_tr) + len(parser_me_ts)}"))

**Training set notes**: 303

**Test set notes**: 202

**Total**: 505

# Create unique dataset for language modeling

In [588]:
tr_vect = [parser_smk_tr.notes, 
           parser_ob_tr.notes, 
           parser_med_tr, 
           parser_cac_tr, 
           parser_tr_tr, 
           {k: ' '.join([n for n in notes.values()]) for k, notes in parser_long_tr.items()}, 
           parser_me_tr]

dt_tr = {}

for chk, tr in enumerate(tr_vect):
    for k, note in tr.items():
        if k in dt_tr:
            i = len([idx for idx in dt_tr.keys() if re.match(k, idx)])
            dt_tr['_'.join([k, str(i)])] = note
        else:
            dt_tr[k] = note
            
display(Markdown(f"Training Notes: {len(dt_tr)} -- Notes from datasets: {sum([len(dd) for dd in tr_vect])}"))

Training Notes: 2932 -- Notes from datasets: 2932

In [589]:
ts_vect = [parser_smk_ts.notes, 
           parser_ob_ts.notes, 
           parser_med_ts, 
           parser_cac_ts, 
           parser_tr_ts, 
           {k: ' '.join([n for n in notes.values()]) for k, notes in parser_long_ts.items()}, 
           parser_me_ts]

dt_ts = {}

for chk, ts in enumerate(ts_vect):
    for k, note in ts.items():
        if k in dt_ts:
            i = len([idx for idx in dt_ts.keys() if re.match(k, idx)])
            dt_ts['_'.join([k, str(i)])] = note
        else:
            dt_ts[k] = note
            
display(Markdown(f"Testing Notes: {len(dt_ts)} -- Notes from datasets: {sum([len(dd) for dd in ts_vect])}"))

Testing Notes: 1859 -- Notes from datasets: 1859

In [40]:
notes = pkl.load(open('./data/train_n2c2_datasets_preprocessed.pkl', 'rb'))
for n in notes:
    fa = re.findall(' us ?\. ', n[1])
    if len(fa) > 0:
#         print(n)
#         print(re.sub(r'\t+', ' ', n[1]))
#         print(re.findall(' [a-z]{1,3}\. ', n[1]))
#         print(re.findall(' [a-z]{1,3}\. ?[a-z]+\. ', n[1]))
        print(n[1][re.search(' us ?\. ', n[1]).span()[0]-100:re.search(' us ?\. ', n[1]).span()[0]+100])

e . however she had area of cancer which will required surgical resection . thus she was referred to us. in order to undergo aortic balloon valvuloplasty as a bridge to her larynx surgery . allergies 
oking no etoh pt lives alone in die he was born raised in lo and flew planes in wwii after coming to us. in 1950's he worked as radiology tech @ sun de ed course afebrile vss exam notable for firm dis
. - gyn ct pelvis did show r adnexal cyst on prelim read which will need further characterization by us. and outpt follow up. - cv she has extensive cardiac history as above. her fall is not likely re
y of significant coronary artery disease and other medical problems as stated above who presented to us. in the emergency department complaining of chest pain. the patient was admitted recently on jan
e mental status changes but these studies were deferred given the lack of interventions available to us. given the patient's decreased functional status. we did however obtained a transthoracic ec

In [37]:
notes = pkl.load(open('./data/test_n2c2_datasets_preprocessed.pkl', 'rb'))
for n in notes:
    fa = re.findall(' nt\. bs\. ', n[1])
    if len(fa) > 0:
#         print(n)
#         print(re.sub(r'\t+', ' ', n[1]))
#         print(re.findall(' [a-z]{1,3}\. ', n[1]))
#         print(re.findall(' [a-z]{1,3}\. ?[a-z]+\. ', n[1]))
        print(n[1][re.search(' nt\. bs\. ', n[1]).span()[0]-100:re.search(' nt\. bs\. ', n[1]).span()[0]+100])

In [749]:
prova = "279::03".split('::')
a, b = prova[0], prova[1]
print(a, b)

279 03


In [798]:
from collections import namedtuple

wn_redundancy = namedtuple('wn_redundancy', ['note_id', 'nr_score', 'counts', 'challenge'])
bp_redundancy = namedtuple('bp_redundancy', ['sen_A', 'sen_B', 'align_A', 'align_B', 'align_score', 'ovrlp_score'])

notes = pkl.load(open(os.path.join(ut.data_folder, 'train_bp_redundancy.pkl'), 'rb'))

In [799]:
notes

[bp_redundancy(sen_A='hypertension', sen_B='abdomen is soft nontender positive bowel sounds', align_A=['hypertension', '-', '-', '-', '-', '-', '-', '-'], align_B=['-', 'abdomen', 'is', 'soft', 'nontender', 'positive', 'bowel', 'sounds'], align_score=-1.6, ovrlp_score=0.0),
 bp_redundancy(sen_A='hypertension', sen_B='lower extremities no edema nontender', align_A=['hypertension', '-', '-', '-', '-', '-'], align_B=['-', 'lower', 'extremities', 'no', 'edema', 'nontender'], align_score=-1.4, ovrlp_score=0.0),
 bp_redundancy(sen_A='hypertension', sen_B='briefly 70 yo f w htn dm2 stable angina who is transferred to nsh on 4/3/89 with aortic valve endocarditis complicated by ai and chf', align_A=['hypertension', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-'], align_B=['-', 'briefly', '70', 'yo', 'f', 'w', 'htn', 'dm2', 'stable', 'angina', 'who', 'is', 'transferred', 'to', 'nsh', 'on', '4/3/89', 'with', 'aortic', 'v

In [787]:
with open('./data/test_n2c2_datasets_preprocessed.csv', 'w') as f:
    wr = csv.writer(f, delimiter='|')
    wr.writerow(["ID", "TEXT"])
    for el in notes:
        wr.writerow(list(el))

In [791]:
notes[0].challenge

'long'

In [800]:
import spacy
nlp = spacy.load('en_core_sci_md', disable=['ner'])

In [826]:
prova = "ciao come stai. io sto bene grazie"

In [827]:
doc = nlp(prova)

In [828]:
sentences = [str(s).strip('\.') for s in doc.sents]
tokens = [[t.text for t in s if t.text != '.'] for s in doc.sents]
tokens
sentences

['ciao come stai', 'io sto bene grazie']

In [830]:
tokens[0]

['ciao', 'come', 'stai']

In [831]:
import itertools

list(itertools.chain.from_iterable([[1,2], [3,4]]))

[1, 2, 3, 4]

In [858]:
def iterative_levenshtein(s, t, **weight_dict):
    """ 
        iterative_levenshtein(s, t) -> ldist
        ldist is the Levenshtein distance between the strings 
        s and t.
        For all i and j, dist[i,j] will contain the Levenshtein 
        distance between the first i characters of s and the 
        first j characters of t
        
        weight_dict: keyword parameters setting the costs for characters,
                     the default value for a character will be 1
    """

    rows = len(s)+1
    cols = len(t)+1
    
    alphabet = "abcdefghijklmnopqrstuvwxyz"
    w = dict( (x, (1, 1, 1)) for x in alphabet + alphabet.upper())
    if weight_dict:
        w.update(weight_dict)
    
    dist = [[0 for x in range(cols)] for x in range(rows)]
    print(dist)
    print('\n')

    # source prefixes can be transformed into empty strings 
    # by deletions:
    for row in range(1, rows):
        dist[row][0] = dist[row-1][0] + w[s[row-1]][0]
    print(dist)
    print('\n')
    
    # target prefixes can be created from an empty source string
    # by inserting the characters
    for col in range(1, cols):
        dist[0][col] = dist[0][col-1] + w[t[col-1]][1]
    print(dist)
    print('\n')
    
    for col in range(1, cols):
        for row in range(1, rows):
            deletes = w[s[row-1]][0]
            inserts = w[t[col-1]][1]
            subs = max( (w[s[row-1]][2], w[t[col-1]][2]))
            if s[row-1] == t[col-1]:
                subs = 0
            else:
                subs = subs

            dist[row][col] = min(dist[row-1][col] + deletes,
                                 dist[row][col-1] + inserts,
                                 dist[row-1][col-1] + subs) # substitution

#     for r in range(rows):
#         print(dist[r])
    
 
    return dist[row][col]

In [859]:
print(iterative_levenshtein("abx", 
                            "xya") )

[[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]


[[0, 0, 0, 0], [1, 0, 0, 0], [2, 0, 0, 0], [3, 0, 0, 0]]


[[0, 1, 2, 3], [1, 0, 0, 0], [2, 0, 0, 0], [3, 0, 0, 0]]


3


In [860]:
def levenshtein_sen(sen1, sen2):

    rows = len(sen1) + 1
    cols = len(sen2) + 1

    dist = np.zeros((rows, cols))
    dist[:, 0] = range(rows)
    dist[0, :] = range(cols)

    for col in range(1, cols):
        for row in range(1, rows):
            if sen1[row - 1] == sen2[col - 1]:
                cost = 0
            else:
                cost = 1
            dist[row, col] = min(dist[row - 1, col] + 1,
                                 dist[row, col - 1] + 1,
                                 dist[row - 1, col - 1] + cost)
    return dist[-1, -1]

In [861]:
levenshtein_sen("abx", "xya")

3.0

In [873]:
bp_notes = pkl.load(open('./data/test_n2c2_datasets_sentences_preprocessed.pkl', 'rb'))

In [866]:
prova = 'ciao ciao .'
prova.strip(r' ?\.? ?')

'ciao ciao'

In [876]:
def levenshtein_sen(sen1, sen2):
    """
    Compute Levenshtein distance between two sentences
    :param sen1: list of tokens (words)
    :param sen2: list of tokens (words)
    :return: edit distance (Levenshtein)
    """
    rows = len(sen1) + 1
    cols = len(sen2) + 1

    dist = np.zeros((rows, cols))
    dist[:, 0] = range(rows)
    dist[0, :] = range(cols)

    for col, row in itertools.product(range(1, cols), range(1, rows)):
        if sen1[row - 1] == sen2[col - 1]:
            cost = 0
        else:
            cost = 1
        dist[row, col] = min(dist[row - 1, col] + 1,
                             dist[row, col - 1] + 1,
                             dist[row - 1, col - 1] + cost)
    return dist[-1, -1]

In [877]:
levenshtein_sen(["ciao", "come", "stai"], ["tutto", "ok"])

3.0