Clean before dumping into Mongo

In [1]:
from glob import glob
import json
import dask.bag as db
import pandas as pd
from tqdm.notebook import tqdm
from functions.data import parse_text, raw_text_to_dict

In [2]:
save=True
data_dir = 'data/downloads'

# Get Data

In [3]:
# Import labels on clinical domain
train_labels_df = pd.read_csv("data/downloads/trainLabels.csv", header=None, names=['file', 'clinical_domain'])
file_to_label = dict(zip(train_labels_df['file'], train_labels_df['clinical_domain']))

# Training Set

In [4]:
# Warning! will overwrite specified file
# Converts raw text into dictionary, then dumps it in json file directly without retaining it in memory

train_file = 'train.json'

if save:
    # create new file
    with open(f'data/{train_file}', 'w') as f:
            f.write('[')

    is_newline = False
    for file in tqdm(train_labels_df['file']):
        filename = file.replace('.txt', '')
        with open(f'{data_dir}/data/{file}', 'r', errors='replace') as f:
            text = f.readlines()

        parsed_text = parse_text(text)
        text_dict = dict([raw_text_to_dict(line) for line in parsed_text])

        # add labels
        text_dict['index_'] = file.replace('.txt', '')
        text_dict['clinical_domain'] =  file_to_label.get(f'{filename}.txt')

        with open(f'data/{train_file}', 'a') as f:

            if is_newline:
                f.write(', ')  # add sep

            if not is_newline:
                is_newline=True

            json.dump(text_dict, f)

    with open(f'data/{train_file}', 'a') as f:
        f.write(']')

print("Complete")

HBox(children=(FloatProgress(value=0.0, max=826.0), HTML(value='')))


Complete


In [5]:
with open(f'data/train.json') as f:
    raw_data = f.read()
data = json.loads(raw_data)
data[0:2]

[{'CC': 'Difficulty with word finding.',
  'HX': 'This 27y/o RHF experienced sudden onset word finding difficulty and slurred speech on the evening of 2/19/96. She denied any associated dysphagia, diplopia, numbness or weakness of her extremities. She went to sleep with her symptoms on 2/19/96, and awoke with them on 2/20/96. She also awoke with a headache (HA) and mild neck stiffness. She took a shower and her HA and neck stiffness resolved. Throughout the day she continued to have difficulty with word finding and had worsening of her slurred speech. That evening, she began to experience numbness and weakness in the lower right face. She felt like there was a &quot;rubber-band&quot; wrapped around her tongue. For 3 weeks prior to presentation, she experienced transient episodes of a &quot;boomerang&quot; shaped field cut in the left eye. The episodes were not associated with any other symptoms. One week prior to presentation, she went to a local ER for menorrhagia. She had just resume

# Test Set

In [6]:
# Warning! will overwrite specified file
# Converts raw text into dictionary, then dumps it in json file directly without retaining it in memory

files = [filepath.replace('data/downloads/data/', '') for filepath in sorted(glob('data/downloads/data/*.txt'))]
files = list(filter(lambda x: x >= '1827.txt', files)) # filter out test set
test_file = 'test.json'

if save:
    # create new file
    with open(f'data/{test_file}', 'w') as f:
            f.write('[')

    is_newline = False
    for file in tqdm(files):
        filename = file.replace('.txt', '')
        with open(f'{data_dir}/data/{file}', 'r', errors='replace') as f:
            text = f.readlines()

        parsed_text = parse_text(text)
        text_dict = dict([raw_text_to_dict(line) for line in parsed_text])

        # add labels (no clinical_domain available)
        text_dict['index_'] = file.replace('.txt', '')

        with open(f'data/{test_file}', 'a') as f:

            if is_newline:
                f.write(', ')  # add sep

            if not is_newline:
                is_newline=True

            json.dump(text_dict, f)

    with open(f'data/{test_file}', 'a') as f:
        f.write(']')

print("Complete")

HBox(children=(FloatProgress(value=0.0, max=413.0), HTML(value='')))


Complete


In [4]:
with open(f'data/test.json') as f:
    raw_data = f.read()
data = json.loads(raw_data)
data[0:2]

[{'PREOPERATIVE DIAGNOSIS': 'Anemia.',
  'PROCEDURE': 'Upper gastrointestinal endoscopy.',
  'POSTOPERATIVE DIAGNOSES': '1.  Severe duodenitis. 2.  Gastroesophageal junction small ulceration seen. 3.  No major bleeding seen in the stomach.',
  'PROCEDURE IN DETAIL': 'The patient was put in left lateral position.  Olympus scope was inserted from the mouth, under direct visualization advanced to the upper part of the stomach, upper part of esophagus, middle of esophagus, GE junction, and some intermittent bleeding was seen at the GE junction.  Advanced into the upper part of the stomach into the antrum.  The duodenum showed extreme duodenitis and the scope was then brought back.  Retroflexion was performed, which was normal.  Scope was then brought back slowly.  Duodenitis was seen and a little bit of ulceration seen at GE junction.',
  'FINDING': 'Severe duodenitis, may be some source of bleeding from there, but no active bleeding at this time.',
  'index_': '1827'},
 {'CC': 'Lethargy.'

# Check

In [5]:
# Use dask to read first 2 lines into memory
text_full_bag = db.read_text("data/train.json").map(json.loads)
text_tuple = text_full_bag.take(2)
text_tuple

([{'CC': 'Difficulty with word finding.',
   'HX': 'This 27y/o RHF experienced sudden onset word finding difficulty and slurred speech on the evening of 2/19/96. She denied any associated dysphagia, diplopia, numbness or weakness of her extremities. She went to sleep with her symptoms on 2/19/96, and awoke with them on 2/20/96. She also awoke with a headache (HA) and mild neck stiffness. She took a shower and her HA and neck stiffness resolved. Throughout the day she continued to have difficulty with word finding and had worsening of her slurred speech. That evening, she began to experience numbness and weakness in the lower right face. She felt like there was a &quot;rubber-band&quot; wrapped around her tongue. For 3 weeks prior to presentation, she experienced transient episodes of a &quot;boomerang&quot; shaped field cut in the left eye. The episodes were not associated with any other symptoms. One week prior to presentation, she went to a local ER for menorrhagia. She had just resu

In [6]:
# Use dask to read first 2 lines into memory
text_full_bag = db.read_text("data/test.json").map(json.loads)
text_tuple = text_full_bag.take(2)
text_tuple

([{'PREOPERATIVE DIAGNOSIS': 'Anemia.',
   'PROCEDURE': 'Upper gastrointestinal endoscopy.',
   'POSTOPERATIVE DIAGNOSES': '1.  Severe duodenitis. 2.  Gastroesophageal junction small ulceration seen. 3.  No major bleeding seen in the stomach.',
   'PROCEDURE IN DETAIL': 'The patient was put in left lateral position.  Olympus scope was inserted from the mouth, under direct visualization advanced to the upper part of the stomach, upper part of esophagus, middle of esophagus, GE junction, and some intermittent bleeding was seen at the GE junction.  Advanced into the upper part of the stomach into the antrum.  The duodenum showed extreme duodenitis and the scope was then brought back.  Retroflexion was performed, which was normal.  Scope was then brought back slowly.  Duodenitis was seen and a little bit of ulceration seen at GE junction.',
   'FINDING': 'Severe duodenitis, may be some source of bleeding from there, but no active bleeding at this time.',
   'index_': '1827'},
  {'CC': 'Let