In [1]:
import pandas as pd
from thunderpack import ThunderReader
from tqdm import tqdm
import tarfile
import zipfile
import random
import os
import shutil
import re
tqdm.pandas()

## Goal
Filter notes to only include those with Brain MRI or CT Head
- Search through notes with regex for keywords 
    - If any are there, keep it, else, drop it

In [11]:
admission_notes_df = pd.read_csv('2_admission_notes_df.csv')
admission_notes_df.head()

Unnamed: 0,BDSPPatientID,AdmissionDate,ICD,NoteDate,NoteTitle
0,150000004,2012-11-05,430,2012-11-05,Notes_1129858847_225334705_20121105.txt
1,150000004,2012-11-05,430,2012-11-21,Notes_1129858847_903347020_20121121.txt
2,150000004,2012-11-05,430,2012-11-21,Notes_1129858847_903347022_20121121.txt
3,150000004,2012-11-05,430,2012-11-25,Notes_1129858847_903347025_20121125.txt
4,150000004,2012-11-05,430,2012-11-28,Notes_1129858847_903347027_20121128.txt


In [12]:
# RANDOMIZE order of df, 25 seed for reproducability
admission_notes_df = admission_notes_df.sample(frac=1, random_state=25).reset_index(drop=True)
admission_notes_df.head()

Unnamed: 0,BDSPPatientID,AdmissionDate,ICD,NoteDate,NoteTitle
0,150022059,2021-02-28,I60.9,2021-03-01,Notes_1129881170_2470102250_20210301.txt
1,151153367,2013-06-08,430,2013-06-08,Notes_1131012445_413930272_20130608.txt
2,150828106,2017-03-23,I60.9,2017-04-17,Notes_1130686598_332462314_20170417.txt
3,150018559,2020-05-10,I60.8,2020-05-12,Notes_1129877493_3385243575_20200512.txt
4,151330933,2020-06-06,I60.32,2020-06-18,Notes_1131189610_3490844981_20200618.txt


In [17]:
def convert_tar_to_zip(tar_path, zip_path):
    with tarfile.open(tar_path, 'r') as tar:
        with zipfile.ZipFile(zip_path, 'w') as zipf:
            for member in tqdm(tar.getmembers()):
                file_obj = tar.extractfile(member)
                if file_obj is not None:
                    zipf.writestr(member.name, file_obj.read())

# This will extract the year from the file name
def extract_year(file_name):
    match = re.search(r'_(\d{8})\.txt$', file_name)
    if match:
        return match.group(1)[:4]
    return None

def read_file_from_zip(base_path, file_name):
    year = extract_year(file_name)
    if year:
        zip_path = f'{base_path}/bidmc_notes_{year}.zip'
        try:
            with zipfile.ZipFile(zip_path, 'r') as zipf:
                with zipf.open(f'{year}/{file_name}') as file:
                    content = file.read()
                    return content.decode('utf-8', errors = 'ignore')
        except (KeyError, FileNotFoundError):
            return None
    return None



In [6]:
year = 2010
for i in tqdm(range(0, 15)):
    tar_file_path = f'/home/jsearle/bigDrive/Dropbox/zz_EHR_Thunderpacks/BIDMC/BIDMC_Deidentified_Notes_March14th2024/bidmc_notes_{year + i}.tar'
    zip_file_path = f'/home/jsearle/bigDrive/NAX/zipNotes/BIDMC/bidmc_notes_{year + i}.zip'
    convert_tar_to_zip(tar_file_path, zip_file_path)

100%|██████████| 317737/317737 [00:19<00:00, 16051.40it/s]
100%|██████████| 990519/990519 [01:02<00:00, 15831.22it/s]
100%|██████████| 1389600/1389600 [01:30<00:00, 15391.17it/s]
100%|██████████| 1477627/1477627 [01:50<00:00, 13324.20it/s]
100%|██████████| 1587770/1587770 [01:50<00:00, 14309.74it/s]
100%|██████████| 1684960/1684960 [02:05<00:00, 13458.68it/s]
100%|██████████| 1791824/1791824 [02:20<00:00, 12714.67it/s]
100%|██████████| 1927996/1927996 [02:36<00:00, 12298.14it/s]
100%|██████████| 2157142/2157142 [03:06<00:00, 11547.08it/s]
100%|██████████| 2313867/2313867 [03:17<00:00, 11729.14it/s]
100%|██████████| 2443733/2443733 [03:39<00:00, 11117.47it/s]
100%|██████████| 2497818/2497818 [03:34<00:00, 11626.06it/s]
100%|██████████| 2486341/2486341 [03:33<00:00, 11620.09it/s]
100%|██████████| 1764965/1764965 [02:20<00:00, 12526.77it/s]
100%|██████████| 549695/549695 [00:37<00:00, 14492.05it/s]
100%|██████████| 15/15 [57:48<00:00, 231.21s/it]


In [18]:
base_path = '/home/jsearle/bigDrive/NAX/zipNotes/BIDMC'

pattern1 = re.compile(r'(MRI|MRA|MR ANGIO|CT|CTA|CT ANGIO)\s*(of)*\s*(the)*\s*(brw|brain|brn|neck|head|hed)', re.IGNORECASE)
pattern2 = re.compile(r'(brw|brain|brn|neck|head|hed)\s*(of)*\s*(the)*\s*(MRI|MRA|MR ANGIO|CT|CTA|CT ANGIO)', re.IGNORECASE)

def read_and_filter(note_title, base_path):
    note_text = read_file_from_zip(base_path, note_title)
    if note_text and (pattern1.search(note_text) or pattern2.search(note_text)):
        return note_text
    else:
        return None


chunk_size = 3000
start = 0
end = min(start + chunk_size, len(admission_notes_df))
chunk = admission_notes_df.iloc[start:end].copy()


# Apply the filtering function
chunk['text'] = chunk['NoteTitle'].progress_apply(lambda x: read_and_filter(x, base_path))

# Drop rows where 'text' is None
filtered_chunk = chunk.dropna(subset=['text'])

filtered_chunk.head()
print(len(filtered_chunk))

100%|██████████| 3000/3000 [9:41:13<00:00, 11.62s/it]  

1516





In [1]:
filtered_chunk.to_csv('3_cpt_filtered_chunk.csv', index=False, header=True, sep=',', na_rep='NA')

NameError: name 'filtered_chunk' is not defined