In [2]:
import pandas as pd
from thunderpack import ThunderReader
from tqdm import tqdm
import tarfile
import zipfile
import random
import os
import shutil
import re
tqdm.pandas()

# Goal of this notebook:
 - Add a column to the dataframe which includes the text of the note associated with the row
 - Plan:
    - Figure out how to open tar archive from here
    - Create a function that opens the tar archive, finds the file, and extracts the text
    - Make a pandas series from these text blocks and append to the initial dataframe

In [3]:
rand_filtered_df = pd.read_csv('4_rand_filtered_df.csv')
rand_filtered_df.head()

Unnamed: 0,BDSPPatientID,DateICD,ICD,DateNote,NoteTextFile
0,114316265,2023-01-24,430,2022-12-25,Notes_13605828901_9002141910_20221225.txt
1,120110560,2023-06-12,430,2023-06-25,Notes_13629086394_8327863776_20230625.txt
2,119573164,2021-02-13,I60.9,2021-02-23,Notes_13589736564_5567001557_20210223.txt
3,112030232,2021-05-07,430,2021-05-18,Notes_13566019013_5701887197_20210518.txt
4,111905857,2020-10-26,430,2020-10-02,Notes_13534548179_4160128051_20201002.txt


In [4]:
def convert_tar_to_zip(tar_path, zip_path):
    with tarfile.open(tar_path, 'r') as tar:
        with zipfile.ZipFile(zip_path, 'w') as zipf:
            for member in tqdm(tar.getmembers()):
                file_obj = tar.extractfile(member)
                if file_obj is not None:
                    zipf.writestr(member.name, file_obj.read())

# This will extract the year from the file name
def extract_year(file_name):
    match = re.search(r'_(\d{8})\.txt$', file_name)
    if match:
        return match.group(1)[:4]
    return None
def read_file_from_zip(base_path, file_name):
    year = extract_year(file_name)
    if year:
        zip_path = f'{base_path}/mgb_notes_{year}.zip'
        try:
            with zipfile.ZipFile(zip_path, 'r') as zipf:
                with zipf.open(f'{year}/{file_name}') as file:
                    content = file.read()
                    return content.decode('utf-8', errors = 'ignore')
        except (KeyError, FileNotFoundError):
            return None
    return None

In [7]:
year = 2013
for i in tqdm(range(0, 12)):
    tar_file_path = f'/home/jsearle/bigDrive/Dropbox/zz_EHR_Thunderpacks/MGB/MGB_Deidentified_Notes_March12th2024/mgb_notes_{year + i}.tar'
    zip_file_path = f'/home/jsearle/bigDrive/NAX/zipNotes/MGB/mgb_notes_{year + i}.zip'
    convert_tar_to_zip(tar_file_path, zip_file_path)

100%|██████████| 10/10 [00:00<00:00, 13815.23it/s]
100%|██████████| 262376/262376 [00:19<00:00, 13284.03it/s]
100%|██████████| 987181/987181 [01:11<00:00, 13777.94it/s]
100%|██████████| 2139314/2139314 [02:46<00:00, 12834.39it/s]
100%|██████████| 3060365/3060365 [04:09<00:00, 12254.40it/s]
100%|██████████| 3349298/3349298 [04:39<00:00, 12003.57it/s]
100%|██████████| 3364270/3364270 [04:36<00:00, 12182.74it/s]
100%|██████████| 3593080/3593080 [05:02<00:00, 11859.20it/s]
100%|██████████| 3610844/3610844 [05:13<00:00, 11532.53it/s]
100%|██████████| 3487666/3487666 [05:14<00:00, 11073.25it/s]
100%|██████████| 2434870/2434870 [03:28<00:00, 11662.12it/s]
100%|██████████| 726732/726732 [00:47<00:00, 15150.90it/s]
100%|██████████| 12/12 [1:01:27<00:00, 307.31s/it]


In [8]:
base_path = '/home/jsearle/bigDrive/NAX/zipNotes/MGB'

rand_filtered_df['text'] = rand_filtered_df['NoteTextFile'].progress_apply(lambda x: read_file_from_zip(base_path, x))

rand_filtered_df.to_csv('cohort_plus_text.csv', index=False, header=True, sep=',', na_rep='NA')

100%|██████████| 1113/1113 [5:38:41<00:00, 18.26s/it] 


In [10]:
rand_filtered_df.head()
print(len(rand_filtered_df))

1113


In [11]:
def check_word_count(text, threshold):
    word_count = len(text.split())
    return word_count > threshold

In [12]:
threshold = 500
rand_filtered_df['valid'] = rand_filtered_df['text'].apply(lambda x: check_word_count(x, threshold))
rand_filtered_df.head()

Unnamed: 0,BDSPPatientID,DateICD,ICD,DateNote,NoteTextFile,text,valid
0,114316265,2023-01-24,430,2022-12-25,Notes_13605828901_9002141910_20221225.txt,Physician ***** ***** Admit date: ****...,True
1,120110560,2023-06-12,430,2023-06-25,Notes_13629086394_8327863776_20230625.txt,Physician ***** ***** Admit date: ****...,True
2,119573164,2021-02-13,I60.9,2021-02-23,Notes_13589736564_5567001557_20210223.txt,Physician ***** ***** Admit date: ****...,True
3,112030232,2021-05-07,430,2021-05-18,Notes_13566019013_5701887197_20210518.txt,Physician ***** ***** Admit date: ****...,True
4,111905857,2020-10-26,430,2020-10-02,Notes_13534548179_4160128051_20201002.txt,Physician ***** ***** Admit date: ****...,True


In [13]:
numValid = sum(rand_filtered_df['valid'])
print(numValid)

1087


In [14]:
false_indeces = rand_filtered_df.index[~rand_filtered_df['valid']].tolist()
print(false_indeces)

[37, 64, 83, 201, 249, 270, 303, 336, 409, 415, 449, 451, 454, 468, 506, 524, 559, 582, 588, 592, 858, 884, 946, 996, 1074, 1105]


In [16]:
final_icd_plus_df = rand_filtered_df.drop(index=false_indeces)
print(len(final_icd_plus_df))
final_icd_plus_df.head()

1087


Unnamed: 0,BDSPPatientID,DateICD,ICD,DateNote,NoteTextFile,text,valid
0,114316265,2023-01-24,430,2022-12-25,Notes_13605828901_9002141910_20221225.txt,Physician ***** ***** Admit date: ****...,True
1,120110560,2023-06-12,430,2023-06-25,Notes_13629086394_8327863776_20230625.txt,Physician ***** ***** Admit date: ****...,True
2,119573164,2021-02-13,I60.9,2021-02-23,Notes_13589736564_5567001557_20210223.txt,Physician ***** ***** Admit date: ****...,True
3,112030232,2021-05-07,430,2021-05-18,Notes_13566019013_5701887197_20210518.txt,Physician ***** ***** Admit date: ****...,True
4,111905857,2020-10-26,430,2020-10-02,Notes_13534548179_4160128051_20201002.txt,Physician ***** ***** Admit date: ****...,True


In [17]:
# write to csv
final_icd_plus_df.to_csv('5_final_icd_plus_df.csv', index=False)

In [None]:
final_icd_plus_df = pd.read_csv('5_final_icd_plus_df.csv')

In [None]:
# df cleanup
keepColumns = ['BDSPPatientID', 'DateNote',  'NoteTextFile', 'text']
clean_final_icd_plus_df = final_icd_plus_df[keepColumns]

clean_final_icd_plus_df.head()