In [6]:
import pandas as pd
from thunderpack import ThunderReader
from tqdm import tqdm
import tarfile
import zipfile
import random
import os
import shutil
import re
tqdm.pandas()

# Goal of this notebook:
 - Add a column to the dataframe which includes the text of the note associated with the row
 - Plan:
    - Figure out how to open tar archive from here
    - Create a function that opens the tar archive, finds the file, and extracts the text
    - Make a pandas series from these text blocks and append to the initial dataframe

In [7]:
cohort_neg = pd.read_csv('cohort_neg.csv')
cohort_neg.head()

Unnamed: 0,BDSPPatientID,NoteDate,NoteTitle
0,118374115,2018-11-20 00:00:00,Notes_13333491432_2130196704_20181120.txt
1,116931824,2022-12-29 00:00:00,Notes_13563868206_7830407523_20221229.txt
2,118587891,2021-05-30 00:00:00,Notes_13532685798_4823397982_20210530.txt
3,114557355,2022-07-22 00:00:00,Notes_13620259495_8489742841_20220722.txt
4,114329029,2018-05-15 00:00:00,Notes_13356996149_1753656538_20180515.txt


In [8]:
notes = cohort_neg['NoteTitle']
print(notes[0])

Notes_13333491432_2130196704_20181120.txt


Before continuing, we will make some helper functions that will make the process easier.
 - In order to go through the whole list of notes and extract their text, we will need to know which zip file they are
 - To do this we will use regex to extract the year and append it to the base file path
 - We also need to modify the read_from_zip method

In [9]:
# This will extract the year from the file name
def extract_year(file_name):
    match = re.search(r'_(\d{8})\.txt$', file_name)
    if match:
        return match.group(1)[:4]
    return None

In [10]:
def read_file_from_zip(base_path, file_name):
    year = extract_year(file_name)
    if year:
        zip_path = f'{base_path}/mgb_notes_{year}.zip'
        try:
            with zipfile.ZipFile(zip_path, 'r') as zipf:
                with zipf.open(f'{year}/{file_name}') as file:
                    content = file.read()
                    return content.decode('utf-8', errors = 'ignore')
        except (KeyError, FileNotFoundError):
            return None
    return None

With these updated helper functions we should be able to attach the txt of each note to the corresponding rows

In [11]:
base_path = '/home/jsearle/bigDrive/NAX/zipNotes/MGB'

cohort_neg['text'] = cohort_neg['NoteTitle'].progress_apply(lambda x: read_file_from_zip(base_path, x))

cohort_neg.to_csv('cohort_plus_text.csv', index=False, header=True, sep=',', na_rep='NA')

100%|██████████| 1087/1087 [5:20:51<00:00, 17.71s/it] 


In [12]:
cohort_neg.head()

Unnamed: 0,BDSPPatientID,NoteDate,NoteTitle,text
0,118374115,2018-11-20 00:00:00,Notes_13333491432_2130196704_20181120.txt,Physician ***** ***** Admit date: ****...
1,116931824,2022-12-29 00:00:00,Notes_13563868206_7830407523_20221229.txt,Physician ***** ***** Admit date: ****...
2,118587891,2021-05-30 00:00:00,Notes_13532685798_4823397982_20210530.txt,Physician ***** ***** Admit date: ****...
3,114557355,2022-07-22 00:00:00,Notes_13620259495_8489742841_20220722.txt,Physician ***** ***** Admit date: ****...
4,114329029,2018-05-15 00:00:00,Notes_13356996149_1753656538_20180515.txt,Physician ***** ***** Admit date: ****...


I forgot to check the character counts -- let's make sure each note has more than 500 characters

In [13]:
def check_word_count(text, threshold):
    return len(text) > threshold

In [14]:
threshold = 500
cohort_neg['valid'] = cohort_neg['text'].apply(lambda x: check_word_count(x, threshold))

In [15]:
cohort_neg.head()

Unnamed: 0,BDSPPatientID,NoteDate,NoteTitle,text,valid
0,118374115,2018-11-20 00:00:00,Notes_13333491432_2130196704_20181120.txt,Physician ***** ***** Admit date: ****...,True
1,116931824,2022-12-29 00:00:00,Notes_13563868206_7830407523_20221229.txt,Physician ***** ***** Admit date: ****...,True
2,118587891,2021-05-30 00:00:00,Notes_13532685798_4823397982_20210530.txt,Physician ***** ***** Admit date: ****...,True
3,114557355,2022-07-22 00:00:00,Notes_13620259495_8489742841_20220722.txt,Physician ***** ***** Admit date: ****...,True
4,114329029,2018-05-15 00:00:00,Notes_13356996149_1753656538_20180515.txt,Physician ***** ***** Admit date: ****...,True


In [16]:
print(sum(cohort_neg['valid']))

1085


In [17]:
false_indeces = cohort_neg.index[~cohort_neg['valid']].tolist()
print(false_indeces)

[647, 1012]


In [18]:
cohort_neg = cohort_neg.drop(index=false_indeces)

In [19]:
print(len(cohort_neg))

1085


In [21]:

# cohort_neg = cohort_neg.drop(columns=['ICD_Date', 'CodeType'])
print(len(cohort_neg))
cohort_neg.head()

1085


Unnamed: 0,BDSPPatientID,NoteDate,NoteTitle,text,valid
0,118374115,2018-11-20 00:00:00,Notes_13333491432_2130196704_20181120.txt,Physician ***** ***** Admit date: ****...,True
1,116931824,2022-12-29 00:00:00,Notes_13563868206_7830407523_20221229.txt,Physician ***** ***** Admit date: ****...,True
2,118587891,2021-05-30 00:00:00,Notes_13532685798_4823397982_20210530.txt,Physician ***** ***** Admit date: ****...,True
3,114557355,2022-07-22 00:00:00,Notes_13620259495_8489742841_20220722.txt,Physician ***** ***** Admit date: ****...,True
4,114329029,2018-05-15 00:00:00,Notes_13356996149_1753656538_20180515.txt,Physician ***** ***** Admit date: ****...,True


Double check that the character count is valid

In [22]:
threshold = 500
cohort_neg['valid'] = cohort_neg['text'].apply(lambda x: check_word_count(x, threshold))
print(sum(cohort_neg['valid']))

1085


In [23]:
updated_cohort_neg = cohort_neg.drop(columns=['valid'])
updated_cohort_neg.to_csv('final_icd_neg_df.csv', index=False, header=True, sep=',', na_rep='NA')

In [24]:
test = pd.read_csv('final_icd_neg_df.csv')
test.head()

Unnamed: 0,BDSPPatientID,NoteDate,NoteTitle,text
0,118374115,2018-11-20 00:00:00,Notes_13333491432_2130196704_20181120.txt,Physician ***** ***** Admit date: ****...
1,116931824,2022-12-29 00:00:00,Notes_13563868206_7830407523_20221229.txt,Physician ***** ***** Admit date: ****...
2,118587891,2021-05-30 00:00:00,Notes_13532685798_4823397982_20210530.txt,Physician ***** ***** Admit date: ****...
3,114557355,2022-07-22 00:00:00,Notes_13620259495_8489742841_20220722.txt,Physician ***** ***** Admit date: ****...
4,114329029,2018-05-15 00:00:00,Notes_13356996149_1753656538_20180515.txt,Physician ***** ***** Admit date: ****...
