In [2]:
import pandas as pd
from thunderpack import ThunderReader
from tqdm import tqdm
import os
import re

In [3]:
# get note metadata for each year
# filter by discharge summary type
startYear = 2010
dfs = []
for i in tqdm(range(0, 15)):
    df = pd.read_csv(f'/home/jsearle/bigDrive/Dropbox/zz_EHR_Thunderpacks/BIDMC/BIDMC_Deidentified_Notes_March14th2024/bidmc_notes_{startYear + i}_metadata.csv')
    df = df[(df['NoteTypeFull'] == 'Initial Note') | (df['NoteTypeFull'] == 'Initial note')]
    dfs.append(df)

100%|██████████| 15/15 [00:38<00:00,  2.58s/it]


In [4]:
# show dataframe, number of admission notes

admission_notes_df = pd.concat(dfs, axis=0, ignore_index=True)
numAdmissionNotes = len(admission_notes_df)
print(numAdmissionNotes)
admission_notes_df.head()

1981750


Unnamed: 0,BDSPPatientID,NoteTypeFull,Service,CreateDate,DeidentifiedName
0,150641617,Initial Note,Psychiatry,20100813,Notes_1130500157_1156166683_20100813.txt
1,150641382,Initial Note,,20100804,Notes_1130499976_10206501630_20100804.txt
2,150641382,Initial Note,Nephrology,20100805,Notes_1130499976_10206501631_20100805.txt
3,150641382,Initial Note,General Surgery,20100805,Notes_1130499976_10206501633_20100805.txt
4,150641382,Initial Note,Vascular Surgery,20100808,Notes_1130499976_10206501636_20100808.txt


In [5]:
# clean up df, keep only relevant info
keepColumns = ['BDSPPatientID', 'CreateDate',  'DeidentifiedName']
admission_notes_df = admission_notes_df[keepColumns]
admission_notes_df.head()

Unnamed: 0,BDSPPatientID,CreateDate,DeidentifiedName
0,150641617,20100813,Notes_1130500157_1156166683_20100813.txt
1,150641382,20100804,Notes_1130499976_10206501630_20100804.txt
2,150641382,20100805,Notes_1130499976_10206501631_20100805.txt
3,150641382,20100805,Notes_1130499976_10206501633_20100805.txt
4,150641382,20100808,Notes_1130499976_10206501636_20100808.txt


In [6]:
# rename columns
rename_dict = { 
    'CreateDate': 'NoteDate', 
    'DeidentifiedName': 'NoteTitle', 
}

admission_notes_df = admission_notes_df.rename(columns=rename_dict)

# change date column from str to datetime
admission_notes_df.loc[:, 'NoteDate'] = pd.to_datetime(admission_notes_df['NoteDate'], format='%Y%m%d')
admission_notes_df.head()

['2010-08-13 00:00:00', '2010-08-04 00:00:00', '2010-08-05 00:00:00',
 '2010-08-05 00:00:00', '2010-08-08 00:00:00', '2010-12-11 00:00:00',
 '2010-10-19 00:00:00', '2010-05-19 00:00:00', '2010-11-02 00:00:00',
 '2010-06-21 00:00:00',
 ...
 '2024-05-12 00:00:00', '2024-01-19 00:00:00', '2024-02-07 00:00:00',
 '2024-01-04 00:00:00', '2024-04-01 00:00:00', '2024-06-22 00:00:00',
 '2024-05-17 00:00:00', '2024-05-17 00:00:00', '2024-04-14 00:00:00',
 '2024-06-07 00:00:00']
Length: 1981750, dtype: datetime64[ns]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  admission_notes_df.loc[:, 'NoteDate'] = pd.to_datetime(admission_notes_df['NoteDate'], format='%Y%m%d')


Unnamed: 0,BDSPPatientID,NoteDate,NoteTitle
0,150641617,2010-08-13,Notes_1130500157_1156166683_20100813.txt
1,150641382,2010-08-04,Notes_1130499976_10206501630_20100804.txt
2,150641382,2010-08-05,Notes_1130499976_10206501631_20100805.txt
3,150641382,2010-08-05,Notes_1130499976_10206501633_20100805.txt
4,150641382,2010-08-08,Notes_1130499976_10206501636_20100808.txt


In [7]:
# read in icd filtered df
icd_filter_df = pd.read_csv('1_icd_pos_df.csv')
# change column from str to datetime
icd_filter_df['AdmissionDate'] = pd.to_datetime(icd_filter_df['AdmissionDate'])

# merge based on patient id
ICD_plus_AN_df = pd.merge(icd_filter_df, admission_notes_df, on='BDSPPatientID', how='inner')
ICD_plus_AN_df.head()


Unnamed: 0,BDSPPatientID,AdmissionDate,ICD,NoteDate,NoteTitle
0,150000004,2012-11-05,430,2012-11-05,Notes_1129858847_225334705_20121105.txt
1,150000004,2012-11-05,430,2012-11-21,Notes_1129858847_903347020_20121121.txt
2,150000004,2012-11-05,430,2012-11-21,Notes_1129858847_903347022_20121121.txt
3,150000004,2012-11-05,430,2012-11-25,Notes_1129858847_903347025_20121125.txt
4,150000004,2012-11-05,430,2012-11-28,Notes_1129858847_903347027_20121128.txt


In [8]:
# filter for DateICD +/- 30 days of DateNote

from datetime import timedelta


time_window = timedelta(days=30)

# Filter rows where DateICD is within 1 month before or after DateNote
filtered_df = ICD_plus_AN_df[abs(ICD_plus_AN_df['AdmissionDate'] - ICD_plus_AN_df['NoteDate']) <= time_window]

print(len(filtered_df))
filtered_df.head()


14579


Unnamed: 0,BDSPPatientID,AdmissionDate,ICD,NoteDate,NoteTitle
0,150000004,2012-11-05,430,2012-11-05,Notes_1129858847_225334705_20121105.txt
1,150000004,2012-11-05,430,2012-11-21,Notes_1129858847_903347020_20121121.txt
2,150000004,2012-11-05,430,2012-11-21,Notes_1129858847_903347022_20121121.txt
3,150000004,2012-11-05,430,2012-11-25,Notes_1129858847_903347025_20121125.txt
4,150000004,2012-11-05,430,2012-11-28,Notes_1129858847_903347027_20121128.txt


In [9]:
# save as csv
filtered_df.to_csv('2_admission_notes_df.csv', index=False)