In [1]:
import pandas as pd
from thunderpack import ThunderReader
from tqdm import tqdm
import os
import re

In [2]:
# get note metadata for each year
# filter by discharge summary type
startYear = 2010
dfs = []
for i in tqdm(range(0, 15)):
    df = pd.read_csv(f'/home/jsearle/bigDrive/Dropbox/zz_EHR_Thunderpacks/BIDMC/BIDMC_Deidentified_Notes_March14th2024/bidmc_notes_{startYear + i}_metadata.csv')
    df = df[(df['NoteTypeFull'] == 'Initial Note') | (df['NoteTypeFull'] == 'Initial note')]
    dfs.append(df)

100%|██████████| 15/15 [00:42<00:00,  2.84s/it]


In [3]:
# show dataframe, number of admission notes

admission_notes_df = pd.concat(dfs, axis=0, ignore_index=True)
numAdmissionNotes = len(admission_notes_df)
print(numAdmissionNotes)
admission_notes_df.tail()

1981750


Unnamed: 0,BDSPPatientID,NoteTypeFull,Service,CreateDate,DeidentifiedName
1981745,150422899,Initial note,Podiatry,20240622,Notes_1130282026_3877943256_20240622.txt
1981746,150422914,Initial note,Cardiology,20240517,Notes_1130281835_3878143156_20240517.txt
1981747,150422914,Initial note,Cardiology,20240517,Notes_1130281835_3878143157_20240517.txt
1981748,150167280,Initial note,Neurosurgery,20240414,Notes_1130026292_523257393_20240414.txt
1981749,150167280,Initial note,Pain Management,20240607,Notes_1130026292_523257395_20240607.txt


In [5]:
# clean up df, keep only relevant info
keepColumns = ['BDSPPatientID', 'CreateDate',  'DeidentifiedName']
admission_notes_df = admission_notes_df[keepColumns]
admission_notes_df.head()

Unnamed: 0,BDSPPatientID,CreateDate,DeidentifiedName
0,150641617,20100813,Notes_1130500157_1156166683_20100813.txt
1,150641382,20100804,Notes_1130499976_10206501630_20100804.txt
2,150641382,20100805,Notes_1130499976_10206501631_20100805.txt
3,150641382,20100805,Notes_1130499976_10206501633_20100805.txt
4,150641382,20100808,Notes_1130499976_10206501636_20100808.txt


In [6]:
# rename columns
rename_dict = { 
    'CreateDate': 'NoteDate', 
    'DeidentifiedName': 'NoteTitle', 
}

admission_notes_df = admission_notes_df.rename(columns=rename_dict)

# change date column from str to datetime
admission_notes_df.loc[:, 'NoteDate'] = pd.to_datetime(admission_notes_df['NoteDate'], format='%Y%m%d')
admission_notes_df.head()

['2010-08-13 00:00:00', '2010-08-04 00:00:00', '2010-08-05 00:00:00',
 '2010-08-05 00:00:00', '2010-08-08 00:00:00', '2010-12-11 00:00:00',
 '2010-10-19 00:00:00', '2010-05-19 00:00:00', '2010-11-02 00:00:00',
 '2010-06-21 00:00:00',
 ...
 '2024-05-12 00:00:00', '2024-01-19 00:00:00', '2024-02-07 00:00:00',
 '2024-01-04 00:00:00', '2024-04-01 00:00:00', '2024-06-22 00:00:00',
 '2024-05-17 00:00:00', '2024-05-17 00:00:00', '2024-04-14 00:00:00',
 '2024-06-07 00:00:00']
Length: 1981750, dtype: datetime64[ns]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  admission_notes_df.loc[:, 'NoteDate'] = pd.to_datetime(admission_notes_df['NoteDate'], format='%Y%m%d')


Unnamed: 0,BDSPPatientID,NoteDate,NoteTitle
0,150641617,2010-08-13,Notes_1130500157_1156166683_20100813.txt
1,150641382,2010-08-04,Notes_1130499976_10206501630_20100804.txt
2,150641382,2010-08-05,Notes_1130499976_10206501631_20100805.txt
3,150641382,2010-08-05,Notes_1130499976_10206501633_20100805.txt
4,150641382,2010-08-08,Notes_1130499976_10206501636_20100808.txt


In [7]:
# read in icd filtered df
icd_filter_df = pd.read_csv('1_icd_neg_df.csv')
# change column from str to datetime
icd_filter_df['AdmissionDate'] = pd.to_datetime(icd_filter_df['AdmissionDate'])

# merge based on patient id
ICD_plus_AN_df = pd.merge(icd_filter_df, admission_notes_df, on='BDSPPatientID', how='inner')
ICD_plus_AN_df.head()


Unnamed: 0,BDSPPatientID,AdmissionDate,ICD,NoteDate,NoteTitle
0,150963606,2017-11-02,M79.1,2014-01-02,Notes_1130822660_2159918624_20140102.txt
1,150963606,2017-11-02,M79.1,2014-04-25,Notes_1130822660_2159918627_20140425.txt
2,150963606,2017-11-02,M79.1,2014-05-09,Notes_1130822660_2159918628_20140509.txt
3,150963606,2017-11-02,M79.1,2017-06-21,Notes_1130822660_2159918636_20170621.txt
4,150963606,2017-11-02,M79.1,2017-10-15,Notes_1130822660_2159918640_20171015.txt


In [8]:
# filter for DateICD +/- 1 month of DateNote

from datetime import timedelta


time_window = timedelta(days=30)

# Filter rows where DateICD is within 1 month before or after DateNote
filtered_df = ICD_plus_AN_df[abs(ICD_plus_AN_df['AdmissionDate'] - ICD_plus_AN_df['NoteDate']) <= time_window]

# # Filter rows where DateICD is within 1 month after DateNote
# filtered_df = ICD_plus_DS_df[(ICD_plus_DS_df['DateICD'] >= ICD_plus_DS_df['DateNote']) & (ICD_plus_DS_df['DateICD'] <= ICD_plus_DS_df['DateNote'] + time_window)]

print(len(filtered_df))
filtered_df.head()


64346


Unnamed: 0,BDSPPatientID,AdmissionDate,ICD,NoteDate,NoteTitle
4,150963606,2017-11-02,M79.1,2017-10-15,Notes_1130822660_2159918640_20171015.txt
29,150770050,2012-10-10,V49.86,2012-09-14,Notes_1130629128_16902710199_20120914.txt
30,150770050,2012-10-10,V49.86,2012-09-20,Notes_1130629128_16902710204_20120920.txt
31,150770050,2012-10-10,V49.86,2012-10-10,Notes_1130629128_16902710211_20121010.txt
32,150770050,2012-10-10,V49.86,2012-10-16,Notes_1130629128_16902710215_20121016.txt


In [11]:
# RANDOMIZE order of df, 25 seed for reproducability
filtered_df = filtered_df.sample(frac=1, random_state=25).reset_index(drop=True)
filtered_df.head()

Unnamed: 0,BDSPPatientID,AdmissionDate,ICD,NoteDate,NoteTitle
0,150005970,2020-12-09,I25.2,2020-12-22,Notes_1129864534_17441726072_20201222.txt
1,150787076,2018-02-04,R33.9,2018-01-27,Notes_1130645711_1863357139_20180127.txt
2,151344919,2022-05-12,I48.91,2022-04-30,Notes_1131203861_3479700929_20220430.txt
3,151076568,2012-06-26,621.0,2012-06-10,Notes_1130935590_2521399596_20120610.txt
4,150994453,2018-04-11,C61,2018-04-12,Notes_1130852937_2253966504_20180412.txt


In [12]:
#filter to only include unique patients
filtered_df = filtered_df.drop_duplicates(subset='BDSPPatientID', keep='first')
print(len(filtered_df))

19168


In [14]:
# save as csv
filtered_df.to_csv('2_admission_notes_df.csv', index=False)