In [21]:
# imports
import pandas as pd
from thunderpack import ThunderReader
from tqdm import tqdm
import re

### Discharge Summaries
 - Read in all note metadata CSV files and filter InpatientNoteTypeDSC column for only 'Discharge Summary'
 - Keep track of: Note title, patient ID, date

In [2]:
# get note metadata for each year
# filter by discharge summary type
startYear = 2013
dfs = []
for i in range(0, 12):
    df = pd.read_csv(f'/home/jsearle/bigDrive/Dropbox/zz_EHR_Thunderpacks/MGB/MGB_Deidentified_Notes_March12th2024/mgb_notes_{startYear + i}_metadata.csv')
    df = df[df['InpatientNoteTypeDSC'] == 'Discharge Summary']
    dfs.append(df)

In [22]:
# show dataframe, number of discharge summaries

discharge_summaries_df = pd.concat(dfs, axis=0, ignore_index=True)
numDischargeSummaries = len(discharge_summaries_df)
print(numDischargeSummaries)
discharge_summaries_df.head()

539579


Unnamed: 0,BDSPPatientID,ContactDate,InpatientNoteTypeDSC,DeidentifiedName
0,113976830,20141225,Discharge Summary,Notes_13243172392_371031605_20141225.txt
1,116909940,20141006,Discharge Summary,Notes_13157546274_374917354_20141006.txt
2,115431378,20141214,Discharge Summary,Notes_13153784024_372493064_20141214.txt
3,113222063,20140913,Discharge Summary,Notes_13200816127_370251728_20140913.txt
4,121246362,20140509,Discharge Summary,Notes_13172591129_379609047_20140509.txt


In [23]:
# clean up df, keep only relevant info
keepColumns = ['BDSPPatientID', 'ContactDate',  'DeidentifiedName']
discharge_summaries_df = discharge_summaries_df[keepColumns]
discharge_summaries_df.head()

Unnamed: 0,BDSPPatientID,ContactDate,DeidentifiedName
0,113976830,20141225,Notes_13243172392_371031605_20141225.txt
1,116909940,20141006,Notes_13157546274_374917354_20141006.txt
2,115431378,20141214,Notes_13153784024_372493064_20141214.txt
3,113222063,20140913,Notes_13200816127_370251728_20140913.txt
4,121246362,20140509,Notes_13172591129_379609047_20140509.txt


In [24]:
# rename columns
rename_dict = { 
    'ContactDate': 'DateNote', 
    'DeidentifiedName': 'NoteTextFile', 
}

discharge_summaries_df = discharge_summaries_df.rename(columns=rename_dict)

# change date column from str to datetime
discharge_summaries_df.loc[:, 'DateNote'] = pd.to_datetime(discharge_summaries_df['DateNote'], format='%Y%m%d')
discharge_summaries_df.head()

['2014-12-25 00:00:00', '2014-10-06 00:00:00', '2014-12-14 00:00:00',
 '2014-09-13 00:00:00', '2014-05-09 00:00:00', '2014-12-27 00:00:00',
 '2014-08-23 00:00:00', '2014-08-22 00:00:00', '2014-06-15 00:00:00',
 '2014-11-03 00:00:00',
 ...
 '2024-04-17 00:00:00', '2024-01-17 00:00:00', '2024-01-15 00:00:00',
 '2024-04-27 00:00:00', '2024-01-07 00:00:00', '2024-05-16 00:00:00',
 '2024-02-13 00:00:00', '2024-03-27 00:00:00', '2024-03-17 00:00:00',
 '2024-02-03 00:00:00']
Length: 539579, dtype: datetime64[ns]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  discharge_summaries_df.loc[:, 'DateNote'] = pd.to_datetime(discharge_summaries_df['DateNote'], format='%Y%m%d')


Unnamed: 0,BDSPPatientID,DateNote,NoteTextFile
0,113976830,2014-12-25,Notes_13243172392_371031605_20141225.txt
1,116909940,2014-10-06,Notes_13157546274_374917354_20141006.txt
2,115431378,2014-12-14,Notes_13153784024_372493064_20141214.txt
3,113222063,2014-09-13,Notes_13200816127_370251728_20140913.txt
4,121246362,2014-05-09,Notes_13172591129_379609047_20140509.txt


In [29]:
# read in icd filtered df
icd_filter_df = pd.read_csv('1_icd_plus_df.csv')
# change column from str to datetime
icd_filter_df['DateICD'] = pd.to_datetime(icd_filter_df['DateICD'])

# merge based on patient id
ICD_plus_DS_df = pd.merge(icd_filter_df, discharge_summaries_df, on='BDSPPatientID', how='inner')
ICD_plus_DS_df.head()


Unnamed: 0,BDSPPatientID,DateICD,ICD,DateNote,NoteTextFile
0,116398048.0,2018-03-09,430,2018-03-20,Notes_13437596114_2097038735_20180320.txt
1,116398048.0,2018-03-09,430,2019-12-13,Notes_13526276424_3921053460_20191213.txt
2,119133865.0,2021-03-07,430,2019-03-25,Notes_13444933668_2214037131_20190325.txt
3,122523107.0,2023-02-09,430,2014-10-20,Notes_13273793077_378347858_20141020.txt
4,122523107.0,2023-02-09,430,2022-09-21,Notes_13743206583_9247349955_20220921.txt


In [36]:
# filter for DateICD +/- 1 month of DateNote

from datetime import timedelta


time_window = timedelta(days=30)

# Filter rows where DateICD is within 1 month before or after DateNote
filtered_df = ICD_plus_DS_df[abs(ICD_plus_DS_df['DateICD'] - ICD_plus_DS_df['DateNote']) <= time_window]

# # Filter rows where DateICD is within 1 month after DateNote
# filtered_df = ICD_plus_DS_df[(ICD_plus_DS_df['DateICD'] >= ICD_plus_DS_df['DateNote']) & (ICD_plus_DS_df['DateICD'] <= ICD_plus_DS_df['DateNote'] + time_window)]

print(len(filtered_df))
filtered_df.head()




9829


Unnamed: 0,BDSPPatientID,DateICD,ICD,DateNote,NoteTextFile
0,116398048.0,2018-03-09,430,2018-03-20,Notes_13437596114_2097038735_20180320.txt
7,113080217.0,2022-10-10,430,2022-10-28,Notes_13604817952_9087332343_20221028.txt
8,113027749.0,2015-06-07,430,2015-06-20,Notes_13246458168_980626991_20150620.txt
20,118319033.0,2015-11-16,430,2015-11-06,Notes_13276057829_1044044733_20151106.txt
21,120036000.0,2020-02-28,430,2020-03-01,Notes_13476104223_4893859211_20200301.txt


In [38]:
# save as csv
filtered_df.to_csv('2_discharge_summaries_df.csv', index=False)
