# Filter by those who don't have SAH ICD code
The kernel won't be able to handle the ICD- cohort, so I'll change the order and filter by discharge summary first, then by negative SAH ICD code.  Then I will do randomized selection of the same number of ICD+ and include notes text.

In [1]:
# imports
import pandas as pd
from thunderpack import ThunderReader
from tqdm import tqdm
import re
import os

In [2]:
#filter all notes to only include discharge summaries
# year = 2013
# dfs = []
# for i in range(0, 12):
#     df = pd.read_csv(f'/home/jsearle/bigDrive/Dropbox/zz_EHR_Thunderpacks/MGB/MGB_Deidentified_Notes_March12th2024/mgb_notes_{year + i}_metadata.csv')
#     df = df[df['InpatientNoteTypeDSC'] == 'Discharge Summary']
#     dfs.append(df)

In [3]:
# show number of total discharge summaries then show example
# discharge_summaries = pd.concat(dfs, axis=0, ignore_index=True)
# print(len(discharge_summaries))
# discharge_summaries.head()

In [4]:
output_dir = '/home/jsearle/bigDrive/NAX/NLP-SAH_identification/code/newPipeline/MGB/6csv'

# Concatenate all filtered CSV files
all_files = [os.path.join(output_dir, f) for f in os.listdir(output_dir) if f.startswith('discharge_summaries_')]
discharge_summaries = pd.concat((pd.read_csv(f) for f in all_files), axis=0, ignore_index=True)

# Show number of total discharge summaries
print(len(discharge_summaries))

# Show example
discharge_summaries.head()

539579


Unnamed: 0,BDSPPatientID,ContactDate,InpatientNoteTypeDSC,DeidentifiedName
0,121552394,20190106,Discharge Summary,Notes_13407429969_1897463845_20190106.txt
1,121552394,20190106,Discharge Summary,Notes_13407429969_1897464004_20190106.txt
2,121210730,20190105,Discharge Summary,Notes_13338301602_1897145379_20190105.txt
3,118988577,20190114,Discharge Summary,Notes_13351931736_1915990662_20190114.txt
4,116506911,20190118,Discharge Summary,Notes_13339873460_1914565554_20190118.txt


In [5]:
#filter by unique patient ID
discharged_patients = set(discharge_summaries['BDSPPatientID'])
print(len(discharged_patients))

69684


### Randomization
 - Setting seed (25) just to be able to reproduce results

In [6]:
rand_DS_comp = discharge_summaries.sample(frac=1, random_state=25).reset_index(drop=True)
rand_DS_comp.head()

Unnamed: 0,BDSPPatientID,ContactDate,InpatientNoteTypeDSC,DeidentifiedName
0,115620551,20150304,Discharge Summary,Notes_13177882086_440619766_20150304.txt
1,114670821,20140924,Discharge Summary,Notes_13211093280_453355769_20140924.txt
2,112187363,20150303,Discharge Summary,Notes_13275669540_440764122_20150303.txt
3,117169043,20141206,Discharge Summary,Notes_13201865993_476529782_20141206.txt
4,122816355,20160824,Discharge Summary,Notes_13404964868_1469502913_20160824.txt


In [7]:
rand_DS = rand_DS_comp.drop_duplicates(subset='BDSPPatientID', keep='first')
print(len(rand_DS))

69684


### Combine with ICD table
 - I will be looking at all ICD codes within the month leading up to the date of the discharge summary
 - Check the format of the dates
 - Merge based on patient ID and date

In [8]:
output_dir = '/home/jsearle/bigDrive/NAX/NLP-SAH_identification/code/newPipeline/MGB/6csv'

all_files = [os.path.join(output_dir, f) for f in os.listdir(output_dir) if f.startswith('filtered_ICD_partition_')]
final_df = pd.concat((pd.read_csv(f) for f in all_files), axis=0, ignore_index=True)

print(len(final_df))

# Show example
final_df.head()

27129594


Unnamed: 0,BDSPEncounterID,EncounterLineNBR,BDSPPatientID,ShiftedContactDTS,ICDLineNBR,ICDCD,ICDDSC,DiagnosisNM,DiagnosisDSC,PrimaryDiagnosisFLG,DiagnosisChronicFLG,ShiftedUpdateDTS,DiagnosisLinkedProblemID,BDSPLastModifiedDTS,code_type
0,13626500000.0,1,118474377.0,2023-05-20 00:00:00.0000000,1.0,157.1,Malignant neoplasm of body of pancreas,Malignant neoplasm of body of pancreas,,Y,N,2023-05-20 13:25:00.0000000,135026244.0,2023-08-16 02:47:04.0620000,ICD
1,13581500000.0,3,122336922.0,2022-08-06 00:00:00.0000000,1.0,788.99,Other symptoms involving urinary system,Lower urinary tract symptoms,,N,N,2022-08-08 03:51:00.0000000,96060426.0,2022-04-27 13:40:53.7930000,ICD
2,13427830000.0,11,120522435.0,2018-09-30 00:00:00.0000000,1.0,278.01,Morbid obesity,Class 2 severe obesity due to excess calories ...,,N,N,2019-01-09 10:12:00.0000000,33794057.0,2022-04-27 14:57:52.3070000,ICD
3,13354680000.0,3,115980947.0,2018-05-10 00:00:00.0000000,1.0,309.81,Posttraumatic stress disorder,Posttraumatic stress disorder,,N,N,2024-01-20 19:05:00.0000000,18320222.0,2023-08-15 19:01:32.7250000,ICD
4,13725100000.0,1,114241241.0,2023-01-20 00:00:00.0000000,1.0,465.9,Acute upper respiratory infections of unspecif...,Viral URI,,Y,N,2023-01-20 11:45:00.0000000,,2023-08-16 04:41:57.7350000,ICD


In [9]:
#Take random sample of 500000 to make it smaller and easier to work with.

final_df = final_df.sample(n=500000, replace=False, random_state=1)

In [10]:
#filter by ICD-
code_regex = "^(?!I60|430).*"
filtered_final_df = final_df[final_df['ICDCD'].astype(str).str.match(code_regex, flags=re.I)]
del final_df

In [11]:
# change from string to datetime
filtered_final_df['ShiftedContactDTS'] = pd.to_datetime(filtered_final_df['ShiftedContactDTS'])
print(filtered_final_df['ShiftedContactDTS'][2])
print(type(filtered_final_df['ShiftedContactDTS'][2]))

KeyError: 2

In [None]:
rand_DS.loc[:, 'ContactDate'] = pd.to_datetime(rand_DS['ContactDate'], format='%Y%m%d')
print(rand_DS['ContactDate'][2])
print(type(rand_DS['ContactDate'][2]))

2015-03-03 00:00:00
<class 'pandas._libs.tslibs.timestamps.Timestamp'>


In [None]:
merged_df = pd.merge(filtered_final_df, rand_DS, on='BDSPPatientID')

In [None]:
merged_df.head()

Unnamed: 0,BDSPEncounterID,EncounterLineNBR,BDSPPatientID,ShiftedContactDTS,ICDLineNBR,ICDCD,ICDDSC,DiagnosisNM,DiagnosisDSC,PrimaryDiagnosisFLG,DiagnosisChronicFLG,ShiftedUpdateDTS,DiagnosisLinkedProblemID,BDSPLastModifiedDTS,code_type,ContactDate,InpatientNoteTypeDSC,DeidentifiedName
0,13626500000.0,1,118474377.0,2023-05-20,1.0,157.1,Malignant neoplasm of body of pancreas,Malignant neoplasm of body of pancreas,,Y,N,2023-05-20 13:25:00.0000000,135026244.0,2023-08-16 02:47:04.0620000,ICD,2019-11-26 00:00:00,Discharge Summary,Notes_13385975835_2644414416_20191126.txt
1,13581500000.0,3,122336922.0,2022-08-06,1.0,788.99,Other symptoms involving urinary system,Lower urinary tract symptoms,,N,N,2022-08-08 03:51:00.0000000,96060426.0,2022-04-27 13:40:53.7930000,ICD,2023-12-05 00:00:00,Discharge Summary,Notes_13705144828_9349774420_20231205.txt
2,13427830000.0,11,120522435.0,2018-09-30,1.0,278.01,Morbid obesity,Class 2 severe obesity due to excess calories ...,,N,N,2019-01-09 10:12:00.0000000,33794057.0,2022-04-27 14:57:52.3070000,ICD,2015-12-16 00:00:00,Discharge Summary,Notes_13310872420_756262157_20151216.txt
3,13354680000.0,3,115980947.0,2018-05-10,1.0,309.81,Posttraumatic stress disorder,Posttraumatic stress disorder,,N,N,2024-01-20 19:05:00.0000000,18320222.0,2023-08-15 19:01:32.7250000,ICD,2016-04-10 00:00:00,Discharge Summary,Notes_13254167247_450575706_20160410.txt
4,13725100000.0,1,114241241.0,2023-01-20,1.0,465.9,Acute upper respiratory infections of unspecif...,Viral URI,,Y,N,2023-01-20 11:45:00.0000000,,2023-08-16 04:41:57.7350000,ICD,2015-02-04 00:00:00,Discharge Summary,Notes_13264279068_433538463_20150204.txt


In [None]:
# +/- one month
merged_df_one_month = merged_df[(merged_df['ShiftedContactDTS'] >= merged_df['ContactDate'] - pd.DateOffset(months=1)) & 
                                (merged_df['ShiftedContactDTS'] <= merged_df['ContactDate'] + pd.DateOffset(months=1))]


In [None]:
print(len(merged_df_one_month))
merged_df_one_month.head()

1087


Unnamed: 0,BDSPEncounterID,EncounterLineNBR,BDSPPatientID,ShiftedContactDTS,ICDLineNBR,ICDCD,ICDDSC,DiagnosisNM,DiagnosisDSC,PrimaryDiagnosisFLG,DiagnosisChronicFLG,ShiftedUpdateDTS,DiagnosisLinkedProblemID,BDSPLastModifiedDTS,code_type,ContactDate,InpatientNoteTypeDSC,DeidentifiedName
22669430,13613300000.0,1,114216511.0,2022-03-30,2.0,E933.1,Antineoplastic and immunosuppressive drugs cau...,Chemotherapy induced neutropenia,,N,N,2023-12-22 11:23:00.0000000,,2023-08-16 01:32:06.3650000,ICD,2022-03-29 00:00:00,Discharge Summary,Notes_13612877798_6185656669_20220329.txt
18676571,13673120000.0,1,115825493.0,2024-04-06,1.0,596.54,Neurogenic bladder NOS,Neurogenic urinary bladder disorder,,Y,N,2024-04-06 19:14:00.0000000,131508324.0,2023-08-16 05:21:23.1540000,ICD,2024-04-24 00:00:00,Discharge Summary,Notes_13680450872_10025641612_20240424.txt
9654591,13727200000.0,8,122397082.0,2021-12-05,1.0,V65.49,Other specified counseling,"Goals of care, counseling/discussion",,N,N,2021-12-06 01:23:00.0000000,132294772.0,2022-11-04 23:30:15.4210000,ICD,2021-11-10 00:00:00,Discharge Summary,Notes_13719986847_8638533275_20211110.txt
13371446,13377570000.0,2,117886365.0,2018-05-21,1.0,473.1,Chronic frontal sinusitis,Chronic frontal sinusitis,,N,N,2024-01-26 18:17:00.0000000,,2023-08-15 18:53:39.1440000,ICD,2018-06-16 00:00:00,Discharge Summary,Notes_13378253243_1622441108_20180616.txt
18924275,13592660000.0,11,112990024.0,2021-11-10,1.0,R09.02,Hypoxemia,Hypoxia,,N,N,2021-11-26 13:18:00.0000000,122633658.0,2022-04-27 13:10:13.8100000,ICD,2021-12-03 00:00:00,Discharge Summary,Notes_13592663582_7705422690_20211203.txt


In [None]:

columns_to_keep = ['BDSPPatientID', 'ShiftedContactDTS', 'ICDCD', 'code_type', 'ContactDate', 'DeidentifiedName']
clean_df = merged_df_one_month[columns_to_keep]
clean_df.head()

ValueError: Cannot take a larger sample than population when 'replace=False'

In [None]:
rename_dict = {
    'ShiftedContactDTS': 'ICD_Date', 
    'ICDCD': 'ICD', 
    'code_type': 'CodeType', 
    'ContactDate': 'NoteDate', 
    'DeidentifiedName': 'NoteTitle'
}

clean_df = clean_df.rename(columns=rename_dict)
clean_df.head()

Unnamed: 0,BDSPPatientID,ICD_Date,ICD,CodeType,NoteDate,NoteTitle
22669430,114216511.0,2022-03-30,E933.1,ICD,2022-03-29 00:00:00,Notes_13612877798_6185656669_20220329.txt
18676571,115825493.0,2024-04-06,596.54,ICD,2024-04-24 00:00:00,Notes_13680450872_10025641612_20240424.txt
9654591,122397082.0,2021-12-05,V65.49,ICD,2021-11-10 00:00:00,Notes_13719986847_8638533275_20211110.txt
13371446,117886365.0,2018-05-21,473.1,ICD,2018-06-16 00:00:00,Notes_13378253243_1622441108_20180616.txt
18924275,112990024.0,2021-11-10,R09.02,ICD,2021-12-03 00:00:00,Notes_13592663582_7705422690_20211203.txt


In [None]:
new_order = ['BDSPPatientID', 'ICD_Date', 'NoteDate', 'ICD', 'CodeType', 'NoteTitle']
clean_df = clean_df[new_order]
clean_df.head()

Unnamed: 0,BDSPPatientID,ICD_Date,NoteDate,ICD,CodeType,NoteTitle
22669430,114216511.0,2022-03-30,2022-03-29 00:00:00,E933.1,ICD,Notes_13612877798_6185656669_20220329.txt
18676571,115825493.0,2024-04-06,2024-04-24 00:00:00,596.54,ICD,Notes_13680450872_10025641612_20240424.txt
9654591,122397082.0,2021-12-05,2021-11-10 00:00:00,V65.49,ICD,Notes_13719986847_8638533275_20211110.txt
13371446,117886365.0,2018-05-21,2018-06-16 00:00:00,473.1,ICD,Notes_13378253243_1622441108_20180616.txt
18924275,112990024.0,2021-11-10,2021-12-03 00:00:00,R09.02,ICD,Notes_13592663582_7705422690_20211203.txt


In [None]:
print(len(clean_df))

1087


In [None]:
clean_df.loc[:, 'ICD'] = clean_df['ICD'].astype(str)
grouped = clean_df.groupby('NoteTitle')['ICD'].apply(lambda x: ', '.join(x)).reset_index()
icd_neg_unique = clean_df.drop_duplicates(subset=['NoteTitle']).drop(columns=['ICD'])
icd_neg_unique = pd.merge(icd_neg_unique, grouped, on='NoteTitle', how='left')

In [None]:
print(len(icd_neg_unique))

1049


In [None]:
#negative cohort same size as positive
cohort_neg = icd_neg_unique.sample(n=1087, replace=False, random_state=1)
print(len(cohort_neg))

In [None]:
cohort_neg = cohort_neg.drop(columns=['ICD_Date', 'CodeType', 'ICD'])
cohort_neg.head()

Unnamed: 0,BDSPPatientID,NoteDate,NoteTitle
0,114216511.0,2022-03-29 00:00:00,Notes_13612877798_6185656669_20220329.txt
1,115825493.0,2024-04-24 00:00:00,Notes_13680450872_10025641612_20240424.txt
2,122397082.0,2021-11-10 00:00:00,Notes_13719986847_8638533275_20211110.txt
3,117886365.0,2018-06-16 00:00:00,Notes_13378253243_1622441108_20180616.txt
4,112990024.0,2021-12-03 00:00:00,Notes_13592663582_7705422690_20211203.txt


In [None]:
# cohort_neg.to_csv('6_icd_minus.csv', index=False, header=True, sep=',', na_rep='NA')