In [1]:
import pandas as pd
from thunderpack import ThunderReader
from tqdm import tqdm
import re

In [2]:
reader = ThunderReader('/home/jsearle/bigDrive/Dropbox/zz_EHR_Thunderpacks/BIDMC/thunderpack_icd_9_10_nax_1m_BIDMC')
key_length = len(list(reader.keys()))
print(key_length)

36


In [3]:
# filter by below icd codes
code_regex = '^(I60|430)'
dfs = []
for i in tqdm(range(1, key_length + 1)):
    df = reader[f'ICD_partition_{i}']
    df = df[df['DiagnosisCodeWithDots'].astype(str).str.match(code_regex, flags = re.I)]
    dfs.append(df)

100%|██████████| 36/36 [00:30<00:00,  1.18it/s]


In [4]:
# show num of total SAH icd codes given
# show example of df
filtered_icd_df = pd.concat(dfs, axis=0, ignore_index=True)
print(len(filtered_icd_df))
filtered_icd_df.head()

4203


Unnamed: 0,BDSPPatientID,BDSPEncounterID,DiagnosisSequenceNumber,DiagnosisCode,DiagnosisPoaInd,DiagnosisCodeWithDots,ShortDescription,LongDescription,DiagnosisType,AdmissionDate,DischargeDate,BDSPLastModifiedDTS,code_type
0,150000004,132917345,1,430,Y,430,SUBARACHNOID HEMORRHAGE,SUBARACHNOID HEMORRHAGE ...,ICD9,2012-11-05,2012-12-14,2023-07-17 10:28:55.1290270,ICD9
1,150000137,133023864,1,430,Y,430,SUBARACHNOID HEMORRHAGE,SUBARACHNOID HEMORRHAGE ...,ICD9,2013-06-14,2013-07-24,2023-07-17 10:28:55.1290270,ICD9
2,150000004,557677029,1,430,Y,430,SUBARACHNOID HEMORRHAGE,SUBARACHNOID HEMORRHAGE ...,ICD9,2012-11-05,2012-11-05,2023-07-17 10:28:55.1290270,ICD9
3,150063730,558573586,1,430,Y,430,SUBARACHNOID HEMORRHAGE,SUBARACHNOID HEMORRHAGE ...,ICD9,2014-03-07,2014-03-07,2023-07-17 11:58:29.1092480,ICD9
4,150000004,922613611,3,430,,430,SUBARACHNOID HEMORRHAGE,SUBARACHNOID HEMORRHAGE ...,ICD9,2013-02-22,2013-02-22,2023-07-17 10:28:55.1290270,ICD9


In [5]:
# clean up df, keep only relevant info
keepColumns = ['BDSPPatientID', 'AdmissionDate', 'DiagnosisCodeWithDots']
clean_icd_df = filtered_icd_df[keepColumns]
clean_icd_df.head()

Unnamed: 0,BDSPPatientID,AdmissionDate,DiagnosisCodeWithDots
0,150000004,2012-11-05,430
1,150000137,2013-06-14,430
2,150000004,2012-11-05,430
3,150063730,2014-03-07,430
4,150000004,2013-02-22,430


In [6]:
# rename columns
rename_dict = { 
    'DiagnosisCodeWithDots': 'ICD',
    'AdmissionDate': 'DateICD' 
}

bi_df = clean_icd_df.rename(columns=rename_dict)

bi_df.head()

Unnamed: 0,BDSPPatientID,DateICD,ICD
0,150000004,2012-11-05,430
1,150000137,2013-06-14,430
2,150000004,2012-11-05,430
3,150063730,2014-03-07,430
4,150000004,2013-02-22,430


MGB

In [7]:
reader = ThunderReader('/home/jsearle/bigDrive/Dropbox/zz_EHR_Thunderpacks/MGB/thunderpack_icd_9_10_1m_MGB')
key_length = len(list(reader.keys()))
print(key_length)

511


In [8]:
# filter by below icd codes
code_regex = '^(I60|430)'
dfs = []
for i in tqdm(range(1, key_length + 1)):
    df = reader[f'ICD_partition_{i}']
    df = df[df['ICDCD'].astype(str).str.match(code_regex, flags = re.I)]
    dfs.append(df)

100%|██████████| 511/511 [25:08<00:00,  2.95s/it]


In [9]:
# show num of total SAH icd codes given
# show example of df
filtered_icd_df = pd.concat(dfs, axis=0, ignore_index=True)
print(len(filtered_icd_df))
filtered_icd_df.head()

41796


Unnamed: 0,BDSPEncounterID,EncounterLineNBR,BDSPPatientID,ShiftedContactDTS,ICDLineNBR,ICDCD,ICDDSC,DiagnosisNM,DiagnosisDSC,PrimaryDiagnosisFLG,DiagnosisChronicFLG,ShiftedUpdateDTS,DiagnosisLinkedProblemID,BDSPLastModifiedDTS,code_type
0,13437640000.0,2,116398048.0,2018-03-09 00:00:00.0000000,1.0,430,Subarachnoid hemorrhage,Subarachnoid hemorrhage,,N,N,2019-07-26 09:49:00.0000000,52214845.0,2022-04-27 13:27:03.6830000,ICD9
1,13394370000.0,2,119744866.0,2019-10-20 00:00:00.0000000,1.0,430,Subarachnoid hemorrhage,SAH (subarachnoid hemorrhage),,N,N,2023-04-28 12:05:00.0000000,81182497.0,2023-08-16 01:27:07.9010000,ICD9
2,13584730000.0,1,116790672.0,2022-05-13 00:00:00.0000000,1.0,430,Subarachnoid hemorrhage,SAH (subarachnoid hemorrhage),,N,N,2022-05-24 09:42:00.0000000,,2022-04-27 15:51:06.4400000,ICD9
3,13556260000.0,1,122243491.0,2020-06-12 00:00:00.0000000,1.0,430,Subarachnoid hemorrhage,Subarachnoid hemorrhage,,Y,N,2020-06-12 10:21:00.0000000,96911003.0,2022-04-27 13:21:20.6900000,ICD9
4,13544450000.0,5,119133865.0,2021-03-07 00:00:00.0000000,1.0,430,Subarachnoid hemorrhage,Subarachnoid hemorrhage,,N,N,2021-03-07 14:15:00.0000000,54423881.0,2022-04-27 14:08:48.7770000,ICD9


In [10]:
# clean up df, keep only relevant info
keepColumns = ['BDSPPatientID', 'ShiftedContactDTS', 'ICDCD']
clean_icd_df = filtered_icd_df[keepColumns]
clean_icd_df.head()

Unnamed: 0,BDSPPatientID,ShiftedContactDTS,ICDCD
0,116398048.0,2018-03-09 00:00:00.0000000,430
1,119744866.0,2019-10-20 00:00:00.0000000,430
2,116790672.0,2022-05-13 00:00:00.0000000,430
3,122243491.0,2020-06-12 00:00:00.0000000,430
4,119133865.0,2021-03-07 00:00:00.0000000,430


In [11]:
# rename columns
rename_dict = { 
    'ShiftedContactDTS': 'DateICD', 
    'ICDCD': 'ICD', 
}

mgb_df = clean_icd_df.rename(columns=rename_dict)

mgb_df.head()


Unnamed: 0,BDSPPatientID,DateICD,ICD
0,116398048.0,2018-03-09 00:00:00.0000000,430
1,119744866.0,2019-10-20 00:00:00.0000000,430
2,116790672.0,2022-05-13 00:00:00.0000000,430
3,122243491.0,2020-06-12 00:00:00.0000000,430
4,119133865.0,2021-03-07 00:00:00.0000000,430


In [14]:
bi_df.to_csv('all_bidmc_icd_pos_dates.csv', index=False)
mgb_df.to_csv('all_mgb_icd_pos_dates.csv', index=False)

Combine

In [12]:
combined_df = pd.concat([bi_df, mgb_df], ignore_index=True)

In [13]:
combined_df.to_csv('all_icd_pos_dates.csv', index=False)