In [22]:
import pandas as pd
from thunderpack import ThunderReader


In [23]:
year = "2023"
code_regex = "^S06"

In [24]:
reader = ThunderReader('/home/cdac-c-8/Documents/ICD')
key_length = len(list(reader.keys()))

In [25]:
dfs = []
for i in range(1, key_length + 1):
    df = reader[f'ICD_partition_{i}']
    df = df[df['DiagnosisCode'].astype(str).str.match(f'{code_regex}')]
    dfs.append(df)

In [26]:
filtered_icd_df = pd.concat(dfs, axis=0, ignore_index=True)
print(len(filtered_icd_df))
filtered_icd_df.head()

17521


Unnamed: 0,BDSPPatientID,BDSPEncounterID,DiagnosisSequenceNumber,DiagnosisCode,DiagnosisPoaInd,DiagnosisCodeWithDots,ShortDescription,LongDescription,DiagnosisType,AdmissionDate,DischargeDate,BDSPLastModifiedDTS,code_type
0,150077892,133252741,8,S06300S,E,S06.300S,UNS FOCAL TBI NO LOC SEQ,"UNSP FOCAL TBI W/O LOSS OF CONSCIOUSNESS, SEQU...",ICD10,2016-01-09,2016-01-14,2023-07-18 11:57:37.2163870,ICD10
1,150065771,133252861,3,S060X9A,Y,S06.0X9A,CONCUSS LOC UNS DUR INIT,CONCUSSION W LOSS OF CONSCIOUSNESS OF UNSP DUR...,ICD10,2016-07-12,2016-07-13,2023-07-18 11:57:37.2163870,ICD10
2,150074748,133267050,1,S066X0A,Y,S06.6X0A,TRAUMATIC SA HEMORRHAGE,"TRAUM SUBRAC HEM W/O LOSS OF CONSCIOUSNESS, IN...",ICD10,2015-07-22,2015-07-23,2023-07-18 11:57:37.2163870,ICD10
3,150062827,133281540,1,S065X0A,Y,S06.5X0A,TRAUMAT SUBDURAL HEMOR N,"TRAUM SUBDR HEM W/O LOSS OF CONSCIOUSNESS, INI...",ICD10,2015-12-06,2015-12-09,2023-07-18 11:57:37.2163870,ICD10
4,150073380,133299208,1,S06300A,Y,S06.300A,UNS FOCAL TBI W/O LOC IN,"UNSP FOCAL TBI W/O LOSS OF CONSCIOUSNESS, INIT...",ICD10,2015-06-29,2015-06-30,2023-07-18 11:57:37.2163870,ICD10


In [27]:
notes_df = pd.read_csv(f'/home/cdac-c-8/Documents/notes/bidmc_notes_{year}/bidmc_notes_{year}_metadata.csv')
notes_df = notes_df[notes_df['NoteTypeFull'] == 'Initial note']
print(len(notes_df))
notes_df.head()

Unnamed: 0,BDSPPatientID,NoteTypeFull,Service,CreateDate,DeidentifiedName
0,150644433,Initial note,Cardiology,20230613,Notes_1130503252_1151799443_20230613.txt
1,150644433,Initial note,Spiritual Care,20230614,Notes_1130503252_1151799444_20230614.txt
2,150644433,Initial note,Infectious Disease,20230614,Notes_1130503252_1151799445_20230614.txt
3,150644433,Progress note,Nursing,20230614,Notes_1130503252_1151799446_20230614.txt
4,150644433,Progress note,General Medicine/Primary Care,20230614,Notes_1130503252_1151799447_20230614.txt


In [33]:
noteTypes = set(notes_df['NoteTypeFull'])
print(noteTypes)

{'Op Report', 'Final/Sign Off', 'Procedure', 'Post Operative', 'Event', 'Telephone', 'Communication', 'Transfer', 'Team Meeting', 'Progress note', 'Initial note', 'Letter', 'Pre Operative'}


In [29]:
filtered_icd_df['AdmissionDate'] = pd.to_datetime(filtered_icd_df['AdmissionDate'])
filtered_icd_df['AdmissionDate'] = filtered_icd_df['AdmissionDate'].dt.strftime('%Y%m%d')
filtered_icd_df['AdmissionDate'] = pd.to_numeric(filtered_icd_df['AdmissionDate'])
filtered_icd_df.head()

Unnamed: 0,BDSPPatientID,BDSPEncounterID,DiagnosisSequenceNumber,DiagnosisCode,DiagnosisPoaInd,DiagnosisCodeWithDots,ShortDescription,LongDescription,DiagnosisType,AdmissionDate,DischargeDate,BDSPLastModifiedDTS,code_type
0,150077892,133252741,8,S06300S,E,S06.300S,UNS FOCAL TBI NO LOC SEQ,"UNSP FOCAL TBI W/O LOSS OF CONSCIOUSNESS, SEQU...",ICD10,20160109,2016-01-14,2023-07-18 11:57:37.2163870,ICD10
1,150065771,133252861,3,S060X9A,Y,S06.0X9A,CONCUSS LOC UNS DUR INIT,CONCUSSION W LOSS OF CONSCIOUSNESS OF UNSP DUR...,ICD10,20160712,2016-07-13,2023-07-18 11:57:37.2163870,ICD10
2,150074748,133267050,1,S066X0A,Y,S06.6X0A,TRAUMATIC SA HEMORRHAGE,"TRAUM SUBRAC HEM W/O LOSS OF CONSCIOUSNESS, IN...",ICD10,20150722,2015-07-23,2023-07-18 11:57:37.2163870,ICD10
3,150062827,133281540,1,S065X0A,Y,S06.5X0A,TRAUMAT SUBDURAL HEMOR N,"TRAUM SUBDR HEM W/O LOSS OF CONSCIOUSNESS, INI...",ICD10,20151206,2015-12-09,2023-07-18 11:57:37.2163870,ICD10
4,150073380,133299208,1,S06300A,Y,S06.300A,UNS FOCAL TBI W/O LOC IN,"UNSP FOCAL TBI W/O LOSS OF CONSCIOUSNESS, INIT...",ICD10,20150629,2015-06-30,2023-07-18 11:57:37.2163870,ICD10


In [30]:
merged_df = pd.merge(filtered_icd_df, notes_df, left_on=['BDSPPatientID', 'AdmissionDate'], right_on=['BDSPPatientID', 'CreateDate'], how='inner')
filtered_notes = merged_df['DeidentifiedName']
long_description = merged_df['LongDescription']
print(len(filtered_notes))
print(filtered_notes)

2131
0        Notes_1129931751_228471823_20230528.txt
1       Notes_1129859087_8071100573_20230526.txt
2       Notes_1129859087_8071100574_20230526.txt
3       Notes_1129859087_8071100575_20230526.txt
4       Notes_1129859087_8071100576_20230526.txt
                          ...                   
2126     Notes_1129860217_261601159_20231226.txt
2127    Notes_1130093026_1390944355_20230816.txt
2128     Notes_1130092715_162606223_20230627.txt
2129    Notes_1129895423_1241537011_20230209.txt
2130     Notes_1130092715_273956932_20230716.txt
Name: DeidentifiedName, Length: 2131, dtype: object


In [31]:
import random

In [32]:
num = random.randint(0, len(filtered_notes) - 1)
print(filtered_notes[num])
print(long_description[num])


Notes_1129875278_3165726355_20230119.txt
TRAUMATIC BRAIN COMPRESSION WITHOUT HERNIATION, INIT                       
