In [1]:
import pandas as pd
from thunderpack import ThunderReader
from tqdm import tqdm
import os
import re
from datetime import timedelta

In [2]:
year = 2010
output_dir = '/home/jsearle/bigDrive/NAX/NLP-SAH_identification/cohortReconstruction/BIDMC/CSVs'

# Create the output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Read and concatenate all CSV files
all_data = []
for i in tqdm(range(0, 12)):
    df = pd.read_csv(f'/home/jsearle/bigDrive/Dropbox/zz_EHR_Thunderpacks/BIDMC/BIDMC_Deidentified_Notes_March14th2024/bidmc_notes_{year + i}_metadata.csv')
    all_data.append(df)

# Concatenate all data into a single DataFrame
all_notes = pd.concat(all_data, axis=0, ignore_index=True)

# Sample 10,000 random notes
sampled_notes = all_notes.sample(frac=1, random_state=2024)
sampled_notes = sampled_notes.drop_duplicates(subset='BDSPPatientID')
sampled_notes = sampled_notes.sample(n=10000, random_state=2024)

# Save the sampled notes to a CSV file
sampled_notes.to_csv(os.path.join(output_dir, 'random_sampled_notes.csv'), index=False)

# Show number of total sampled notes
print(len(sampled_notes))

# Show example
sampled_notes.head()

100%|██████████| 12/12 [00:34<00:00,  2.88s/it]


10000


Unnamed: 0,BDSPPatientID,NoteTypeFull,Service,CreateDate,DeidentifiedName
17731536,150049399,Progress note,Nursing,20201002,Notes_1129907921_198649726_20201002.txt
8782162,151229866,Letter,Dermatology,20161218,Notes_1131088794_442709467_20161218.txt
8328323,150012216,Progress note,Case Management,20160716,Notes_1129870751_2819162312_20160716.txt
4344950,150720117,Progress note,Case Management,20140224,Notes_1130578531_1381358127_20140224.txt
1895263,150334304,Letter,Gastroenterology,20120929,Notes_1130193228_2601394523_20120929.txt


In [11]:
# clean up df, keep only relevant info
keepColumns = ['BDSPPatientID', 'CreateDate',  'DeidentifiedName']
sampled_notes = sampled_notes[keepColumns]
sampled_notes.head()

Unnamed: 0,BDSPPatientID,CreateDate,DeidentifiedName
17731536,150049399,20201002,Notes_1129907921_198649726_20201002.txt
8782162,151229866,20161218,Notes_1131088794_442709467_20161218.txt
8328323,150012216,20160716,Notes_1129870751_2819162312_20160716.txt
4344950,150720117,20140224,Notes_1130578531_1381358127_20140224.txt
1895263,150334304,20120929,Notes_1130193228_2601394523_20120929.txt


In [12]:
# rename columns
rename_dict = { 
    'CreateDate': 'NoteDate', 
    'DeidentifiedName': 'NoteTitle', 
}

sampled_notes = sampled_notes.rename(columns=rename_dict)

# change date column from str to datetime
sampled_notes.loc[:, 'NoteDate'] = pd.to_datetime(sampled_notes['NoteDate'], format='%Y%m%d')
sampled_notes.head()

['2020-10-02 00:00:00', '2016-12-18 00:00:00', '2016-07-16 00:00:00',
 '2014-02-24 00:00:00', '2012-09-29 00:00:00', '2014-05-06 00:00:00',
 '2021-09-16 00:00:00', '2014-04-22 00:00:00', '2014-09-09 00:00:00',
 '2020-12-23 00:00:00',
 ...
 '2015-08-15 00:00:00', '2020-04-09 00:00:00', '2018-07-29 00:00:00',
 '2015-06-25 00:00:00', '2017-06-27 00:00:00', '2017-06-25 00:00:00',
 '2020-05-04 00:00:00', '2018-04-10 00:00:00', '2017-05-05 00:00:00',
 '2017-01-07 00:00:00']
Length: 10000, dtype: datetime64[ns]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  sampled_notes.loc[:, 'NoteDate'] = pd.to_datetime(sampled_notes['NoteDate'], format='%Y%m%d')


Unnamed: 0,BDSPPatientID,NoteDate,NoteTitle
17731536,150049399,2020-10-02,Notes_1129907921_198649726_20201002.txt
8782162,151229866,2016-12-18,Notes_1131088794_442709467_20161218.txt
8328323,150012216,2016-07-16,Notes_1129870751_2819162312_20160716.txt
4344950,150720117,2014-02-24,Notes_1130578531_1381358127_20140224.txt
1895263,150334304,2012-09-29,Notes_1130193228_2601394523_20120929.txt


In [13]:
# Get unique patient IDs from the sampled notes
unique_patient_ids = set(sampled_notes['BDSPPatientID'])
print(len(unique_patient_ids))

10000


In [6]:
reader = ThunderReader('/home/jsearle/bigDrive/Dropbox/zz_EHR_Thunderpacks/BIDMC/thunderpack_icd_9_10_nax_1m_BIDMC')
key_length = len(list(reader.keys()))
print(key_length)

36


In [7]:
output_dir = '/home/jsearle/bigDrive/NAX/NLP-SAH_identification/cohortReconstruction/BIDMC/CSVs'
os.makedirs(output_dir, exist_ok=True)

for i in tqdm(range(1, key_length + 1)):
    df = reader[f'ICD_partition_{i}']
    df = df[df['BDSPPatientID'].isin(unique_patient_ids)]
    df.to_csv(os.path.join(output_dir, f'filtered_ICD_partition_{i}.csv'), index=False)

100%|██████████| 36/36 [00:43<00:00,  1.21s/it]


In [8]:
all_files = [os.path.join(output_dir, f) for f in os.listdir(output_dir) if f.startswith('filtered_ICD_partition_')]
ICD_df = pd.concat((pd.read_csv(f) for f in all_files), axis=0, ignore_index=True)
print(len(ICD_df))
ICD_df.head()

794783


Unnamed: 0,BDSPPatientID,BDSPEncounterID,DiagnosisSequenceNumber,DiagnosisCode,DiagnosisPoaInd,DiagnosisCodeWithDots,ShortDescription,LongDescription,DiagnosisType,AdmissionDate,DischargeDate,BDSPLastModifiedDTS,code_type
0,151037991,1051842220,1,F4322,,F43.22,ADJUSTMENT DISORDER WIT,ADJUSTMENT DISORDER WITH ANXIETY ...,ICD10,2017-05-18,2017-05-18,2023-07-19 19:09:12.4626110,ICD10
1,151037991,1051842220,2,K140,,K14.0,GLOSSITIS,GLOSSITIS ...,ICD10,2017-05-18,2017-05-18,2023-07-19 19:09:12.4626110,ICD10
2,150056604,1050860700,1,R51,,R51,HEADACHE,HEADACHE ...,ICD10,2016-12-23,2016-12-23,2023-07-18 11:44:35.2589490,ICD10
3,150056604,1050860700,2,M5481,,M54.81,OCCIPITAL NEURALGIA,OCCIPITAL NEURALGIA ...,ICD10,2016-12-23,2016-12-23,2023-07-18 11:44:35.2589490,ICD10
4,151035244,1054296240,1,I82A11,,I82.A11,ACUTE EMBO THROMB RT AXI,ACUTE EMBOLISM AND THROMBOSIS OF RIGHT AXILLAR...,ICD10,2022-08-25,2022-08-25,2023-07-19 19:09:12.4626110,ICD10


In [9]:
ICD_df['AdmissionDate'] = pd.to_datetime(ICD_df['AdmissionDate'])
print(ICD_df['AdmissionDate'][2])
print(type(ICD_df['AdmissionDate'][2]))

2016-12-23 00:00:00
<class 'pandas._libs.tslibs.timestamps.Timestamp'>


In [14]:
# Regex pattern, simplified by removing redundant `(?I)`
code_regex = "^(I60|430).*"

def check_icd_in_period(id_, date):
    time_window = timedelta(days=30)

    # Filter the rows for the specific BDSPPatientID and within the 30-day window
    filtered_df = ICD_df[
        (ICD_df['BDSPPatientID'] == id_) &
        (abs(ICD_df['AdmissionDate'] - date) <= time_window)
    ]
    
    # Check if any of the filtered rows match the ICD code pattern
    if filtered_df['DiagnosisCodeWithDots'].str.match(code_regex, flags=re.I).any():
        return 1
    return 0

# Apply the check_icd_in_period function to each row in sampled_notes
sampled_notes['ICD'] = sampled_notes.apply(lambda row: check_icd_in_period(row['BDSPPatientID'], row['NoteDate']), axis=1)

In [15]:
print(len(sampled_notes))
sampled_notes.head()

10000


Unnamed: 0,BDSPPatientID,NoteDate,NoteTitle,ICD
17731536,150049399,2020-10-02,Notes_1129907921_198649726_20201002.txt,0
8782162,151229866,2016-12-18,Notes_1131088794_442709467_20161218.txt,0
8328323,150012216,2016-07-16,Notes_1129870751_2819162312_20160716.txt,0
4344950,150720117,2014-02-24,Notes_1130578531_1381358127_20140224.txt,0
1895263,150334304,2012-09-29,Notes_1130193228_2601394523_20120929.txt,0


In [17]:
# Count the number of 1s and 0s in the ICDpos column
icd_counts = sampled_notes['ICD'].value_counts()

numPos = icd_counts.get(1, 0)
numNeg = icd_counts.get(0, 0)

prevalence = numPos/len(sampled_notes)


print('BI')
print('ICD Positive Cases: ', numPos)
print('ICD Negative Cases: ', numNeg)
print('Prevalence: ', prevalence)

BI
ICD Positive Cases:  21
ICD Negative Cases:  9979
Prevalence:  0.0021
