In [44]:
import pandas as pd
from thunderpack import ThunderReader
from tqdm import tqdm
import os
import re
from datetime import timedelta


MGB

In [45]:
year = 2013
output_dir = '/home/jsearle/bigDrive/NAX/NLP-SAH_identification/cohortReconstruction/CSVs'

# Create the output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Read and concatenate all CSV files
all_data = []
for i in tqdm(range(0, 12)):
    df = pd.read_csv(f'/home/jsearle/bigDrive/Dropbox/zz_EHR_Thunderpacks/MGB/MGB_Deidentified_Notes_March12th2024/mgb_notes_{year + i}_metadata.csv')
    all_data.append(df)

# Concatenate all data into a single DataFrame
all_notes = pd.concat(all_data, axis=0, ignore_index=True)

# Sample 10,000 random notes
sampled_notes = all_notes.sample(frac=1, random_state=2024)
sampled_notes = sampled_notes.drop_duplicates(subset='BDSPPatientID')
sampled_notes = sampled_notes.sample(n=10000, random_state=2024)

# Save the sampled notes to a CSV file
sampled_notes.to_csv(os.path.join(output_dir, 'random_sampled_notes.csv'), index=False)

# Show number of total sampled notes
print(len(sampled_notes))

# Show example
sampled_notes.head()

100%|██████████| 12/12 [00:33<00:00,  2.76s/it]


10000


Unnamed: 0,BDSPPatientID,ContactDate,InpatientNoteTypeDSC,DeidentifiedName
9564516,112932836,20181103,Consults,Notes_13377519543_2592692819_20181103.txt
662403,119764569,20150420,Discharge Summary,Notes_13194883924_442003354_20150420.txt
25548921,113659421,20230725,Patient Instructions,Notes_13687733606_9228224427_20230725.txt
18776324,118767582,20210801,Assessment & Plan Note,Notes_13543524172_6416789708_20210801.txt
1459988,111655576,20160128,Telephone Encounter,Notes_13245063691_1170263719_20160128.txt


In [46]:
# change date column from str to datetime
sampled_notes.loc[:, 'ContactDate'] = pd.to_datetime(sampled_notes['ContactDate'], format='%Y%m%d')
sampled_notes.head()

['2018-11-03 00:00:00', '2015-04-20 00:00:00', '2023-07-25 00:00:00',
 '2021-08-01 00:00:00', '2016-01-28 00:00:00', '2014-04-28 00:00:00',
 '2023-04-12 00:00:00', '2021-09-16 00:00:00', '2021-09-12 00:00:00',
 '2018-03-23 00:00:00',
 ...
 '2017-04-01 00:00:00', '2022-06-20 00:00:00', '2015-12-24 00:00:00',
 '2020-09-11 00:00:00', '2014-04-22 00:00:00', '2018-07-09 00:00:00',
 '2018-04-05 00:00:00', '2021-11-28 00:00:00', '2014-08-11 00:00:00',
 '2017-08-29 00:00:00']
Length: 10000, dtype: datetime64[ns]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  sampled_notes.loc[:, 'ContactDate'] = pd.to_datetime(sampled_notes['ContactDate'], format='%Y%m%d')


Unnamed: 0,BDSPPatientID,ContactDate,InpatientNoteTypeDSC,DeidentifiedName
9564516,112932836,2018-11-03,Consults,Notes_13377519543_2592692819_20181103.txt
662403,119764569,2015-04-20,Discharge Summary,Notes_13194883924_442003354_20150420.txt
25548921,113659421,2023-07-25,Patient Instructions,Notes_13687733606_9228224427_20230725.txt
18776324,118767582,2021-08-01,Assessment & Plan Note,Notes_13543524172_6416789708_20210801.txt
1459988,111655576,2016-01-28,Telephone Encounter,Notes_13245063691_1170263719_20160128.txt


In [47]:
# Get unique patient IDs from the sampled notes
unique_patient_ids = set(sampled_notes['BDSPPatientID'])
print(len(unique_patient_ids))

10000


In [48]:
reader = ThunderReader('/home/jsearle/bigDrive/Dropbox/zz_EHR_Thunderpacks/MGB/thunderpack_icd_9_10_1m_MGB')
key_length = len(list(reader.keys()))
print(key_length)

511


In [49]:
# Only run once, then CSVs should be saved
# Skip this cell if CSVs is already full
output_dir = '/home/jsearle/bigDrive/NAX/NLP-SAH_identification/cohortReconstruction/CSVs'
os.makedirs(output_dir, exist_ok=True)

for i in tqdm(range(1, key_length + 1)):
    df = reader[f'ICD_partition_{i}']
    df = df[df['BDSPPatientID'].isin(unique_patient_ids)]
    df.to_csv(os.path.join(output_dir, f'filtered_ICD_partition_{i}.csv'), index=False)

100%|██████████| 511/511 [24:23<00:00,  2.86s/it]


In [50]:

all_files = [os.path.join(output_dir, f) for f in os.listdir(output_dir) if f.startswith('filtered_ICD_partition_')]
ICD_df = pd.concat((pd.read_csv(f) for f in all_files), axis=0, ignore_index=True)
print(len(ICD_df))

3544714


In [51]:
ICD_df.head()

Unnamed: 0,BDSPEncounterID,EncounterLineNBR,BDSPPatientID,ShiftedContactDTS,ICDLineNBR,ICDCD,ICDDSC,DiagnosisNM,DiagnosisDSC,PrimaryDiagnosisFLG,DiagnosisChronicFLG,ShiftedUpdateDTS,DiagnosisLinkedProblemID,BDSPLastModifiedDTS,code_type
0,13384590000.0,1,116948270.0,2018-07-06 00:00:00.0000000,1.0,787.7,Abnormal feces,Abnormal stools,,Y,N,2022-11-11 19:55:00.0000000,54544730.0,2023-08-16 01:10:11.7310000,ICD
1,13267530000.0,1,115152158.0,2016-09-02 00:00:00.0000000,1.0,IMO0001,,Uncontrolled type 2 diabetes mellitus without ...,,Y,N,2022-06-19 17:36:00.0000000,,2023-08-15 18:16:15.9490000,ICD
2,13767760000.0,1,117357448.0,2022-10-21 00:00:00.0000000,1.0,780.39,Other convulsions,Seizure,,Y,N,2022-10-21 11:19:00.0000000,143078966.0,2023-08-16 06:22:54.5000000,ICD
3,13534730000.0,1,118512146.0,2021-02-02 00:00:00.0000000,1.0,202.80,"Other malignant lymphomas, unspecified site, e...","Diffuse large B-cell lymphoma, unspecified bod...",,Y,N,2021-02-02 15:15:00.0000000,95796118.0,2022-04-27 15:51:06.4400000,ICD
4,13649660000.0,1,122087242.0,2022-04-29 00:00:00.0000000,1.0,E888.9,Unspecified fall,"Fall in home, initial encounter",,N,N,2022-04-29 10:48:00.0000000,,2023-08-16 03:16:42.5390000,ICD


In [52]:
ICD_df['ShiftedContactDTS'] = pd.to_datetime(ICD_df['ShiftedContactDTS'])
print(ICD_df['ShiftedContactDTS'][2])
print(type(ICD_df['ShiftedContactDTS'][2]))

2022-10-21 00:00:00
<class 'pandas._libs.tslibs.timestamps.Timestamp'>


In [55]:
# Regex pattern, simplified by removing redundant `(?I)`
code_regex = "^(I60|430).*"

def check_icd_in_period(id_, date):
    time_window = timedelta(days=30)

    # Filter the rows for the specific BDSPPatientID and within the 30-day window
    filtered_df = ICD_df[
        (ICD_df['BDSPPatientID'] == id_) &
        (abs(ICD_df['ShiftedContactDTS'] - date) <= time_window)
    ]
    
    # Check if any of the filtered rows match the ICD code pattern
    if filtered_df['ICDCD'].str.match(code_regex, flags=re.I).any():
        return 1
    return 0

# Apply the check_icd_in_period function to each row in sampled_notes
sampled_notes['ICD'] = sampled_notes.apply(lambda row: check_icd_in_period(row['BDSPPatientID'], row['ContactDate']), axis=1)

In [56]:
print(len(sampled_notes))
sampled_notes.head()

10000


Unnamed: 0,BDSPPatientID,ContactDate,InpatientNoteTypeDSC,DeidentifiedName,ICD
9564516,112932836,2018-11-03,Consults,Notes_13377519543_2592692819_20181103.txt,0
662403,119764569,2015-04-20,Discharge Summary,Notes_13194883924_442003354_20150420.txt,0
25548921,113659421,2023-07-25,Patient Instructions,Notes_13687733606_9228224427_20230725.txt,0
18776324,118767582,2021-08-01,Assessment & Plan Note,Notes_13543524172_6416789708_20210801.txt,0
1459988,111655576,2016-01-28,Telephone Encounter,Notes_13245063691_1170263719_20160128.txt,0


In [59]:
# Count the number of 1s and 0s in the ICDpos column
icd_counts = sampled_notes['ICD'].value_counts()

numPos = icd_counts.get(1, 0)
numNeg = icd_counts.get(0, 0)

posPrevalence = numPos/len(sampled_notes)
negPrevalence = numNeg/len(sampled_notes)

print('MGB')
print('ICD Positive Cases: ', numPos)
print('ICD Negative Cases: ', numNeg)
print('ICD Positive Prevalence: ', posPrevalence)
print('ICD Negative Prevalence: ', negPrevalence)

MGB
ICD Positive Cases:  85
ICD Negative Cases:  9915
ICD Positive Prevalence:  0.0085
ICD Negative Prevalence:  0.9915
