In [52]:
import pandas as pd
from thunderpack import ThunderReader
from tqdm import tqdm
import re
import glob

# Create Feature Matrix Format
- include info from cohort, add annotations column
- add icd feature which includes those with pos ICD +/- 6 months of Note Date

In [53]:
# create basic frame for feature matrix
# create list of all patient IDs that were used in annotation
cohort = pd.read_csv('/home/jsearle/bigDrive/NAX/NLP-SAH_identification/test&trainCohorts/combined_cohort_train.csv')

# read in annotations
annotations = pd.read_csv('/home/jsearle/bigDrive/NAX/NLP-SAH_identification/annotationTools/cleanAnnotations/full_annotations_final.csv')

print(len(cohort))
print(len(annotations))

1548
3096


In [54]:
startMatrix = cohort.merge(annotations, left_on='BDSPPatientID', right_on='empi', how='inner')
print(len(startMatrix))
startMatrix.head()

1548


Unnamed: 0,BDSPPatientID,NoteDate,NoteTitle,text,empi,annot
0,117032881,2021-09-29 00:00:00,Notes_13689094716_7824448998_20210929.txt,Physician ***** ***** Admit date: ****...,117032881,0
1,120402560,2021-09-01,Notes_13598482458_7493034182_20210901.txt,Physician ***** ***** Admit date: ****...,120402560,0
2,111454037,2023-11-15,Notes_13666481048_10665439216_20231115.txt,Discharge Summary Name: ***** *****...,111454037,0
3,121582882,2018-10-10 00:00:00,Notes_13329742924_1945106714_20181010.txt,Physician ***** ***** Admit date: ****...,121582882,0
4,111678728,2016-12-29 00:00:00,Notes_13278714866_1420969733_20161229.txt,Physician ***** ***** Admit date: ****...,111678728,0


In [55]:
matrix = startMatrix.drop(columns=['empi'])
matrix.head()

Unnamed: 0,BDSPPatientID,NoteDate,NoteTitle,text,annot
0,117032881,2021-09-29 00:00:00,Notes_13689094716_7824448998_20210929.txt,Physician ***** ***** Admit date: ****...,0
1,120402560,2021-09-01,Notes_13598482458_7493034182_20210901.txt,Physician ***** ***** Admit date: ****...,0
2,111454037,2023-11-15,Notes_13666481048_10665439216_20231115.txt,Discharge Summary Name: ***** *****...,0
3,121582882,2018-10-10 00:00:00,Notes_13329742924_1945106714_20181010.txt,Physician ***** ***** Admit date: ****...,0
4,111678728,2016-12-29 00:00:00,Notes_13278714866_1420969733_20161229.txt,Physician ***** ***** Admit date: ****...,0


## MGB ICDs

In [10]:
path = '/home/jsearle/bigDrive/NAX/NLP-SAH_identification/cohortExtractionPipeline/MGB/CSVs/'

patientIDs = matrix['BDSPPatientID']
print(len(patientIDs))

1548


In [11]:
csv_files = glob.glob(path + "filtered_ICD_partition_*.csv")

In [12]:
filtered_dfs = []

for file in tqdm(csv_files):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(file)
    
    # Filter the DataFrame to include only rows with matching BDSPPatientID
    filtered_df = df[df['BDSPPatientID'].isin(patientIDs)]
    
    # Append the filtered DataFrame to the list
    filtered_dfs.append(filtered_df)

# Concatenate all filtered DataFrames into one
relevantPatientICDs = pd.concat(filtered_dfs, ignore_index=True)

relevantPatientICDs.head()

100%|██████████| 511/511 [01:48<00:00,  4.71it/s]


Unnamed: 0,BDSPEncounterID,EncounterLineNBR,BDSPPatientID,ShiftedContactDTS,ICDLineNBR,ICDCD,ICDDSC,DiagnosisNM,DiagnosisDSC,PrimaryDiagnosisFLG,DiagnosisChronicFLG,ShiftedUpdateDTS,DiagnosisLinkedProblemID,BDSPLastModifiedDTS,code_type
0,13567050000.0,1,117562273.0,2023-07-11 00:00:00.0000000,1.0,351.9,"Facial nerve disorder, unspecified",Facial nerve disorder,,Y,N,2023-07-11 20:10:00.0000000,,2022-09-25 07:01:48.2070000,ICD
1,13280630000.0,2,122057217.0,2018-02-26 00:00:00.0000000,1.0,787.20,"Dysphagia, unspecified","Dysphagia, unspecified type",,N,N,2023-11-07 17:34:00.0000000,41582426.0,2023-08-15 18:43:12.0060000,ICD
2,13519680000.0,1,118601252.0,2021-04-30 00:00:00.0000000,1.0,V65.49,Other specified counseling,Counseling and coordination of care,,Y,N,2021-05-01 11:00:00.0000000,27553194.0,2022-04-27 14:08:48.7770000,ICD
3,13420680000.0,1,118129222.0,2019-08-21 00:00:00.0000000,2.0,571.5,Cirrhosis of liver without mention of alcohol,Liver cirrhosis secondary to nonalcoholic stea...,,N,N,2019-09-02 15:07:00.0000000,76499212.0,2022-04-27 12:58:36.0030000,ICD
4,13292480000.0,1,120791202.0,2016-11-10 00:00:00.0000000,1.0,401.9,Unspecified essential hypertension,Essential hypertension,,Y,N,2022-04-24 20:18:00.0000000,20902954.0,2023-08-15 18:01:15.4060000,ICD


In [13]:
# Save the combined DataFrame to a new CSV file
relevantPatientICDs.to_csv("MGBrelevantPatientICDs.csv", index=False)

## BIDMC ICDs

In [14]:
path = '/home/jsearle/bigDrive/NAX/NLP-SAH_identification/cohortExtractionPipeline/BIDMC/CSVs/'

patientIDs = matrix['BDSPPatientID']
print(len(patientIDs))

1548


In [15]:
csv_files = glob.glob(path + "filtered_ICD_partition_*.csv")

In [16]:
filtered_dfs = []

for file in tqdm(csv_files):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(file)
    
    # Filter the DataFrame to include only rows with matching BDSPPatientID
    filtered_df = df[df['BDSPPatientID'].isin(patientIDs)]
    
    # Append the filtered DataFrame to the list
    filtered_dfs.append(filtered_df)

# Concatenate all filtered DataFrames into one
relevantPatientICDs = pd.concat(filtered_dfs, ignore_index=True)

relevantPatientICDs.head()

  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)
100%|██████████| 36/36 [01:12<00:00,  2.02s/it]


Unnamed: 0,BDSPPatientID,BDSPEncounterID,DiagnosisSequenceNumber,DiagnosisCode,DiagnosisPoaInd,DiagnosisCodeWithDots,ShortDescription,LongDescription,DiagnosisType,AdmissionDate,DischargeDate,BDSPLastModifiedDTS,code_type
0,151039569,1051843626,1,C50311,,C50.311,MAL NEO LW-INNER QUAD RT,MALIG NEOPLM OF LOWER-INNER QUADRANT OF RIGHT ...,ICD10,2016-11-21,2016-11-21,2023-07-19 19:09:12.4626110,ICD10
1,151041279,1052728849,1,C7951,,C79.51,SECONDARY MALIGNANT NEOP,SECONDARY MALIGNANT NEOPLASM OF BONE ...,ICD10,2018-07-01,2018-07-01,2023-07-19 19:10:23.4213520,ICD10
2,151041279,1052728849,2,G939,,G93.9,DISORDER OF BRAIN UNSPEC,"DISORDER OF BRAIN, UNSPECIFIED ...",ICD10,2018-07-01,2018-07-01,2023-07-19 19:10:23.4213520,ICD10
3,151039569,1051845284,1,C50311,,C50.311,MAL NEO LW-INNER QUAD RT,MALIG NEOPLM OF LOWER-INNER QUADRANT OF RIGHT ...,ICD10,2016-11-22,2016-11-22,2023-07-19 19:09:12.4626110,ICD10
4,151041279,1052733141,1,C7951,,C79.51,SECONDARY MALIGNANT NEOP,SECONDARY MALIGNANT NEOPLASM OF BONE ...,ICD10,2018-07-03,2018-07-03,2023-07-19 19:10:23.4213520,ICD10


In [17]:
# Save the combined DataFrame to a new CSV file
relevantPatientICDs.to_csv("BIDMCrelevantPatientICDs.csv", index=False)

# Create new csv that filters by ICD code
### MGB First

In [19]:
MGBrelevantPatientICDs = pd.read_csv("MGBrelevantPatientICDs.csv")
BIDMCrelevantPatientICDs = pd.read_csv("BIDMCrelevantPatientICDs.csv")

In [22]:
code_regex = '^(I60|430)'

MGBrelevantPatientICDs = MGBrelevantPatientICDs.dropna(subset=['ICDCD'])

allPosICDsMGB = MGBrelevantPatientICDs[MGBrelevantPatientICDs['ICDCD'].str.match(code_regex)]

allPosICDsMGB.head()



Unnamed: 0,BDSPEncounterID,EncounterLineNBR,BDSPPatientID,ShiftedContactDTS,ICDLineNBR,ICDCD,ICDDSC,DiagnosisNM,DiagnosisDSC,PrimaryDiagnosisFLG,DiagnosisChronicFLG,ShiftedUpdateDTS,DiagnosisLinkedProblemID,BDSPLastModifiedDTS,code_type
217,13721230000.0,4,121501383.0,2022-07-03 00:00:00.0000000,1.0,430,Subarachnoid hemorrhage,Subarachnoid hemorrhage,,N,N,2022-07-04 23:08:00.0000000,127299861.0,2022-09-15 07:00:26.8080000,ICD
225,13614300000.0,2,113797450.0,2021-06-22 00:00:00.0000000,1.0,430,Subarachnoid hemorrhage,Ruptured (congenital) cerebral aneurysm,,N,N,2021-06-22 18:14:00.0000000,112494133.0,2022-06-13 17:43:46.0660000,ICD
251,13543920000.0,3,111244917.0,2021-02-07 00:00:00.0000000,1.0,430,Subarachnoid hemorrhage,SAH (subarachnoid hemorrhage),,N,N,2021-02-07 17:13:00.0000000,85747190.0,2022-04-27 13:40:53.7930000,ICD
484,13685260000.0,1,116587087.0,2022-10-20 00:00:00.0000000,1.0,430,Subarachnoid hemorrhage,SAH (subarachnoid hemorrhage),,Y,N,2022-11-17 13:20:00.0000000,124551050.0,2022-06-13 17:47:33.6620000,ICD
525,13440950000.0,2,113252439.0,2018-04-04 00:00:00.0000000,1.0,430,Subarachnoid hemorrhage,Subarachnoid hemorrhage,,N,N,2022-08-25 19:37:00.0000000,54503085.0,2023-08-16 01:13:14.4420000,ICD


In [23]:
BIDMCrelevantPatientICDs = BIDMCrelevantPatientICDs.dropna(subset=['DiagnosisCodeWithDots'])

allPosICDsBIDMC = BIDMCrelevantPatientICDs[BIDMCrelevantPatientICDs['DiagnosisCodeWithDots'].str.match(code_regex)]

allPosICDsBIDMC.head()

Unnamed: 0,BDSPPatientID,BDSPEncounterID,DiagnosisSequenceNumber,DiagnosisCode,DiagnosisPoaInd,DiagnosisCodeWithDots,ShortDescription,LongDescription,DiagnosisType,AdmissionDate,DischargeDate,BDSPLastModifiedDTS,code_type
534,151055839,134384936,1,I6012,Y,I60.12,NONTRAUM SA HEMOR LT MID,NTRM SUBARACH HEMORRHAGE FROM LEFT MIDDLE CERE...,ICD10,2018-05-18,2018-06-05,2023-07-19 19:10:23.4213520,ICD10
616,151053565,134426070,2,I609,N,I60.9,NONTRAUMATIC SUBARACH HE,"NONTRAUMATIC SUBARACHNOID HEMORRHAGE, UNSPECIF...",ICD10,2019-01-30,2019-02-11,2023-07-19 19:10:23.4213520,ICD10
758,151066006,134656049,1,I604,Y,I60.4,NONTRAUM SA HEMORR BASIL,NONTRAUMATIC SUBARACHNOID HEMORRHAGE FROM BASI...,ICD10,2021-03-30,2021-04-11,2023-07-19 19:11:59.3749140,ICD10
930,151055839,984158717,1,I609,,I60.9,NONTRAUMATIC SUBARACH HE,"NONTRAUMATIC SUBARACHNOID HEMORRHAGE, UNSPECIF...",ICD10,2018-09-16,2018-09-16,2023-07-19 19:10:23.4213520,ICD10
978,151055839,982329054,1,I609,,I60.9,NONTRAUMATIC SUBARACH HE,"NONTRAUMATIC SUBARACHNOID HEMORRHAGE, UNSPECIF...",ICD10,2018-07-10,2018-07-10,2023-07-19 19:10:23.4213520,ICD10


In [56]:
allPosICDsMGB.to_csv("allPosICDsMGB.csv", index=False)
allPosICDsBIDMC.to_csv("allPosICDsBIDMC.csv", index=False)

## Now adding feature for ICD code received 6 months +/- the NoteDate 

In [57]:
# Rename the date columns to a common name
allPosICDsMGB = allPosICDsMGB.rename(columns={'ShiftedContactDTS': 'ICDDate'})
allPosICDsBIDMC = allPosICDsBIDMC.rename(columns={'AdmissionDate': 'ICDDate'})

In [58]:
allPosICDsMGB.head()

Unnamed: 0,BDSPEncounterID,EncounterLineNBR,BDSPPatientID,ICDDate,ICDLineNBR,ICDCD,ICDDSC,DiagnosisNM,DiagnosisDSC,PrimaryDiagnosisFLG,DiagnosisChronicFLG,ShiftedUpdateDTS,DiagnosisLinkedProblemID,BDSPLastModifiedDTS,code_type
217,13721230000.0,4,121501383.0,2022-07-03,1.0,430,Subarachnoid hemorrhage,Subarachnoid hemorrhage,,N,N,2022-07-04 23:08:00.0000000,127299861.0,2022-09-15 07:00:26.8080000,ICD
225,13614300000.0,2,113797450.0,2021-06-22,1.0,430,Subarachnoid hemorrhage,Ruptured (congenital) cerebral aneurysm,,N,N,2021-06-22 18:14:00.0000000,112494133.0,2022-06-13 17:43:46.0660000,ICD
251,13543920000.0,3,111244917.0,2021-02-07,1.0,430,Subarachnoid hemorrhage,SAH (subarachnoid hemorrhage),,N,N,2021-02-07 17:13:00.0000000,85747190.0,2022-04-27 13:40:53.7930000,ICD
484,13685260000.0,1,116587087.0,2022-10-20,1.0,430,Subarachnoid hemorrhage,SAH (subarachnoid hemorrhage),,Y,N,2022-11-17 13:20:00.0000000,124551050.0,2022-06-13 17:47:33.6620000,ICD
525,13440950000.0,2,113252439.0,2018-04-04,1.0,430,Subarachnoid hemorrhage,Subarachnoid hemorrhage,,N,N,2022-08-25 19:37:00.0000000,54503085.0,2023-08-16 01:13:14.4420000,ICD


In [59]:
allPosICDsBIDMC.head()

Unnamed: 0,BDSPPatientID,BDSPEncounterID,DiagnosisSequenceNumber,DiagnosisCode,DiagnosisPoaInd,DiagnosisCodeWithDots,ShortDescription,LongDescription,DiagnosisType,ICDDate,DischargeDate,BDSPLastModifiedDTS,code_type
534,151055839,134384936,1,I6012,Y,I60.12,NONTRAUM SA HEMOR LT MID,NTRM SUBARACH HEMORRHAGE FROM LEFT MIDDLE CERE...,ICD10,2018-05-18,2018-06-05,2023-07-19 19:10:23.4213520,ICD10
616,151053565,134426070,2,I609,N,I60.9,NONTRAUMATIC SUBARACH HE,"NONTRAUMATIC SUBARACHNOID HEMORRHAGE, UNSPECIF...",ICD10,2019-01-30,2019-02-11,2023-07-19 19:10:23.4213520,ICD10
758,151066006,134656049,1,I604,Y,I60.4,NONTRAUM SA HEMORR BASIL,NONTRAUMATIC SUBARACHNOID HEMORRHAGE FROM BASI...,ICD10,2021-03-30,2021-04-11,2023-07-19 19:11:59.3749140,ICD10
930,151055839,984158717,1,I609,,I60.9,NONTRAUMATIC SUBARACH HE,"NONTRAUMATIC SUBARACHNOID HEMORRHAGE, UNSPECIF...",ICD10,2018-09-16,2018-09-16,2023-07-19 19:10:23.4213520,ICD10
978,151055839,982329054,1,I609,,I60.9,NONTRAUMATIC SUBARACH HE,"NONTRAUMATIC SUBARACHNOID HEMORRHAGE, UNSPECIF...",ICD10,2018-07-10,2018-07-10,2023-07-19 19:10:23.4213520,ICD10


In [71]:
allPosICDsMGB['ICDDate'] = pd.to_datetime(allPosICDsMGB['ICDDate'])
allPosICDsBIDMC['ICDDate'] = pd.to_datetime(allPosICDsBIDMC['ICDDate'])

In [72]:
icd_df = pd.concat([allPosICDsMGB, allPosICDsBIDMC], ignore_index=True)

In [73]:
matrix = startMatrix

In [74]:
def parse_dates(date_str):
    for fmt in ('%Y-%m-%d %H:%M:%S.%f', '%Y-%m-%d %H:%M:%S', '%Y-%m-%d'):
        try:
            return pd.to_datetime(date_str, format=fmt)
        except ValueError:
            continue
    # If all formats fail, return NaT
    return pd.NaT

# Apply the custom function to convert dates
matrix['NoteDate'] = matrix['NoteDate'].apply(parse_dates)
matrix.head()


Unnamed: 0,BDSPPatientID,NoteDate,NoteTitle,text,empi,annot
0,117032881,2021-09-29,Notes_13689094716_7824448998_20210929.txt,Physician ***** ***** Admit date: ****...,117032881,0
1,120402560,2021-09-01,Notes_13598482458_7493034182_20210901.txt,Physician ***** ***** Admit date: ****...,120402560,0
2,111454037,2023-11-15,Notes_13666481048_10665439216_20231115.txt,Discharge Summary Name: ***** *****...,111454037,0
3,121582882,2018-10-10,Notes_13329742924_1945106714_20181010.txt,Physician ***** ***** Admit date: ****...,121582882,0
4,111678728,2016-12-29,Notes_13278714866_1420969733_20161229.txt,Physician ***** ***** Admit date: ****...,111678728,0


In [75]:
matrix = matrix.drop(columns=['empi'])
matrix.head()


Unnamed: 0,BDSPPatientID,NoteDate,NoteTitle,text,annot
0,117032881,2021-09-29,Notes_13689094716_7824448998_20210929.txt,Physician ***** ***** Admit date: ****...,0
1,120402560,2021-09-01,Notes_13598482458_7493034182_20210901.txt,Physician ***** ***** Admit date: ****...,0
2,111454037,2023-11-15,Notes_13666481048_10665439216_20231115.txt,Discharge Summary Name: ***** *****...,0
3,121582882,2018-10-10,Notes_13329742924_1945106714_20181010.txt,Physician ***** ***** Admit date: ****...,0
4,111678728,2016-12-29,Notes_13278714866_1420969733_20161229.txt,Physician ***** ***** Admit date: ****...,0


In [76]:
matrix['NoteDate'] = pd.to_datetime(matrix['NoteDate'])

In [77]:
def icd_received_within_6_months(row, icd_df):
    patient_id = row['BDSPPatientID']
    note_date = row['NoteDate']
    start_date = note_date - pd.DateOffset(months=6)
    end_date = note_date + pd.DateOffset(months=6)
    
    patient_icd_df = icd_df[icd_df['BDSPPatientID'] == patient_id]
    return 1 if ((patient_icd_df['ICDDate'] >= start_date) & 
            (patient_icd_df['ICDDate'] <= end_date)).any() else 0

# Apply the function to each row in the matrix
matrix['ICD'] = matrix.apply(icd_received_within_6_months, axis=1, icd_df=icd_df)

matrix.head()


Unnamed: 0,BDSPPatientID,NoteDate,NoteTitle,text,annot,ICD
0,117032881,2021-09-29,Notes_13689094716_7824448998_20210929.txt,Physician ***** ***** Admit date: ****...,0,0
1,120402560,2021-09-01,Notes_13598482458_7493034182_20210901.txt,Physician ***** ***** Admit date: ****...,0,0
2,111454037,2023-11-15,Notes_13666481048_10665439216_20231115.txt,Discharge Summary Name: ***** *****...,0,0
3,121582882,2018-10-10,Notes_13329742924_1945106714_20181010.txt,Physician ***** ***** Admit date: ****...,0,0
4,111678728,2016-12-29,Notes_13278714866_1420969733_20161229.txt,Physician ***** ***** Admit date: ****...,0,0


In [80]:
matrix.to_csv('matrix_ICD_feature_training.csv', index=False)