In [1]:
import pandas as pd
from thunderpack import ThunderReader
from tqdm import tqdm
import re
import glob

# Create Feature Matrix Format
- include info from cohort, add annotations column
- add icd feature which includes those with pos ICD +/- 6 months of Note Date

### Combined Cohort
 - run this for combined hospital cohort

In [2]:
# create basic frame for feature matrix
# create list of all patient IDs that were used in annotation
cohort = pd.read_csv('/home/jsearle/bigDrive/NAX/NLP-SAH_identification/test&trainCohorts/combined_cohort_test.csv')

# read in annotations
annotations = pd.read_csv('/home/jsearle/bigDrive/NAX/NLP-SAH_identification/annotationTools/cleanAnnotations/full_annotations_final.csv')

print(len(cohort))
print(len(annotations))

1548
3096


In [3]:
startMatrix = cohort.merge(annotations, left_on='BDSPPatientID', right_on='empi', how='inner')
print(len(startMatrix))
startMatrix.head()

1548


Unnamed: 0,BDSPPatientID,NoteDate,NoteTitle,text,hospital,empi,annot
0,115883980,2018-04-14,Notes_13414009311_1956375582_20180414.txt,Physician ***** ***** Admit date: ****...,MGB,115883980,1
1,116483510,2022-05-12 00:00:00,Notes_13622415618_8165687294_20220512.txt,Physician ***** ***** Admit date: ****...,MGB,116483510,0
2,150009858,2013-08-29,Notes_1129868316_2608699887_20130829.txt,\n\nNote Date: *****/*****/*****\n\nNote Type:...,BIDMC,150009858,1
3,150009896,2022-02-04,Notes_1129868625_2609280946_20220204.txt,\n\nNote Date: *****/*****/***** Time: 0813\n...,BIDMC,150009896,1
4,116023288,2022-01-03 00:00:00,Notes_13528991310_6365444126_20220103.txt,***** ***** ***** Medical Psychiatry D...,MGB,116023288,0


In [4]:
matrix = startMatrix.drop(columns=['empi'])
matrix.head()

Unnamed: 0,BDSPPatientID,NoteDate,NoteTitle,text,hospital,annot
0,115883980,2018-04-14,Notes_13414009311_1956375582_20180414.txt,Physician ***** ***** Admit date: ****...,MGB,1
1,116483510,2022-05-12 00:00:00,Notes_13622415618_8165687294_20220512.txt,Physician ***** ***** Admit date: ****...,MGB,0
2,150009858,2013-08-29,Notes_1129868316_2608699887_20130829.txt,\n\nNote Date: *****/*****/*****\n\nNote Type:...,BIDMC,1
3,150009896,2022-02-04,Notes_1129868625_2609280946_20220204.txt,\n\nNote Date: *****/*****/***** Time: 0813\n...,BIDMC,1
4,116023288,2022-01-03 00:00:00,Notes_13528991310_6365444126_20220103.txt,***** ***** ***** Medical Psychiatry D...,MGB,0


## MGB ICDs

In [5]:
path = '/home/jsearle/bigDrive/NAX/NLP-SAH_identification/cohortExtractionPipeline/MGB/CSVs/'

patientIDs = matrix['BDSPPatientID']
print(len(patientIDs))

1548


In [6]:
csv_files = glob.glob(path + "filtered_ICD_partition_*.csv")

In [7]:
filtered_dfs = []

for file in tqdm(csv_files):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(file)
    
    # Filter the DataFrame to include only rows with matching BDSPPatientID
    filtered_df = df[df['BDSPPatientID'].isin(patientIDs)]
    
    # Append the filtered DataFrame to the list
    filtered_dfs.append(filtered_df)

# Concatenate all filtered DataFrames into one
relevantPatientICDs = pd.concat(filtered_dfs, ignore_index=True)

relevantPatientICDs.head()

100%|██████████| 511/511 [02:40<00:00,  3.19it/s]


Unnamed: 0,BDSPEncounterID,EncounterLineNBR,BDSPPatientID,ShiftedContactDTS,ICDLineNBR,ICDCD,ICDDSC,DiagnosisNM,DiagnosisDSC,PrimaryDiagnosisFLG,DiagnosisChronicFLG,ShiftedUpdateDTS,DiagnosisLinkedProblemID,BDSPLastModifiedDTS,code_type
0,13442640000.0,3,118526681.0,2020-10-03 00:00:00.0000000,1.0,486.0,"Pneumonia, organism unspecified",Pneumonia of both lower lobes due to infectiou...,,N,N,2020-10-23 14:16:00.0000000,86181325.0,2022-04-27 13:04:38.9700000,ICD
1,13406130000.0,2,111886016.0,2018-12-31 00:00:00.0000000,1.0,272.4,Other and unspecified hyperlipidemia,"Hyperlipidemia, unspecified hyperlipidemia type",,N,N,2018-12-31 13:13:00.0000000,30182803.0,2022-04-27 13:27:03.6830000,ICD
2,13567050000.0,1,117562273.0,2023-07-11 00:00:00.0000000,1.0,351.9,"Facial nerve disorder, unspecified",Facial nerve disorder,,Y,N,2023-07-11 20:10:00.0000000,,2022-09-25 07:01:48.2070000,ICD
3,13458340000.0,1,119565757.0,2020-01-06 00:00:00.0000000,1.0,854.06,Intracranial injury of other and unspecified n...,Traumatic brain injury with loss of consciousn...,,Y,N,2020-01-06 13:13:00.0000000,79494062.0,2022-04-27 15:51:06.4400000,ICD
4,13280630000.0,2,122057217.0,2018-02-26 00:00:00.0000000,1.0,787.2,"Dysphagia, unspecified","Dysphagia, unspecified type",,N,N,2023-11-07 17:34:00.0000000,41582426.0,2023-08-15 18:43:12.0060000,ICD


In [8]:
# Save the combined DataFrame to a new CSV file
relevantPatientICDs.to_csv("helperCSVs/MGBrelevantPatientICDs.csv", index=False)

## BIDMC ICDs

In [10]:
path = '/home/jsearle/bigDrive/NAX/NLP-SAH_identification/cohortExtractionPipeline/BIDMC/CSVs/'

patientIDs = matrix['BDSPPatientID']
print(len(patientIDs))

1548


In [11]:
csv_files = glob.glob(path + "filtered_ICD_partition_*.csv")

In [12]:
filtered_dfs = []

for file in tqdm(csv_files):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(file)
    
    # Filter the DataFrame to include only rows with matching BDSPPatientID
    filtered_df = df[df['BDSPPatientID'].isin(patientIDs)]
    
    # Append the filtered DataFrame to the list
    filtered_dfs.append(filtered_df)

# Concatenate all filtered DataFrames into one
relevantPatientICDs = pd.concat(filtered_dfs, ignore_index=True)

relevantPatientICDs.head()

  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)
100%|██████████| 36/36 [01:47<00:00,  2.99s/it]


Unnamed: 0,BDSPPatientID,BDSPEncounterID,DiagnosisSequenceNumber,DiagnosisCode,DiagnosisPoaInd,DiagnosisCodeWithDots,ShortDescription,LongDescription,DiagnosisType,AdmissionDate,DischargeDate,BDSPLastModifiedDTS,code_type
0,151039569,1051843626,1,C50311,,C50.311,MAL NEO LW-INNER QUAD RT,MALIG NEOPLM OF LOWER-INNER QUADRANT OF RIGHT ...,ICD10,2016-11-21,2016-11-21,2023-07-19 19:09:12.4626110,ICD10
1,151039569,1051845284,1,C50311,,C50.311,MAL NEO LW-INNER QUAD RT,MALIG NEOPLM OF LOWER-INNER QUADRANT OF RIGHT ...,ICD10,2016-11-22,2016-11-22,2023-07-19 19:09:12.4626110,ICD10
2,151039569,1051848011,1,C50311,,C50.311,MAL NEO LW-INNER QUAD RT,MALIG NEOPLM OF LOWER-INNER QUADRANT OF RIGHT ...,ICD10,2016-11-23,2016-11-23,2023-07-19 19:09:12.4626110,ICD10
3,151039569,1051849298,1,C50311,,C50.311,MAL NEO LW-INNER QUAD RT,MALIG NEOPLM OF LOWER-INNER QUADRANT OF RIGHT ...,ICD10,2016-11-26,2016-11-26,2023-07-19 19:09:12.4626110,ICD10
4,151039569,1051850966,1,C50311,,C50.311,MAL NEO LW-INNER QUAD RT,MALIG NEOPLM OF LOWER-INNER QUADRANT OF RIGHT ...,ICD10,2016-11-27,2016-11-27,2023-07-19 19:09:12.4626110,ICD10


In [13]:
# Save the combined DataFrame to a new CSV file
relevantPatientICDs.to_csv("helperCSVs/BIDMCrelevantPatientICDs.csv", index=False)

# Create new csv that filters by ICD code
### MGB First

In [15]:
MGBrelevantPatientICDs = pd.read_csv("helperCSVs/MGBrelevantPatientICDs.csv")
BIDMCrelevantPatientICDs = pd.read_csv("helperCSVs/BIDMCrelevantPatientICDs.csv")

In [16]:
code_regex = '^(I60|430)'

MGBrelevantPatientICDs = MGBrelevantPatientICDs.dropna(subset=['ICDCD'])

allPosICDsMGB = MGBrelevantPatientICDs[MGBrelevantPatientICDs['ICDCD'].str.match(code_regex)]

allPosICDsMGB.head()



Unnamed: 0,BDSPEncounterID,EncounterLineNBR,BDSPPatientID,ShiftedContactDTS,ICDLineNBR,ICDCD,ICDDSC,DiagnosisNM,DiagnosisDSC,PrimaryDiagnosisFLG,DiagnosisChronicFLG,ShiftedUpdateDTS,DiagnosisLinkedProblemID,BDSPLastModifiedDTS,code_type
202,13721230000.0,4,121501383.0,2022-07-03 00:00:00.0000000,1.0,430,Subarachnoid hemorrhage,Subarachnoid hemorrhage,,N,N,2022-07-04 23:08:00.0000000,127299861.0,2022-09-15 07:00:26.8080000,ICD
207,13614300000.0,2,113797450.0,2021-06-22 00:00:00.0000000,1.0,430,Subarachnoid hemorrhage,Ruptured (congenital) cerebral aneurysm,,N,N,2021-06-22 18:14:00.0000000,112494133.0,2022-06-13 17:43:46.0660000,ICD
273,13688070000.0,1,113270682.0,2023-09-19 00:00:00.0000000,1.0,430,Subarachnoid hemorrhage,Subarachnoid hemorrhage,,N,N,2023-09-19 16:24:00.0000000,102014310.0,2023-08-16 07:08:36.4310000,ICD
281,13588750000.0,1,120491495.0,2023-03-07 00:00:00.0000000,1.0,430,Subarachnoid hemorrhage,Cerebral aneurysm rupture,,N,N,2023-03-07 10:47:00.0000000,91821754.0,2022-08-17 06:09:29.6700000,ICD
438,13685260000.0,1,116587087.0,2022-10-20 00:00:00.0000000,1.0,430,Subarachnoid hemorrhage,SAH (subarachnoid hemorrhage),,Y,N,2022-11-17 13:20:00.0000000,124551050.0,2022-06-13 17:47:33.6620000,ICD


In [17]:
BIDMCrelevantPatientICDs = BIDMCrelevantPatientICDs.dropna(subset=['DiagnosisCodeWithDots'])

allPosICDsBIDMC = BIDMCrelevantPatientICDs[BIDMCrelevantPatientICDs['DiagnosisCodeWithDots'].str.match(code_regex)]

allPosICDsBIDMC.head()

Unnamed: 0,BDSPPatientID,BDSPEncounterID,DiagnosisSequenceNumber,DiagnosisCode,DiagnosisPoaInd,DiagnosisCodeWithDots,ShortDescription,LongDescription,DiagnosisType,AdmissionDate,DischargeDate,BDSPLastModifiedDTS,code_type
209,151055839,134384936,1,I6012,Y,I60.12,NONTRAUM SA HEMOR LT MID,NTRM SUBARACH HEMORRHAGE FROM LEFT MIDDLE CERE...,ICD10,2018-05-18,2018-06-05,2023-07-19 19:10:23.4213520,ICD10
299,151055679,134606361,1,I608,Y,I60.8,OTH NONTRAUM SUBARACHNOI,OTHER NONTRAUMATIC SUBARACHNOID HEMORRHAGE ...,ICD10,2019-12-17,2019-12-24,2023-07-19 19:10:23.4213520,ICD10
309,151055679,134622230,1,I609,,I60.9,NONTRAUMATIC SUBARACH HE,"NONTRAUMATIC SUBARACHNOID HEMORRHAGE, UNSPECIF...",ICD10,2020-03-16,2020-03-16,2023-07-19 19:10:23.4213520,ICD10
349,151055679,562955858,1,I608,Y,I60.8,OTH NONTRAUM SUBARACHNOI,OTHER NONTRAUMATIC SUBARACHNOID HEMORRHAGE ...,ICD10,2019-12-17,2019-12-17,2023-07-19 19:10:23.4213520,ICD10
520,151055839,984158717,1,I609,,I60.9,NONTRAUMATIC SUBARACH HE,"NONTRAUMATIC SUBARACHNOID HEMORRHAGE, UNSPECIF...",ICD10,2018-09-16,2018-09-16,2023-07-19 19:10:23.4213520,ICD10


In [18]:
allPosICDsMGB.to_csv("helperCSVs/allPosICDsMGB.csv", index=False)
allPosICDsBIDMC.to_csv("helperCSVs/allPosICDsBIDMC.csv", index=False)

## Now adding feature for ICD code received 6 months +/- the NoteDate 

In [19]:
allPosICDsMGB = pd.read_csv('helperCSVs/allPosICDsMGB.csv')
allPosICDsBIDMC = pd.read_csv('helperCSVs/allPosICDsBIDMC.csv')

In [20]:
# Rename the date columns to a common name
allPosICDsMGB = allPosICDsMGB.rename(columns={'ShiftedContactDTS': 'ICDDate'})
allPosICDsBIDMC = allPosICDsBIDMC.rename(columns={'AdmissionDate': 'ICDDate'})

In [21]:
allPosICDsMGB.head()

Unnamed: 0,BDSPEncounterID,EncounterLineNBR,BDSPPatientID,ICDDate,ICDLineNBR,ICDCD,ICDDSC,DiagnosisNM,DiagnosisDSC,PrimaryDiagnosisFLG,DiagnosisChronicFLG,ShiftedUpdateDTS,DiagnosisLinkedProblemID,BDSPLastModifiedDTS,code_type
0,13721230000.0,4,121501383.0,2022-07-03 00:00:00.0000000,1.0,430,Subarachnoid hemorrhage,Subarachnoid hemorrhage,,N,N,2022-07-04 23:08:00.0000000,127299861.0,2022-09-15 07:00:26.8080000,ICD
1,13614300000.0,2,113797450.0,2021-06-22 00:00:00.0000000,1.0,430,Subarachnoid hemorrhage,Ruptured (congenital) cerebral aneurysm,,N,N,2021-06-22 18:14:00.0000000,112494133.0,2022-06-13 17:43:46.0660000,ICD
2,13688070000.0,1,113270682.0,2023-09-19 00:00:00.0000000,1.0,430,Subarachnoid hemorrhage,Subarachnoid hemorrhage,,N,N,2023-09-19 16:24:00.0000000,102014310.0,2023-08-16 07:08:36.4310000,ICD
3,13588750000.0,1,120491495.0,2023-03-07 00:00:00.0000000,1.0,430,Subarachnoid hemorrhage,Cerebral aneurysm rupture,,N,N,2023-03-07 10:47:00.0000000,91821754.0,2022-08-17 06:09:29.6700000,ICD
4,13685260000.0,1,116587087.0,2022-10-20 00:00:00.0000000,1.0,430,Subarachnoid hemorrhage,SAH (subarachnoid hemorrhage),,Y,N,2022-11-17 13:20:00.0000000,124551050.0,2022-06-13 17:47:33.6620000,ICD


In [22]:
allPosICDsBIDMC.head()

Unnamed: 0,BDSPPatientID,BDSPEncounterID,DiagnosisSequenceNumber,DiagnosisCode,DiagnosisPoaInd,DiagnosisCodeWithDots,ShortDescription,LongDescription,DiagnosisType,ICDDate,DischargeDate,BDSPLastModifiedDTS,code_type
0,151055839,134384936,1,I6012,Y,I60.12,NONTRAUM SA HEMOR LT MID,NTRM SUBARACH HEMORRHAGE FROM LEFT MIDDLE CERE...,ICD10,2018-05-18,2018-06-05,2023-07-19 19:10:23.4213520,ICD10
1,151055679,134606361,1,I608,Y,I60.8,OTH NONTRAUM SUBARACHNOI,OTHER NONTRAUMATIC SUBARACHNOID HEMORRHAGE ...,ICD10,2019-12-17,2019-12-24,2023-07-19 19:10:23.4213520,ICD10
2,151055679,134622230,1,I609,,I60.9,NONTRAUMATIC SUBARACH HE,"NONTRAUMATIC SUBARACHNOID HEMORRHAGE, UNSPECIF...",ICD10,2020-03-16,2020-03-16,2023-07-19 19:10:23.4213520,ICD10
3,151055679,562955858,1,I608,Y,I60.8,OTH NONTRAUM SUBARACHNOI,OTHER NONTRAUMATIC SUBARACHNOID HEMORRHAGE ...,ICD10,2019-12-17,2019-12-17,2023-07-19 19:10:23.4213520,ICD10
4,151055839,984158717,1,I609,,I60.9,NONTRAUMATIC SUBARACH HE,"NONTRAUMATIC SUBARACHNOID HEMORRHAGE, UNSPECIF...",ICD10,2018-09-16,2018-09-16,2023-07-19 19:10:23.4213520,ICD10


In [23]:
allPosICDsMGB['ICDDate'] = pd.to_datetime(allPosICDsMGB['ICDDate'])
allPosICDsBIDMC['ICDDate'] = pd.to_datetime(allPosICDsBIDMC['ICDDate'])

In [24]:
icd_df = pd.concat([allPosICDsMGB, allPosICDsBIDMC], ignore_index=True)

In [25]:
matrix = startMatrix

In [26]:
def parse_dates(date_str):
    for fmt in ('%Y-%m-%d %H:%M:%S.%f', '%Y-%m-%d %H:%M:%S', '%Y-%m-%d'):
        try:
            return pd.to_datetime(date_str, format=fmt)
        except ValueError:
            continue
    # If all formats fail, return NaT
    return pd.NaT

# Apply the custom function to convert dates
matrix['NoteDate'] = matrix['NoteDate'].apply(parse_dates)
matrix.head()


Unnamed: 0,BDSPPatientID,NoteDate,NoteTitle,text,hospital,empi,annot
0,115883980,2018-04-14,Notes_13414009311_1956375582_20180414.txt,Physician ***** ***** Admit date: ****...,MGB,115883980,1
1,116483510,2022-05-12,Notes_13622415618_8165687294_20220512.txt,Physician ***** ***** Admit date: ****...,MGB,116483510,0
2,150009858,2013-08-29,Notes_1129868316_2608699887_20130829.txt,\n\nNote Date: *****/*****/*****\n\nNote Type:...,BIDMC,150009858,1
3,150009896,2022-02-04,Notes_1129868625_2609280946_20220204.txt,\n\nNote Date: *****/*****/***** Time: 0813\n...,BIDMC,150009896,1
4,116023288,2022-01-03,Notes_13528991310_6365444126_20220103.txt,***** ***** ***** Medical Psychiatry D...,MGB,116023288,0


In [27]:
matrix = matrix.drop(columns=['empi'])
matrix.head()


Unnamed: 0,BDSPPatientID,NoteDate,NoteTitle,text,hospital,annot
0,115883980,2018-04-14,Notes_13414009311_1956375582_20180414.txt,Physician ***** ***** Admit date: ****...,MGB,1
1,116483510,2022-05-12,Notes_13622415618_8165687294_20220512.txt,Physician ***** ***** Admit date: ****...,MGB,0
2,150009858,2013-08-29,Notes_1129868316_2608699887_20130829.txt,\n\nNote Date: *****/*****/*****\n\nNote Type:...,BIDMC,1
3,150009896,2022-02-04,Notes_1129868625_2609280946_20220204.txt,\n\nNote Date: *****/*****/***** Time: 0813\n...,BIDMC,1
4,116023288,2022-01-03,Notes_13528991310_6365444126_20220103.txt,***** ***** ***** Medical Psychiatry D...,MGB,0


In [28]:
matrix['NoteDate'] = pd.to_datetime(matrix['NoteDate'])

In [29]:
def icd_received_within_6_months(row, icd_df):
    patient_id = row['BDSPPatientID']
    note_date = row['NoteDate']
    start_date = note_date - pd.DateOffset(months=6)
    end_date = note_date + pd.DateOffset(months=6)
    
    patient_icd_df = icd_df[icd_df['BDSPPatientID'] == patient_id]
    return 1 if ((patient_icd_df['ICDDate'] >= start_date) & 
            (patient_icd_df['ICDDate'] <= end_date)).any() else 0

# Apply the function to each row in the matrix
matrix['ICD'] = matrix.apply(icd_received_within_6_months, axis=1, icd_df=icd_df)

matrix.head()


Unnamed: 0,BDSPPatientID,NoteDate,NoteTitle,text,hospital,annot,ICD
0,115883980,2018-04-14,Notes_13414009311_1956375582_20180414.txt,Physician ***** ***** Admit date: ****...,MGB,1,1
1,116483510,2022-05-12,Notes_13622415618_8165687294_20220512.txt,Physician ***** ***** Admit date: ****...,MGB,0,0
2,150009858,2013-08-29,Notes_1129868316_2608699887_20130829.txt,\n\nNote Date: *****/*****/*****\n\nNote Type:...,BIDMC,1,1
3,150009896,2022-02-04,Notes_1129868625_2609280946_20220204.txt,\n\nNote Date: *****/*****/***** Time: 0813\n...,BIDMC,1,1
4,116023288,2022-01-03,Notes_13528991310_6365444126_20220103.txt,***** ***** ***** Medical Psychiatry D...,MGB,0,0


In [30]:
matrix.to_csv('1_matrix_ICD_feature_training.csv', index=False)