In [10]:
import pandas as pd
from thunderpack import ThunderReader
from tqdm import tqdm
import re
import gc

# Goal of this notebook:
 - Filter notes by ICD code indicating lack of SAH:
   - ICD -
 - Plan to do so:
   - For ICD +, filter using regex to include only ICD codes that I have listed
   - For ICD -, do the opposite
   - Remember that there are more rows than unique patients.  ie: some patients will have multiple ICD codes all linked to the same note
   - To fix that - try to condense rows into one per note by making the ICD column contain a list of ICD's (not tidy but I feel like it works here)

Start by reading in prepped_df.csv

In [11]:
prepped_df = pd.read_csv('prepped_df.csv')
prepped_df.head()

Unnamed: 0,BDSPPatientID,ICD_Date,NoteDate,ICD,CodeType,NoteTitle
0,121173304,2021-03-07,2021-02-26 00:00:00,205.00,ICD,Notes_13542494271_5936347980_20210226.txt
1,112711779,2022-04-01,2022-04-05 00:00:00,793.11,ICD,Notes_13500753060_5772195018_20220405.txt
2,117553376,2017-09-19,2017-08-24 00:00:00,368.8,ICD,Notes_13336483339_1359110280_20170824.txt
3,111813998,2019-01-22,2019-02-07 00:00:00,V49.89,ICD,Notes_13416480748_2022890910_20190207.txt
4,116523553,2017-04-19,2017-04-25 00:00:00,648.83,ICD,Notes_13291229443_1699851441_20170425.txt


Let's repeat the process but matching everything except TBI codes to get our ICD- cohort

In [12]:
code_regex = "^(?!I60|430).*"
icd_neg = prepped_df[prepped_df['ICD'].astype(str).str.match(code_regex, flags=re.I)]
icd_neg.head()

Unnamed: 0,BDSPPatientID,ICD_Date,NoteDate,ICD,CodeType,NoteTitle
0,121173304,2021-03-07,2021-02-26 00:00:00,205.00,ICD,Notes_13542494271_5936347980_20210226.txt
1,112711779,2022-04-01,2022-04-05 00:00:00,793.11,ICD,Notes_13500753060_5772195018_20220405.txt
2,117553376,2017-09-19,2017-08-24 00:00:00,368.8,ICD,Notes_13336483339_1359110280_20170824.txt
3,111813998,2019-01-22,2019-02-07 00:00:00,V49.89,ICD,Notes_13416480748_2022890910_20190207.txt
4,116523553,2017-04-19,2017-04-25 00:00:00,648.83,ICD,Notes_13291229443_1699851441_20170425.txt


In [13]:
icd_neg.loc[:, 'ICD'] = icd_neg['ICD'].astype(str)
grouped = icd_neg.groupby('NoteTitle')['ICD'].apply(lambda x: ', '.join(x)).reset_index()
icd_neg_unique = icd_neg.drop_duplicates(subset=['NoteTitle']).drop(columns=['ICD'])
icd_neg_unique = pd.merge(icd_neg_unique, grouped, on='NoteTitle', how='left')

In [14]:
print(len(icd_neg_unique))

23891


Finally, let's choose our sample of 1087.  Same size as ICD +

In [15]:
cohort_neg = icd_neg_unique.sample(n=1087, replace=False, random_state=1)
print(len(cohort_neg))

1087


In [16]:
cohort_neg.head()

Unnamed: 0,BDSPPatientID,ICD_Date,NoteDate,CodeType,NoteTitle,ICD
16837,118374115,2018-10-28,2018-11-20 00:00:00,ICD,Notes_13333491432_2130196704_20181120.txt,"799.89, G93.40, Z74.09, V49.89, 437.9, E85.4, ..."
14066,116931824,2022-12-21,2022-12-29 00:00:00,ICD,Notes_13563868206_7830407523_20221229.txt,"Z01.818, R31.29, E78.00, 599.72, 780.39, 569.3..."
4406,118587891,2021-06-25,2021-05-30 00:00:00,ICD,Notes_13532685798_4823397982_20210530.txt,"332.0, 530.81, G45.9, 585.3, I95.1, 799.3, Z71..."
4120,114557355,2022-08-01,2022-07-22 00:00:00,ICD,Notes_13620259495_8489742841_20220722.txt,"V76.44, 434.91, I63.512, 434.91, J02.9, G45.9,..."
13390,114329029,2018-04-15,2018-05-15 00:00:00,ICD,Notes_13356996149_1753656538_20180515.txt,"780.39, 780.39, R07.89, S06.5XAA, R56.9, 780.6..."


I guess the ICD code really only matters for the plus cohort in case we wanted to look into the spread of certain TBI codes

In [17]:
cohort_neg = cohort_neg.drop(columns=['ICD_Date', 'CodeType', 'ICD'])
cohort_neg.head()

Unnamed: 0,BDSPPatientID,NoteDate,NoteTitle
16837,118374115,2018-11-20 00:00:00,Notes_13333491432_2130196704_20181120.txt
14066,116931824,2022-12-29 00:00:00,Notes_13563868206_7830407523_20221229.txt
4406,118587891,2021-05-30 00:00:00,Notes_13532685798_4823397982_20210530.txt
4120,114557355,2022-07-22 00:00:00,Notes_13620259495_8489742841_20220722.txt
13390,114329029,2018-05-15 00:00:00,Notes_13356996149_1753656538_20180515.txt


In [18]:
cohort_neg.to_csv('cohort_neg.csv', index=False, header=True, sep=',', na_rep='NA')

In [19]:
test = pd.read_csv('cohort_neg.csv')
test.head()

Unnamed: 0,BDSPPatientID,NoteDate,NoteTitle
0,118374115,2018-11-20 00:00:00,Notes_13333491432_2130196704_20181120.txt
1,116931824,2022-12-29 00:00:00,Notes_13563868206_7830407523_20221229.txt
2,118587891,2021-05-30 00:00:00,Notes_13532685798_4823397982_20210530.txt
3,114557355,2022-07-22 00:00:00,Notes_13620259495_8489742841_20220722.txt
4,114329029,2018-05-15 00:00:00,Notes_13356996149_1753656538_20180515.txt


# Goal Accomplished
Summary:
 - Used regex to search for ICD code specifics
 - Got the unique list of note titles
 - Created cohorts of 1000 (randomly)
 - Kept important columns and exported to CSV
 
Next:
 - Bram said I need to get the whole note text into a new column somehow