In [1]:
# imports
import pandas as pd
from thunderpack import ThunderReader
from tqdm import tqdm
import re

# Goal of this notebook:
 - Filter my current dataframe (1_icd_plus_df.csv) to only include patients who have either had a head CT or a brain MRI
 - Plan to do so:
   - Read in cpt df
   - Filter for head CT and brain MRI codes
   - Extract list of patient ID's
   - Read in 1_icd_plus_df.csv
   - Pull ID column, compare with cpt ID's, and keep overlap
   - Filter 1_icd_plus_df based on the new list

In [2]:
reader = ThunderReader('/home/jsearle/bigDrive/Dropbox/zz_EHR_Thunderpacks/MGB/thunderpack_cpt_1m_MGB')
key_length = len(list(reader.keys()))
print(key_length)

735


### CPT Codes to filter for:
**70450**: 	CT Head or Brain without contrast\
**70460**:	CT Head or Brain with contrast\
**70470**: 	CT Head or Brain without contrast, followed by contrast and further sections\
**70551**: 	MRI Brain (including brain stem) without contrast\
**70552**: 	MRI Brain (including brain stem) with contrast\
**70553**: 	MRI Brain (including brain stem) without contrast, followed by with contrast

In [4]:
# get note metadata for each year
# filter by cpt code
code_regex = "(?:70450|70460|70470|70551|70552|70553)"
dfs = []
for i in range(1, key_length + 1):
    df = reader[f'cpt_partition_{i}']
    df = df[df['CPT'].astype(str).str.match(code_regex)]
    dfs.append(df)

In [6]:
# show dataframe, number of discharge summaries
df = pd.concat(dfs, axis=0, ignore_index=True)
print(len(df))
df.head()

2723303


Unnamed: 0,BDSPPatientID,BDSPEncounterID,CPT,StartDTS,EndDTS,BDSPLastModifiedDTS
0,116257406,13555739296,70553,2021-08-15 00:00:00.0000000,2021-08-17 00:00:00.0000000,2022-12-09 09:18:39.4433333
1,111316311,10342917280,70450,2019-01-01 00:00:00.0000000,2019-01-01 00:00:00.0000000,2023-01-09 10:56:39.1333333
2,120960333,10659988635,70553,2016-08-30 00:00:00.0000000,2016-08-30 00:00:00.0000000,2023-01-09 10:56:39.1333333
3,114483137,13426862981,70450,2019-08-18 00:00:00.0000000,2019-08-30 00:00:00.0000000,2022-12-09 08:29:42.8766667
4,112591416,10780820903,70450,2020-06-18 00:00:00.0000000,2020-06-18 00:00:00.0000000,2023-01-09 10:56:39.1333333


In [17]:
# only include get 1 per patient, filtering for those that have ever gotten CT Head or MRI
cpt_pt_ids = set(df['BDSPPatientID'])
print(len(cpt_pt_ids))

430691


In [11]:
# read in discharge summaries df
icd_discharge_df = pd.read_csv('2_discharge_summaries_df.csv')
icd_discharge_df.head()


Unnamed: 0,BDSPPatientID,DateICD,ICD,DateNote,NoteTextFile
0,116398048.0,2018-03-09,430,2018-03-20,Notes_13437596114_2097038735_20180320.txt
1,113080217.0,2022-10-10,430,2022-10-28,Notes_13604817952_9087332343_20221028.txt
2,113027749.0,2015-06-07,430,2015-06-20,Notes_13246458168_980626991_20150620.txt
3,118319033.0,2015-11-16,430,2015-11-06,Notes_13276057829_1044044733_20151106.txt
4,120036000.0,2020-02-28,430,2020-03-01,Notes_13476104223_4893859211_20200301.txt


In [12]:
print(type(icd_discharge_df['BDSPPatientID'][0]))
print(type(df['BDSPPatientID'][0]))

<class 'numpy.float64'>
<class 'str'>


In [15]:
# make patient IDs same type
icd_discharge_df['BDSPPatientID'] = icd_discharge_df['BDSPPatientID'].astype(int).astype(str)

icd_discharge_pt_ids = set(icd_discharge_df['BDSPPatientID'])

In [18]:
# find the overlap between cpt matching patients and previously filtered patients
overlap = cpt_pt_ids.intersection(icd_discharge_pt_ids)
print(len(overlap))

1113


1,113 patients who overlap.  Let's filter the icd_discharge filtered file to only include those patients

In [20]:
# show length (looks like it filtered out about 400 patients) and current df
prepped_df = icd_discharge_df[icd_discharge_df['BDSPPatientID'].isin(overlap)]
print(len(prepped_df))
prepped_df.head()

9491


Unnamed: 0,BDSPPatientID,DateICD,ICD,DateNote,NoteTextFile
0,116398048,2018-03-09,430,2018-03-20,Notes_13437596114_2097038735_20180320.txt
1,113080217,2022-10-10,430,2022-10-28,Notes_13604817952_9087332343_20221028.txt
2,113027749,2015-06-07,430,2015-06-20,Notes_13246458168_980626991_20150620.txt
3,118319033,2015-11-16,430,2015-11-06,Notes_13276057829_1044044733_20151106.txt
4,120036000,2020-02-28,430,2020-03-01,Notes_13476104223_4893859211_20200301.txt


In [22]:
# save as csv
prepped_df.to_csv('3_full_filtered.csv', index=False, header=True, sep=',', na_rep='NA')