In [1]:
import pandas as pd
from thunderpack import ThunderReader
from tqdm import tqdm
import re
import gc

# Goal of this notebook:
 - Filter my current dataframe (pre_cpt_filtering.csv) to only include patients who have either had a head CT or a brain MRI
 - Plan to do so:
   - Read in cpt df
   - Filter for head CT and brain MRI codes
   - Extract list of patient ID's
   - Read in pre_cpt_filtering.csv
   - Pull ID column, compare with cpt ID's, and keep overlap
   - Filter pre_cpt_filtering based on the new list


In [2]:
reader = ThunderReader('/home/jsearle/bigDrive/Dropbox/zz_EHR_Thunderpacks/MGB/thunderpack_cpt_1m_MGB')
key_length = len(list(reader.keys()))
print(key_length)

735


### CPT Codes to filter for:
**70450**: 	CT Head or Brain without contrast\
**70460**:	CT Head or Brain with contrast\
**70470**: 	CT Head or Brain without contrast, followed by contrast and further sections\
**70551**: 	MRI Brain (including brain stem) without contrast\
**70552**: 	MRI Brain (including brain stem) with contrast\
**70553**: 	MRI Brain (including brain stem) without contrast, followed by with contrast

In [3]:
code_regex = "(?:70450|70460|70470|70551|70552|70553)"

In [4]:
dfs = []
for i in tqdm(range(1, key_length + 1)):
    df = reader[f'cpt_partition_{i}']
    df = df[df['CPT'].astype(str).str.match(code_regex)]
    dfs.append(df)

100%|██████████| 735/735 [26:19<00:00,  2.15s/it]


In [5]:
df = pd.concat(dfs, axis=0, ignore_index=True)
print(len(df))

2723303


In [6]:
df.head()

Unnamed: 0,BDSPPatientID,BDSPEncounterID,CPT,StartDTS,EndDTS,BDSPLastModifiedDTS
0,116257406,13555739296,70553,2021-08-15 00:00:00.0000000,2021-08-17 00:00:00.0000000,2022-12-09 09:18:39.4433333
1,111316311,10342917280,70450,2019-01-01 00:00:00.0000000,2019-01-01 00:00:00.0000000,2023-01-09 10:56:39.1333333
2,120960333,10659988635,70553,2016-08-30 00:00:00.0000000,2016-08-30 00:00:00.0000000,2023-01-09 10:56:39.1333333
3,114483137,13426862981,70450,2019-08-18 00:00:00.0000000,2019-08-30 00:00:00.0000000,2022-12-09 08:29:42.8766667
4,112591416,10780820903,70450,2020-06-18 00:00:00.0000000,2020-06-18 00:00:00.0000000,2023-01-09 10:56:39.1333333


In [7]:
unique_pt_ids = set(df['BDSPPatientID'])
print(len(unique_pt_ids))

430691


The variable 'unique_pt_ids' is a set of all patient ID's associated with a patient that has had either a Head CT or Brain MRI\
Next Steps:
 - Read in the pre_cpt_filtering.csv dataframe
 - Pull the unique pt ID's from that df
 - Find the overlap

In [8]:
pre_cpt_filtering = pd.read_csv('pre_cpt_filtering.csv')
pre_cpt_filtering.head()

Unnamed: 0,BDSPPatientID,ICD_Date,NoteDate,ICD,CodeType,NoteTitle
0,121173304.0,2021-03-07,2021-02-26 00:00:00,205.00,ICD,Notes_13542494271_5936347980_20210226.txt
1,112711779.0,2022-04-01,2022-04-05 00:00:00,793.11,ICD,Notes_13500753060_5772195018_20220405.txt
2,117553376.0,2017-09-19,2017-08-24 00:00:00,368.8,ICD,Notes_13336483339_1359110280_20170824.txt
3,111813998.0,2019-01-22,2019-02-07 00:00:00,V49.89,ICD,Notes_13416480748_2022890910_20190207.txt
4,116523553.0,2017-04-19,2017-04-25 00:00:00,648.83,ICD,Notes_13291229443_1699851441_20170425.txt


In [9]:
print(type(pre_cpt_filtering['BDSPPatientID'][0]))
print(type(df['BDSPPatientID'][0]))

<class 'numpy.float64'>
<class 'str'>


In [10]:
pre_cpt_filtering['BDSPPatientID'] = pre_cpt_filtering['BDSPPatientID'].astype(int).astype(str)

In [11]:
pre_cpt_filtering.head()

Unnamed: 0,BDSPPatientID,ICD_Date,NoteDate,ICD,CodeType,NoteTitle
0,121173304,2021-03-07,2021-02-26 00:00:00,205.00,ICD,Notes_13542494271_5936347980_20210226.txt
1,112711779,2022-04-01,2022-04-05 00:00:00,793.11,ICD,Notes_13500753060_5772195018_20220405.txt
2,117553376,2017-09-19,2017-08-24 00:00:00,368.8,ICD,Notes_13336483339_1359110280_20170824.txt
3,111813998,2019-01-22,2019-02-07 00:00:00,V49.89,ICD,Notes_13416480748_2022890910_20190207.txt
4,116523553,2017-04-19,2017-04-25 00:00:00,648.83,ICD,Notes_13291229443_1699851441_20170425.txt


In [12]:
unique_pre_cpt = set(pre_cpt_filtering['BDSPPatientID'])

In [13]:
overlap = unique_pt_ids.intersection(unique_pre_cpt)
print(len(overlap))

23899


Looks like we have 23,899 patients who overlap.  Let's filter the pre_cpt file to only include those patients

In [15]:
prepped_df = pre_cpt_filtering[pre_cpt_filtering['BDSPPatientID'].isin(overlap)]
print(len(prepped_df))

757913


In [16]:
prepped_df.head()

Unnamed: 0,BDSPPatientID,ICD_Date,NoteDate,ICD,CodeType,NoteTitle
0,121173304,2021-03-07,2021-02-26 00:00:00,205.00,ICD,Notes_13542494271_5936347980_20210226.txt
1,112711779,2022-04-01,2022-04-05 00:00:00,793.11,ICD,Notes_13500753060_5772195018_20220405.txt
2,117553376,2017-09-19,2017-08-24 00:00:00,368.8,ICD,Notes_13336483339_1359110280_20170824.txt
3,111813998,2019-01-22,2019-02-07 00:00:00,V49.89,ICD,Notes_13416480748_2022890910_20190207.txt
4,116523553,2017-04-19,2017-04-25 00:00:00,648.83,ICD,Notes_13291229443_1699851441_20170425.txt


# Goal Accomplished
Summary:
 - Got the CPT dataframe read in while filtering for head CTs and brain MRIs
 - Pulled unique list of patient ID's
 - Got the pre_cpt_filtering csv file read in
 - Pulled unique list of patient ID's from this df
 - Found the overlap
 - Filtered the csv df to only include this overlap

Next:
 - Save it to a new csv
 - Create cohorts based on ICD code +/-

In [17]:
prepped_df.to_csv('prepped_df.csv', index=False, header=True, sep=',', na_rep='NA')

In [18]:
test = pd.read_csv('prepped_df.csv')
test.head()

Unnamed: 0,BDSPPatientID,ICD_Date,NoteDate,ICD,CodeType,NoteTitle
0,121173304,2021-03-07,2021-02-26 00:00:00,205.00,ICD,Notes_13542494271_5936347980_20210226.txt
1,112711779,2022-04-01,2022-04-05 00:00:00,793.11,ICD,Notes_13500753060_5772195018_20220405.txt
2,117553376,2017-09-19,2017-08-24 00:00:00,368.8,ICD,Notes_13336483339_1359110280_20170824.txt
3,111813998,2019-01-22,2019-02-07 00:00:00,V49.89,ICD,Notes_13416480748_2022890910_20190207.txt
4,116523553,2017-04-19,2017-04-25 00:00:00,648.83,ICD,Notes_13291229443_1699851441_20170425.txt
