In [1]:
import pandas as pd
from thunderpack import ThunderReader
from tqdm import tqdm
import re
import random

In [2]:
reader = ThunderReader('/home/jsearle/bigDrive/Dropbox/zz_EHR_Thunderpacks/BIDMC/thunderpack_icd_9_10_nax_1m_BIDMC')
key_length = len(list(reader.keys()))
print(key_length)

36


In [3]:
# filter by below icd codes
code_regex = "^(?!I60|430).*"
dfs = []
for i in tqdm(range(1, key_length + 1)):
    df = reader[f'ICD_partition_{i}']
    df = df[df['DiagnosisCodeWithDots'].astype(str).str.match(code_regex, flags = re.I)]
    dfs.append(df)

100%|██████████| 36/36 [00:48<00:00,  1.33s/it]


In [4]:
# show num of total SAH icd codes given
# show example of df
filtered_icd_df = pd.concat(dfs, axis=0, ignore_index=True)
print(len(filtered_icd_df))
filtered_icd_df.head()

35907203


Unnamed: 0,BDSPPatientID,BDSPEncounterID,DiagnosisSequenceNumber,DiagnosisCode,DiagnosisPoaInd,DiagnosisCodeWithDots,ShortDescription,LongDescription,DiagnosisType,AdmissionDate,DischargeDate,BDSPLastModifiedDTS,code_type
0,150048203,132798031,1,1748,Y,174.8,MALIGN NEOPL BREAST NEC,MALIG NEOPLASM BREAST OTH ...,ICD9,2009-11-16,2009-11-18,2023-07-17 10:28:55.1290270,ICD9
1,150048203,132798031,2,V860,E,V86.0,ESTROGEN RECEPT POS STAT,ESTROGEN RECEPTOR POS STATUS [ER+] ...,ICD9,2009-11-16,2009-11-18,2023-07-17 10:28:55.1290270,ICD9
2,150048203,132798031,3,2449,Y,244.9,HYPOTHYROIDISM NOS,HYPOTHYROIDISM UNSP ...,ICD9,2009-11-16,2009-11-18,2023-07-17 10:28:55.1290270,ICD9
3,150048203,132798031,4,2724,Y,272.4,HYPERLIPIDEMIA NEC/NOS,OTH & UNSP HYPERLIPIDEMIA ...,ICD9,2009-11-16,2009-11-18,2023-07-17 10:28:55.1290270,ICD9
4,150048203,132798031,5,27800,Y,278.00,"OBESITY, UNSPECIFIED",OBESITY UNSP ...,ICD9,2009-11-16,2009-11-18,2023-07-17 10:28:55.1290270,ICD9


In [5]:
# clean up df, keep only relevant info
keepColumns = ['BDSPPatientID', 'AdmissionDate', 'DiagnosisCodeWithDots']
clean_icd_df = filtered_icd_df[keepColumns]
clean_icd_df.head()

Unnamed: 0,BDSPPatientID,AdmissionDate,DiagnosisCodeWithDots
0,150048203,2009-11-16,174.8
1,150048203,2009-11-16,V86.0
2,150048203,2009-11-16,244.9
3,150048203,2009-11-16,272.4
4,150048203,2009-11-16,278.00


In [6]:
# rename columns
rename_dict = { 
    'DiagnosisCodeWithDots': 'ICD', 
}

clean_icd_df = clean_icd_df.rename(columns=rename_dict)

clean_icd_df.head()


Unnamed: 0,BDSPPatientID,AdmissionDate,ICD
0,150048203,2009-11-16,174.8
1,150048203,2009-11-16,V86.0
2,150048203,2009-11-16,244.9
3,150048203,2009-11-16,272.4
4,150048203,2009-11-16,278.00


In [7]:
# RANDOMIZE order of df, 25 seed for reproducability
clean_icd_df = clean_icd_df.sample(frac=1, random_state=25).reset_index(drop=True)
clean_icd_df.head()

Unnamed: 0,BDSPPatientID,AdmissionDate,ICD
0,150963606,2017-11-02,M79.1
1,150255564,2022-10-18,K21.9
2,150594739,2015-06-12,174.9
3,150770050,2012-10-10,V49.86
4,150728234,2013-10-05,692.74


In [8]:
#filter to only include unique patients
unique_patients_icd_df = clean_icd_df.drop_duplicates(subset='BDSPPatientID', keep='first')
print(len(unique_patients_icd_df))

476652


In [10]:
# Grab the first 50000 (already randomized) to make the df easier to work with
icd_neg_df = unique_patients_icd_df.head(50000)
print(len(icd_neg_df))

50000


In [11]:
# save df as csv
icd_neg_df.to_csv('1_icd_neg_df.csv', index=False)