In [27]:
import pandas as pd
from thunderpack import ThunderReader
from tqdm import tqdm

## GOALS
- AGE average age of all participants (and Standard Deviation)
    - per hospital
    - per ICD standing

- SEX num/percent female/male
    - per hospital
    - per icd standing

- RACE num/percent
    - per hospital
    - per icd standing


### MGB
 - BDSPPatientID
 - AGE
    - NoteDate yr - DateOfBirth yr
    - Add Age column
    - Age brackets (18-30, etc.)
 - SEX
    - SexDSC (Patient informed Gender)
        - or -
    - SexAssignedAtBirthDSC
        - or -
    - GengerIdentityDSC
 - RACE
    - PatientRace

### BIDMC

 - BDSPPatientID
 - AGE
    - NoteDate yr - DateOfBirth yr
    - Add Age column
    - Age brackets (18-30, etc.)
 - SEX
    - SexDSC (Patient informed Gender)
 - RACE
    - PatientRace

In [28]:
# BIDMC
cohort = pd.read_csv('/home/jsearle/bigDrive/NAX/NLP-SAH_identification/cleanCohorts/BIDMC_cohort_final.csv')
cohort.drop(columns=['text'], inplace=True)
print(len(cohort))
cohort.head()

1014


Unnamed: 0,BDSPPatientID,NoteDate,NoteTitle
0,151212513,2016-02-04,Notes_1131071605_435587215_20160204.txt
1,150639400,2023-06-12,Notes_1130498158_10141889500_20230612.txt
2,150057053,2020-12-17,Notes_1129915463_4302367261_20201217.txt
3,151005496,2010-06-29,Notes_1130863925_364816102_20100629.txt
4,150006448,2014-07-14,Notes_1129865503_1991829458_20140714.txt


In [29]:
patientIDs = cohort['BDSPPatientID']

reader = ThunderReader('/home/jsearle/bigDrive/Dropbox/zz_EHR_Thunderpacks/BIDMC/thunderpack_demographics_nax_1m_BIDMC')
key_length = len(list(reader.keys()))
print(key_length)
print(list(reader.keys()))

2
['demographics_partition_1', 'demographics_partition_2']


In [21]:
birthDates = pd.read_csv('/home/jsearle/bigDrive/Dropbox/zz_EHR_Thunderpacks/BIDMC/bidmc_patient_demographics_birth_and_death_dates.csv')
birthDates = birthDates[birthDates['BDSPPatientID'].isin(patientIDs)]
print(len(birthDates))
birthDates.head()

1014


Unnamed: 0,BDSPPatientID,DateOfBirth,ExpiredDate,DateOfDeathMARegistryData,BDSPLastModifiedDTS
125,150000004,1957-03-12,,,2023-05-19 17:03:37.6866667
4121,150064116,1953-05-02,2018-03-10,2018-03-10,2024-01-10 15:18:02.4866667
4856,150064837,1955-10-27,,,2023-07-06 17:38:51.9929860
9406,150000090,1949-06-15,,,2023-05-19 17:03:39.2033333
12990,150072715,1948-07-03,,,2023-07-06 17:38:51.9929860


In [30]:
filtered_dfs = []
for i in tqdm(range(1, key_length + 1)):
    df = reader[f'demographics_partition_{i}']
    filtered_df = df[df['BDSPPatientID'].isin(patientIDs)]
    filtered_dfs.append(filtered_df)

# Concatenate all filtered DataFrames into one
relevantPatientIDs = pd.concat(filtered_dfs, ignore_index=True)

relevantPatientIDs.head()

100%|██████████| 2/2 [00:00<00:00,  4.58it/s]


Unnamed: 0,BDSPPatientID,PatientRace,EthnicGroupDSC,MaritalStatusDSC,ReligionDSC,LanguageDSC,SexDSC,BDSPLastModifiedDTS
0,150000004,ASIAN,CHINESE,MARRIED,UNKNOWN,ENGLISH,Male,2023-05-19 17:03:37.6866667
1,150064116,WHITE,AMERICAN,SINGLE,CATHOLIC,ENGLISH,Female,2023-07-06 17:38:51.9929860
2,150064837,WHITE,AMERICAN INDIAN,SINGLE,BUDDHIST,ENGLISH,Male,2023-07-06 17:38:51.9929860
3,150000090,WHITE,AMERICAN,SINGLE,CATHOLIC,ENGLISH,Female,2023-05-19 17:03:39.2033333
4,150072715,WHITE,GREEK,DIVORCED,UNKNOWN,ENGLISH,Female,2023-07-06 17:38:51.9929860


In [32]:
print(len(relevantPatientIDs))
print(len(patientIDs))
relevantPatientIDs.head()

701
1014


Unnamed: 0,BDSPPatientID,PatientRace,EthnicGroupDSC,MaritalStatusDSC,ReligionDSC,LanguageDSC,SexDSC,BDSPLastModifiedDTS
0,150000004,ASIAN,CHINESE,MARRIED,UNKNOWN,ENGLISH,Male,2023-05-19 17:03:37.6866667
1,150064116,WHITE,AMERICAN,SINGLE,CATHOLIC,ENGLISH,Female,2023-07-06 17:38:51.9929860
2,150064837,WHITE,AMERICAN INDIAN,SINGLE,BUDDHIST,ENGLISH,Male,2023-07-06 17:38:51.9929860
3,150000090,WHITE,AMERICAN,SINGLE,CATHOLIC,ENGLISH,Female,2023-05-19 17:03:39.2033333
4,150072715,WHITE,GREEK,DIVORCED,UNKNOWN,ENGLISH,Female,2023-07-06 17:38:51.9929860


In [33]:
# Get a list of BDSPPatientIDs that are in patientIDs but not in relevantPatientIDs
missing_patient_ids = set(patientIDs) - set(relevantPatientIDs['BDSPPatientID'])

# Convert the set to a list
missing_patient_ids = list(missing_patient_ids)

print(len(missing_patient_ids))
print(missing_patient_ids)

499
[151134208, 150016002, 150009858, 150020100, 151306244, 150005766, 151306251, 150013964, 151271440, 151148567, 151212057, 150020131, 150562854, 150009896, 151173164, 150020141, 150020143, 151332919, 150014010, 151130172, 150005821, 151199815, 150018119, 150003803, 151113821, 150014049, 151361643, 151330933, 150020214, 151292027, 150014076, 151296127, 151296131, 150018180, 150007941, 151193734, 150007943, 151296134, 151300235, 150012047, 150014103, 150050970, 150020251, 150020255, 150020257, 150014122, 150028459, 150014123, 150055086, 151138480, 150018224, 150012082, 150014134, 150018230, 151212221, 151326914, 151210181, 150018245, 151038152, 151134414, 151142607, 151120092, 151181532, 151165150, 150055137, 151130345, 150018287, 151128306, 151191796, 150014208, 151163140, 151324936, 150014219, 151109911, 151195939, 151367972, 151095590, 150012205, 151349552, 150018364, 151204163, 150020421, 150038854, 151234893, 151159120, 151288149, 150640982, 151374169, 150006116, 150667627, 15134

In [13]:
relevantPatientIDs = relevantPatientIDs.drop_duplicates(subset='BDSPPatientID', keep='first')
print(len(relevantPatientIDs))

1458


In [8]:
relevantPatientIDs = relevantPatientIDs[['BDSPPatientID', 'DateOfBirth', 'PatientRace', 'SexDSC']]

relevantPatientIDs.head()

Unnamed: 0,BDSPPatientID,DateOfBirth,PatientRace,SexDSC
0,121298131,1972-09-21,White,Male
1,114146799,1950-09-22,White,Female
2,116372012,1957-03-10,White,Male
3,119649396,1986-09-14,White,Female
4,115251131,1985-06-07,White,Male


In [23]:
patients_df = pd.merge(birthDates, cohort, on='BDSPPatientID', how='inner')
print(len(patients_df))
patients_df.head()

1014


Unnamed: 0,BDSPPatientID,DateOfBirth,ExpiredDate,DateOfDeathMARegistryData,BDSPLastModifiedDTS,NoteDate,NoteTitle
0,150000004,1957-03-12,,,2023-05-19 17:03:37.6866667,2012-12-05,Notes_1129858847_903347041_20121205.txt
1,150064116,1953-05-02,2018-03-10,2018-03-10,2024-01-10 15:18:02.4866667,2017-09-05,Notes_1129922564_7789324346_20170905.txt
2,150064837,1955-10-27,,,2023-07-06 17:38:51.9929860,2023-01-30,Notes_1129923656_226657938_20230130.txt
3,150000090,1949-06-15,,,2023-05-19 17:03:39.2033333,2022-04-23,Notes_1129858659_7925200739_20220423.txt
4,150072715,1948-07-03,,,2023-07-06 17:38:51.9929860,2023-06-18,Notes_1129931751_934063735_20230618.txt


In [24]:
def parse_dates(date_str):
    for fmt in ('%Y-%m-%d %H:%M:%S.%f', '%Y-%m-%d %H:%M:%S', '%Y-%m-%d'):
        try:
            return pd.to_datetime(date_str, format=fmt)
        except ValueError:
            continue
    # If all formats fail, return NaT
    return pd.NaT

# Apply the custom function to convert dates
patients_df['NoteDate'] = patients_df['NoteDate'].apply(parse_dates)
patients_df['DateOfBirth'] = patients_df['DateOfBirth'].apply(parse_dates)

# Calculate age at NoteDate
patients_df['Age'] = (patients_df['NoteDate'] - patients_df['DateOfBirth']).apply(lambda x: x.days // 365)

patients_df.head()

Unnamed: 0,BDSPPatientID,DateOfBirth,ExpiredDate,DateOfDeathMARegistryData,BDSPLastModifiedDTS,NoteDate,NoteTitle,Age
0,150000004,1957-03-12,,,2023-05-19 17:03:37.6866667,2012-12-05,Notes_1129858847_903347041_20121205.txt,55
1,150064116,1953-05-02,2018-03-10,2018-03-10,2024-01-10 15:18:02.4866667,2017-09-05,Notes_1129922564_7789324346_20170905.txt,64
2,150064837,1955-10-27,,,2023-07-06 17:38:51.9929860,2023-01-30,Notes_1129923656_226657938_20230130.txt,67
3,150000090,1949-06-15,,,2023-05-19 17:03:39.2033333,2022-04-23,Notes_1129858659_7925200739_20220423.txt,72
4,150072715,1948-07-03,,,2023-07-06 17:38:51.9929860,2023-06-18,Notes_1129931751_934063735_20230618.txt,75


In [11]:
# Create a function to categorize the PatientRace values
def categorize_race(race):
    if "White" in race:
        return "White"
    elif "Black or African American" in race:
        return "Black or African American"
    elif "Asian" in race:
        return "Asian"
    elif "Unavailable" in race or "Declined" in race:
        return "Unavailable"
    else:
        return "Other"

# Apply the function to the PatientRace column to create a new categorized column
patients_df['CategorizedRace'] = patients_df['PatientRace'].apply(categorize_race)

# Drop the columns
patients_df = patients_df.drop(columns=['PatientRace', 'DateOfBirth', 'NoteDate', 'NoteTitle'])

patients_df.head()

In [14]:
neg_cohort = pd.read_csv('/home/jsearle/bigDrive/NAX/NLP-SAH_identification/cleanCohorts/MGB_neg_cohort_final.csv')
pos_cohort = pd.read_csv('/home/jsearle/bigDrive/NAX/NLP-SAH_identification/cleanCohorts/MGB_pos_cohort_final.csv')

negPatientIDs = neg_cohort['BDSPPatientID']
posPatientIDs = pos_cohort['BDSPPatientID']

negPatients = patients_df[patients_df['BDSPPatientID'].isin(negPatientIDs)]

posPatients = patients_df[patients_df['BDSPPatientID'].isin(posPatientIDs)]

print(len(negPatients))
print(len(posPatients))
print(len(negPatients)+len(posPatients)-len(patients_df))

negPatients.head()

1029
1053
0


Unnamed: 0,BDSPPatientID,SexDSC,Age,CategorizedRace
1,114146799,Female,67,White
4,115251131,Male,33,White
5,116981962,Female,62,White
6,121811577,Male,60,White
11,116085928,Male,83,White


In [17]:
patients_df.to_csv('MGB_demographics.csv', index=False)
negPatients.to_csv('MGB_neg_demographics.csv', index=False)
posPatients.to_csv('MGB_pos_demographics.csv', index=False)