In [1]:
import pandas as pd
from thunderpack import ThunderReader
from tqdm import tqdm
import re

## GOALS
- AGE average age of all participants (and Standard Deviation)
    - per hospital
    - per ICD standing

- SEX num/percent female/male
    - per hospital
    - per icd standing

- RACE num/percent
    - per hospital
    - per icd standing


### MGB
 - BDSPPatientID
 - AGE
    - NoteDate yr - DateOfBirth yr
    - Add Age column
    - Age brackets (18-30, etc.)
 - SEX
    - SexDSC (Patient informed Gender)
        - or -
    - SexAssignedAtBirthDSC
        - or -
    - GengerIdentityDSC
 - RACE
    - PatientRace

### BIDMC

 - BDSPPatientID
 - AGE
    - NoteDate yr - DateOfBirth yr
    - Add Age column
    - Age brackets (18-30, etc.)
 - SEX
    - SexDSC (Patient informed Gender)
 - RACE
    - PatientRace

In [2]:
# Start with MGB
cohort = pd.read_csv('/home/jsearle/bigDrive/NAX/NLP-SAH_identification/cleanCohorts/MGB_cohort_final.csv')
cohort.drop(columns=['text'], inplace=True)
print(len(cohort))
cohort.head()

2082


Unnamed: 0,BDSPPatientID,NoteDate,NoteTitle
0,118374115,2018-11-20 00:00:00,Notes_13333491432_2130196704_20181120.txt
1,116931824,2022-12-29 00:00:00,Notes_13563868206_7830407523_20221229.txt
2,118587891,2021-05-30 00:00:00,Notes_13532685798_4823397982_20210530.txt
3,114557355,2022-07-22 00:00:00,Notes_13620259495_8489742841_20220722.txt
4,114329029,2018-05-15 00:00:00,Notes_13356996149_1753656538_20180515.txt


In [3]:
patientIDs = cohort['BDSPPatientID']

reader = ThunderReader('/home/jsearle/bigDrive/Dropbox/zz_EHR_Thunderpacks/MGB/thunderpack_demographics_MGB')
key_length = len(list(reader.keys()))
print(key_length)
print(list(reader.keys()))

11
['demographics_partition_1', 'demographics_partition_10', 'demographics_partition_2', 'demographics_partition_3', 'demographics_partition_4', 'demographics_partition_5', 'demographics_partition_6', 'demographics_partition_7', 'demographics_partition_8', 'demographics_partition_9', 'metadata']


In [4]:
metadata = reader[f'metadata']
print(metadata)

['BDSPPatientID', 'DateOfBirth', 'DateOfDeath', 'DateOfDeathMARegistryData', 'PatientRace', 'EthnicGroupDSC', 'MaritalStatusDSC', 'ReligionDSC', 'LanguageDSC', 'VeteranStatusDSC', 'SexDSC', 'PrimaryCauseOfDeathDSC', 'UNOSPrimaryCauseOfDeathTXT', 'FirstContributoryCauseOfDeathDSC', 'UNOSContributoryCauseOfDeath01TXT', 'SecondContributoryCauseOfDeathDSC', 'UNOSContributoryCauseOfDeath02TXT', 'EducationLevelDSC', 'GenderIdentityDSC', 'SexAssignedAtBirthDSC', 'BDSPLastModifiedDTS']


In [5]:
filtered_dfs = []
for i in tqdm(range(1, key_length)):
    df = reader[f'demographics_partition_{i}']
    filtered_df = df[df['BDSPPatientID'].isin(patientIDs)]
    filtered_dfs.append(filtered_df)

# Concatenate all filtered DataFrames into one
relevantPatientIDs = pd.concat(filtered_dfs, ignore_index=True)

relevantPatientIDs.head()

100%|██████████| 10/10 [00:09<00:00,  1.10it/s]


Unnamed: 0,BDSPPatientID,DateOfBirth,DateOfDeath,DateOfDeathMARegistryData,PatientRace,EthnicGroupDSC,MaritalStatusDSC,ReligionDSC,LanguageDSC,VeteranStatusDSC,...,PrimaryCauseOfDeathDSC,UNOSPrimaryCauseOfDeathTXT,FirstContributoryCauseOfDeathDSC,UNOSContributoryCauseOfDeath01TXT,SecondContributoryCauseOfDeathDSC,UNOSContributoryCauseOfDeath02TXT,EducationLevelDSC,GenderIdentityDSC,SexAssignedAtBirthDSC,BDSPLastModifiedDTS
0,121298131,1972-09-21,2019-07-12,2019-07-12,White,Not Hispanic,Married/Civil Union,Jewish,English,"No, Never Served or Is Currently Active",...,,,,,,,Graduated - High School,Male,Male,2022-11-29 10:39:17.3066667
1,114146799,1950-09-22,,2018-04-27,White,Not Hispanic,Divorced,Roman Catholic,English,"No, Never Served or Is Currently Active",...,,,,,,,Graduated - High School,,,2022-11-29 06:56:59.7100000
2,116372012,1957-03-10,,,White,Not Hispanic,Married/Civil Union,No Preference,English,"No, Never Served or Is Currently Active",...,,,,,,,Graduated - College,Male,Male,2022-07-17 17:57:14.1830000
3,119649396,1986-09-14,,,White,Not Hispanic,Married/Civil Union,Christian,English,"No, Never Served or Is Currently Active",...,,,,,,,Graduated - College,Female,Female,2022-07-17 17:57:14.1830000
4,115251131,1985-06-07,,,White,Not Hispanic,Single,Roman Catholic,English,"No, Never Served or Is Currently Active",...,,,,,,,Graduated - High School,Male,Male,2022-07-17 17:57:14.1830000


In [6]:
print(len(relevantPatientIDs))
print(len(patientIDs))

2132
2082


In [7]:
relevantPatientIDs = relevantPatientIDs.drop_duplicates(subset='BDSPPatientID', keep='first')
print(len(relevantPatientIDs))

2082


In [8]:
relevantPatientIDs = relevantPatientIDs[['BDSPPatientID', 'DateOfBirth', 'PatientRace', 'SexDSC']]

relevantPatientIDs.head()

Unnamed: 0,BDSPPatientID,DateOfBirth,PatientRace,SexDSC
0,121298131,1972-09-21,White,Male
1,114146799,1950-09-22,White,Female
2,116372012,1957-03-10,White,Male
3,119649396,1986-09-14,White,Female
4,115251131,1985-06-07,White,Male


In [9]:
patients_df = pd.merge(relevantPatientIDs, cohort, on='BDSPPatientID', how='inner')
patients_df.head()

Unnamed: 0,BDSPPatientID,DateOfBirth,PatientRace,SexDSC,NoteDate,NoteTitle
0,121298131,1972-09-21,White,Male,2017-07-05,Notes_13290902125_1440517492_20170705.txt
1,114146799,1950-09-22,White,Female,2017-12-23 00:00:00,Notes_13348111085_1696677928_20171223.txt
2,116372012,1957-03-10,White,Male,2022-01-21,Notes_13547570969_7274296664_20220121.txt
3,119649396,1986-09-14,White,Female,2019-11-30,Notes_13479556208_3953465452_20191130.txt
4,115251131,1985-06-07,White,Male,2019-02-15 00:00:00,Notes_13328354473_2229396795_20190215.txt


In [10]:
def parse_dates(date_str):
    for fmt in ('%Y-%m-%d %H:%M:%S.%f', '%Y-%m-%d %H:%M:%S', '%Y-%m-%d'):
        try:
            return pd.to_datetime(date_str, format=fmt)
        except ValueError:
            continue
    # If all formats fail, return NaT
    return pd.NaT

# Apply the custom function to convert dates
patients_df['NoteDate'] = patients_df['NoteDate'].apply(parse_dates)
patients_df['DateOfBirth'] = patients_df['DateOfBirth'].apply(parse_dates)

# Calculate age at NoteDate
patients_df['Age'] = (patients_df['NoteDate'] - patients_df['DateOfBirth']).apply(lambda x: x.days // 365)

patients_df.head()

Unnamed: 0,BDSPPatientID,DateOfBirth,PatientRace,SexDSC,NoteDate,NoteTitle,Age
0,121298131,1972-09-21,White,Male,2017-07-05,Notes_13290902125_1440517492_20170705.txt,44
1,114146799,1950-09-22,White,Female,2017-12-23,Notes_13348111085_1696677928_20171223.txt,67
2,116372012,1957-03-10,White,Male,2022-01-21,Notes_13547570969_7274296664_20220121.txt,64
3,119649396,1986-09-14,White,Female,2019-11-30,Notes_13479556208_3953465452_20191130.txt,33
4,115251131,1985-06-07,White,Male,2019-02-15,Notes_13328354473_2229396795_20190215.txt,33


In [11]:
# Create a function to categorize the PatientRace values
def categorize_race(race):
    if "White" in race:
        return "White"
    elif "Black or African American" in race:
        return "Black or African American"
    elif "Asian" in race:
        return "Asian"
    elif "Unavailable" in race or "Declined" in race:
        return "Unavailable"
    else:
        return "Other"

# Apply the function to the PatientRace column to create a new categorized column
patients_df['CategorizedRace'] = patients_df['PatientRace'].apply(categorize_race)

# Drop the columns
patients_df = patients_df.drop(columns=['PatientRace', 'DateOfBirth', 'NoteDate', 'NoteTitle'])

patients_df.head()

In [14]:
neg_cohort = pd.read_csv('/home/jsearle/bigDrive/NAX/NLP-SAH_identification/cleanCohorts/MGB_neg_cohort_final.csv')
pos_cohort = pd.read_csv('/home/jsearle/bigDrive/NAX/NLP-SAH_identification/cleanCohorts/MGB_pos_cohort_final.csv')

negPatientIDs = neg_cohort['BDSPPatientID']
posPatientIDs = pos_cohort['BDSPPatientID']

negPatients = patients_df[patients_df['BDSPPatientID'].isin(negPatientIDs)]

posPatients = patients_df[patients_df['BDSPPatientID'].isin(posPatientIDs)]

print(len(negPatients))
print(len(posPatients))
print(len(negPatients)+len(posPatients)-len(patients_df))

negPatients.head()

1029
1053
0


Unnamed: 0,BDSPPatientID,SexDSC,Age,CategorizedRace
1,114146799,Female,67,White
4,115251131,Male,33,White
5,116981962,Female,62,White
6,121811577,Male,60,White
11,116085928,Male,83,White


In [17]:
patients_df.to_csv('MGB_demographics.csv', index=False)
negPatients.to_csv('MGB_neg_demographics.csv', index=False)
posPatients.to_csv('MGB_pos_demographics.csv', index=False)