In [22]:
import pandas as pd
from thunderpack import ThunderReader
from tqdm import tqdm

## GOALS
- AGE average age of all participants (and Standard Deviation)
    - per hospital
    - per ICD standing

- SEX num/percent female/male
    - per hospital
    - per icd standing

- RACE num/percent
    - per hospital
    - per icd standing


### MGB
 - BDSPPatientID
 - AGE
    - NoteDate yr - DateOfBirth yr
    - Add Age column
    - Age brackets (18-30, etc.)
 - SEX
    - SexDSC (Patient informed Gender)
        - or -
    - SexAssignedAtBirthDSC
        - or -
    - GengerIdentityDSC
 - RACE
    - PatientRace

### BIDMC

 - BDSPPatientID
 - AGE
    - NoteDate yr - DateOfBirth yr
    - Add Age column
    - Age brackets (18-30, etc.)
 - SEX
    - SexDSC (Patient informed Gender)
 - RACE
    - PatientRace

In [2]:
# BIDMC
cohort = pd.read_csv('/home/jsearle/bigDrive/NAX/NLP-SAH_identification/cleanCohorts/BIDMC_cohort_final.csv')
cohort.drop(columns=['text'], inplace=True)
print(len(cohort))
cohort.head()

1014


Unnamed: 0,BDSPPatientID,NoteDate,NoteTitle
0,151212513,2016-02-04,Notes_1131071605_435587215_20160204.txt
1,150639400,2023-06-12,Notes_1130498158_10141889500_20230612.txt
2,150057053,2020-12-17,Notes_1129915463_4302367261_20201217.txt
3,151005496,2010-06-29,Notes_1130863925_364816102_20100629.txt
4,150006448,2014-07-14,Notes_1129865503_1991829458_20140714.txt


In [3]:
patientIDs = cohort['BDSPPatientID']

reader = ThunderReader('/home/jsearle/bigDrive/Dropbox/zz_EHR_Thunderpacks/BIDMC/thunderpack_demographics_nax_1m_BIDMC')
key_length = len(list(reader.keys()))
print(key_length)
print(list(reader.keys()))

2
['patient_partition_1', 'patient_partition_2']


In [5]:
filtered_dfs = []
for i in tqdm(range(1, key_length + 1)):
    df = reader[f'patient_partition_{i}']
    filtered_df = df[df['BDSPPatientID'].isin(patientIDs)]
    filtered_dfs.append(filtered_df)

# Concatenate all filtered DataFrames into one
relevantPatientIDs = pd.concat(filtered_dfs, ignore_index=True)

relevantPatientIDs.head()

100%|██████████| 2/2 [00:01<00:00,  1.73it/s]


Unnamed: 0,BDSPPatientID,DateOfBirth,LanguageDSC,SexDSC,PatientRace,EthnicGroupDSC,MaritalStatusDSC,ReligionDSC,ExpiredDate,DateOfDeathMARegistryData,BDSPLastModifiedDTS
0,150000004,1957-03-12,ENGLISH,Male,ASIAN,CHINESE,MARRIED,UNKNOWN,,,2023-05-19 17:03:37.6866667
1,150064116,1953-05-02,ENGLISH,Female,WHITE,AMERICAN,SINGLE,CATHOLIC,2018-03-10,2018-03-10,2024-01-10 15:18:02.4866667
2,150064837,1955-10-27,ENGLISH,Male,WHITE,AMERICAN INDIAN,SINGLE,BUDDHIST,,,2023-07-06 17:38:51.9929860
3,150000090,1949-06-15,ENGLISH,Female,WHITE,AMERICAN,SINGLE,CATHOLIC,,,2023-05-19 17:03:39.2033333
4,150072715,1948-07-03,ENGLISH,Female,WHITE,GREEK,DIVORCED,UNKNOWN,,,2023-07-06 17:38:51.9929860


In [6]:
print(len(relevantPatientIDs))
print(len(patientIDs))
relevantPatientIDs.head()

1014
1014


Unnamed: 0,BDSPPatientID,DateOfBirth,LanguageDSC,SexDSC,PatientRace,EthnicGroupDSC,MaritalStatusDSC,ReligionDSC,ExpiredDate,DateOfDeathMARegistryData,BDSPLastModifiedDTS
0,150000004,1957-03-12,ENGLISH,Male,ASIAN,CHINESE,MARRIED,UNKNOWN,,,2023-05-19 17:03:37.6866667
1,150064116,1953-05-02,ENGLISH,Female,WHITE,AMERICAN,SINGLE,CATHOLIC,2018-03-10,2018-03-10,2024-01-10 15:18:02.4866667
2,150064837,1955-10-27,ENGLISH,Male,WHITE,AMERICAN INDIAN,SINGLE,BUDDHIST,,,2023-07-06 17:38:51.9929860
3,150000090,1949-06-15,ENGLISH,Female,WHITE,AMERICAN,SINGLE,CATHOLIC,,,2023-05-19 17:03:39.2033333
4,150072715,1948-07-03,ENGLISH,Female,WHITE,GREEK,DIVORCED,UNKNOWN,,,2023-07-06 17:38:51.9929860


In [8]:
relevantPatientIDs = relevantPatientIDs.drop_duplicates(subset='BDSPPatientID', keep='first')
print(len(relevantPatientIDs))

1014


In [9]:
relevantPatientIDs = relevantPatientIDs[['BDSPPatientID', 'DateOfBirth', 'PatientRace', 'SexDSC']]

relevantPatientIDs.head()

Unnamed: 0,BDSPPatientID,DateOfBirth,PatientRace,SexDSC
0,150000004,1957-03-12,ASIAN,Male
1,150064116,1953-05-02,WHITE,Female
2,150064837,1955-10-27,WHITE,Male
3,150000090,1949-06-15,WHITE,Female
4,150072715,1948-07-03,WHITE,Female


In [12]:
patients_df = pd.merge(relevantPatientIDs, cohort, on='BDSPPatientID', how='inner')
print(len(patients_df))
patients_df.head()

1014


Unnamed: 0,BDSPPatientID,DateOfBirth,PatientRace,SexDSC,NoteDate,NoteTitle
0,150000004,1957-03-12,ASIAN,Male,2012-12-05,Notes_1129858847_903347041_20121205.txt
1,150064116,1953-05-02,WHITE,Female,2017-09-05,Notes_1129922564_7789324346_20170905.txt
2,150064837,1955-10-27,WHITE,Male,2023-01-30,Notes_1129923656_226657938_20230130.txt
3,150000090,1949-06-15,WHITE,Female,2022-04-23,Notes_1129858659_7925200739_20220423.txt
4,150072715,1948-07-03,WHITE,Female,2023-06-18,Notes_1129931751_934063735_20230618.txt


In [13]:
def parse_dates(date_str):
    for fmt in ('%Y-%m-%d %H:%M:%S.%f', '%Y-%m-%d %H:%M:%S', '%Y-%m-%d'):
        try:
            return pd.to_datetime(date_str, format=fmt)
        except ValueError:
            continue
    # If all formats fail, return NaT
    return pd.NaT

# Apply the custom function to convert dates
patients_df['NoteDate'] = patients_df['NoteDate'].apply(parse_dates)
patients_df['DateOfBirth'] = patients_df['DateOfBirth'].apply(parse_dates)

# Calculate age at NoteDate
patients_df['Age'] = (patients_df['NoteDate'] - patients_df['DateOfBirth']).apply(lambda x: x.days // 365)

patients_df.head()

Unnamed: 0,BDSPPatientID,DateOfBirth,PatientRace,SexDSC,NoteDate,NoteTitle,Age
0,150000004,1957-03-12,ASIAN,Male,2012-12-05,Notes_1129858847_903347041_20121205.txt,55
1,150064116,1953-05-02,WHITE,Female,2017-09-05,Notes_1129922564_7789324346_20170905.txt,64
2,150064837,1955-10-27,WHITE,Male,2023-01-30,Notes_1129923656_226657938_20230130.txt,67
3,150000090,1949-06-15,WHITE,Female,2022-04-23,Notes_1129858659_7925200739_20220423.txt,72
4,150072715,1948-07-03,WHITE,Female,2023-06-18,Notes_1129931751_934063735_20230618.txt,75


In [17]:
# Get all unique values in the 'PatientRace' column
unique_races = patients_df['PatientRace'].unique()

# Display the unique values
print("Unique values of PatientRace:")
print(unique_races)


Unique values of PatientRace:
['ASIAN' 'WHITE' 'BLACK/AFRICAN AMERICAN' 'UNABLE TO OBTAIN'
 'UNKNOWN/NOT SPECIFIED' '' None 'OTHER RACE' 'DECLINED TO ANSWER'
 'AMERICAN INDIAN/ALASKA NATIVE'
 'NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER' 'PREFER NOT TO SAY']


In [19]:
# Create a function to categorize the PatientRace values
def categorize_race(race):
    if race is None or race == '':
        return "Unavailable"
    elif "WHITE" in race:
        return "White"
    elif "BLACK/AFRICAN AMERICAN" in race:
        return "Black or African American"
    elif "ASIAN" in race:
        return "Asian"
    elif any(term in race for term in ["AMERICAN INDIAN/ALASKA NATIVE", "NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER", "OTHER RACE"]):
        return "Other"
    elif any(term in race for term in ["UNABLE TO OBTAIN", "UNKNOWN/NOT SPECIFIED", "DECLINED TO ANSWER", "PREFER NOT TO SAY"]):
        return "Unavailable"
    else:
        return "Unknown"

# Apply the function to the PatientRace column to create a new categorized column
patients_df['CategorizedRace'] = patients_df['PatientRace'].apply(categorize_race)

# Drop the columns
patients_df = patients_df.drop(columns=['PatientRace', 'DateOfBirth', 'NoteDate', 'NoteTitle'])

# Display the first few rows to verify
patients_df.head()

Unnamed: 0,BDSPPatientID,SexDSC,Age,CategorizedRace
0,150000004,Male,55,Asian
1,150064116,Female,64,White
2,150064837,Male,67,White
3,150000090,Female,72,White
4,150072715,Female,75,White


In [23]:
neg_cohort = pd.read_csv('/home/jsearle/bigDrive/NAX/NLP-SAH_identification/cleanCohorts/BIDMC_neg_cohort_final_updated.csv')
pos_cohort = pd.read_csv('/home/jsearle/bigDrive/NAX/NLP-SAH_identification/cleanCohorts/BIDMC_pos_cohort_final_updated.csv')

negPatientIDs = neg_cohort['BDSPPatientID']
posPatientIDs = pos_cohort['BDSPPatientID']

negPatients = patients_df[patients_df['BDSPPatientID'].isin(negPatientIDs)]

posPatients = patients_df[patients_df['BDSPPatientID'].isin(posPatientIDs)]

print(len(negPatients))
print(len(posPatients))
print(len(negPatients)+len(posPatients)-len(patients_df))

negPatients.head()

499
515
0


Unnamed: 0,BDSPPatientID,SexDSC,Age,CategorizedRace
1,150064116,Female,64,White
6,150074292,Male,74,White
7,150000165,Female,90,Black or African American
10,150000237,Male,66,White
12,150000263,Male,56,White


In [21]:
patients_df.to_csv('BIDMC_demographics.csv', index=False)
negPatients.to_csv('BIDMC_neg_demographics.csv', index=False)
posPatients.to_csv('BIDMC_pos_demographics.csv', index=False)

In [25]:
neg_annot = pd.read_csv('/home/jsearle/bigDrive/NAX/NLP-SAH_identification/annotationTools/cleanAnnotations/neg_annotations_final.csv')
pos_annot = pd.read_csv('/home/jsearle/bigDrive/NAX/NLP-SAH_identification/annotationTools/cleanAnnotations/pos_annotations_final.csv')

In [26]:
def find_annotation_source(empi):
    if empi in neg_annot['empi'].values:
        return 'neg_annot'
    elif empi in pos_annot['empi'].values:
        return 'pos_annot'
    else:
        return 'Not Found'

# Step 2: Apply the function to the posPatients and negPatients DataFrames
posPatients['AnnotationSource'] = posPatients['BDSPPatientID'].apply(find_annotation_source)
negPatients['AnnotationSource'] = negPatients['BDSPPatientID'].apply(find_annotation_source)




Positive Patients with Annotation Source:
   BDSPPatientID  SexDSC  Age CategorizedRace AnnotationSource
0      150000004    Male   55           Asian        pos_annot
2      150064837    Male   67           White        pos_annot
3      150000090  Female   72           White        pos_annot
4      150072715  Female   75           White        neg_annot
5      150000137  Female   76           White        pos_annot

Negative Patients with Annotation Source:
    BDSPPatientID  SexDSC  Age            CategorizedRace AnnotationSource
1       150064116  Female   64                      White        neg_annot
6       150074292    Male   74                      White        neg_annot
7       150000165  Female   90  Black or African American        neg_annot
10      150000237    Male   66                      White        neg_annot
12      150000263    Male   56                      White        neg_annot


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  posPatients['AnnotationSource'] = posPatients['BDSPPatientID'].apply(find_annotation_source)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  negPatients['AnnotationSource'] = negPatients['BDSPPatientID'].apply(find_annotation_source)


In [29]:
# Count the number of patients from each source in posPatients
pos_source_counts = posPatients['AnnotationSource'].value_counts()

# Count the number of patients from each source in negPatients
neg_source_counts = negPatients['AnnotationSource'].value_counts()

# Display the results
print("Number of Positive Patients from Each Source:")
print(pos_source_counts)

print("\nNumber of Negative Patients from Each Source:")
print(neg_source_counts)

print(100*346/515)

Number of Positive Patients from Each Source:
AnnotationSource
pos_annot    346
neg_annot    169
Name: count, dtype: int64

Number of Negative Patients from Each Source:
AnnotationSource
neg_annot    499
Name: count, dtype: int64
67.18446601941747
