# EDA

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('data/merged_5000_patient_radio_disc.csv')
df.head()

Unnamed: 0,subject_id,BMI,BMI (kg/m2),Blood Pressure,Blood Pressure Lying,Blood Pressure Sitting,Blood Pressure Standing,Blood Pressure Standing (1 min),Blood Pressure Standing (3 mins),Height,...,ELECTIVE,EU OBSERVATION,EW EMER.,OBSERVATION ADMIT,SURGICAL SAME DAY ADMISSION,URGENT,marital_status,race,radio_bioBERT_embedding,disc_bioBERT_embedding
0,10207476,20.0,25.0,120/62,119/58,115/63,,114/51,,,...,1.0,5.0,16.0,4.0,2.0,2.0,MARRIED,WHITE - OTHER EUROPEAN,[ 1.16570882e-01 -1.30169436e-01 1.55872822e-...,[ 1.38808012e-01 -1.36818409e-01 -2.57112831e-...
1,14130048,42.0,39.9,160/96,150/90,150/90,,120/73,150/90,64.0,...,1.0,5.0,9.0,7.0,,2.0,DIVORCED,WHITE,[ 7.90389553e-02 -5.54898381e-02 1.47240043e-...,[ 2.02867508e-01 -1.90792486e-01 4.20148969e-...
2,17751804,25.0,20.7,102/60,118/70,110/72,,101/68,99/72,,...,4.0,1.0,4.0,3.0,1.0,,SINGLE,WHITE,,
3,18632748,39.0,44.6,122/70,142/82,109/49,150/84,111/51,,,...,2.0,10.0,13.0,2.0,2.0,3.0,SINGLE,ASIAN - CHINESE,[ 3.53200674e-01 -2.37847850e-01 3.51036079e-...,[ 1.39690787e-01 -1.72594145e-01 -3.51392962e-...
4,18369403,,33.8,160/92,108/74,93/59,,88/64,,,...,1.0,5.0,9.0,9.0,,1.0,DIVORCED,BLACK/AFRICAN AMERICAN,[ 8.85203481e-02 -1.85742557e-01 2.28411064e-...,[ 1.61080152e-01 -2.11643443e-01 2.04216745e-...


In [3]:
df.isna().sum()

subject_id                             0
BMI                                 2554
BMI (kg/m2)                           24
Blood Pressure                        13
Blood Pressure Lying                3658
Blood Pressure Sitting              3516
Blood Pressure Standing             4813
Blood Pressure Standing (1 min)     3647
Blood Pressure Standing (3 mins)    4466
Height                              4541
Height (Inches)                       31
Weight                              3050
Weight (Lbs)                          21
eGFR                                4939
gender                                 0
anchor_age                             0
anchor_year                            0
anchor_year_group                      0
dod                                 2708
unique_drugs                        4611
diagnoses_icd_code_version             0
procedures_icd_code_version           65
AMBULATORY OBSERVATION              4046
DIRECT EMER.                        2791
DIRECT OBSERVATI

In [4]:
df['procedures_icd_code_version'].value_counts()

procedures_icd_code_version
5A2204Z_10                                                                                                                                                                                                                                                                                                                                                                                                                                                                       6
0SRD0J9_10                                                                                                                                                                                                                                                                                                                                                                                                                                                                       5
8154_9                                

In [5]:
# Extraction of unique ICD codes in dataset
icd_codes = df['procedures_icd_code_version'].apply(lambda x: list(set(str(x).split(','))))
icd_codes = icd_codes.explode()
icd_codes.value_counts()

procedures_icd_code_version
02HV33Z_10    1309
3897_9         714
3893_9         691
0DJ08ZZ_10     617
3E0G76Z_10     608
              ... 
09PK7YZ_10       1
0TC74ZZ_10       1
0D738DZ_10       1
02JA0ZZ_10       1
0SRR019_10       1
Name: count, Length: 6709, dtype: int64

In [6]:
print(icd_codes.unique())

['8801_9' '4652_9' '8938_9' ... '03WY03Z_10' '09C4XZZ_10' '0SRR019_10']


## Process Data based off All Hat Clinical Trial Qualifications

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 35 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   subject_id                        5000 non-null   int64  
 1   BMI                               2446 non-null   float64
 2   BMI (kg/m2)                       4976 non-null   float64
 3   Blood Pressure                    4987 non-null   object 
 4   Blood Pressure Lying              1342 non-null   object 
 5   Blood Pressure Sitting            1484 non-null   object 
 6   Blood Pressure Standing           187 non-null    object 
 7   Blood Pressure Standing (1 min)   1353 non-null   object 
 8   Blood Pressure Standing (3 mins)  534 non-null    object 
 9   Height                            459 non-null    object 
 10  Height (Inches)                   4969 non-null   float64
 11  Weight                            1950 non-null   float64
 12  Weight

In [8]:
import pandas as pd
import re

# Load the patient dataset
df_eligible = pd.read_csv("data/merged_5000_patient_radio_disc.csv")

# # Inclusion: Age must be 55 or older
# df_eligible = df[df['anchor_age'] >= 55]

# Assuming a column 'ICD_Codes' that contains ICD codes as a comma-separated string.
# Define regex patterns for the inclusion ICD code prefixes.
# Hypertension (must be present)
htn_regex = r'\b(401|I10)'
# Additional risk factors (at least one required):
risk_regex = r'\b(410|I21|I22|4293|I51\.7|250|E11|3051|F17\.2|272\.5|E78\.5|440|I70)'

# Exclusion criteria: codes indicating severe conditions
exclusion_regex = r'\b(428|I50|434|436|I63|I64|585|586|N18|N19)\b'

# Create masks for each criterion based on ICD codes
# It is assumed that the 'ICD_Codes' field is a string that may contain multiple codes separated by commas.
def contains_pattern(codes_str, pattern):
    if pd.isna(codes_str):
        return False
    return bool(re.search(pattern, codes_str))

# Apply the masks: 

# Patient must have a hypertension diagnosis.
htn_mask = df_eligible['diagnoses_icd_code_version'].apply(lambda x: contains_pattern(x, htn_regex))
htn_mask = df_eligible['procedures_icd_code_version'].apply(lambda x: contains_pattern(x, htn_regex))

# Patient must have at least one additional CHD risk factor.
risk_mask = df_eligible['diagnoses_icd_code_version'].apply(lambda x: contains_pattern(x, risk_regex))
risk_mask = df_eligible['procedures_icd_code_version'].apply(lambda x: contains_pattern(x, risk_regex))

# Patient should NOT have any exclusion codes.
exclusion_mask = df_eligible['diagnoses_icd_code_version'].apply(lambda x: not contains_pattern(x, exclusion_regex))
exclusion_mask = df_eligible['procedures_icd_code_version'].apply(lambda x: not contains_pattern(x, exclusion_regex))

# Combine the masks to get the final eligible group
df_final = df_eligible[htn_mask | risk_mask]

# Output the count and a sample of qualifying patients
print("Total patients qualifying for the ALLHAT trial criteria:", len(df_final))
df_final.head()

Total patients qualifying for the ALLHAT trial criteria: 163


Unnamed: 0,subject_id,BMI,BMI (kg/m2),Blood Pressure,Blood Pressure Lying,Blood Pressure Sitting,Blood Pressure Standing,Blood Pressure Standing (1 min),Blood Pressure Standing (3 mins),Height,...,ELECTIVE,EU OBSERVATION,EW EMER.,OBSERVATION ADMIT,SURGICAL SAME DAY ADMISSION,URGENT,marital_status,race,radio_bioBERT_embedding,disc_bioBERT_embedding
2,17751804,25.0,20.7,102/60,118/70,110/72,,101/68,99/72,,...,4.0,1.0,4.0,3.0,1.0,,SINGLE,WHITE,,
12,19985545,23.0,25.3,102/66,117/78,110/70,,117/80,154/79,,...,3.0,3.0,15.0,15.0,,,MARRIED,WHITE - OTHER EUROPEAN,[ 1.39633209e-01 -1.59038469e-01 1.04453407e-...,[ 1.51621208e-01 -1.55696258e-01 -9.76459309e-...
14,15534164,28.0,24.9,101/70,114/77,117/71,108/74,131/74,,66.5,...,1.0,3.0,2.0,3.0,,,MARRIED,WHITE,[ 1.83287725e-01 -2.27743670e-01 5.75758033e-...,[ 1.43734738e-01 -1.46319315e-01 1.99887436e-...
15,13976907,,31.6,116/70,114/68,149/93,113/64,88/54,,,...,1.0,12.0,9.0,9.0,,1.0,MARRIED,WHITE,[ 7.92515799e-02 -1.22456186e-01 1.36946365e-...,[ 1.62082255e-01 -1.56106740e-01 5.42580895e-...
42,15790142,23.0,20.9,132/86,122/91,125/92,,109/78,110/82,,...,,2.0,2.0,12.0,,5.0,SINGLE,WHITE,[ 5.05083352e-02 -1.85041383e-01 1.31982043e-...,[ 1.13354146e-01 -1.42289743e-01 -6.92519844e-...


removed age restriction to add noice in the data and made an or statement to assume that the clinical study had it as an optional condition for having CHD risk factor 

In [9]:
# join dataset with patient demographic dataset
df_demo = pd.read_csv('data/patients_cleaned.csv')
df_demo.head()


Unnamed: 0,subject_id,gender,anchor_age,anchor_year,insurance,language,marital_status,race,blood_pressure_systolic,blood_pressure_diastolic,bmi,height,weight,egfr
0,10000117,F,48,2174,Medicaid,English,DIVORCED,WHITE,108.0,74.0,18.9,64.0,110.0,
1,10000161,M,60,2163,Medicaid,English,SINGLE,WHITE,106.0,92.0,,,,
2,10000248,M,34,2192,Private,English,MARRIED,WHITE,,,25.5,68.0,168.0,
3,10000280,M,20,2151,Private,English,,OTHER,125.0,77.0,,,170.5,
4,10000560,F,53,2189,Private,English,MARRIED,WHITE,124.0,78.0,,,128.0,


In [20]:
# Merge the DataFrames
combined_df = pd.merge(df, df_demo, how='inner', left_index=True,
                     right_index=True, suffixes=('', '_remove'))
 
# remove the duplicate columns
combined_df.drop([i for i in combined_df.columns if 'remove' in i],
               axis=1, inplace=True)

combined_df.head()

Unnamed: 0,subject_id,BMI,BMI (kg/m2),Blood Pressure,Blood Pressure Lying,Blood Pressure Sitting,Blood Pressure Standing,Blood Pressure Standing (1 min),Blood Pressure Standing (3 mins),Height,...,radio_bioBERT_embedding,disc_bioBERT_embedding,insurance,language,blood_pressure_systolic,blood_pressure_diastolic,bmi,height,weight,egfr
0,10207476,20.0,25.0,120/62,119/58,115/63,,114/51,,,...,[ 1.16570882e-01 -1.30169436e-01 1.55872822e-...,[ 1.38808012e-01 -1.36818409e-01 -2.57112831e-...,Medicaid,English,108.0,74.0,18.9,64.0,110.0,
1,14130048,42.0,39.9,160/96,150/90,150/90,,120/73,150/90,64.0,...,[ 7.90389553e-02 -5.54898381e-02 1.47240043e-...,[ 2.02867508e-01 -1.90792486e-01 4.20148969e-...,Medicaid,English,106.0,92.0,,,,
2,17751804,25.0,20.7,102/60,118/70,110/72,,101/68,99/72,,...,,,Private,English,,,25.5,68.0,168.0,
3,18632748,39.0,44.6,122/70,142/82,109/49,150/84,111/51,,,...,[ 3.53200674e-01 -2.37847850e-01 3.51036079e-...,[ 1.39690787e-01 -1.72594145e-01 -3.51392962e-...,Private,English,125.0,77.0,,,170.5,
4,18369403,,33.8,160/92,108/74,93/59,,88/64,,,...,[ 8.85203481e-02 -1.85742557e-01 2.28411064e-...,[ 1.61080152e-01 -2.11643443e-01 2.04216745e-...,Private,English,124.0,78.0,,,128.0,


In [21]:
combined_df.drop(columns=['Blood Pressure', 'Blood Pressure Lying','Blood Pressure Sitting', 'Blood Pressure Standing', 'Blood Pressure Standing (1 min)', 'Blood Pressure Standing (3 mins)', 'BMI', 'Height', 'Weight'], inplace=True)

In [22]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5000 entries, 0 to 4999
Data columns (total 34 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   subject_id                   5000 non-null   int64  
 1   BMI (kg/m2)                  4976 non-null   float64
 2   Height (Inches)              4969 non-null   float64
 3   Weight (Lbs)                 4979 non-null   float64
 4   eGFR                         61 non-null     object 
 5   gender                       5000 non-null   object 
 6   anchor_age                   5000 non-null   int64  
 7   anchor_year                  5000 non-null   int64  
 8   anchor_year_group            5000 non-null   object 
 9   dod                          2292 non-null   object 
 10  unique_drugs                 389 non-null    object 
 11  diagnoses_icd_code_version   5000 non-null   object 
 12  procedures_icd_code_version  4935 non-null   object 
 13  AMBULATORY OBSERVATION 

## Generalize ICD codes for broader categories

In [28]:
def generalize_icd_by_version(code, version_filter):
    """
    Given an ICD code in the format 'SpecificCode_Version' and a version filter (e.g., "9" or "10"),
    returns the generalized code (first three characters of the SpecificCode) if it matches the version.
    Otherwise, returns None.
    """
    if "_" not in code:
        return None
    code_part, version = code.split("_")
    if version != version_filter:
        return None
    return code_part[:3]

def generalize_patient_icd_codes_by_version(icd_str, version_filter):
    """
    Processes a comma-separated string of ICD codes, generalizes each code that matches the version_filter,
    and returns a sorted, comma-separated string of unique generalized codes.
    """
    if pd.isna(icd_str):
        return ""
    codes = [x.strip() for x in icd_str.split(",") if x.strip()]
    # Use a set to collect unique generalized codes for the specified version
    generalized = {generalize_icd_by_version(code, version_filter) for code in codes}
    # Remove None values if any
    generalized = {code for code in generalized if code is not None}
    return ", ".join(sorted(generalized))

# Assume the dataset has a column "ICD_Codes" containing comma-separated ICD codes.
# Create new columns for ICD-9 and ICD-10 generalized codes.
combined_df["Generalized_Diagnoses_ICD9"] = combined_df["diagnoses_icd_code_version"].apply(lambda x: generalize_patient_icd_codes_by_version(x, "9"))
combined_df["Generalized_Diagnoses_ICD10"] = combined_df["diagnoses_icd_code_version"].apply(lambda x: generalize_patient_icd_codes_by_version(x, "10"))
combined_df["Generalized_Procedures_ICD9"] = combined_df["procedures_icd_code_version"].apply(lambda x: generalize_patient_icd_codes_by_version(x, "9"))
combined_df["Generalized_Procedures_ICD10"] = combined_df["procedures_icd_code_version"].apply(lambda x: generalize_patient_icd_codes_by_version(x, "10"))

combined_df.head()

Unnamed: 0,subject_id,BMI (kg/m2),Height (Inches),Weight (Lbs),eGFR,gender,anchor_age,anchor_year,anchor_year_group,dod,...,bmi,height,weight,egfr,Generalized_Diagnoses_ICD_Codes,Generalized_Procedures_ICD_Codes,Generalized_Diagnoses_ICD9,Generalized_Diagnoses_ICD10,Generalized_Procedures_ICD9,Generalized_Procedures_ICD10
0,10207476,25.0,66.0,152.6,,F,63,2169,2008 - 2010,2185-03-22,...,18.9,64.0,110.0,,"008, 041, 079, 205, 207, 250, 255, 266, 272, 2...","001, 004, 006, 02H, 02V, 03H, 360, 361, 372, 3...","008, 041, 079, 205, 207, 250, 255, 266, 272, 2...","A04, A41, B02, C92, C93, C94, D46, D47, D61, D...","001, 004, 006, 360, 361, 372, 389, 452, 457, 4...","02H, 02V, 03H, 4A1, B24"
1,14130048,39.9,63.0,225.4,,F,55,2157,2008 - 2010,,...,,,,,"008, 041, 242, 272, 276, 278, 280, 285, 327, 3...","02H, 0D2, 0DB, 0DD, 0DW, 348, 389, 3C1, 3E0, 4...","008, 041, 242, 272, 276, 278, 280, 285, 327, 3...","A08, B37, B97, D51, E05, E43, E44, E55, E66, E...","348, 389, 446, 463, 534, 536, 537, 560, 573, 5...","02H, 0D2, 0DB, 0DD, 0DW, 3C1, 3E0, B21"
2,17751804,20.7,68.0,162.0,,M,55,2134,2008 - 2010,2147-09-04,...,25.5,68.0,168.0,,"203, 272, 276, 284, 288, 295, 300, 305, 307, 4...","02H, 0DB, 0DH, 0DT, 0JH, 0W9, 389, 3E0, 410, 4...","203, 272, 276, 284, 288, 295, 300, 305, 307, 4...","B37, B95, B96, C18, C77, C90, D50, D62, D63, D...","389, 410, 413, 861, 992","02H, 0DB, 0DH, 0DT, 0JH, 0W9, 3E0, DPY, DWY"
3,18632748,44.6,57.0,206.0,,F,61,2145,2008 - 2010,,...,,,170.5,,"250, 272, 275, 276, 278, 285, 288, 296, 300, 3...","02H, 0FB, 0JX, 0KB, 0LS, 0RB, 0RC, 0RP, 0RR, 2...","250, 272, 275, 276, 278, 285, 288, 296, 300, 3...","A08, B17, B95, B96, D62, D64, D72, E04, E09, E...","352, 372, 389, 396, 451, 452, 681, 690, 707, 8...","02H, 0FB, 0JX, 0KB, 0LS, 0RB, 0RC, 0RP, 0RR, 2W1"
4,18369403,33.8,72.0,249.2,,M,46,2195,2008 - 2010,2208-10-25,...,,,128.0,,"008, 041, 070, 110, 250, 272, 274, 276, 278, 2...","004, 009, 027, 02H, 03H, 047, 04H, 05H, 0B9, 0...","008, 041, 070, 110, 250, 272, 274, 276, 278, 2...","A04, A41, B18, B19, B95, B96, D35, D45, D63, D...","004, 009, 389, 395, 399, 556, 860, 862, 991","027, 02H, 03H, 047, 04H, 05H, 0B9, 0BC, 0BH, 0..."


In [29]:
combined_df.to_csv('data/patient_data_merged_with_radiology_dataset.csv', index=False)