# Readmission Risk for Heart Failure Patients

In [1]:
import mimicfouretl.bigquery_utils as bq
from mimicfouretl.data_insights import display_datasets
from mimicfouretl.query_builder import QueryBuilder
from mimicfouretl.feature_engineering import FeatureEngineering
from mimicfouretl.feature_engineering import left_merge_dataframes

from pyspark.sql.functions import col

from mimicfouretl.ml_utils import MLUtils

## Build BigQuery Spark session

In [2]:
tables = bq.list_tables('mimiciv_hosp', use_local_data=True)
tables

['mimiciv_hosp.hcpcsevents',
 'mimiciv_hosp.patients',
 'mimiciv_hosp.emar',
 'mimiciv_hosp.d_hcpcs',
 'mimiciv_hosp.pharmacy',
 'mimiciv_hosp.emar_detail',
 'mimiciv_hosp.d_labitems',
 'mimiciv_hosp.prescriptions',
 'mimiciv_hosp.diagnoses_icd',
 'mimiciv_hosp.transfers',
 'mimiciv_hosp.d_icd_diagnoses',
 'mimiciv_hosp.poe',
 'mimiciv_hosp.omr',
 'mimiciv_hosp.procedures_icd',
 'mimiciv_hosp.microbiologyevents',
 'mimiciv_hosp.poe_detail',
 'mimiciv_hosp.labevents',
 'mimiciv_hosp.services',
 'mimiciv_hosp.d_icd_procedures',
 'mimiciv_hosp.admissions']

In [3]:
display_datasets()

Dropdown(description='Dataset:', options=('hosp.pharmacy', 'hosp.provider', 'hosp.poe_detail', 'hosp.admission…

Output()

In [4]:
spark = bq.get_spark_session(use_local_data=True)

24/03/30 16:07:32 WARN Utils: Your hostname, Dunyas-Laptop.local resolves to a loopback address: 127.0.0.1; using 10.0.0.117 instead (on interface en0)
24/03/30 16:07:32 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/30 16:07:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Get relevant ICD codes and Lab Item IDs

### Get ICD codes for heart failure diagnoses

In [64]:
qb_heart_failure_codes = QueryBuilder(dataset='hosp.d_icd_diagnoses', 
                                      columns=['icd_code', 'icd_version', 'long_title'],
                                      filters="LOWER(long_title) LIKE '%heart failure%'")
heart_failure_codes_query = qb_heart_failure_codes.generate_query()
print(heart_failure_codes_query)

SELECT long_title, icd_version, icd_code
FROM `mimiciv_hosp.d_icd_diagnoses`
WHERE LOWER(long_title) LIKE '%heart failure%'


In [65]:
heart_failure_icd_items = bq.run_query(spark, heart_failure_codes_query, use_local_data=True)

In [66]:
heart_failure_icd_items.toPandas()

Unnamed: 0,long_title,icd_version,icd_code
0,Acute diastolic heart failure,9,42831
1,"Diastolic heart failure, unspecified",9,42830
2,"Congestive heart failure, unspecified",9,4280
3,Acute on chronic systolic heart failure,9,42823
4,"Systolic heart failure, unspecified",9,42820
5,Chronic systolic heart failure,9,42822
6,Chronic diastolic heart failure,9,42832
7,Acute systolic (congestive) heart failure,10,I5021
8,Acute on chronic diastolic (congestive) heart failure,10,I5033
9,Hypertensive heart disease with heart failure,10,I110


In [67]:
heart_failure_icd_codes_list = [row['icd_code'] for row in heart_failure_icd_items.select('icd_code').distinct().collect()]

In [68]:
heart_failure_icd_codes_str = "'" + "', '".join(heart_failure_icd_codes_list) + "'"

### Get itemids for BNP labs
#### Removed BNP filter

In [69]:
qb_bnp_labs = QueryBuilder(dataset='hosp.d_labitems', 
                           columns=['itemid', 'label', 'fluid', 'category'],
                           filters=["LOWER(label) LIKE '%bnp%'", "fluid = 'Blood'"])
bnp_labs_query = qb_bnp_labs.generate_query()
print(bnp_labs_query)

SELECT label, category, itemid, fluid
FROM `mimiciv_hosp.d_labitems`
WHERE LOWER(label) LIKE '%bnp%' AND fluid = 'Blood'


In [70]:
bnp_lab_items = bq.run_query(spark, bnp_labs_query, use_local_data=True)

In [71]:
bnp_lab_items.toPandas()

Unnamed: 0,label,category,itemid,fluid
0,NTproBNP,Chemistry,50963,Blood


In [72]:
bnp_lab_itemids_list = [row['itemid'] for row in bnp_lab_items.select('itemid').distinct().collect()]

In [73]:
bnp_lab_itemids_str = ', '.join(str(itemid) for itemid in bnp_lab_itemids_list)

## Query MIMIC IV database for Heart Failure diagnoses and BNP labs

In [74]:
# Initialize QueryBuilders
qb_diagnoses = QueryBuilder(dataset='hosp.diagnoses_icd', 
                            columns=['subject_id', 'hadm_id', 'seq_num', 'icd_code', 'icd_version'],
                            filters=f'icd_code IN ({heart_failure_icd_codes_str})')

qb_labevents = QueryBuilder(dataset='hosp.labevents', 
                            columns=['subject_id', 'hadm_id', 'itemid', 'valuenum', 'ref_range_lower', 'ref_range_upper'],
                            filters=f'itemid IN ({bnp_lab_itemids_str})')

# Join datasets
qb_diagnoses.join_with(qb_labevents, join_type='inner', columns=['subject_id', 'hadm_id'])

# Generate query for joined data
qualifying_hosp_admissions_query = qb_diagnoses.generate_query(limit=100000)
# This query can now be used to extract the relevant joined data
print(qualifying_hosp_admissions_query)

SELECT seq_num, valuenum, ref_range_lower, icd_version, icd_code, ref_range_upper, `mimiciv_hosp.diagnoses_icd`.hadm_id, itemid, `mimiciv_hosp.diagnoses_icd`.subject_id
FROM `mimiciv_hosp.diagnoses_icd`
INNER JOIN `mimiciv_hosp.labevents` ON `mimiciv_hosp.diagnoses_icd`.subject_id = `mimiciv_hosp.labevents`.subject_id AND `mimiciv_hosp.diagnoses_icd`.hadm_id = `mimiciv_hosp.labevents`.hadm_id
WHERE icd_code IN ('I5021', 'I5033', '42830', 'I130', 'I5032', '4280', 'I110', '42832', '42822', 'I5031', 'I5030', '42831', '42823', '42820', 'I132') AND itemid IN (50963)
LIMIT 100000


In [75]:
qualifying_hosp_admissions_df = bq.run_query(spark, qualifying_hosp_admissions_query, use_local_data=True)

## Create binary feature checking if BNP value is outside of reference range

In [76]:
feature_engineer = FeatureEngineering(qualifying_hosp_admissions_df)

In [77]:
condition_str = "(valuenum < ref_range_lower) OR (valuenum > ref_range_upper)"
feature_engineer.create_conditional_feature(condition_str, "bnp_outside_ref_range")

In [78]:
qualifying_hosp_admissions_df = feature_engineer.get_processed_data()
qualifying_hosp_admissions_df.toPandas()

Unnamed: 0,seq_num,valuenum,ref_range_lower,icd_version,icd_code,ref_range_upper,hadm_id,itemid,subject_id,bnp_outside_ref_range
0,4,448.0,0.0,9,4280,624.0,24753776,50963,17464192,0
1,3,448.0,0.0,9,42832,624.0,24753776,50963,17464192,0
2,5,64622.0,0.0,10,I132,353.0,21537709,50963,12715419,1
3,3,64622.0,0.0,10,I5033,353.0,21537709,50963,12715419,1
4,20,3407.0,0.0,10,I110,353.0,28371642,50963,12468255,1
5,4,3407.0,0.0,10,I5021,353.0,28371642,50963,12468255,1
6,6,44337.0,0.0,9,4280,226.0,28660713,50963,12715419,1
7,4,44337.0,0.0,9,42822,226.0,28660713,50963,12715419,1
8,2,1245.0,0.0,10,I5033,852.0,20456036,50963,11599045,1
9,5,1245.0,0.0,10,I130,852.0,20456036,50963,11599045,1


## Get admissions data, filtered by Subject ID of Qualifying Patients

In [79]:
qualifying_hosp_admissions_subject_ids_list = [row['subject_id'] for row in qualifying_hosp_admissions_df.select('subject_id').distinct().collect()]

In [80]:
len(qualifying_hosp_admissions_subject_ids_list)

4

In [82]:
qualifying_hosp_admissions_subject_ids_str = ', '.join(str(subject_id) for subject_id in qualifying_hosp_admissions_subject_ids_list)

In [83]:
# Initialize QueryBuilders
qb_admissions = QueryBuilder(dataset='hosp.admissions',
                             filters=f'subject_id IN ({qualifying_hosp_admissions_subject_ids_str})')

# Generate query for joined data
admissions_query = qb_admissions.generate_query(limit=100000)
# This query can now be used to extract the relevant joined data
#print(admissions_query)

In [84]:
admissions_df = bq.run_query(spark, admissions_query, use_local_data=True)

In [85]:
admissions_df.toPandas()

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,deathtime,admission_type,admit_provider_id,admission_location,discharge_location,insurance,language,marital_status,race,edregtime,edouttime,hospital_expire_flag
0,11599045,29825042,2119-09-07 22:00:00,2119-09-10 16:40:00,NaT,URGENT,P20N5X,TRANSFER FROM HOSPITAL,HOME,Medicare,ENGLISH,MARRIED,UNKNOWN,NaT,NaT,0
1,17464192,24753776,2154-03-30 16:42:00,2154-04-03 18:31:00,NaT,EW EMER.,P29CGZ,EMERGENCY ROOM,SKILLED NURSING FACILITY,Medicare,ENGLISH,WIDOWED,WHITE,2154-03-30 11:25:00,2154-03-30 18:00:00,0
2,12715419,26153129,2164-07-16 20:45:00,2164-07-20 16:45:00,NaT,EW EMER.,P14622,EMERGENCY ROOM,HOME HEALTH CARE,Medicare,ENGLISH,WIDOWED,BLACK/AFRICAN AMERICAN,2164-07-16 15:25:00,2164-07-16 23:35:00,0
3,12715419,28660713,2157-05-30 18:56:00,2157-06-01 17:35:00,NaT,EW EMER.,P13MPH,EMERGENCY ROOM,HOME HEALTH CARE,Medicare,ENGLISH,WIDOWED,BLACK/AFRICAN AMERICAN,2157-05-30 14:02:00,2157-05-30 20:08:00,0
4,12715419,21418445,2158-05-27 09:09:00,2158-05-29 16:07:00,NaT,EW EMER.,P612YK,EMERGENCY ROOM,HOME HEALTH CARE,Medicare,ENGLISH,WIDOWED,BLACK/AFRICAN AMERICAN,2158-05-27 06:15:00,2158-05-27 10:36:00,0
5,12715419,21789190,2162-03-03 02:33:00,2162-03-14 19:55:00,NaT,EW EMER.,P20DGX,EMERGENCY ROOM,HOME HEALTH CARE,Medicare,ENGLISH,WIDOWED,BLACK/AFRICAN AMERICAN,2162-03-02 23:19:00,2162-03-03 03:46:00,0
6,12715419,26716153,2157-09-22 11:06:00,2157-09-23 17:50:00,NaT,EU OBSERVATION,P20E5L,EMERGENCY ROOM,HOME,Medicare,ENGLISH,WIDOWED,BLACK/AFRICAN AMERICAN,2157-09-22 06:40:00,2157-09-22 12:40:00,0
7,12468255,28371642,2178-02-14 21:12:00,2178-02-23 16:25:00,NaT,OBSERVATION ADMIT,P327CS,TRANSFER FROM HOSPITAL,HOME HEALTH CARE,Other,ENGLISH,WIDOWED,WHITE,2178-02-14 11:42:00,2178-02-14 23:56:00,0
8,12468255,29212634,2178-01-11 15:49:00,2178-01-16 18:50:00,NaT,OBSERVATION ADMIT,P37QZV,PHYSICIAN REFERRAL,HOME,Other,ENGLISH,WIDOWED,WHITE,NaT,NaT,0
9,12468255,25722662,2177-10-26 21:27:00,2177-11-30 17:15:00,NaT,OBSERVATION ADMIT,P5516X,PROCEDURE SITE,HOME HEALTH CARE,Other,ENGLISH,WIDOWED,WHITE,NaT,NaT,0


## Count Previous Admissions for Qualifying Patients

In [86]:
feature_engineer = FeatureEngineering(admissions_df)

In [87]:
feature_engineer.count_previous_events(
    partition_column='subject_id',
    order_column='admittime',
    event_column='hadm_id'
)

In [88]:
admissions_df = feature_engineer.get_processed_data()
admissions_df.toPandas()

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,deathtime,admission_type,admit_provider_id,admission_location,discharge_location,insurance,language,marital_status,race,edregtime,edouttime,hospital_expire_flag,previous_hadm_id
0,11599045,29825042,2119-09-07 22:00:00,2119-09-10 16:40:00,NaT,URGENT,P20N5X,TRANSFER FROM HOSPITAL,HOME,Medicare,ENGLISH,MARRIED,UNKNOWN,NaT,NaT,0,0
1,11599045,20456036,2120-04-26 12:27:00,2120-05-06 18:58:00,NaT,OBSERVATION ADMIT,P34SHK,PHYSICIAN REFERRAL,SKILLED NURSING FACILITY,Medicare,ENGLISH,MARRIED,WHITE,2120-04-25 22:56:00,2120-04-26 15:53:00,0,1
2,12468255,25722662,2177-10-26 21:27:00,2177-11-30 17:15:00,NaT,OBSERVATION ADMIT,P5516X,PROCEDURE SITE,HOME HEALTH CARE,Other,ENGLISH,WIDOWED,WHITE,NaT,NaT,0,0
3,12468255,29212634,2178-01-11 15:49:00,2178-01-16 18:50:00,NaT,OBSERVATION ADMIT,P37QZV,PHYSICIAN REFERRAL,HOME,Other,ENGLISH,WIDOWED,WHITE,NaT,NaT,0,1
4,12468255,28371642,2178-02-14 21:12:00,2178-02-23 16:25:00,NaT,OBSERVATION ADMIT,P327CS,TRANSFER FROM HOSPITAL,HOME HEALTH CARE,Other,ENGLISH,WIDOWED,WHITE,2178-02-14 11:42:00,2178-02-14 23:56:00,0,2
5,12715419,28660713,2157-05-30 18:56:00,2157-06-01 17:35:00,NaT,EW EMER.,P13MPH,EMERGENCY ROOM,HOME HEALTH CARE,Medicare,ENGLISH,WIDOWED,BLACK/AFRICAN AMERICAN,2157-05-30 14:02:00,2157-05-30 20:08:00,0,0
6,12715419,26716153,2157-09-22 11:06:00,2157-09-23 17:50:00,NaT,EU OBSERVATION,P20E5L,EMERGENCY ROOM,HOME,Medicare,ENGLISH,WIDOWED,BLACK/AFRICAN AMERICAN,2157-09-22 06:40:00,2157-09-22 12:40:00,0,1
7,12715419,21418445,2158-05-27 09:09:00,2158-05-29 16:07:00,NaT,EW EMER.,P612YK,EMERGENCY ROOM,HOME HEALTH CARE,Medicare,ENGLISH,WIDOWED,BLACK/AFRICAN AMERICAN,2158-05-27 06:15:00,2158-05-27 10:36:00,0,2
8,12715419,21789190,2162-03-03 02:33:00,2162-03-14 19:55:00,NaT,EW EMER.,P20DGX,EMERGENCY ROOM,HOME HEALTH CARE,Medicare,ENGLISH,WIDOWED,BLACK/AFRICAN AMERICAN,2162-03-02 23:19:00,2162-03-03 03:46:00,0,3
9,12715419,24649413,2163-09-26 21:09:00,2163-09-27 15:37:00,NaT,AMBULATORY OBSERVATION,P878WT,PROCEDURE SITE,,Medicare,ENGLISH,WIDOWED,BLACK/AFRICAN AMERICAN,NaT,NaT,0,4


## Check for Readmission within 30, 90, and 180 days

In [89]:
feature_engineer.check_event_within_timeframe(
    partition_column='subject_id',
    event_column='admittime', 
    timeframe=30
)

In [90]:
feature_engineer.check_event_within_timeframe(
    partition_column='subject_id',
    event_column='admittime', 
    timeframe=90
)

In [91]:
feature_engineer.check_event_within_timeframe(
    partition_column='subject_id',
    event_column='admittime', 
    timeframe=180
)

In [92]:
admissions_df = feature_engineer.get_processed_data()
admissions_df.toPandas()

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,deathtime,admission_type,admit_provider_id,admission_location,discharge_location,insurance,...,race,edregtime,edouttime,hospital_expire_flag,previous_hadm_id,next_admittime_date,days_to_next_admittime,admittime_within_30_days,admittime_within_90_days,admittime_within_180_days
0,11599045,29825042,2119-09-07 22:00:00,2119-09-10 16:40:00,NaT,URGENT,P20N5X,TRANSFER FROM HOSPITAL,HOME,Medicare,...,UNKNOWN,NaT,NaT,0,0,2120-04-26 12:27:00,232.0,0,0,0
1,11599045,20456036,2120-04-26 12:27:00,2120-05-06 18:58:00,NaT,OBSERVATION ADMIT,P34SHK,PHYSICIAN REFERRAL,SKILLED NURSING FACILITY,Medicare,...,WHITE,2120-04-25 22:56:00,2120-04-26 15:53:00,0,1,NaT,,0,0,0
2,12468255,25722662,2177-10-26 21:27:00,2177-11-30 17:15:00,NaT,OBSERVATION ADMIT,P5516X,PROCEDURE SITE,HOME HEALTH CARE,Other,...,WHITE,NaT,NaT,0,0,2178-01-11 15:49:00,77.0,0,1,1
3,12468255,29212634,2178-01-11 15:49:00,2178-01-16 18:50:00,NaT,OBSERVATION ADMIT,P37QZV,PHYSICIAN REFERRAL,HOME,Other,...,WHITE,NaT,NaT,0,1,2178-02-14 21:12:00,34.0,0,1,1
4,12468255,28371642,2178-02-14 21:12:00,2178-02-23 16:25:00,NaT,OBSERVATION ADMIT,P327CS,TRANSFER FROM HOSPITAL,HOME HEALTH CARE,Other,...,WHITE,2178-02-14 11:42:00,2178-02-14 23:56:00,0,2,NaT,,0,0,0
5,12715419,28660713,2157-05-30 18:56:00,2157-06-01 17:35:00,NaT,EW EMER.,P13MPH,EMERGENCY ROOM,HOME HEALTH CARE,Medicare,...,BLACK/AFRICAN AMERICAN,2157-05-30 14:02:00,2157-05-30 20:08:00,0,0,2157-09-22 11:06:00,115.0,0,0,1
6,12715419,26716153,2157-09-22 11:06:00,2157-09-23 17:50:00,NaT,EU OBSERVATION,P20E5L,EMERGENCY ROOM,HOME,Medicare,...,BLACK/AFRICAN AMERICAN,2157-09-22 06:40:00,2157-09-22 12:40:00,0,1,2158-05-27 09:09:00,247.0,0,0,0
7,12715419,21418445,2158-05-27 09:09:00,2158-05-29 16:07:00,NaT,EW EMER.,P612YK,EMERGENCY ROOM,HOME HEALTH CARE,Medicare,...,BLACK/AFRICAN AMERICAN,2158-05-27 06:15:00,2158-05-27 10:36:00,0,2,2162-03-03 02:33:00,1376.0,0,0,0
8,12715419,21789190,2162-03-03 02:33:00,2162-03-14 19:55:00,NaT,EW EMER.,P20DGX,EMERGENCY ROOM,HOME HEALTH CARE,Medicare,...,BLACK/AFRICAN AMERICAN,2162-03-02 23:19:00,2162-03-03 03:46:00,0,3,2163-09-26 21:09:00,572.0,0,0,0
9,12715419,24649413,2163-09-26 21:09:00,2163-09-27 15:37:00,NaT,AMBULATORY OBSERVATION,P878WT,PROCEDURE SITE,,Medicare,...,BLACK/AFRICAN AMERICAN,NaT,NaT,0,4,2164-07-16 20:45:00,294.0,0,0,0


## Merge Admissions data with Qualifying Admissions data

In [93]:
merged_df = left_merge_dataframes(qualifying_hosp_admissions_df, admissions_df, ['subject_id', 'hadm_id'])

In [94]:
merged_df.toPandas()

Unnamed: 0,seq_num,valuenum,ref_range_lower,icd_version,icd_code,ref_range_upper,hadm_id,itemid,subject_id,bnp_outside_ref_range,...,race,edregtime,edouttime,hospital_expire_flag,previous_hadm_id,next_admittime_date,days_to_next_admittime,admittime_within_30_days,admittime_within_90_days,admittime_within_180_days
0,4,448.0,0.0,9,4280,624.0,24753776,50963,17464192,0,...,WHITE,2154-03-30 11:25:00,2154-03-30 18:00:00,0,0,NaT,,0,0,0
1,3,448.0,0.0,9,42832,624.0,24753776,50963,17464192,0,...,WHITE,2154-03-30 11:25:00,2154-03-30 18:00:00,0,0,NaT,,0,0,0
2,5,64622.0,0.0,10,I132,353.0,21537709,50963,12715419,1,...,BLACK/AFRICAN AMERICAN,2166-03-07 16:15:00,2166-03-08 02:10:00,0,8,2166-05-28 22:47:00,81.0,0,1,1
3,3,64622.0,0.0,10,I5033,353.0,21537709,50963,12715419,1,...,BLACK/AFRICAN AMERICAN,2166-03-07 16:15:00,2166-03-08 02:10:00,0,8,2166-05-28 22:47:00,81.0,0,1,1
4,20,3407.0,0.0,10,I110,353.0,28371642,50963,12468255,1,...,WHITE,2178-02-14 11:42:00,2178-02-14 23:56:00,0,2,NaT,,0,0,0
5,4,3407.0,0.0,10,I5021,353.0,28371642,50963,12468255,1,...,WHITE,2178-02-14 11:42:00,2178-02-14 23:56:00,0,2,NaT,,0,0,0
6,6,44337.0,0.0,9,4280,226.0,28660713,50963,12715419,1,...,BLACK/AFRICAN AMERICAN,2157-05-30 14:02:00,2157-05-30 20:08:00,0,0,2157-09-22 11:06:00,115.0,0,0,1
7,4,44337.0,0.0,9,42822,226.0,28660713,50963,12715419,1,...,BLACK/AFRICAN AMERICAN,2157-05-30 14:02:00,2157-05-30 20:08:00,0,0,2157-09-22 11:06:00,115.0,0,0,1
8,2,1245.0,0.0,10,I5033,852.0,20456036,50963,11599045,1,...,WHITE,2120-04-25 22:56:00,2120-04-26 15:53:00,0,1,NaT,,0,0,0
9,5,1245.0,0.0,10,I130,852.0,20456036,50963,11599045,1,...,WHITE,2120-04-25 22:56:00,2120-04-26 15:53:00,0,1,NaT,,0,0,0


## Get Patient Data for Qualifying Patients

In [95]:
# Initialize QueryBuilders
qb_patients = QueryBuilder(dataset='hosp.patients',
                           columns=['subject_id', 'gender', 'anchor_age'],
                           filters=f'subject_id IN ({qualifying_hosp_admissions_subject_ids_str})')

# Generate query for joined data
patients_query = qb_patients.generate_query(limit=100000)
# This query can now be used to extract the relevant joined data
# print(patients_query)

In [96]:
patients_df = bq.run_query(spark, patients_query, use_local_data=True)

In [97]:
patients_df.toPandas()

Unnamed: 0,gender,subject_id,anchor_age
0,F,12715419,62
1,F,17464192,81
2,F,12468255,65
3,M,11599045,79


### Merge Patients Data

In [98]:
merged_df = left_merge_dataframes(merged_df, patients_df, ['subject_id'])

## Create Demographic Features

In [99]:
feature_engineer = FeatureEngineering(merged_df)

In [100]:
condition_str = "LOWER(gender) LIKE '%m%'"
feature_engineer.create_conditional_feature(condition_str, "gender_male")

In [101]:
condition_str = "LOWER(race) LIKE '%white%'"
feature_engineer.create_conditional_feature(condition_str, "race_white")

In [102]:
condition_str = "LOWER(race) LIKE '%black%'"
feature_engineer.create_conditional_feature(condition_str, "race_black")

In [103]:
condition_str = "LOWER(race) LIKE '%hispanic%' OR LOWER(race) LIKE '%latino%'"
feature_engineer.create_conditional_feature(condition_str, "race_hispanic_latino")

In [104]:
condition_str = "LOWER(race) LIKE '%asian%'"
feature_engineer.create_conditional_feature(condition_str, "race_asian")

In [105]:
condition_str = "LOWER(insurance) LIKE '%medicare%'"
feature_engineer.create_conditional_feature(condition_str, "insurance_medicare")

In [106]:
condition_str = "LOWER(insurance) LIKE '%medicaid%'"
feature_engineer.create_conditional_feature(condition_str, "insurance_medicaid")

In [107]:
condition_str = "LOWER(marital_status) LIKE '%married%'"
feature_engineer.create_conditional_feature(condition_str, "marital_status_married")

In [108]:
condition_str = "LOWER(marital_status) LIKE '%single%'"
feature_engineer.create_conditional_feature(condition_str, "marital_status_single")

In [109]:
condition_str = "LOWER(marital_status) LIKE '%widowed%'"
feature_engineer.create_conditional_feature(condition_str, "marital_status_widowed")

In [110]:
condition_str = "LOWER(marital_status) LIKE '%divorced%'"
feature_engineer.create_conditional_feature(condition_str, "marital_status_divorced")

In [111]:
processed_df = feature_engineer.get_processed_data()

In [112]:
processed_df.toPandas()

Unnamed: 0,seq_num,valuenum,ref_range_lower,icd_version,icd_code,ref_range_upper,hadm_id,itemid,subject_id,bnp_outside_ref_range,...,race_white,race_black,race_hispanic_latino,race_asian,insurance_medicare,insurance_medicaid,marital_status_married,marital_status_single,marital_status_widowed,marital_status_divorced
0,4,448.0,0.0,9,4280,624.0,24753776,50963,17464192,0,...,1,0,0,0,1,0,0,0,1,0
1,3,448.0,0.0,9,42832,624.0,24753776,50963,17464192,0,...,1,0,0,0,1,0,0,0,1,0
2,5,64622.0,0.0,10,I132,353.0,21537709,50963,12715419,1,...,0,1,0,0,1,0,0,0,1,0
3,3,64622.0,0.0,10,I5033,353.0,21537709,50963,12715419,1,...,0,1,0,0,1,0,0,0,1,0
4,20,3407.0,0.0,10,I110,353.0,28371642,50963,12468255,1,...,1,0,0,0,0,0,0,0,1,0
5,4,3407.0,0.0,10,I5021,353.0,28371642,50963,12468255,1,...,1,0,0,0,0,0,0,0,1,0
6,6,44337.0,0.0,9,4280,226.0,28660713,50963,12715419,1,...,0,1,0,0,1,0,0,0,1,0
7,4,44337.0,0.0,9,42822,226.0,28660713,50963,12715419,1,...,0,1,0,0,1,0,0,0,1,0
8,2,1245.0,0.0,10,I5033,852.0,20456036,50963,11599045,1,...,1,0,0,0,1,0,1,0,0,0
9,5,1245.0,0.0,10,I130,852.0,20456036,50963,11599045,1,...,1,0,0,0,1,0,1,0,0,0


## Clean up Final DataFrame

In [113]:
processed_df = processed_df.withColumnRenamed('previous_hadm_id', 'n_prev_hosp_admissions')

In [114]:
features = ['subject_id',
 'hadm_id',
 'seq_num',
 'admittime',
 'dischtime',
 'deathtime',
 'valuenum',
 'bnp_outside_ref_range',
 'n_prev_hosp_admissions',
 'anchor_age',
 'gender_male',
 'race_white',
 'race_black',
 'race_hispanic_latino',
 'race_asian',
 'insurance_medicare',
 'insurance_medicaid',
 'marital_status_married',
 'marital_status_single',
 'marital_status_widowed',
 'marital_status_divorced',
 'admittime_within_30_days',
 'admittime_within_90_days',
 'admittime_within_180_days']

In [115]:
final_df = processed_df.select([col(f) for f in features])
final_df.toPandas()

Unnamed: 0,subject_id,hadm_id,seq_num,admittime,dischtime,deathtime,valuenum,bnp_outside_ref_range,n_prev_hosp_admissions,anchor_age,...,race_asian,insurance_medicare,insurance_medicaid,marital_status_married,marital_status_single,marital_status_widowed,marital_status_divorced,admittime_within_30_days,admittime_within_90_days,admittime_within_180_days
0,17464192,24753776,4,2154-03-30 16:42:00,2154-04-03 18:31:00,NaT,448.0,0,0,81,...,0,1,0,0,0,1,0,0,0,0
1,17464192,24753776,3,2154-03-30 16:42:00,2154-04-03 18:31:00,NaT,448.0,0,0,81,...,0,1,0,0,0,1,0,0,0,0
2,12715419,21537709,5,2166-03-08 00:38:00,2166-03-10 15:45:00,NaT,64622.0,1,8,62,...,0,1,0,0,0,1,0,0,1,1
3,12715419,21537709,3,2166-03-08 00:38:00,2166-03-10 15:45:00,NaT,64622.0,1,8,62,...,0,1,0,0,0,1,0,0,1,1
4,12468255,28371642,20,2178-02-14 21:12:00,2178-02-23 16:25:00,NaT,3407.0,1,2,65,...,0,0,0,0,0,1,0,0,0,0
5,12468255,28371642,4,2178-02-14 21:12:00,2178-02-23 16:25:00,NaT,3407.0,1,2,65,...,0,0,0,0,0,1,0,0,0,0
6,12715419,28660713,6,2157-05-30 18:56:00,2157-06-01 17:35:00,NaT,44337.0,1,0,62,...,0,1,0,0,0,1,0,0,0,1
7,12715419,28660713,4,2157-05-30 18:56:00,2157-06-01 17:35:00,NaT,44337.0,1,0,62,...,0,1,0,0,0,1,0,0,0,1
8,11599045,20456036,2,2120-04-26 12:27:00,2120-05-06 18:58:00,NaT,1245.0,1,1,79,...,0,1,0,1,0,0,0,0,0,0
9,11599045,20456036,5,2120-04-26 12:27:00,2120-05-06 18:58:00,NaT,1245.0,1,1,79,...,0,1,0,1,0,0,0,0,0,0


In [116]:
# Initialize MLUtils with the DataFrame
ml_utils = MLUtils(final_df)

# Set the target and feature columns
target_column = 'admittime_within_180_days' 
feature_columns = ['valuenum',
                   'bnp_outside_ref_range',
                   'n_prev_hosp_admissions',
                   'anchor_age',
                   'gender_male',
                   'race_white',
                   'race_black',
                   'race_hispanic_latino',
                   'race_asian',
                   'insurance_medicare',
                   'insurance_medicaid',
                   'marital_status_married',
                   'marital_status_single',
                   'marital_status_widowed',
                   'marital_status_divorced']

ml_utils.set_target(target_column)
ml_utils.set_features(feature_columns)

# Drop all rows where a feature or target value is NaN
ml_utils.clean_data(verbose=True)

# Split data into training, validation, and testing sets
ml_utils.split_data(ratio=(0.7, 0.15, 0.15))

# Train the model using XGBoost with SMOTE and undersampling
ml_utils.train_classification_model(smote=True, undersample_factor=0.5, verbose=True)

# Evaluate the model on validation and test datasets
validation_metrics = ml_utils.evaluate_classification_model(eval_type='val')
test_metrics = ml_utils.evaluate_classification_model(eval_type='test')

# Print evaluation metrics
display("Validation Metrics:", validation_metrics)
display("Test Metrics:", test_metrics)

ml_utils.display_confusion_matrix('val')
ml_utils.display_confusion_matrix('test')

Number of rows dropped: 0
Minority Class Count: 3, Majority Class Count: 4
Minority/Majority Ratio: 0.7500
Undersampling Majority. New Minority/Majority Ratio: 0.8750


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


ValueError: Only one class present in y_true. ROC AUC score is not defined in that case.