# Readmission Risk for Heart Failure Patients

In [None]:
import mimicfouretl.bigquery_utils as bq
from mimicfouretl.data_insights import display_datasets
from mimicfouretl.query_builder import QueryBuilder
from mimicfouretl.feature_engineering import FeatureEngineering
from mimicfouretl.feature_engineering import left_merge_dataframes

from pyspark.sql.functions import col

from mimicfouretl.ml_utils import MLUtils

## Build BigQuery Spark session

In [None]:
bq.set_credentials_file('../bq_credentials/client_secret.json')
bq.set_project_id('mimic-iv-418015')
# bq.set_project_id('micro-vine-412020')

In [None]:
client = bq.get_client(use_service_account_auth=False)

In [None]:
# tables = bq.list_tables('mimiciv_icu', client)
tables = bq.list_tables('physionet-data.mimiciv_hosp', client)
tables

In [None]:
display_datasets()

In [None]:
spark = bq.get_spark_session()

## Get relevant ICD codes and Lab Item IDs

### Get ICD codes for heart failure diagnoses

In [None]:
qb_heart_failure_codes = QueryBuilder(dataset='hosp.d_icd_diagnoses', 
                                      columns=['icd_code', 'icd_version', 'long_title'],
                                      filters="LOWER(long_title) LIKE '%heart failure%'")
heart_failure_codes_query = qb_heart_failure_codes.generate_query()
print(heart_failure_codes_query)

In [None]:
heart_failure_icd_items = bq.run_query(spark, heart_failure_codes_query)

In [None]:
heart_failure_icd_items.toPandas()

#### Transform Heart Failure ICD codes list to string for query

In [None]:
heart_failure_icd_codes_list = [row['icd_code'] for row in heart_failure_icd_items.select('icd_code').distinct().collect()]

In [None]:
heart_failure_icd_codes_str = "'" + "', '".join(heart_failure_icd_codes_list) + "'"

### Get itemids for BNP labs

In [None]:
qb_bnp_labs = QueryBuilder(dataset='hosp.d_labitems', 
                           columns=['itemid', 'label', 'fluid', 'category'],
                           filters=["LOWER(label) LIKE '%bnp%'", "fluid = 'Blood'"])
bnp_labs_query = qb_bnp_labs.generate_query()
print(bnp_labs_query)

In [None]:
bnp_lab_items = bq.run_query(spark, bnp_labs_query)

In [None]:
bnp_lab_items.toPandas()

#### Transform BNP Lab item IDs to string for query

In [None]:
bnp_lab_itemids_list = [row['itemid'] for row in bnp_lab_items.select('itemid').distinct().collect()]

In [None]:
bnp_lab_itemids_str = ', '.join(str(itemid) for itemid in bnp_lab_itemids_list)

## Query MIMIC IV database for Heart Failure diagnoses and BNP labs

In [None]:
# Initialize QueryBuilders
qb_diagnoses = QueryBuilder(dataset='hosp.diagnoses_icd', 
                            columns=['subject_id', 'hadm_id', 'seq_num', 'icd_code', 'icd_version'],
                            filters=f'icd_code IN ({heart_failure_icd_codes_str})')

qb_labevents = QueryBuilder(dataset='hosp.labevents', 
                            columns=['subject_id', 'hadm_id', 'itemid', 'valuenum', 'ref_range_lower', 'ref_range_upper'],
                            filters=f'itemid IN ({bnp_lab_itemids_str})')

# Join datasets
qb_diagnoses.join_with(qb_labevents, join_type='inner', columns=['subject_id', 'hadm_id'])

# Generate query for joined data
qualifying_hosp_admissions_query = qb_diagnoses.generate_query(limit=100000)
# This query can now be used to extract the relevant joined data
print(qualifying_hosp_admissions_query)

In [None]:
qualifying_hosp_admissions_df = bq.run_query(spark, qualifying_hosp_admissions_query)

## Create binary feature checking if BNP value is outside of reference range

In [None]:
feature_engineer = FeatureEngineering(qualifying_hosp_admissions_df)

In [None]:
condition_str = "(valuenum < ref_range_lower) OR (valuenum > ref_range_upper)"
feature_engineer.create_conditional_feature(condition_str, "bnp_outside_ref_range")

In [None]:
qualifying_hosp_admissions_df = feature_engineer.get_processed_data()
qualifying_hosp_admissions_df.toPandas()

## Get admissions data, filtered by Subject ID of Qualifying Patients

In [None]:
qualifying_hosp_admissions_subject_ids_list = [row['subject_id'] for row in qualifying_hosp_admissions_df.select('subject_id').distinct().collect()]

In [None]:
len(qualifying_hosp_admissions_subject_ids_list)

In [None]:
qualifying_hosp_admissions_subject_ids_str = ', '.join(str(subject_id) for subject_id in qualifying_hosp_admissions_subject_ids_list)

In [None]:
# Initialize QueryBuilders
qb_admissions = QueryBuilder(dataset='hosp.admissions',
                             filters=f'subject_id IN ({qualifying_hosp_admissions_subject_ids_str})')

# Generate query for joined data
admissions_query = qb_admissions.generate_query(limit=100000)
# This query can now be used to extract the relevant joined data
#print(admissions_query)

In [None]:
admissions_df = bq.run_query(spark, admissions_query)

In [None]:
admissions_df.toPandas()

## Count Previous Admissions for Qualifying Patients

In [None]:
feature_engineer = FeatureEngineering(admissions_df)

In [None]:
feature_engineer.count_previous_events(
    partition_column='subject_id',
    order_column='admittime',
    event_column='hadm_id'
)

In [None]:
admissions_df = feature_engineer.get_processed_data()
admissions_df.toPandas()

## Check for Readmission within 30, 90, and 180 days

In [None]:
feature_engineer.check_event_within_timeframe(
    partition_column='subject_id',
    event_column='admittime', 
    timeframe=30
)

In [None]:
feature_engineer.check_event_within_timeframe(
    partition_column='subject_id',
    event_column='admittime', 
    timeframe=90
)

In [None]:
feature_engineer.check_event_within_timeframe(
    partition_column='subject_id',
    event_column='admittime', 
    timeframe=180
)

In [None]:
admissions_df = feature_engineer.get_processed_data()
admissions_df.toPandas()

## Merge Admissions data with Qualifying Admissions data

In [None]:
merged_df = left_merge_dataframes(qualifying_hosp_admissions_df, admissions_df, ['subject_id', 'hadm_id'])

In [None]:
merged_df.toPandas()

## Get Patient Data for Qualifying Patients

In [None]:
# Initialize QueryBuilders
qb_patients = QueryBuilder(dataset='hosp.patients',
                           columns=['subject_id', 'gender', 'anchor_age'],
                           filters=f'subject_id IN ({qualifying_hosp_admissions_subject_ids_str})')

# Generate query for joined data
patients_query = qb_patients.generate_query(limit=100000)
# This query can now be used to extract the relevant joined data
# print(patients_query)

In [None]:
patients_df = bq.run_query(spark, patients_query)

In [None]:
patients_df.toPandas()

### Merge Patients Data

In [None]:
merged_df = left_merge_dataframes(merged_df, patients_df, ['subject_id'])

## Create Demographic Features

In [None]:
demographic_features = {
    "gender_male": "LOWER(gender) LIKE '%m%'",
    "race_white": "LOWER(race) LIKE '%white%'",
    "race_black": "LOWER(race) LIKE '%black%'",
    "race_hispanic_latino": "LOWER(race) LIKE '%hispanic%' OR LOWER(race) LIKE '%latino%'",
    "race_asian": "LOWER(race) LIKE '%asian%'",
    "insurance_medicare": "LOWER(insurance) LIKE '%medicare%'",
    "insurance_medicaid": "LOWER(insurance) LIKE '%medicaid%'",
    "marital_status_married": "LOWER(marital_status) LIKE '%married%'",
    "marital_status_single": "LOWER(marital_status) LIKE '%single%'",
    "marital_status_widowed": "LOWER(marital_status) LIKE '%widowed%'",
    "marital_status_divorced": "LOWER(marital_status) LIKE '%divorced%'"
}

In [None]:
feature_engineer = FeatureEngineering(merged_df)
for feature_name, condition in demographic_features.items():
    feature_engineer.create_conditional_feature(condition, feature_name)

In [None]:
processed_df = feature_engineer.get_processed_data()
processed_df.toPandas()

## Clean up Final DataFrame

In [None]:
processed_df = processed_df.withColumnRenamed('previous_hadm_id', 'n_prev_hosp_admissions')

In [None]:
features = ['subject_id',
 'hadm_id',
 'seq_num',
 'admittime',
 'dischtime',
 'deathtime',
 'valuenum',
 'bnp_outside_ref_range',
 'n_prev_hosp_admissions',
 'anchor_age',
 'gender_male',
 'race_white',
 'race_black',
 'race_hispanic_latino',
 'race_asian',
 'insurance_medicare',
 'insurance_medicaid',
 'marital_status_married',
 'marital_status_single',
 'marital_status_widowed',
 'marital_status_divorced',
 'admittime_within_30_days',
 'admittime_within_90_days',
 'admittime_within_180_days']

In [None]:
final_df = processed_df.select([col(f) for f in features])
final_df.toPandas()

In [None]:
# Initialize MLUtils with the DataFrame
ml_utils = MLUtils(final_df)

# Set the target and feature columns
target_column = 'admittime_within_180_days' 
feature_columns = ['valuenum',
                   'bnp_outside_ref_range',
                   'n_prev_hosp_admissions',
                   'anchor_age',
                   'gender_male',
                   'race_white',
                   'race_black',
                   'race_hispanic_latino',
                   'race_asian',
                   'insurance_medicare',
                   'insurance_medicaid',
                   'marital_status_married',
                   'marital_status_single',
                   'marital_status_widowed',
                   'marital_status_divorced']

ml_utils.set_target(target_column)
ml_utils.set_features(feature_columns)

# Drop all rows where a feature or target value is NaN
ml_utils.clean_data(verbose=True)

# Split data into training, validation, and testing sets
ml_utils.split_data(ratio=(0.7, 0.15, 0.15))

# Train the model using XGBoost with SMOTE and undersampling
ml_utils.train_classification_model(smote=True, undersample_factor=0.5, verbose=True)

# Evaluate the model on validation and test datasets
validation_metrics = ml_utils.evaluate_classification_model(eval_type='val')
test_metrics = ml_utils.evaluate_classification_model(eval_type='test')

# Print evaluation metrics
display("Validation Metrics:", validation_metrics)
display("Test Metrics:", test_metrics)

ml_utils.display_confusion_matrix('val')
ml_utils.display_confusion_matrix('test')

In [None]:
ml_utils.optimize_with_optuna(model_type='classification', 
                              n_trials=500, 
                              storage_url='sqlite:///optuna_study_02.db')

In [None]:
# Evaluate the model on validation and test datasets
validation_metrics = ml_utils.evaluate_classification_model(eval_type='val')
test_metrics = ml_utils.evaluate_classification_model(eval_type='test')

# Print evaluation metrics
display("Validation Metrics:", validation_metrics)
display("Test Metrics:", test_metrics)

ml_utils.display_confusion_matrix('val')
ml_utils.display_confusion_matrix('test')

In [None]:
shap_values = ml_utils.compute_shap_values()

In [None]:
ml_utils.visualize_shap_values(shap_values, plot_type='bar')

In [None]:
ml_utils.visualize_shap_values(shap_values, plot_type='bee_swarm')

In [None]:
ml_utils.visualize_shap_values(shap_values, plot_type='waterfall', sample_number=3)

In [None]:
ml_utils.visualize_shap_values(shap_values, plot_type='dependence', feature_name='n_prev_hosp_admissions')