In [1]:
from pathlib import Path

from IPython.display import (
    display,
    Markdown,
)
import pandas as pd
import os
from concurrent.futures import ProcessPoolExecutor, as_completed

from howso.synthesizer import Synthesizer
from howso.utilities import infer_feature_attributes

os.environ["LANG"] = "en_US.UTF-8"
os.environ["LC_ALL"] = "en_US.UTF-8"

N_CORES = os.cpu_count()
MAX_WORKERS = max(1, N_CORES - 1)
M = 30      # number of synthetic runs (start here)
print(f'number of workers: {MAX_WORKERS}')

number of workers: 31


In [2]:
def read_project14_data():
    col_type_mapping = {'age_yrs': 'Int64', 'alcohol_daysperweek': 'Int64', 'bio_sex_birth_2': 'Int64', 
                    'cc_asthma_2': 'Int64', 'cc_asud_2': 'Int64', 'cc_autoimm_2': 'Int64', 'cc_cancer_2': 'Int64', 'cc_chronickd_2': 'Int64', 
                    'cc_clung_2': 'Int64', 'cc_copd_2': 'Int64', 'cc_cvd_2': 'Int64', 'cc_depression_2': 'Int64', 'cc_diabetes_2': 'Int64', 
                    'cc_hypertension_2': 'Int64', 'cc_imm_2': 'Int64', 'cc_intrav_2': 'Int64', 'cc_otherchroniccond_2': 'Int64', 'cc_othermh_2': 'Int64', 
                    'cc_sickle_2': 'Int64', 'cov_pan_chal_hlth_2': 'Int64', 'cov_pan_chal_med_2': 'Int64', 'cov_pan_chlng_trans_2': 'Int64', 'cov_tst_mthd_2': 'Int64', 
                    'covid_abpain_2': 'Int64', 'covid_cough_2': 'Int64', 'covid_diffbreath_2': 'Int64', 'covid_fatique_2': 'Int64', 'covid_fever_2': 'Int64', 'covid_headache_2': 'Int64', 'covid_myalgia_2': 'Int64', 'covid_nausea_2': 'Int64', 'covid_olfactory_2': 'Int64', 'covid_other_2': 'Int64', 'covid_pandemic_challenges_abod_2': 'Int64', 'covid_pandemic_challenges_food_2': 'Int64', 'covid_pandemic_challenges_wate_2': 'Int64', 'covid_runnynose_2': 'Int64', 'covid_skinrash_2': 'Int64', 'covid_test_approval': 'Int64', 'covid_test_collection_setting': 'Int64', 'covid_test_performed_location': 'Int64', 'covid_test_result': 'Int64', 'covid_test_specimen_collector': 'Int64', 'covid_test_specimen_type': 'Int64', 'covid_test_study_setting': 'Int64', 'covid_test_tar_dis_stat_2___1': 'Int64', 'covid_test_tar_dis_stat_2___2': 'Int64', 'covid_test_tar_dis_stat_2___3': 'Int64', 'covid_test_tar_dis_stat_2___4': 'Int64', 'covid_test_tar_dis_stat_2___5': 'Int64', 'covid_test_tar_dis_stat_2___6': 'Int64', 'covid_test_tar_dis_stat_2___9': 'Int64', 'covid_test_type': 'Int64', 'covid_vaccine': 'Int64', 'current_employment_status': 'Int64', 'edu_years_of_school': 'Int64', 'employed_ew': 'Int64', 'employed_healthcare_2': 'Int64', 'family_income': 'Int64', 'flu_vaccine_season_3': 'Int64', 'flu_vaccinehistind_2': 'Int64', 'gender_identity_term_2': 'Int64', 'hi_coverage_type': 'Int64', 'hi_loss_covid': 'Int64', 'household_congregate_3': 'Int64', 'household_famgen_3': 'Int64', 'household_homeless': 'Int64', 'isolate_maintain_job': 'Int64', 'jobloss_covid19_2': 'Int64', 'language_english': 'Int64', 'language_home___1': 'Int64', 'language_home___2': 'Int64', 'language_home___3': 'Int64', 'language_home___4': 'Int64', 'language_home___5': 'Int64', 'language_home___6': 'Int64', 'language_home___7': 'Int64', 'language_home___8': 'Int64', 'language_home___9': 'Int64', 'language_home___90': 'Int64', 'language_home___99': 'Int64', 'lifetime_use_alcohol': 'Int64', 'positivemonth_covidtest_2': 'Int64', 'positiveyear_covidtest_3': 'Int64', 'pregnancy_status': 'Int64', 'quarantine_maintain_job': 'Int64', 'race_ethn_asian_detail_3___1': 'Int64', 'race_ethn_asian_detail_3___2': 'Int64', 'race_ethn_asian_detail_3___3': 'Int64', 'race_ethn_asian_detail_3___4': 'Int64', 'race_ethn_asian_detail_3___5': 'Int64', 'race_ethn_asian_detail_3___99': 'Int64', 'race_ethn_hispanic': 'Int64', 'race_ethn_hispanic_detail_2___1': 'Int64', 'race_ethn_hispanic_detail_2___2': 'Int64', 'race_ethn_hispanic_detail_2___3': 'Int64', 'race_ethn_hispanic_detail_2___4': 'Int64', 'race_ethn_hispanic_detail_2___5': 'Int64', 'race_ethn_hispanic_detail_2___6': 'Int64', 'race_ethn_hispanic_detail_2___7': 'Int64', 'race_ethn_hispanic_detail_2___99': 'Int64', 'race_ethn_islander_detail_2___1': 'Int64', 'race_ethn_islander_detail_2___10': 'Int64', 'race_ethn_islander_detail_2___11': 'Int64', 'race_ethn_islander_detail_2___12': 'Int64', 'race_ethn_islander_detail_2___13': 'Int64', 'race_ethn_islander_detail_2___14': 'Int64', 'race_ethn_islander_detail_2___2': 'Int64', 'race_ethn_islander_detail_2___3': 'Int64', 'race_ethn_islander_detail_2___4': 'Int64', 'race_ethn_islander_detail_2___5': 'Int64', 'race_ethn_islander_detail_2___6': 'Int64', 'race_ethn_islander_detail_2___7': 'Int64', 'race_ethn_islander_detail_2___8': 'Int64', 'race_ethn_islander_detail_2___9': 'Int64', 'race_ethn_islander_detail_2___99': 'Int64', 'race_ethn_race___1': 'Int64', 'race_ethn_race___15': 'Int64', 'race_ethn_race___2': 'Int64', 'race_ethn_race___3': 'Int64', 'race_ethn_race___4': 'Int64', 'race_ethn_race___5': 'Int64', 'race_ethn_race___99': 'Int64', 'recentmonth_covidtest_2': 'Int64', 'recentresult_covidtest': 'Int64', 'recentyear_covidtest_3': 'Int64', 'self_reported_disability': 'Int64', 'self_reported_health_status_assessment': 'Int64', 
                    'height_cm': 'float64', 'weight_lbs': 'float64', 'sex_orient_id': 'Int64', 'smoker_cur_stat_2': 'Int64', 'smoker_number': 'Int64', 'test_accesseasy_2': 'Int64', 'test_accesswhere_2': 'Int64', 'tested_for_covid': 'Int64', 'tested_positive_for_covid': 'Int64', 'vaccine_avail': 'Int64', 'vaccine_concerns_3___1': 'Int64', 'vaccine_concerns_3___10': 'Int64', 'vaccine_concerns_3___2': 'Int64', 'vaccine_concerns_3___3': 'Int64', 'vaccine_concerns_3___4': 'Int64', 'vaccine_concerns_3___5': 'Int64', 'vaccine_concerns_3___6': 'Int64', 'vaccine_concerns_3___7': 'Int64', 'vaccine_concerns_3___8': 'Int64', 'vaccine_concerns_3___9': 'Int64', 'vaccine_concerns_3___96': 'Int64', 'vaccine_reasons_3___1': 'Int64', 'vaccine_reasons_3___10': 'Int64', 'vaccine_reasons_3___11': 'Int64', 'vaccine_reasons_3___2': 'Int64', 'vaccine_reasons_3___3': 'Int64', 'vaccine_reasons_3___4': 'Int64', 'vaccine_reasons_3___5': 'Int64', 'vaccine_reasons_3___6': 'Int64', 'vaccine_reasons_3___7': 'Int64', 'vaccine_reasons_3___8': 'Int64', 'vaccine_reasons_3___9': 'Int64', 'vaccine_reasons_3___96': 'Int64', 'vaper_cur_stat': 'Int64', 'work_closecont_2': 'Int64', 'work_ppe_2': 'Int64', 'work_wash_2': 'Int64', 'zip_code': 'string'}
    print(f'total number of columns being mapped: {len(col_type_mapping)}')
    
    data_path = Path("/home", "sagemaker-user", "phs002660", "project14_DATA_origcopy_v3_processed.csv")
    df = pd.read_csv(data_path, dtype=col_type_mapping)
    df = df.reset_index(drop=True)
    print(df.columns)
    print(f'total number of rows: {len(df)}')
    
    # for model comparison for covid_test_result prediction, nulls from target feature should be removed and all-null columns should be removed as well
    target_feature = 'covid_test_result'
    df = df.dropna(subset=[target_feature])
    df = df[df[target_feature] != 3]
    print(f'after removing null and extraneous rows, df.shape: {df.shape}')
    null_only_cols = df.columns[df.isna().all()].tolist()
    print(null_only_cols)
    df = df.drop(columns=null_only_cols)
    df = df.reset_index(drop=True)
    print(df.shape)
    pd.set_option('display.max_columns', None)
    df.describe()
    return df

def read_wastewater_clinical_data():
    col_type_mapping = {'study_id': 'string', 'race': 'Int64', 'ethnicity': 'Int64', 'age': 'Int64',
                     'sex': 'Int64', 'zip': 'string', 'insurance': 'Int64', 'vaping': 'Int64',
                     'nicotine': 'Int64', 'alcohol_use': 'Int64', 'asthma': 'Int64', 'cancer': 'Int64',
                     'cardiovascular_disease': 'Int64', 'chronic_kidney_disease': 'Int64',
                     'chronic_lung_disease': 'Int64', 'diabetes': 'Int64', 'hypertension': 'Int64',
                     'immunosuppressive_conditio': 'Int64', 'serious_mental_illness': 'Int64',
                     'sickle_cell_disease': 'Int64', 'pregnancy_status': 'Int64', 'height_feet': 'Int64',
                     'height_inches': 'Int64', 'weight_lbs': 'float64', 'patient_data_date': 'string',
                     'patient_covid_test_complete': 'string', 'covid_test_result': 'Int64',
                     'covid_test_type': 'Int64', 'patient_stay_length': 'Int64', 'patient_ventilator': 'string',
                     'patient_icu': 'string', 'patient_remdesivir_use': 'string',
                     'patient_mortality': 'string', 'country': 'string'}
    print(f'total number of columns being mapped: {len(col_type_mapping)}')
    
    data_path = Path("/home", "sagemaker-user", "phs002525", "rad_015_941-01_deidentified_clinicaldata_DATA_origcopy_v1_processed.csv")
    df = pd.read_csv(data_path, dtype=col_type_mapping)
    df = df.reset_index(drop=True)
    print(df.columns)
    print(f'total number of rows: {len(df)}')
    
    # for model comparison for covid_test_result prediction, nulls from target feature should be removed and all-null columns should be removed as well
    target_feature = 'patient_stay_length'
    df = df.dropna(subset=[target_feature])
    print(f'after removing null rows, df.shape: {df.shape}')
    null_only_cols = df.columns[df.isna().all()].tolist()
    print(null_only_cols)
    df = df.drop(columns=null_only_cols)
    df = df.reset_index(drop=True)
    print(df.shape)
    pd.set_option('display.max_columns', None)
    df.describe()
    return df


In [3]:
def get_wastewater_features(df):
    target_feature = 'patient_stay_length'
    features = infer_feature_attributes(
        df,
        attempt_infer_extended_nominals=True,
        infer_bounds=True,
        tight_bounds=['age', 'weight_lbs', 'patient_stay_length', 'height_inches'],
        types={"continuous": ['age', 'weight_lbs', 'patient_stay_length', 'height_inches']},
        dependent_features={
            "race": ["sickle_cell_disease", ],
            "sex": ["pregnancy_status", ],
        }
    )
    
    subtype_map = {
        # feature: subtype
        "country": "country",
        "zip": "int-id"
    }
    
    for feature, subtype in subtype_map.items():
        features[feature]["subtype"] = subtype
    
    for f_name in features.get_names(types=("continuous")):
        if f_name not in ['age', 'weight_lbs', 'patient_stay_length', 'height_inches']:
            print(f'changing inferred continuous feature: {f_name} to nominal')
            features[f_name]["type"] = "nominal"
    
    for f_name in features.get_names(types=("nominal", "ordinal")):
        if f_name not in subtype_map:
            features[f_name]["non_sensitive"] = True
    
    features[target_feature]["bounds"] = {'allow_null': False, 'min': df[target_feature].min(), 'max': df[target_feature].max()}
    
    features.to_dataframe()
    return features

def get_project14_features(df):
    target_feature = 'covid_test_result'
    features = infer_feature_attributes(df, attempt_infer_extended_nominals=True, infer_bounds=True,
                                        tight_bounds=[target_feature],
                                        types={"continuous": ['age_yrs', 'weight_lbs', 'height_cm']}
                                        )
    
    # As this is just a recipe notebook and not an attempt at creating anonymous data, we are setting the nominal features as `non_sensitive`.
    # When creating anonymous data, you should only set nominal features as non_sensitive if they are truly non_sensitive.
    # For example, when attempting to create anonymous data, we strongly recommend that nominals are encoded, which can be achieved by the following utility code:
    # features = infer_feature_attributess(df_orig, attempt_infer_extended_nominals=True)
    features["zip_code"]["type"] = "nominal"
    features["zip_code"]["subtype"] = "postcode"
    
    for f_name in features.get_names(types=("continuous")):
        if f_name not in ['age_yrs', 'weight_lbs', 'height_cm']:
            print(f'changing inferred continuous feature: {f_name} to nominal')
            features[f_name]["type"] = "nominal"
    
    for f_name in features.get_names(types=("nominal", "ordinal")):
        features[f_name]["non_sensitive"] = True
    
    features[target_feature]["bounds"] = {'allow_null': False}
    
    features.to_dataframe()
    print(features[target_feature])
    return features

In [4]:
def generate_one_synth(run_id, df, features):
    with Synthesizer(privacy_override=True, use_id_privacy=False) as s:
        s.train(df, features=features)
        hyperparameters = s.hyperparams
        # Train the given dataset
        gen_df = s.synthesize_cases(n_samples=df.shape[0], 
                                    desired_conviction=5, 
                                    generate_new_cases='no'
                                   )
        return run_id, gen_df

In [5]:
RUN_IDS = list(range(M))

input_df = read_wastewater_clinical_data()
input_feats = get_wastewater_features(input_df)

with ProcessPoolExecutor(max_workers=MAX_WORKERS) as executor:
    futures = {
        executor.submit(generate_one_synth, run_id, input_df, input_feats)
        for run_id in RUN_IDS
    }

    for fut in as_completed(futures):
        run_id, synth_df = fut.result()
        data_filename = f"synthesized_data_for_wastewater_run_{run_id}.csv"
        data_path = os.path.join("/home", "sagemaker-user", "stability", "wastewater_clinical", data_filename)
        synth_df.to_csv(data_path, index=False)    


total number of columns being mapped: 34
Index(['race', 'ethnicity', 'age', 'sex', 'zip', 'insurance', 'vaping',
       'nicotine', 'alcohol_use', 'asthma', 'cancer', 'cardiovascular_disease',
       'chronic_kidney_disease', 'chronic_lung_disease', 'diabetes',
       'hypertension', 'immunosuppressive_conditio', 'serious_mental_illness',
       'sickle_cell_disease', 'pregnancy_status', 'height_inches',
       'weight_lbs', 'patient_stay_length', 'patient_ventilator',
       'patient_icu', 'patient_remdesivir_use', 'patient_mortality',
       'country'],
      dtype='object')
total number of rows: 7285
after removing null rows, df.shape: (7285, 28)
[]
(7285, 28)


Version 50.2.2 of Howso Engine™ is available. You are using version 50.2.1.
Version 50.2.2 of Howso Engine™ is available. You are using version 50.2.1.
Version 50.2.2 of Howso Engine™ is available. You are using version 50.2.1.
Version 50.2.2 of Howso Engine™ is available. You are using version 50.2.1.
Version 50.2.2 of Howso Engine™ is available. You are using version 50.2.1.
Version 50.2.2 of Howso Engine™ is available. You are using version 50.2.1.
Version 50.2.2 of Howso Engine™ is available. You are using version 50.2.1.
Version 50.2.2 of Howso Engine™ is available. You are using version 50.2.1.
Version 50.2.2 of Howso Engine™ is available. You are using version 50.2.1.
Version 50.2.2 of Howso Engine™ is available. You are using version 50.2.1.
Version 50.2.2 of Howso Engine™ is available. You are using version 50.2.1.
Version 50.2.2 of Howso Engine™ is available. You are using version 50.2.1.
Version 50.2.2 of Howso Engine™ is available. You are using version 50.2.1.
Version 50.2