<h2>Dataprep for Recall Optimized Modeling</h2>

There is nothing too interesting happening in here - this notebook just loads and preps data for usage in recall_optimized_modeling.ipynb. Run it in its entirety before moving onto that notebook!

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

BASE_DIR = Path().resolve().parent

<h2>Tidying Up</h2>

In [2]:
df = pd.read_csv(
    BASE_DIR / 'data_main' / "natality_aligned_10pct_sample.csv",
    on_bad_lines="skip"
)

df.dtypes

dob_yy        int64
dob_mm      float64
dob_tt      float64
dob_wk        int64
bfacil      float64
             ...   
ca_disor    float64
ca_hypo     float64
date         object
time_sin    float64
time_cos    float64
Length: 89, dtype: object

In [3]:
df['cig_rec'].value_counts(normalize=True)

cig_rec
0.0    0.954764
1.0    0.045236
Name: proportion, dtype: float64

In [4]:
df[['mar_p', 'rf_fedrg', 'rf_artec', 'me_trial']] = df[['mar_p', 'rf_fedrg', 'rf_artec', 'me_trial']].replace('X', np.nan).replace('U', np.nan)

In [5]:
df['fracehisp'].value_counts()

fracehisp
1.0    1168782
7.0     554937
8.0     324490
2.0     303889
4.0     135960
6.0      47583
3.0      12947
5.0       6391
Name: count, dtype: int64

In [6]:
df['priordead'].value_counts(normalize=True)

priordead
0.0     9.886328e-01
1.0     9.607956e-03
2.0     1.148206e-03
3.0     2.635036e-04
4.0     1.276650e-04
5.0     6.967082e-05
6.0     4.281447e-05
8.0     2.997013e-05
7.0     2.958090e-05
10.0    2.296412e-05
12.0    1.440123e-05
11.0    1.011978e-05
14.0    3.892224e-07
Name: proportion, dtype: float64

In [7]:
binary_cols = [col for col in df.columns if df[col].nunique() == 2 and df[col].dtype == 'int64']

binary_cols

['sex']

In [8]:
df['date'] = pd.to_datetime(df['dob_yy'].astype(str) + df['dob_wk'].astype(str) + '1', format='%G%V%u')

# Fourier terms for time of day as a type of "seasonality"
df['dob_tt'] = pd.to_numeric(df['dob_tt'], errors='coerce').fillna(0).astype(int)
df['time_str'] = df['dob_tt'].astype(str).str.zfill(4)
df['hour'] = df['time_str'].str[:2].astype(int)
df['minute'] = df['time_str'].str[2:].astype(int)

df['minute_of_day'] = df['hour'] * 60 + df['minute']
df['time_sin'] = np.sin(2 * np.pi * df['minute_of_day'] / 1440)
df['time_cos'] = np.cos(2 * np.pi * df['minute_of_day'] / 1440)

df = df.drop(['dob_yy', 'dob_tt', 'dob_wk', 'time_str', 'minute_of_day', 'minute', 'hour'], axis=1)

df['sex'] = np.where(df['sex'] == 'M', 1, 0)

df = df.replace('X', np.nan)

In [None]:
df = df.rename(columns={'no_mmorb': 'morbidity_reported'})

df = df[df['morbidity_reported'] != 9]

List of variables that require some cleaning in their encodings - lots of sentinel values.

- fagecomb - 99 is a sentinel value; will need to drop or fill it
- priorlive - 99 is a sentinel value
- priordead - 99 sentinel value
- priorterm - 99
- *illb_r (Interval Since Last Live Birth Recode) - 000–003 are sentinel; 888 - Not applicable — first live birth; 999 - Unknown or not stated
- ilop_r (Interval Since Last Other Pregnancy Recode) - not sure if we want to use; same sentinel as illb_r
- *ilp_r (Interval Since Last Pregnancy Recode) - not sure we want to use; same sentinel as illb_r
- ilp_r11 (Interval Since Last Pregnancy Recode 11) - same as above
- precare - 99
- previs - 99
- cig_0 - cig_3 - 99
- bmi - 99.9
- pwgt_r - 999
- dwgt_r - 999
- wtgain - 99
- rf_cesarn - 99
- combgest - 99
- dbwt - 999


In [10]:
cols_99 = [
    'fagecomb', 'priorlive', 'priordead', 'priorterm',
    'precare', 'previs',
    'wtgain', 'rf_cesarn', 'combgest'
]
df[cols_99] = df[cols_99].replace(99, np.nan)

df[['pwgt_r', 'dwgt_r', 'dbwt']] = df[['pwgt_r', 'dwgt_r', 'dbwt']].replace([999, 9999], np.nan)

df['bmi'] = df['bmi'].replace(99.9, np.nan)
df.loc[df['bmi'] > 90, 'bmi'] = np.nan

df[['illb_r', 'ilp_r']] = df[['illb_r', 'ilp_r']].replace(
    [0, 1, 2, 3, 888, 999],
    np.nan
)

df = df.dropna(subset=['ca_down', 'ca_disor'])
df[['ca_down', 'ca_disor']] = df[['ca_down', 'ca_disor']].replace('C', 2).astype(int)
binary_cols.append('ca_down')

# These seem redundant, so removed for now - can always comment out.
df = df.drop(columns=['ilop_r', 'ilp_r11'], axis=1)

In [None]:
continuous_cols = [
    'bmi', 'time_sin', 'time_cos'
]

discrete_cols = [
    'priorlive', 'priordead', 'priorterm',
    "precare", 'previs', 'pwgt_r', 'wtgain', 'rf_cesarn', 'combgest', 'dbwt',
    'mager', 'fagecomb', 'dwgt_r'
]

cat_cols = [
    'mracehisp', 'mar_p', 'meduc', 'fracehisp', 'feduc',
    'rf_pdiab', 'rf_gdiab', 'rf_phype', 'rf_ghype', 'cig_rec',
    'rf_ehype', 'rf_ppterm', 'rf_inftr', 'rf_fedrg', 'rf_artec',
    'rf_cesar', 'ip_gon', 'ip_syph', 'ip_chlam', 'ip_hepb', 'ip_hepc',
    'ld_indl', 'ld_augm', 'ld_ster', 'ld_antb', 'ld_chor', 'ld_anes',
    'me_pres', 'me_rout', 'me_trial', 'mm_mtr', 'mm_plac', 'mm_rupt',
    'attend', 'pay', 'dplural',
    'ab_aven1', 'ab_aven6', 'ab_nicu', 'ab_surf', 'ab_anti', 'ab_seiz',
    'dob_mm' # using monthly seasonality as one-hot instead of Fourier. Will be more descriptive.
]

In [12]:
missing_df = (
    df.isna()
      .sum()
      .sort_values(ascending=False)
      .reset_index()
      .set_axis(['variable', 'missing'], axis=1)
)
missing_df['missing_pct'] = missing_df['missing'] / len(df)

missing_df

Unnamed: 0,variable,missing,missing_pct
0,rf_artec,2515496,0.978668
1,rf_fedrg,2515496,0.978668
2,me_trial,1755128,0.682842
3,mar_p,1660073,0.645861
4,ilp_r,1132927,0.440772
...,...,...,...
79,ca_anen,0,0.000000
80,mracehisp,0,0.000000
81,dplural,0,0.000000
82,sex,0,0.000000


Drop features that have a count of missing values above a specific threshold. Impute based on the median of the data points' given year of measurement otherwise.

In [13]:
threshold = 0.20

df = df.drop(columns=missing_df[missing_df['missing_pct'] > threshold]['variable'], axis=1)

for var in missing_df['variable']:
    if var not in df.columns:
        continue

    missing_pct = missing_df.loc[missing_df['variable'] == var, 'missing_pct'].values[0]

    if missing_pct > 0 and df[var].dtype in ['float64']:
        df[var] = (
            df.groupby(df['date'].dt.year)[var]
              .transform(lambda x: x.fillna(x.median()))
        )

In [14]:
missing_df = (
    df.isna()
      .sum()
      .sort_values(ascending=False)
      .reset_index()
      .set_axis(['variable', 'missing'], axis=1)
)
missing_df['missing_pct'] = missing_df['missing'] / len(df)

missing_df

Unnamed: 0,variable,missing,missing_pct
0,dob_mm,0,0.0
1,mm_aicu,0,0.0
2,dbwt,0,0.0
3,combgest,0,0.0
4,sex,0,0.0
...,...,...,...
73,rf_ehype,0,0.0
74,rf_ghype,0,0.0
75,rf_phype,0,0.0
76,rf_gdiab,0,0.0


In [None]:
df["prior_dead_term_binary"] = ((df["priordead"] > 0) | (df["priorterm"] > 0)).astype(int)

df["precare_binary"] = (df["precare"] > 0).astype(int)

binary_cols.extend(["precare_binary", "prior_dead_term_binary"])

to_remove = ['priorlive', 'priordead', 'priorterm']
discrete_cols = [c for c in discrete_cols if c not in to_remove and c not in ['mar_p', 'rf_fedrg', 'rf_artec', 'me_trial']]
cat_cols = [c for c in cat_cols if c not in to_remove and c not in ['mar_p', 'rf_fedrg', 'rf_artec', 'me_trial']]
binary_cols = [c for c in binary_cols if c not in to_remove and c not in ['mar_p', 'rf_fedrg', 'rf_artec', 'me_trial']]

df.drop(to_remove, axis=1, inplace=True)

binary_cols.extend(['ca_disor', 'bfacil'])

In [16]:
df['bfacil']

0          1.0
1          1.0
2          1.0
3          1.0
4          1.0
          ... 
2576837    1.0
2576838    1.0
2576839    1.0
2576840    1.0
2576841    1.0
Name: bfacil, Length: 2570327, dtype: float64

In [None]:
export_df = df[['date', 'morbidity_reported'] + binary_cols + cat_cols + continuous_cols + discrete_cols].copy()
export_df["hospital_birth_binary"] = export_df["bfacil"].eq(1).astype(int)
export_df = export_df.drop(columns=["bfacil"])

# These are actually a part of the target; they're morbidities
export_df = export_df.drop(columns=['mm_mtr', 'mm_plac', 'mm_rupt'], axis=1)

cols_to_remove = ['mm_mtr', 'mm_plac', 'mm_rupt',
    'rf_inftr', 'rf_cesar',
    'me_trial'
]

discrete_cols = [c for c in discrete_cols if c not in cols_to_remove]
cat_cols = [c for c in cat_cols if c not in cols_to_remove]

In [21]:
cat_cols

['mracehisp',
 'meduc',
 'fracehisp',
 'feduc',
 'rf_pdiab',
 'rf_gdiab',
 'rf_phype',
 'rf_ghype',
 'cig_rec',
 'rf_ehype',
 'rf_ppterm',
 'ip_gon',
 'ip_syph',
 'ip_chlam',
 'ip_hepb',
 'ip_hepc',
 'ld_indl',
 'ld_augm',
 'ld_ster',
 'ld_antb',
 'ld_chor',
 'ld_anes',
 'me_pres',
 'me_rout',
 'attend',
 'pay',
 'dplural',
 'ab_aven1',
 'ab_aven6',
 'ab_nicu',
 'ab_surf',
 'ab_anti',
 'ab_seiz',
 'dob_mm']

In [22]:
discrete_cols

['precare',
 'previs',
 'pwgt_r',
 'wtgain',
 'rf_cesarn',
 'combgest',
 'dbwt',
 'mager',
 'fagecomb',
 'dwgt_r']

In [23]:
continuous_cols

['bmi', 'time_sin', 'time_cos']

In [24]:
binary_vars = [
    col for col in export_df.columns if export_df[col].nunique() == 2
]

binary_vars

['morbidity_reported',
 'ca_down',
 'precare_binary',
 'prior_dead_term_binary',
 'ca_disor',
 'rf_pdiab',
 'rf_gdiab',
 'rf_phype',
 'rf_ghype',
 'cig_rec',
 'rf_ehype',
 'rf_ppterm',
 'rf_inftr',
 'rf_cesar',
 'ip_gon',
 'ip_syph',
 'ip_chlam',
 'ip_hepb',
 'ip_hepc',
 'ld_indl',
 'ld_augm',
 'ld_ster',
 'ld_antb',
 'ld_chor',
 'ld_anes',
 'ab_aven1',
 'ab_aven6',
 'ab_nicu',
 'ab_surf',
 'ab_anti',
 'ab_seiz',
 'hospital_birth_binary']

In [None]:
export_df.to_csv(BASE_DIR / 'recall_modeling' / 'data' / 'natality_10yr_test_data_cat.csv', index=False)

In [None]:
enc = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

for col in cat_cols:
    col_data = export_df[col].astype(int).astype("string")

    encoded = enc.fit_transform(col_data.to_frame())
    encoded_cols = enc.get_feature_names_out([col])
    encoded_df = pd.DataFrame(encoded, columns=encoded_cols, index=df.index)

    export_df = pd.concat([export_df, encoded_df], axis=1)
    export_df = export_df.drop(columns=[col])

export_df

In [28]:
export_df = export_df.drop(columns=['mracehisp_1', 'meduc_1', 'fracehisp_1', 'feduc_1', 'rf_pdiab_0', 
                 'rf_gdiab_0', 'rf_phype_0', 'rf_ghype_0', 'rf_ehype_0', 'rf_ppterm_0',
                 'ip_gon_0', 'ip_syph_0', 'ip_chlam_0', 'ip_hepb_0', 'ip_hepc_0', 'ld_indl_0', 'ld_augm_0',
                 'ld_ster_0', 'ld_antb_0', 'ld_chor_0', 'ld_anes_0', 'me_pres_1',
                 'pay_2', 'dplural_1', 'ab_aven1_0', 'ab_aven6_0',
                 'ab_nicu_0', 'ab_surf_0', 'ab_anti_0', 'ab_seiz_0', 'dob_mm_1'
])

In [29]:
binary_vars = [
    col for col in export_df.columns if export_df[col].nunique() == 2
]

binary_vars

['morbidity_reported',
 'ca_down',
 'precare_binary',
 'prior_dead_term_binary',
 'ca_disor',
 'rf_inftr',
 'rf_cesar',
 'hospital_birth_binary',
 'mracehisp_2',
 'mracehisp_3',
 'mracehisp_4',
 'mracehisp_5',
 'mracehisp_6',
 'mracehisp_7',
 'mracehisp_8',
 'meduc_2',
 'meduc_3',
 'meduc_4',
 'meduc_5',
 'meduc_6',
 'meduc_7',
 'meduc_8',
 'fracehisp_2',
 'fracehisp_3',
 'fracehisp_4',
 'fracehisp_5',
 'fracehisp_6',
 'fracehisp_7',
 'fracehisp_8',
 'feduc_2',
 'feduc_3',
 'feduc_4',
 'feduc_5',
 'feduc_6',
 'feduc_7',
 'feduc_8',
 'rf_pdiab_1',
 'rf_gdiab_1',
 'rf_phype_1',
 'rf_ghype_1',
 'cig_rec_0',
 'cig_rec_1',
 'rf_ehype_1',
 'rf_ppterm_1',
 'ip_gon_1',
 'ip_syph_1',
 'ip_chlam_1',
 'ip_hepb_1',
 'ip_hepc_1',
 'ld_indl_1',
 'ld_augm_1',
 'ld_ster_1',
 'ld_antb_1',
 'ld_chor_1',
 'ld_anes_1',
 'me_pres_2',
 'me_pres_3',
 'me_rout_1',
 'me_rout_2',
 'me_rout_3',
 'me_rout_4',
 'attend_1',
 'attend_2',
 'attend_3',
 'attend_4',
 'attend_5',
 'pay_1',
 'pay_3',
 'pay_4',
 'pay_5',


In [30]:
export_df

Unnamed: 0,date,morbidity_reported,sex,ca_down,precare_binary,prior_dead_term_binary,ca_disor,rf_inftr,rf_cesar,bmi,...,dob_mm_10,dob_mm_11,dob_mm_12,dob_mm_2,dob_mm_3,dob_mm_4,dob_mm_5,dob_mm_6,dob_mm_7,dob_mm_8
0,2020-01-27,0.0,0,0,1,0,0,0.0,0.0,22.299999,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,2020-02-03,0.0,0,0,1,0,0,0.0,0.0,29.200001,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,2020-01-06,0.0,0,0,1,0,0,0.0,0.0,18.600000,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,2020-01-13,0.0,0,0,1,0,0,0.0,0.0,22.100000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2019-12-30,0.0,0,0,1,1,0,0.0,0.0,25.100000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2576837,2019-01-07,0.0,0,0,1,1,0,0.0,1.0,30.000000,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2576838,2019-02-11,0.0,0,0,1,0,0,0.0,0.0,49.099998,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2576839,2018-12-31,0.0,0,0,1,0,0,0.0,0.0,20.299999,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2576840,2019-01-14,0.0,0,0,1,1,0,0.0,0.0,33.700001,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
export_df.to_csv(BASE_DIR / 'recall_modeling' / 'data' / 'natality_10yr_test_data_one_hot.csv', index=False)