<h1>
Some Pruned Variables, Pruned Values, No Specific Target (Separate Morbidities)
</h1>

In [1]:
from pathlib import Path
import re
import pandas as pd
import numpy as np

BASE_DIR = Path().resolve().parent

<h2>Tidying Up</h2>

In [2]:
df = pd.read_csv(BASE_DIR / 'natality_7yr_test_data.csv')

df.dtypes

  df = pd.read_csv(BASE_DIR / 'natality_7yr_test_data.csv')


dob_yy       int64
dob_mm       int64
dob_tt       int64
dob_wk       int64
bfacil       int64
             ...  
ca_cleft     int64
ca_clpal     int64
ca_down     object
ca_disor    object
ca_hypo      int64
Length: 87, dtype: object

In [3]:
df['cig_rec'].value_counts(normalize=True)

cig_rec
0    0.924543
1    0.066752
2    0.008705
Name: proportion, dtype: float64

In [4]:
df.nunique()

dob_yy         7
dob_mm        12
dob_tt      1441
dob_wk         7
bfacil         8
            ... 
ca_cleft       2
ca_clpal       2
ca_down        5
ca_disor       5
ca_hypo        2
Length: 87, dtype: int64

In [5]:
df['priordead'].value_counts(normalize=True)

priordead
0     0.982029
1     0.010548
99    0.005638
2     0.001162
3     0.000329
4     0.000133
9     0.000052
6     0.000043
5     0.000024
7     0.000014
12    0.000010
11    0.000010
8     0.000010
Name: proportion, dtype: float64

In [6]:
binary_cols = [col for col in df.columns if df[col].nunique() == 2 and df[col].dtype == 'int64']

binary_cols

['dmar',
 'ca_anen',
 'ca_mnsb',
 'ca_cchd',
 'ca_cdh',
 'ca_omph',
 'ca_gast',
 'ca_limb',
 'ca_cleft',
 'ca_clpal',
 'ca_hypo']

In [7]:
df['imp_sex'].unique()

array([nan,  1.])

In [8]:
df = df.drop('imp_sex', axis=1)

df['date'] = pd.to_datetime(df['dob_yy'].astype(str) + df['dob_wk'].astype(str) + '1', format='%G%V%u')

# Fourier terms for time of day as a type of "seasonality"
df['time_str'] = df['dob_tt'].astype(str).str.zfill(4)
df['hour'] = df['time_str'].str[:2].astype(int)
df['minute'] = df['time_str'].str[2:].astype(int)

df['minute_of_day'] = df['hour'] * 60 + df['minute']
df['time_sin'] = np.sin(2 * np.pi * df['minute_of_day'] / 1440)
df['time_cos'] = np.cos(2 * np.pi * df['minute_of_day'] / 1440)

df = df.drop(['dob_yy', 'dob_tt', 'dob_wk', 'time_str', 'minute_of_day', 'minute', 'hour'], axis=1)

df['sex'] = np.where(df['sex'] == 'M', 1, 0)

In [9]:
# clean outcome and drop where we don't have a label (we aren't at risk of data-shortage lol)

# df = df.rename(columns={'no_mmorb': 'morbidity_reported'})

# df = df[df['morbidity_reported'] != 9]
# flipping the binary so morbidity is the positive class
# df['morbidity_reported'] = 1 - df['morbidity_reported']

df = df.drop('no_mmorb', axis=1)

List of variables that require some cleaning in their encodings - lots of sentinel values.

- fagecomb - 99 is a sentinel value; will need to drop or fill it
- priorlive - 99 is a sentinel value
- priordead - 99 sentinel value
- priorterm - 99
- *illb_r (Interval Since Last Live Birth Recode) - 000–003 are sentinel; 888 - Not applicable — first live birth; 999 - Unknown or not stated
- ilop_r (Interval Since Last Other Pregnancy Recode) - not sure if we want to use; same sentinel as illb_r
- *ilp_r (Interval Since Last Pregnancy Recode) - not sure we want to use; same sentinel as illb_r
- ilp_r11 (Interval Since Last Pregnancy Recode 11) - same as above
- precare - 99
- previs - 99
- cig_0 - cig_3 - 99
- bmi - 99.9
- pwgt_r - 999
- dwgt_r - 999
- wtgain - 99
- rf_cesarn - 99
- combgest - 99
- dbwt - 999


In [None]:
cols_99 = [
    'fagecomb', 'priorlive', 'priordead', 'priorterm',
    'precare', 'previs', 'cig_1', 'cig_2', 'cig_3',
    'wtgain', 'rf_cesarn', 'combgest'
]
df[cols_99] = df[cols_99].replace(99, np.nan)

df[['pwgt_r', 'dwgt_r', 'dbwt']] = df[['pwgt_r', 'dwgt_r', 'dbwt']].replace([999, 9999], np.nan)

df['bmi'] = df['bmi'].replace(99.9, np.nan)
df.loc[df['bmi'] > 90, 'bmi'] = np.nan

df[['illb_r', 'ilp_r']] = df[['illb_r', 'ilp_r']].replace(
    [0, 1, 2, 3, 888, 999],
    np.nan
)

df[['ca_down', 'ca_disor']] = df[['ca_down', 'ca_disor']].replace('C', 2).astype(int)
binary_cols.append('ca_down')

# These seem redundant, so removed for now - can always comment out.
df = df.drop(columns=['ilop_r', 'ilp_r11'], axis=1)

In [11]:
continuous_cols = [ # 'illb_r', 'ilp_r' <- dropped later on due to too many missing values
    'bmi', 'time_sin', 'time_cos' #, 'month_sin', 'month_cos'
]

discrete_cols = [
    'cig_0', 'cig_1', 'cig_2', 'cig_3', 'priorlive', 'priordead', 'priorterm',
    "precare", 'previs', 'pwgt_r', 'wtgain', 'rf_cesarn', 'combgest', 'dbwt',
    'mager', 'fagecomb'
]

cat_cols = [
    'bfacil', 'mracehisp', 'mar_p', 'meduc', 'fracehisp', 'feduc',
    'rf_pdiab', 'rf_gdiab', 'rf_phype', 'rf_ghype', 'cig_rec',
    'rf_ehype', 'rf_ppterm', 'rf_inftr', 'rf_fedrg', 'rf_artec',
    'rf_cesar', 'ip_gon', 'ip_syph', 'ip_chlam', 'ip_hepb', 'ip_hepc',
    'ld_indl', 'ld_augm', 'ld_ster', 'ld_antb', 'ld_chor', 'ld_anes',
    'me_pres', 'me_rout', 'me_trial', 'mm_mtr', 'mm_plac', 'mm_rupt',
    'mm_uhyst', 'mm_aicu', 'attend', 'pay', 'dplural',
    'ab_aven1', 'ab_aven6', 'ab_nicu', 'ab_surf', 'ab_anti', 'ab_seiz',
    'ca_down', 'ca_disor', 'dob_mm' # using monthly seasonality as one-hot instead of Fourier. Will be more descriptive.
]

In [12]:
missing_df = (
    df.isna()
      .sum()
      .sort_values(ascending=False)
      .reset_index()
      .set_axis(['variable', 'missing'], axis=1)
)
missing_df['missing_pct'] = missing_df['missing'] / len(df)

missing_df

Unnamed: 0,variable,missing,missing_pct
0,ilp_r,86451,0.411671
1,illb_r,84784,0.403733
2,fagecomb,31469,0.149852
3,wtgain,6484,0.030876
4,dwgt_r,3591,0.017100
...,...,...,...
78,ld_ster,0,0.000000
79,ld_antb,0,0.000000
80,ld_chor,0,0.000000
81,ld_anes,0,0.000000


Drop features that have a count of missing values above a specific threshold. Impute based on the median of the data points' given year of measurement otherwise.

In [13]:
threshold = 0.20

df = df.drop(columns=missing_df[missing_df['missing_pct'] > threshold]['variable'], axis=1)

for var in missing_df['variable']:
    if var not in df.columns:
        continue

    missing_pct = missing_df.loc[missing_df['variable'] == var, 'missing_pct'].values[0]

    if missing_pct > 0 and df[var].dtype in ['float64']:
        df[var] = (
            df.groupby(df['date'].dt.year)[var]
              .transform(lambda x: x.fillna(x.median()))
        )

In [14]:
missing_df = (
    df.isna()
      .sum()
      .sort_values(ascending=False)
      .reset_index()
      .set_axis(['variable', 'missing'], axis=1)
)
missing_df['missing_pct'] = missing_df['missing'] / len(df)

missing_df

Unnamed: 0,variable,missing,missing_pct
0,dob_mm,0,0.0
1,ld_augm,0,0.0
2,dbwt,0,0.0
3,combgest,0,0.0
4,sex,0,0.0
...,...,...,...
76,rf_phype,0,0.0
77,rf_gdiab,0,0.0
78,rf_pdiab,0,0.0
79,wtgain,0,0.0


Some columns are also extremely zero-inflated, so let's create seprate binary versions to encode both y/n and "magnitude". We can select / prune later on.

In [15]:
df["cig_0_binary"] = (df["cig_0"] > 0).astype(int)

df["cig_1_binary"] = (df["cig_1"] > 0).astype(int)

df["cig_2_binary"] = (df["cig_2"] > 0).astype(int)

df["cig_3_binary"] = (df["cig_3"] > 0).astype(int)

#df["priorlive_binary"] = (df["priorlive"] > 0).astype(int)

df["prior_dead_term_binary"] = ((df["priordead"] > 0) | (df["priorterm"] > 0)).astype(int)

df["precare_binary"] = (df["precare"] > 0).astype(int)

binary_cols.extend(["cig_0_binary", "cig_1_binary", "cig_2_binary", "cig_3_binary", "precare_binary"])
continuous_cols.append('prior_dead_term_binary')

to_remove = ['cig_0', 'cig_1', 'cig_2', 'cig_3', 'priorlive', 'priordead', 'priorterm']
discrete_cols = [c for c in discrete_cols if c not in to_remove]
cat_cols = [c for c in cat_cols if c not in to_remove]

df.drop(to_remove, axis=1, inplace=True)

In [16]:
list(binary_cols)

['dmar',
 'ca_anen',
 'ca_mnsb',
 'ca_cchd',
 'ca_cdh',
 'ca_omph',
 'ca_gast',
 'ca_limb',
 'ca_cleft',
 'ca_clpal',
 'ca_hypo',
 'cig_0_binary',
 'cig_1_binary',
 'cig_2_binary',
 'cig_3_binary',
 'precare_binary']

In [17]:
export_df = df[['date'] + binary_cols + cat_cols + continuous_cols + discrete_cols].copy()


In [18]:
export_df.head()

Unnamed: 0,date,dmar,ca_anen,ca_mnsb,ca_cchd,ca_cdh,ca_omph,ca_gast,ca_limb,ca_cleft,...,prior_dead_term_binary,precare,previs,pwgt_r,wtgain,rf_cesarn,combgest,dbwt,mager,fagecomb
0,2020-01-20,1,0,0,0,0,0,0,0,0,...,0,4.0,10.0,145.0,35.0,0.0,40.0,3570.0,25,30.0
1,2020-02-03,1,0,0,0,0,0,0,0,0,...,0,3.0,15.0,140.0,10.0,0.0,39.0,3560.0,28,27.0
2,2020-02-03,1,0,0,0,0,0,0,0,0,...,0,3.0,8.0,190.0,47.0,0.0,39.0,3130.0,36,35.0
3,2020-02-03,1,0,0,0,0,0,0,0,0,...,0,3.0,9.0,140.0,26.0,0.0,40.0,2760.0,32,34.0
4,2020-02-03,1,0,0,0,0,0,0,0,0,...,0,2.0,11.0,130.0,28.0,2.0,38.0,2948.0,39,44.0


In [19]:
# congenital_anomalies = ['ca_anen', 'ca_mnsb', 'ca_cchd', 'ca_cdh', 'ca_omph', 'ca_gast', 'ca_cleft', 'ca_clpal', 'ca_hypo', 'ca_limb']

# export_df["congenital_anomalies"] = (
#     export_df.loc[:, congenital_anomalies].eq(1).any(axis=1).astype(int)
# )

# export_df = export_df.drop(columns=congenital_anomalies)

# binary_cols = [c for c in binary_cols if c not in congenital_anomalies]
# binary_cols.append("congenital_anomalies")

In [20]:
smoking = ['cig_0_binary', 'cig_1_binary', 'cig_2_binary', 'cig_3_binary', 'cig_rec']

export_df["smoking"] = (
    export_df.loc[:, smoking].eq(1).any(axis=1).astype(int)
)
export_df = export_df.drop(columns=smoking)

binary_cols = [c for c in binary_cols if c not in smoking]
binary_cols.append("smoking")

cat_cols = [c for c in cat_cols if c not in smoking]

In [21]:
# Merge home birth
export_df["hospital_birth_binary"] = export_df["bfacil"].eq(1).astype(int)
export_df = export_df.drop(columns=["bfacil"])

cat_cols.remove("bfacil")
binary_cols.append("bfacil")

In [22]:
export_df['ca_disor'].value_counts(normalize=True)

ca_disor
0    0.999362
2    0.000638
Name: proportion, dtype: float64

In [23]:
export_df = export_df[(export_df['mracehisp'] != 8) & (export_df['fracehisp'] != 8)]
export_df = export_df[(export_df['meduc'] != 9) & (export_df['feduc'] != 9)]
export_df = export_df[export_df['rf_pdiab'] != 2]
export_df = export_df[export_df['rf_gdiab'] != 2]
export_df = export_df[export_df['rf_phype'] != 2]
export_df = export_df[export_df['rf_ghype'] != 2]
export_df = export_df[export_df['rf_ehype'] != 2]
export_df = export_df[export_df['rf_ppterm'] != 2]
export_df = export_df[export_df['ld_indl'] != 2]
export_df = export_df[export_df['ld_augm'] != 2]
export_df = export_df[export_df['ld_ster'] != 2]
export_df = export_df[export_df['ld_antb'] != 2]
export_df = export_df[export_df['ld_anes'] != 2]
export_df = export_df[export_df['ab_aven1'] != 2]
export_df = export_df[export_df['ab_aven6'] != 2]
export_df = export_df[export_df['ab_nicu'] != 2]
export_df = export_df[export_df['ab_anti'] != 2]

# 1 = y, 2 = no, 3 = unknown, 0 = Not Applicable
# Gonna remove, as I don't know this belongs in
# a causal graph.
export_df = export_df.drop(columns=['mar_p'])

# 8 is other, 9 is unknown
export_df = export_df[~export_df['pay'].isin([8, 9])]

# 3 is other, 9 is unknown
export_df = export_df[~export_df['me_pres'].isin([3, 9])]

# 5 is other, 9 is unknown
export_df = export_df[~export_df['attend'].isin([5, 9])]

# 9 is unknown; 2 is forceps and pretty sparse
# Will drop it if needed
# export_df = export_df[~export_df['me_rout'].isin([9])]
export_df = export_df.drop('me_rout', axis=1)

# 1 & 2 are pretty rare; we'll drop whole column if needed
# export_df = export_df[export_df['ld_chor'] != 2]
export_df = export_df.drop('ld_chor', axis=1)

# This includes rf_fedrg and rf_artec; breaking out
export_df = export_df.drop('rf_inftr', axis=1)

# We have a continuous column (rf_cesarn) for the number of c-sections.
# We can swap if needed
export_df = export_df.drop('rf_cesar', axis=1)

# Combined into one since they're all sparse otherwise
stds = ['ip_gon', 'ip_syph', 'ip_chlam', 'ip_hepb', 'ip_hepc']
# export_df['std_pos'] = (
#     export_df.loc[:, stds].eq(1).any(axis=1).astype(int)
# )
export_df = export_df[~export_df[stds].eq(2).any(axis=1)]

# export_df = export_df.drop(columns=stds)

# me_trial - attempts before c-section; might be too correlated with c-section y/n
export_df = export_df.drop('me_trial', axis=1)

# TODO: keeping these for now instead of one target.
# These are actually a part of the target; they're morbidities
# export_df = export_df.drop(columns=['mm_mtr', 'mm_plac', 'mm_rupt'], axis=1)

# Merging twins + together
# export_df['twins_plus'] = np.where(export_df['dplural'] > 1, 1, 0)

# pretty rare; can drop later if needed
export_df = export_df.drop('ab_surf', axis=1)
# export_df = export_df[export_df['ab_surf'] != 2]

# exceedingly rare
export_df = export_df.drop('ab_seiz', axis=1)

cat_to_binary = [
    'rf_pdiab', 'rf_gdiab', 'rf_phype', 'rf_ghype', 'rf_ehype',
    'rf_ppterm', 'ld_indl', 'ld_augm', 'ld_ster', 'ld_antb', 'ld_anes',
    'ab_aven1', 'ab_aven6', 'ab_nicu', 'ab_anti', 'twins_plus', 'ab_surf',
    'ip_gon', 'ip_syph', 'ip_chlam', 'ip_hepb', 'ip_hepc'
]

cols_to_remove = [ # mm_mtr', 'mm_plac', 'mm_rupt', 'dplural
    'ab_seiz', 'mar_p', 'rf_inftr', 'rf_cesar',
    'me_trial', 'me_rout', 'ld_chor'
]

discrete_cols = [c for c in discrete_cols if c not in cols_to_remove]
cat_cols = [c for c in cat_cols if c not in cols_to_remove and c not in cat_to_binary]
binary_cols.extend(cat_to_binary)

In [24]:
export_df

Unnamed: 0,date,dmar,ca_anen,ca_mnsb,ca_cchd,ca_cdh,ca_omph,ca_gast,ca_limb,ca_cleft,...,previs,pwgt_r,wtgain,rf_cesarn,combgest,dbwt,mager,fagecomb,smoking,hospital_birth_binary
0,2020-01-20,1,0,0,0,0,0,0,0,0,...,10.0,145.0,35.0,0.0,40.0,3570.0,25,30.0,0,1
1,2020-02-03,1,0,0,0,0,0,0,0,0,...,15.0,140.0,10.0,0.0,39.0,3560.0,28,27.0,0,1
2,2020-02-03,1,0,0,0,0,0,0,0,0,...,8.0,190.0,47.0,0.0,39.0,3130.0,36,35.0,0,1
3,2020-02-03,1,0,0,0,0,0,0,0,0,...,9.0,140.0,26.0,0.0,40.0,2760.0,32,34.0,0,1
4,2020-02-03,1,0,0,0,0,0,0,0,0,...,11.0,130.0,28.0,2.0,38.0,2948.0,39,44.0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
209995,2019-01-14,2,0,0,0,0,0,0,0,0,...,14.0,201.0,13.0,0.0,39.0,3345.0,19,22.0,0,1
209996,2019-01-14,2,0,0,0,0,0,0,0,0,...,12.0,130.0,25.0,0.0,39.0,4586.0,20,24.0,1,1
209997,2019-01-14,2,0,0,0,0,0,0,0,0,...,11.0,200.0,19.0,0.0,39.0,3240.0,28,29.0,0,1
209998,2019-01-14,2,0,0,0,0,0,0,0,0,...,7.0,130.0,62.0,0.0,35.0,1985.0,27,37.0,0,1


In [25]:
export_df['precare'].value_counts(normalize=True)

precare
2.0     0.384377
3.0     0.353266
4.0     0.101843
5.0     0.049522
6.0     0.028665
1.0     0.026161
7.0     0.022735
8.0     0.015192
0.0     0.013278
9.0     0.004873
10.0    0.000089
Name: proportion, dtype: float64

In [26]:
export_df.to_csv(BASE_DIR / 'dag_side_quest' / 'natality_7yr_test_data_for_dag_2.csv', index=False)