# Setup

In [1]:
!pip3 install gpy
!pip3 install git+https://github.com/BRML/climin
!pip3 install -U imbalanced-learn

Collecting gpy
[?25l  Downloading https://files.pythonhosted.org/packages/ce/4a/43d6f07b8b493bc216ecf1d5c447809e8c9d0b1b18b0b9db496dfadd87ea/GPy-1.10.0.tar.gz (959kB)
[K     |████████████████████████████████| 962kB 8.0MB/s 
Collecting paramz>=0.9.0
[?25l  Downloading https://files.pythonhosted.org/packages/d8/37/4abbeb78d30f20d3402887f46e6e9f3ef32034a9dea65d243654c82c8553/paramz-0.9.5.tar.gz (71kB)
[K     |████████████████████████████████| 71kB 10.1MB/s 
Building wheels for collected packages: gpy, paramz
  Building wheel for gpy (setup.py) ... [?25l[?25hdone
  Created wheel for gpy: filename=GPy-1.10.0-cp37-cp37m-linux_x86_64.whl size=2565007 sha256=a469b47a3901948b3c566148d7214f3b9fb00533dd2b8b38f0b33d00001e9b71
  Stored in directory: /root/.cache/pip/wheels/23/99/8d/d0c3dee7db4af58190cde6abdb45e6a7ded2f9f01ff528dd0f
  Building wheel for paramz (setup.py) ... [?25l[?25hdone
  Created wheel for paramz: filename=paramz-0.9.5-cp37-none-any.whl size=102566 sha256=1a65220264a309a2

In [2]:
from google.colab import drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


# Finish data preprocessing
- Import mostly-preprocessed dataset (see load_vbac_data.py) 
- Check data looks right
- Add indicator columns for missing features
- Alter some features to make them more processable by the models


##  Load Data

In [16]:
import pandas as pd
import os
import numpy as np
import torch
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold

In [17]:
headers = ['FACILITY_RECODE', 'MOTHERS_AGE_RECODE', 'MARITAL_STATUS', 'MOTHERS_EDUCATION', 'PRIOR_BIRTHS_NOW_LIVING', 'PRIOR_BIRTHS_NOW_DEAD', 
           'PRIOR_OTHER_TERMINATIONS', 'LIVE_BIRTH_ORDER_RECODE', 'TOTAL_BIRTH_ORDER_RECODE', 'INTERVAL_SINCE_LAST_LIVE_BIRTH_RECODE', 
           'MONTH_PRENATAL_CARE_BEGAN_RECODE', 'NUMBER_OF_PRENATAL_VISITS_RECODE', 'CIGARETTES_BEFORE_PREGNANCY_RECODE', 
           'CIGARETTES_FIRST_TRIMESTER_RECODE', 'CIGARETTES_SECOND_TRIMESTER_RECODE', 'CIGARETTES_THIRD_TRIMESTER_RECODE', 
           'MOTHERS_HEIGHT_IN_TOTAL_INCHES', 'MOTHERS_BMI_RECODE', 'PRE_PREGNANCY_WEIGHT_RECODE', 'DELIVERY_WEIGHT_RECODE', 'WEIGHT_GAIN', 
           'PRE_PREGNANCY_DIABETES', 'GESTATIONAL_DIABETES', 'PRE_PREGNANCY_HYPERTENSION', 'GESTATIONAL_HYPERTENSION', 
           'HYPERTENSION_ECLAMPSIA', 'PREVIOUS_PRETERM_BIRTH', 'PREVIOUS_CESAREAN', 'NUMBER_OF_PREVIOUS_CESAREANS', 'NO_INFECTIONS_REPORTED', 
           'INDUCTION_OF_LABOR', 'AUGMENTATION_OF_LABOR', 'CHORIOAMNIONITIS', 'ATTENDANT_AT_BIRTH', 'PAYMENT_SOURCE_FOR_DELIVERY', 
           'PLURALITY_RECODE', 'SEX_OF_INFANT', 'COMBINED_GESTATION_RECODE', 'BIRTH_WEIGHT_RECODE', 'TOL_ATTEMPTED', 
           'DELIVERY_METHOD_1', 'DELIVERY_METHOD_2']

result = ['successful_vbac']
headers.extend(result)

# Default values taken from UserGuide2019-508.pdf
list_of_cols_with_missing_vals_and_their_default_numb = [
        ('FACILITY_RECODE', 3),
        ('MARITAL_STATUS', 9),
        ('MOTHERS_EDUCATION', 9),
        ('PRIOR_BIRTHS_NOW_LIVING', 99),
        ('PRIOR_BIRTHS_NOW_DEAD', 99),
        ('PRIOR_OTHER_TERMINATIONS', 99),
        ('LIVE_BIRTH_ORDER_RECODE', 9),
        ('TOTAL_BIRTH_ORDER_RECODE', 9),
        ('MONTH_PRENATAL_CARE_BEGAN_RECODE', 5),
        ('NUMBER_OF_PRENATAL_VISITS_RECODE', 12),
        ('CIGARETTES_BEFORE_PREGNANCY_RECODE', 6),
        ('CIGARETTES_FIRST_TRIMESTER_RECODE', 6),
        ('CIGARETTES_SECOND_TRIMESTER_RECODE', 6),
        ('CIGARETTES_THIRD_TRIMESTER_RECODE', 6),
        ('MOTHERS_HEIGHT_IN_TOTAL_INCHES', 99),
        ('MOTHERS_BMI_RECODE', 9),
        ('PRE_PREGNANCY_WEIGHT_RECODE', 999),
        ('DELIVERY_WEIGHT_RECODE', 999),
        ('WEIGHT_GAIN', 99),
        ('PRE_PREGNANCY_DIABETES', -1),
        ('GESTATIONAL_DIABETES', -1),
        ('PRE_PREGNANCY_HYPERTENSION', -1),
        ('GESTATIONAL_HYPERTENSION', -1),
        ('HYPERTENSION_ECLAMPSIA', -1),
        ('PREVIOUS_PRETERM_BIRTH', -1),
        ('NUMBER_OF_PREVIOUS_CESAREANS', 99),
        ('NO_INFECTIONS_REPORTED', 9),
        ('INDUCTION_OF_LABOR', -1),
        ('AUGMENTATION_OF_LABOR', -1),
        ('CHORIOAMNIONITIS', -1),
        ('ATTENDANT_AT_BIRTH', 9),
        ('PAYMENT_SOURCE_FOR_DELIVERY', 9),
        ('BIRTH_WEIGHT_RECODE', 12),
        ('INTERVAL_SINCE_LAST_LIVE_BIRTH_RECODE', 999),
        ('COMBINED_GESTATION_RECODE', 99)]

In [19]:
filename = '2019_vbac_data'
# Load data - edith MYPATH to contain the proper path to the dataset
MYPATH = 'gdrive/MyDrive/AA222'
data_path = os.getcwd() + f'/{MYPATH}/{filename}.csv'
birth_df = pd.read_csv(data_path, header=None, names=headers, index_col=False, skip_blank_lines=True, dtype=float)

In [20]:
X = pd.DataFrame(birth_df).iloc[:, :-1]
y = pd.DataFrame(birth_df).iloc[:, -1:] # 1 if successful VBAC, 0 if failed.

## Sanity check dataset

In [22]:
# Check that 100% of samples have had a prior cesarean
X["PREVIOUS_CESAREAN"].value_counts()

1.0    109126
Name: PREVIOUS_CESAREAN, dtype: int64

In [24]:
# Check that INTERVAL_SINCE_LAST_LIVE_BIRTH_RECODE does not have many 888s (first time births) 
print((X["INTERVAL_SINCE_LAST_LIVE_BIRTH_RECODE"] == 888.0).value_counts())

# If not too many, just make these 999 (unknown value) - probably a mistake inputting data, 
# as we saw above that 100% of samples had a previous cesarean. 
X["INTERVAL_SINCE_LAST_LIVE_BIRTH_RECODE"].apply(lambda x : 999.0 if x == 888.0 else x)

False    108908
True        218
Name: INTERVAL_SINCE_LAST_LIVE_BIRTH_RECODE, dtype: int64


0         45.0
1         56.0
2         21.0
3         75.0
4         82.0
          ... 
109121    22.0
109122    33.0
109123    47.0
109124    13.0
109125    45.0
Name: INTERVAL_SINCE_LAST_LIVE_BIRTH_RECODE, Length: 109126, dtype: float64

In [25]:
# Make sure values are equivalent (data consistency/reliability reasons)
# For delivery_method_recode, 2 = VBAC and 4 = CBAC
# For delivery_method_recode_2, 1 = Vaginal and 2 = Cesarean

print(X["DELIVERY_METHOD_1"].value_counts())
print(X["DELIVERY_METHOD_2"].value_counts())

2.0    80289
4.0    28837
Name: DELIVERY_METHOD_1, dtype: int64
1.0    80289
2.0    28837
Name: DELIVERY_METHOD_2, dtype: int64


## Reform data to make more processable by models 

In [26]:
# Add indicator columns to indicate if a feature is missing
# Set missing feature values to median 

for col, val in list_of_cols_with_missing_vals_and_their_default_numb:
    X[col+'_MISSING'] = X[col].apply(lambda x: 1 if x==val or x==-1 else 0)
    the_median = X[X[col]!=val][col].median()
    X[col] = X[col].apply(lambda x: the_median if x==val or x==-1 else x)

In [28]:
# Change facility_recode column to "in_hospital" for interpretability
X['IN_HOSPITAL'] = X['FACILITY_RECODE'].apply(lambda x: x if x==1 else 0)
X = X.drop(columns='FACILITY_RECODE')

In [29]:
# Change marital status to 1 or 0 
X['MARITAL_STATUS'].apply(lambda x: x if x == 1 else 0)

0         1.0
1         0.0
2         1.0
3         1.0
4         1.0
         ... 
109121    0.0
109122    1.0
109123    0.0
109124    1.0
109125    0.0
Name: MARITAL_STATUS, Length: 109126, dtype: float64

In [30]:
# Change 'month prenatal care began' value from 0 to 11 if never got prenatal care to avoid confusing the regression
X['MONTH_PRENATAL_CARE_BEGAN_RECODE'].value_counts()
X['MONTH_PRENATAL_CARE_BEGAN_RECODE'].apply(lambda x: x if x != 0 else 11)

0         1.0
1         1.0
2         1.0
3         1.0
4         1.0
         ... 
109121    1.0
109122    1.0
109123    3.0
109124    1.0
109125    1.0
Name: MONTH_PRENATAL_CARE_BEGAN_RECODE, Length: 109126, dtype: float64

## Standardize dataset & delete unused columns

In [31]:
from sklearn import preprocessing
scaler = preprocessing.StandardScaler().fit(X)
X_scaled = pd.DataFrame(scaler.transform(X), columns=X.columns.values)

In [32]:
X_scaled

Unnamed: 0,MOTHERS_AGE_RECODE,MARITAL_STATUS,MOTHERS_EDUCATION,PRIOR_BIRTHS_NOW_LIVING,PRIOR_BIRTHS_NOW_DEAD,PRIOR_OTHER_TERMINATIONS,LIVE_BIRTH_ORDER_RECODE,TOTAL_BIRTH_ORDER_RECODE,INTERVAL_SINCE_LAST_LIVE_BIRTH_RECODE,MONTH_PRENATAL_CARE_BEGAN_RECODE,NUMBER_OF_PRENATAL_VISITS_RECODE,CIGARETTES_BEFORE_PREGNANCY_RECODE,CIGARETTES_FIRST_TRIMESTER_RECODE,CIGARETTES_SECOND_TRIMESTER_RECODE,CIGARETTES_THIRD_TRIMESTER_RECODE,MOTHERS_HEIGHT_IN_TOTAL_INCHES,MOTHERS_BMI_RECODE,PRE_PREGNANCY_WEIGHT_RECODE,DELIVERY_WEIGHT_RECODE,WEIGHT_GAIN,PRE_PREGNANCY_DIABETES,GESTATIONAL_DIABETES,PRE_PREGNANCY_HYPERTENSION,GESTATIONAL_HYPERTENSION,HYPERTENSION_ECLAMPSIA,PREVIOUS_PRETERM_BIRTH,NUMBER_OF_PREVIOUS_CESAREANS,NO_INFECTIONS_REPORTED,INDUCTION_OF_LABOR,AUGMENTATION_OF_LABOR,CHORIOAMNIONITIS,ATTENDANT_AT_BIRTH,PAYMENT_SOURCE_FOR_DELIVERY,PLURALITY_RECODE,SEX_OF_INFANT,COMBINED_GESTATION_RECODE,BIRTH_WEIGHT_RECODE,FACILITY_RECODE_MISSING,MARITAL_STATUS_MISSING,MOTHERS_EDUCATION_MISSING,PRIOR_BIRTHS_NOW_LIVING_MISSING,PRIOR_BIRTHS_NOW_DEAD_MISSING,PRIOR_OTHER_TERMINATIONS_MISSING,LIVE_BIRTH_ORDER_RECODE_MISSING,TOTAL_BIRTH_ORDER_RECODE_MISSING,MONTH_PRENATAL_CARE_BEGAN_RECODE_MISSING,NUMBER_OF_PRENATAL_VISITS_RECODE_MISSING,CIGARETTES_BEFORE_PREGNANCY_RECODE_MISSING,CIGARETTES_FIRST_TRIMESTER_RECODE_MISSING,CIGARETTES_SECOND_TRIMESTER_RECODE_MISSING,CIGARETTES_THIRD_TRIMESTER_RECODE_MISSING,MOTHERS_HEIGHT_IN_TOTAL_INCHES_MISSING,MOTHERS_BMI_RECODE_MISSING,PRE_PREGNANCY_WEIGHT_RECODE_MISSING,DELIVERY_WEIGHT_RECODE_MISSING,WEIGHT_GAIN_MISSING,PRE_PREGNANCY_DIABETES_MISSING,GESTATIONAL_DIABETES_MISSING,PRE_PREGNANCY_HYPERTENSION_MISSING,GESTATIONAL_HYPERTENSION_MISSING,HYPERTENSION_ECLAMPSIA_MISSING,PREVIOUS_PRETERM_BIRTH_MISSING,NUMBER_OF_PREVIOUS_CESAREANS_MISSING,NO_INFECTIONS_REPORTED_MISSING,INDUCTION_OF_LABOR_MISSING,AUGMENTATION_OF_LABOR_MISSING,CHORIOAMNIONITIS_MISSING,ATTENDANT_AT_BIRTH_MISSING,PAYMENT_SOURCE_FOR_DELIVERY_MISSING,BIRTH_WEIGHT_RECODE_MISSING,INTERVAL_SINCE_LAST_LIVE_BIRTH_RECODE_MISSING,COMBINED_GESTATION_RECODE_MISSING,IN_HOSPITAL
0,0.193403,-0.671916,0.954163,0.693069,-0.127855,-0.557039,0.728496,0.267364,-0.083619,-0.506592,-0.312010,-0.249119,-0.221734,-0.207742,-0.199706,-0.610735,-0.902375,-0.947583,-0.746788,0.588918,-0.106066,-0.285453,-0.155023,-0.250265,-0.045555,2.880007,1.955618,0.163434,-0.555142,1.713356,-0.153122,-0.451572,-0.698059,-0.130803,-0.983567,0.062695,0.765562,-0.006769,-0.324771,-0.125232,-0.028409,-0.042742,-0.045048,-0.044945,-0.056398,-0.165169,-0.183905,-0.07479,-0.075407,-0.075037,-0.123904,-0.075037,-0.167526,-0.157455,-0.125307,-0.193641,0.0,0.0,0.0,0.0,0.0,0.0,-0.048301,-0.050171,-0.0142,-0.0142,-0.0142,-0.02792,-0.085666,-0.038078,-0.230425,-0.027589,0.142706
1,-0.738851,1.488280,-0.165883,-0.009398,-0.127855,-0.557039,-0.015901,-0.337604,0.133456,-0.506592,-0.312010,4.772193,5.854838,6.667542,7.148770,-0.958911,0.766892,0.310841,0.280435,-0.108464,-0.106066,-0.285453,-0.155023,-0.250265,-0.045555,-0.347221,-0.327097,0.163434,-0.555142,1.713356,-0.153122,-0.451572,-0.698059,-0.130803,-0.983567,0.062695,-0.041793,-0.006769,-0.324771,-0.125232,-0.028409,-0.042742,-0.045048,-0.044945,-0.056398,-0.165169,-0.183905,-0.07479,-0.075407,-0.075037,-0.123904,-0.075037,-0.167526,-0.157455,-0.125307,-0.193641,0.0,0.0,0.0,0.0,0.0,0.0,-0.048301,-0.050171,-0.0142,-0.0142,-0.0142,-0.02792,-0.085666,-0.038078,-0.230425,-0.027589,0.142706
2,1.125657,-0.671916,1.514186,-0.009398,-0.127855,-0.557039,-0.015901,-0.337604,-0.557238,-0.506592,-0.312010,-0.249119,-0.221734,-0.207742,-0.199706,0.085616,-0.902375,-0.824208,-0.546354,0.798132,-0.106066,-0.285453,-0.155023,-0.250265,-0.045555,-0.347221,-0.327097,0.163434,-0.555142,-0.583650,-0.153122,-0.451572,0.214255,-0.130803,1.016708,1.416676,-0.041793,-0.006769,-0.324771,-0.125232,-0.028409,-0.042742,-0.045048,-0.044945,-0.056398,-0.165169,-0.183905,-0.07479,-0.075407,-0.075037,-0.123904,-0.075037,-0.167526,-0.157455,-0.125307,-0.193641,0.0,0.0,0.0,0.0,0.0,0.0,-0.048301,-0.050171,-0.0142,-0.0142,-0.0142,-0.02792,-0.085666,-0.038078,-0.230425,-0.027589,0.142706
3,1.125657,-0.671916,0.954163,-0.009398,-0.127855,-0.557039,-0.015901,-0.337604,0.508405,-0.506592,0.178962,-0.249119,-0.221734,-0.207742,-0.199706,-0.262560,0.766892,0.236816,-0.170541,-0.038726,-0.106066,-0.285453,-0.155023,-0.250265,-0.045555,-0.347221,-0.327097,0.163434,1.801340,-0.583650,-0.153122,-0.451572,2.951195,-0.130803,-0.983567,-0.614295,-0.041793,-0.006769,-0.324771,-0.125232,-0.028409,-0.042742,-0.045048,-0.044945,-0.056398,6.054389,5.437601,-0.07479,-0.075407,-0.075037,-0.123904,-0.075037,-0.167526,-0.157455,7.980378,5.164191,0.0,0.0,0.0,0.0,0.0,0.0,-0.048301,19.931635,-0.0142,-0.0142,-0.0142,-0.02792,-0.085666,-0.038078,-0.230425,-0.027589,0.142706
4,2.057912,-0.671916,-0.165883,-0.009398,-0.127855,-0.557039,-0.015901,-0.337604,0.646544,-0.506592,-0.312010,-0.249119,-0.221734,-0.207742,-0.199706,-0.610735,-0.902375,-0.750183,-0.195595,1.565252,-0.106066,-0.285453,-0.155023,-0.250265,-0.045555,-0.347221,-0.327097,0.163434,1.801340,-0.583650,-0.153122,-0.451572,0.214255,-0.130803,1.016708,0.062695,0.765562,-0.006769,-0.324771,-0.125232,-0.028409,-0.042742,-0.045048,-0.044945,-0.056398,-0.165169,-0.183905,-0.07479,-0.075407,-0.075037,-0.123904,-0.075037,-0.167526,-0.157455,-0.125307,-0.193641,0.0,0.0,0.0,0.0,0.0,0.0,-0.048301,-0.050171,-0.0142,-0.0142,-0.0142,-0.02792,-0.085666,-0.038078,-0.230425,-0.027589,0.142706
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109121,-1.671105,1.488280,-1.285930,-0.711866,-0.127855,-0.557039,-0.760298,-0.942572,-0.537504,-0.506592,0.669934,-0.249119,-0.221734,-0.207742,-0.199706,-2.351614,-0.067741,-0.651483,-1.072493,-1.154537,-0.106066,-0.285453,-0.155023,-0.250265,-0.045555,-0.347221,-0.327097,0.163434,-0.555142,-0.583650,-0.153122,-0.451572,-0.698059,-0.130803,-0.983567,-0.614295,-0.849149,-0.006769,-0.324771,-0.125232,-0.028409,-0.042742,-0.045048,-0.044945,-0.056398,-0.165169,-0.183905,-0.07479,-0.075407,-0.075037,-0.123904,-0.075037,-0.167526,-0.157455,-0.125307,-0.193641,0.0,0.0,0.0,0.0,0.0,0.0,-0.048301,-0.050171,-0.0142,-0.0142,-0.0142,-0.02792,-0.085666,-0.038078,-0.230425,-0.027589,0.142706
109122,1.125657,-0.671916,-1.285930,0.693069,-0.127855,3.212842,0.728496,2.687236,-0.320429,-0.506592,-0.312010,-0.249119,-0.221734,-0.207742,-0.199706,0.781968,0.766892,1.125116,1.157332,0.031012,-0.106066,-0.285453,-0.155023,-0.250265,-0.045555,-0.347221,-0.327097,0.163434,-0.555142,-0.583650,-0.153122,-0.451572,-0.698059,-0.130803,-0.983567,1.416676,1.572918,-0.006769,-0.324771,-0.125232,-0.028409,-0.042742,-0.045048,-0.044945,-0.056398,-0.165169,-0.183905,-0.07479,-0.075407,-0.075037,-0.123904,-0.075037,-0.167526,-0.157455,-0.125307,-0.193641,0.0,0.0,0.0,0.0,0.0,0.0,-0.048301,-0.050171,-0.0142,-0.0142,-0.0142,-0.02792,-0.085666,-0.038078,-0.230425,-0.027589,0.142706
109123,1.125657,1.488280,0.954163,-0.711866,-0.127855,-0.557039,-0.760298,-0.942572,-0.044151,2.391567,-1.293955,-0.249119,-0.221734,-0.207742,-0.199706,-1.307087,-0.902375,-0.996933,-0.947221,0.170489,-0.106066,-0.285453,-0.155023,-0.250265,-0.045555,-0.347221,-0.327097,0.163434,-0.555142,-0.583650,-0.153122,-0.451572,-0.698059,-0.130803,1.016708,2.093666,-0.849149,-0.006769,-0.324771,-0.125232,-0.028409,-0.042742,-0.045048,-0.044945,-0.056398,-0.165169,-0.183905,-0.07479,-0.075407,-0.075037,-0.123904,-0.075037,-0.167526,-0.157455,-0.125307,-0.193641,0.0,0.0,0.0,0.0,0.0,0.0,-0.048301,-0.050171,-0.0142,-0.0142,-0.0142,-0.02792,-0.085666,-0.038078,-0.230425,-0.027589,0.142706
109124,0.193403,-0.671916,-0.725907,2.098003,-0.127855,0.385431,2.217290,2.082268,-0.715111,-0.506592,-0.312010,-0.249119,-0.221734,-0.207742,-0.199706,0.781968,-0.067741,0.236816,-0.170541,-1.154537,-0.106066,-0.285453,-0.155023,-0.250265,-0.045555,-0.347221,-0.327097,0.163434,-0.555142,-0.583650,-0.153122,-0.451572,-0.698059,-0.130803,-0.983567,-1.291286,-0.041793,-0.006769,-0.324771,-0.125232,-0.028409,-0.042742,-0.045048,-0.044945,-0.056398,-0.165169,-0.183905,-0.07479,-0.075407,-0.075037,-0.123904,-0.075037,-0.167526,-0.157455,-0.125307,-0.193641,0.0,0.0,0.0,0.0,0.0,0.0,-0.048301,-0.050171,-0.0142,-0.0142,-0.0142,-0.02792,-0.085666,-0.038078,-0.230425,-0.027589,0.142706


In [None]:
# Find empty columns
X_scaled.describe()

Unnamed: 0,MOTHERS_AGE_RECODE,MARITAL_STATUS,MOTHERS_EDUCATION,PRIOR_BIRTHS_NOW_LIVING,PRIOR_BIRTHS_NOW_DEAD,PRIOR_OTHER_TERMINATIONS,LIVE_BIRTH_ORDER_RECODE,TOTAL_BIRTH_ORDER_RECODE,INTERVAL_SINCE_LAST_LIVE_BIRTH_RECODE,MONTH_PRENATAL_CARE_BEGAN_RECODE,NUMBER_OF_PRENATAL_VISITS_RECODE,CIGARETTES_BEFORE_PREGNANCY_RECODE,CIGARETTES_FIRST_TRIMESTER_RECODE,CIGARETTES_SECOND_TRIMESTER_RECODE,CIGARETTES_THIRD_TRIMESTER_RECODE,MOTHERS_HEIGHT_IN_TOTAL_INCHES,MOTHERS_BMI_RECODE,PRE_PREGNANCY_WEIGHT_RECODE,DELIVERY_WEIGHT_RECODE,WEIGHT_GAIN,PRE_PREGNANCY_DIABETES,GESTATIONAL_DIABETES,PRE_PREGNANCY_HYPERTENSION,GESTATIONAL_HYPERTENSION,HYPERTENSION_ECLAMPSIA,PREVIOUS_PRETERM_BIRTH,NUMBER_OF_PREVIOUS_CESAREANS,NO_INFECTIONS_REPORTED,INDUCTION_OF_LABOR,AUGMENTATION_OF_LABOR,CHORIOAMNIONITIS,ATTENDANT_AT_BIRTH,PAYMENT_SOURCE_FOR_DELIVERY,PLURALITY_RECODE,SEX_OF_INFANT,COMBINED_GESTATION_RECODE,BIRTH_WEIGHT_RECODE,FACILITY_RECODE_MISSING,MARITAL_STATUS_MISSING,MOTHERS_EDUCATION_MISSING,PRIOR_BIRTHS_NOW_LIVING_MISSING,PRIOR_BIRTHS_NOW_DEAD_MISSING,PRIOR_OTHER_TERMINATIONS_MISSING,LIVE_BIRTH_ORDER_RECODE_MISSING,TOTAL_BIRTH_ORDER_RECODE_MISSING,MONTH_PRENATAL_CARE_BEGAN_RECODE_MISSING,NUMBER_OF_PRENATAL_VISITS_RECODE_MISSING,CIGARETTES_BEFORE_PREGNANCY_RECODE_MISSING,CIGARETTES_FIRST_TRIMESTER_RECODE_MISSING,CIGARETTES_SECOND_TRIMESTER_RECODE_MISSING,CIGARETTES_THIRD_TRIMESTER_RECODE_MISSING,MOTHERS_HEIGHT_IN_TOTAL_INCHES_MISSING,MOTHERS_BMI_RECODE_MISSING,PRE_PREGNANCY_WEIGHT_RECODE_MISSING,DELIVERY_WEIGHT_RECODE_MISSING,WEIGHT_GAIN_MISSING,PRE_PREGNANCY_DIABETES_MISSING,GESTATIONAL_DIABETES_MISSING,PRE_PREGNANCY_HYPERTENSION_MISSING,GESTATIONAL_HYPERTENSION_MISSING,HYPERTENSION_ECLAMPSIA_MISSING,PREVIOUS_PRETERM_BIRTH_MISSING,NUMBER_OF_PREVIOUS_CESAREANS_MISSING,NO_INFECTIONS_REPORTED_MISSING,INDUCTION_OF_LABOR_MISSING,AUGMENTATION_OF_LABOR_MISSING,CHORIOAMNIONITIS_MISSING,ATTENDANT_AT_BIRTH_MISSING,PAYMENT_SOURCE_FOR_DELIVERY_MISSING,BIRTH_WEIGHT_RECODE_MISSING,INTERVAL_SINCE_LAST_LIVE_BIRTH_RECODE_MISSING,COMBINED_GESTATION_RECODE_MISSING,IN_HOSPITAL
count,109126.0,109126.0,109126.0,109126.0,109126.0,109126.0,109126.0,109126.0,109126.0,109126.0,109126.0,109126.0,109126.0,109126.0,109126.0,109126.0,109126.0,109126.0,109126.0,109126.0,109126.0,109126.0,109126.0,109126.0,109126.0,109126.0,109126.0,109126.0,109126.0,109126.0,109126.0,109126.0,109126.0,109126.0,109126.0,109126.0,109126.0,109126.0,109126.0,109126.0,109126.0,109126.0,109126.0,109126.0,109126.0,109126.0,109126.0,109126.0,109126.0,109126.0,109126.0,109126.0,109126.0,109126.0,109126.0,109126.0,109126.0,109126.0,109126.0,109126.0,109126.0,109126.0,109126.0,109126.0,109126.0,109126.0,109126.0,109126.0,109126.0,109126.0,109126.0,109126.0,109126.0
mean,2.264939e-14,-1.799044e-15,-4.590282e-15,9.200503e-15,1.98227e-15,-1.503709e-14,6.279719e-15,-7.58027e-15,7.925695e-16,1.344328e-14,-1.475439e-15,5.750471e-14,6.768001e-15,-2.628479e-14,-3.42004e-14,3.785009e-15,-3.246086e-15,-1.282122e-15,6.18143e-15,4.557235e-15,-6.622707e-15,6.250855e-15,7.248151000000001e-17,-4.49817e-15,-4.012963e-15,-3.852565e-14,-1.439385e-14,8.364722e-15,-3.198312e-14,8.730193e-15,2.784553e-15,-1.421809e-14,-1.014034e-15,-2.394209e-14,4.565724e-16,-5.737308000000001e-17,3.824714e-15,-3.486904e-16,-4.039649e-13,-3.440561e-14,1.020674e-15,-1.057158e-14,9.511585e-16,2.91955e-15,-8.216552e-15,-1.499143e-14,1.70679e-14,-1.145405e-14,1.247236e-14,-1.181669e-14,-8.318734e-15,-5.024552e-15,-6.00456e-16,1.956915e-14,-1.818337e-14,-1.058516e-14,0.0,0.0,0.0,0.0,0.0,0.0,6.184666e-15,-1.252049e-14,5.587774e-15,5.587774e-15,5.587774e-15,4.123841e-15,4.984366e-14,3.030865e-15,6.781958e-14,1.1446e-15,-1.992752e-14
std,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,0.0,0.0,0.0,0.0,0.0,0.0,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005,1.000005
min,-3.535614,-0.6719165,-1.845953,-1.414333,-0.1278545,-0.5570391,-1.504695,-1.547539,-0.9124527,-0.5065923,-2.766871,-0.2491192,-0.2217337,-0.2077425,-0.1997061,-11.75236,-1.737009,-2.107308,-2.199932,-1.921657,-0.1060656,-0.285453,-0.1550235,-0.2502652,-0.04555544,-0.3472214,-0.3270972,-6.118675,-0.5551422,-0.5836499,-0.1531223,-0.4515723,-0.6980586,-0.1308031,-0.983567,-3.999247,-4.885927,-0.006769098,-0.3247713,-0.1252319,-0.02840877,-0.04274239,-0.04504766,-0.04494542,-0.05639798,-0.1651694,-0.1839046,-0.07478964,-0.07540666,-0.07503704,-0.1239043,-0.07503704,-0.1675257,-0.1574547,-0.1253074,-0.1936412,0.0,0.0,0.0,0.0,0.0,0.0,-0.04830129,-0.0501715,-0.01420009,-0.01420009,-0.01420009,-0.02791994,-0.08566574,-0.03807843,-0.2304253,-0.02758927,-7.007408
25%,-0.7388512,-0.6719165,-0.7259067,-0.7118656,-0.1278545,-0.5570391,-0.7602978,-0.9425716,-0.4783018,-0.5065923,-0.3120103,-0.2491192,-0.2217337,-0.2077425,-0.1997061,-0.6107354,-0.9023752,-0.7501833,-0.6966793,-0.5966313,-0.1060656,-0.285453,-0.1550235,-0.2502652,-0.04555544,-0.3472214,-0.3270972,0.1634341,-0.5551422,-0.5836499,-0.1531223,-0.4515723,-0.6980586,-0.1308031,-0.983567,-0.6142953,-0.8491491,-0.006769098,-0.3247713,-0.1252319,-0.02840877,-0.04274239,-0.04504766,-0.04494542,-0.05639798,-0.1651694,-0.1839046,-0.07478964,-0.07540666,-0.07503704,-0.1239043,-0.07503704,-0.1675257,-0.1574547,-0.1253074,-0.1936412,0.0,0.0,0.0,0.0,0.0,0.0,-0.04830129,-0.0501715,-0.01420009,-0.01420009,-0.01420009,-0.02791994,-0.08566574,-0.03807843,-0.2304253,-0.02758927,0.1427061
50%,0.1934031,-0.6719165,-0.1658834,-0.00939833,-0.1278545,-0.5570391,-0.01590079,-0.3376036,-0.2414922,-0.5065923,0.178962,-0.2491192,-0.2217337,-0.2077425,-0.1997061,0.08561616,-0.06774143,-0.2320086,-0.1705409,-0.03872582,-0.1060656,-0.285453,-0.1550235,-0.2502652,-0.04555544,-0.3472214,-0.3270972,0.1634341,-0.5551422,-0.5836499,-0.1531223,-0.4515723,0.2142548,-0.1308031,-0.983567,0.06269509,-0.04179345,-0.006769098,-0.3247713,-0.1252319,-0.02840877,-0.04274239,-0.04504766,-0.04494542,-0.05639798,-0.1651694,-0.1839046,-0.07478964,-0.07540666,-0.07503704,-0.1239043,-0.07503704,-0.1675257,-0.1574547,-0.1253074,-0.1936412,0.0,0.0,0.0,0.0,0.0,0.0,-0.04830129,-0.0501715,-0.01420009,-0.01420009,-0.01420009,-0.02791994,-0.08566574,-0.03807843,-0.2304253,-0.02758927,0.1427061
75%,1.125657,1.48828,0.9541632,0.6930689,-0.1278545,0.3854312,0.7284963,0.2673643,0.1926587,-0.5065923,0.6699342,-0.2491192,-0.2217337,-0.2077425,-0.1997061,0.7819678,0.7668923,0.4835661,0.5309771,0.5889178,-0.1060656,-0.285453,-0.1550235,-0.2502652,-0.04555544,-0.3472214,-0.3270972,0.1634341,-0.5551422,1.713356,-0.1531223,-0.4515723,0.2142548,-0.1308031,1.016708,0.7396855,0.7655622,-0.006769098,-0.3247713,-0.1252319,-0.02840877,-0.04274239,-0.04504766,-0.04494542,-0.05639798,-0.1651694,-0.1839046,-0.07478964,-0.07540666,-0.07503704,-0.1239043,-0.07503704,-0.1675257,-0.1574547,-0.1253074,-0.1936412,0.0,0.0,0.0,0.0,0.0,0.0,-0.04830129,-0.0501715,-0.01420009,-0.01420009,-0.01420009,-0.02791994,-0.08566574,-0.03807843,-0.2304253,-0.02758927,0.1427061
max,3.92242,1.48828,2.07421,11.23008,48.03873,21.11978,3.706084,2.687236,16.55225,3.840647,2.142851,8.119734,9.905886,11.25106,12.04775,4.960077,2.43616,5.295189,5.316332,4.912685,9.428129,3.503204,6.450636,3.995761,21.95127,2.880007,17.93462,0.1634341,1.80134,1.713356,6.530727,4.680513,5.688135,15.12737,1.016708,2.093666,3.187629,147.7302,3.079089,7.985187,35.2004,23.39598,22.19871,22.24921,17.73113,6.054389,5.437601,13.37084,13.26143,13.32675,8.070748,13.32675,5.969234,6.351032,7.980378,5.164191,0.0,0.0,0.0,0.0,0.0,0.0,20.70338,19.93163,70.4221,70.4221,70.4221,35.81669,11.67328,26.26159,4.339802,36.24598,0.1427061


In [34]:
# Remove empty columns

cols_that_are_empty = [
'PRE_PREGNANCY_DIABETES_MISSING',
'GESTATIONAL_DIABETES_MISSING',
'PRE_PREGNANCY_HYPERTENSION_MISSING',
'GESTATIONAL_HYPERTENSION_MISSING',
'HYPERTENSION_ECLAMPSIA_MISSING',
'PREVIOUS_PRETERM_BIRTH_MISSING']


X_scaled = X_scaled.drop(columns=cols_that_are_empty)

In [None]:
# We don't want these anymore as they indicate the result 
# Used for data sanity checks

X_scaled = X_scaled.drop(columns=['PREVIOUS_CESAREAN', 'TOL_ATTEMPTED', 
           'DELIVERY_METHOD_1', 'DELIVERY_METHOD_2'])

## Balance dataset

In [36]:
import climin
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from imblearn.under_sampling import AllKNN
from imblearn.over_sampling import RandomOverSampler

In [37]:
undersample = True
oversample = False

if undersample:
  # rus = RandomUnderSampler(random_state=42)
  rus = AllKNN()
  X_res, y_res = rus.fit_resample(X_scaled, y)

elif oversample:
  X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2)
  ros = RandomOverSampler(sampling_strategy='minority', random_state=44)
  X_train_res, y_train_res = rus.fit_resample(X_train, y_train)
  X_test_res, y_test_res = rus.fit_resample(X_test, y_test)

else:
  X_res, y_res = X_scaled, y

In [38]:
# Check that dataset is properly balanced 

y_res["successful_vbac"].value_counts() if not oversample else y_test_res["successful_vbac"].value_counts()

1.0    33623
0.0    28837
Name: successful_vbac, dtype: int64

# Logistic Regression Model

## Train Model (10-fold cross validation)

In [42]:
model = LogisticRegression(max_iter=1000)
scores = []
kfold = KFold(n_splits=10)

if not oversample:
   regression_X, regression_Y = X_res, y_res
else:
  # Combine test + train data again, as using K-fold & Logistic Regression
  regression_X, regression_Y = X_train_res.append(X_test_res), y_train_res.append(y_test_res)
  
for i, (train, test) in enumerate(kfold.split(regression_X, regression_Y)):
  model.fit(regression_X.iloc[train,:], regression_Y.iloc[train,:].values.ravel())
  score = model.score(regression_X.iloc[test,:], regression_Y.iloc[test,:].values.ravel())
  scores.append(score)

## Print Results

In [43]:
print(scores)

[0.5358629522894652, 0.7566442523214858, 0.7305475504322767, 0.7718539865513929, 0.8152417547230227, 0.6685878962536023, 0.7367915465898175, 0.7672110150496317, 0.7436759526096702, 0.7423951328850464]


In [44]:
model.intercept_

array([-0.05333046])

In [46]:
import statistics
from sklearn.metrics import roc_auc_score, auc
import matplotlib.pyplot as plt

print("Mean Accuracy: ", statistics.mean(scores))
print("Stddev : ", statistics.stdev(scores))
print("AUC: ", roc_auc_score(regression_Y, model.predict_proba(regression_X)[:, 1]))
# plt.hist(model.predict_proba(regression_X)[:, 1])

Mean Accuracy:  0.7268812039705411
Stddev :  0.07663981817600765
AUC:  0.885929264083745


In [45]:
sorted_indexes = np.argsort(np.abs(model.coef_))[0][::-1]
print("Features with highest +/- coefficient:\n")
for i, feature in enumerate(sorted_indexes[:10]):
  print(f'{i+1})      {regression_X.columns[feature]}: {round(model.coef_[0][feature] , 3)}')
  print("")

Features with highest +/- coefficient:

1)      LIVE_BIRTH_ORDER_RECODE: 1.445

2)      ATTENDANT_AT_BIRTH: 1.084

3)      NUMBER_OF_PREVIOUS_CESAREANS: -0.912

4)      IN_HOSPITAL: -0.786

5)      PRIOR_BIRTHS_NOW_LIVING: -0.757

6)      TOTAL_BIRTH_ORDER_RECODE: 0.65

7)      MOTHERS_HEIGHT_IN_TOTAL_INCHES: 0.44

8)      PRIOR_OTHER_TERMINATIONS: -0.396

9)      CIGARETTES_THIRD_TRIMESTER_RECODE_MISSING: 0.39

10)      DELIVERY_WEIGHT_RECODE: -0.333



# Gradient Boosted Decision Tree

In [47]:
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier()

In [48]:
if not oversample:
  X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2)
else:
  X_train, X_test, y_train, y_test = X_train_res, X_test_res, y_train_res, y_test_res

## Hyperparameter sweep 

Skip this if not changing the dataset.

In [None]:
n_estimators = [int(x) for x in np.linspace(50, 500, 5)]
max_depth = [int(x) for x in np.linspace(2, 25, 5)]
# Add the default as a possible value
max_depth.append(3)
max_depth.append(4)

max_features = ['auto', 'log2', .5, .75]
subsample = [.4,.6,.8,1.]
criterion = ['mse', 'friedman_mse']
min_samples_split = [int(x) for x in np.linspace(2, 200, 6)]
min_impurity_decrease = [0.02, 0.05, 0.1]

# creating hyper param grid to search over
hyper_param_grid = {
    'n_estimators': n_estimators,
    'max_depth': max_depth,
    'max_features': max_features,
    'subsample': subsample,
    'criterion': criterion,
    'min_samples_split': min_samples_split,
    'min_impurity_decrease': min_impurity_decrease
  }


In [None]:
from sklearn.model_selection import RandomizedSearchCV


gbc_CV_tuner = RandomizedSearchCV(estimator = gbc, param_distributions = hyper_param_grid, scoring='f1',
                               n_iter = 50, cv = 6, verbose = 50,#, random_state = 100, 
                               n_jobs = 1, refit=False)

gbc_CV_tuner.fit(X_train, np.ravel(y_train))

In [None]:
# Then run this to find the winner:
gbc_CV_tuner.best_params_

Input these values into the ```optimized_gbc``` instantiation below.

## Fit model 

In [50]:
# Model
optimized_gbc = GradientBoostingClassifier(n_estimators=275,min_samples_split=160, min_impurity_decrease=0.05, max_features='log2', max_depth=13, criterion='friedman_mse', subsample=0.8)
optimized_gbc.fit(X_train, np.ravel(y_train))

GradientBoostingClassifier(max_depth=13, max_features='log2',
                           min_impurity_decrease=0.05, min_samples_split=160,
                           n_estimators=275, subsample=0.8)

## Evaluate Model

In [51]:
gbc_predictions = optimized_gbc.predict(X_test)
gbc_probs = optimized_gbc.predict_proba(X_test)  # Gives probability of output (prediction before rounding)

## Print results

In [52]:
from sklearn.metrics import accuracy_score, f1_score
print('GBC Accuracy: ' + str(round(accuracy_score(y_test, gbc_predictions), 3)))
print('GBC F1: ' + str(round(f1_score(y_test, gbc_predictions), 2)))
print('AUC: ' + str(round(roc_auc_score(y_test, gbc_predictions), 2)))

GBC Accuracy: 0.839
GBC F1: 0.85
AUC: 0.84


In [53]:
sorted_indexes = np.argsort(optimized_gbc.feature_importances_)[::-1]
print("Features with highest GBDT feature importances:\n")
for i, feature in enumerate(sorted_indexes[:10]):
  print(f'{i+1})      {X_train.columns[feature]}: {round(optimized_gbc.feature_importances_[feature], 3)}')
  print("")

Features with highest GBDT feature importances:

1)      ATTENDANT_AT_BIRTH: 0.196

2)      NUMBER_OF_PREVIOUS_CESAREANS: 0.094

3)      PRIOR_BIRTHS_NOW_LIVING: 0.093

4)      LIVE_BIRTH_ORDER_RECODE: 0.058

5)      TOTAL_BIRTH_ORDER_RECODE: 0.05

6)      INTERVAL_SINCE_LAST_LIVE_BIRTH_RECODE: 0.041

7)      DELIVERY_WEIGHT_RECODE: 0.041

8)      MOTHERS_HEIGHT_IN_TOTAL_INCHES: 0.033

9)      PRE_PREGNANCY_WEIGHT_RECODE: 0.031

10)      MOTHERS_BMI_RECODE: 0.025

