# Train Test Validation Split


### Importing the big volume of data

In [6]:
import pandas as pd

# features identified that may be useful
usecols = [
    # outcome variable
    "matchs",

    # maternal demographics
    "mager9",         # mother's age (9-category)
    "mar",            # marital status
    "meduc",          # mother's education
    "mbrace",         # mother's bridged race
    "umhisp",         # mother's Hispanic origin

    # obstetric history & prenatal care
    "lbo",            # live birth order
    "tbo",            # total birth order
    "precare_rec",    # month prenatal care began (recode)
    "previs_rec",     # number of prenatal visits (recode)
    "wtgain_rec",     # gestational weight gain (recode)

    # maternal health risks
    "urf_diab",       # diabetes
    "urf_chyper",     # chronic hypertension
    "urf_phyper",     # pregnancy-associated hypertension
    "urf_eclam",      # eclampsia
    "rf_ppterm",      # previous preterm birth
    "rf_ppoutc",      # poor previous pregnancy outcome
    "rf_cesar",       # previous cesarean
    "rf_ncesar",      # number of previous cesareans

    # infant characteristics at birth
    "sex",            # sex of infant
    "combgest",       # gestation (weeks, detailed)
    "gestrec10",      # gestation recode 10
    "dbwt",           # birth weight (grams)
    "bwtr14",         # birth weight recode 14
    "apgar5r",        # 5-minute Apgar recode
    "dplural",        # plurality (singleton/twins/etc.)

    # neonatal complications / interventions
    "ab_vent",        # assisted ventilation
    "ab_vent6",       # assisted ventilation > 6 hrs
    "ab_nicu",        # NICU admission

    # congenital anomalies
    "ca_anen",
    "ca_menin",
    "ca_heart",
    "ca_ompha",
    "ca_gastro",
    "ca_limb",
    "ca_cleftlp",
    "ca_cleft",
    "ca_downs",
    "ca_chrom",
]

df = pd.read_csv("linkco2013us_den.csv", usecols=usecols, low_memory=False)
df.to_csv("InfantMortality2013_Narrowed.csv", index=False)
df

Unnamed: 0,mager9,mbrace,umhisp,mar,meduc,lbo,tbo,precare_rec,previs_rec,wtgain_rec,...,ca_menin,ca_heart,ca_ompha,ca_gastro,ca_limb,ca_cleftlp,ca_cleft,ca_downs,ca_chrom,matchs
0,3,1,1,1,3,2,9,1,5,1,...,N,N,N,N,N,N,N,N,N,2
1,4,21,0,1,4,4,5,1,5,2,...,N,N,N,N,N,N,N,N,N,2
2,4,1,0,1,6,1,1,1,7,4,...,N,N,N,N,N,N,N,N,N,2
3,5,1,0,1,6,3,5,2,5,3,...,N,N,N,N,N,N,N,N,N,2
4,3,3,0,2,2,1,1,1,9,1,...,N,N,N,N,N,N,N,N,N,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1832763,6,1,0,1,7,1,1,1,9,3,...,N,N,N,N,N,N,N,N,N,2
1832764,4,1,0,2,2,2,2,1,8,4,...,N,N,N,N,N,N,N,N,N,2
1832765,6,2,0,2,6,4,4,1,6,5,...,N,N,N,N,N,N,N,N,N,2
1832766,4,1,0,1,6,1,1,2,4,5,...,N,N,N,N,N,N,N,N,N,2


In [7]:
df["matchs"] = pd.to_numeric(df["matchs"], errors="coerce")
print(df["matchs"].value_counts(dropna=False))

matchs
2.0       1822352
1.0          9846
NaN           509
0.0            38
3.0             4
4.0             4
8.0             4
5.0             2
39.0            2
2012.0          2
12.0            1
6.0             1
9.0             1
11.0            1
38.0            1
Name: count, dtype: int64


### Dropping features that may lead to a label leak (i.e. cause of death)
- dont want the model to learn that certain features are tied to infant mortality for certain

In [8]:
df = df[df["matchs"].isin([1.0, 2.0])]
df["infant_death"] = (df["matchs"] == 1).astype(int)

must_drop_columns = [
    # death timing / age
    "aged",
    "ager5",
    "ager22",
    # circumstances of death
    "manner",
    "dispo",
    "autopsy",
    "place",
    # cause of death codes
    "ucod",
    "ucod130",
    # infant id
    "idnumber",
    "recwt",
]
df = df.drop(columns=[c for c in must_drop_columns if c in df.columns], inplace=False)

# drop columns wiht more than 80% missing data
n_rows = len(df)
missing_frac = df.isnull().mean()
missing_data_columns = missing_frac[missing_frac > 0.8].index
df = df.drop(columns=missing_data_columns)
df


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["infant_death"] = (df["matchs"] == 1).astype(int)


Unnamed: 0,mager9,mbrace,umhisp,mar,meduc,lbo,tbo,precare_rec,previs_rec,wtgain_rec,...,ca_heart,ca_ompha,ca_gastro,ca_limb,ca_cleftlp,ca_cleft,ca_downs,ca_chrom,matchs,infant_death
0,3,1,1,1,3,2,9,1,5,1,...,N,N,N,N,N,N,N,N,2.0,0
1,4,21,0,1,4,4,5,1,5,2,...,N,N,N,N,N,N,N,N,2.0,0
2,4,1,0,1,6,1,1,1,7,4,...,N,N,N,N,N,N,N,N,2.0,0
3,5,1,0,1,6,3,5,2,5,3,...,N,N,N,N,N,N,N,N,2.0,0
4,3,3,0,2,2,1,1,1,9,1,...,N,N,N,N,N,N,N,N,2.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1832762,3,2,0,2,2,2,2,1,9,4,...,N,N,N,N,N,N,N,N,2.0,0
1832763,6,1,0,1,7,1,1,1,9,3,...,N,N,N,N,N,N,N,N,2.0,0
1832764,4,1,0,2,2,2,2,1,8,4,...,N,N,N,N,N,N,N,N,2.0,0
1832765,6,2,0,2,6,4,4,1,6,5,...,N,N,N,N,N,N,N,N,2.0,0


### Prepare features and target variable

In [9]:
y = df["infant_death"]
X = df.drop(columns=["infant_death"])

### Train test Validation

In [10]:
from sklearn.model_selection import train_test_split
X_main, X_test, y_main, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_main, y_main, test_size=0.25, random_state=42, stratify=y_main)



### output as csv

In [11]:
train_df = X_train.copy()
train_df["infant_death"] = y_train.values

val_df = X_val.copy()
val_df["infant_death"] = y_val.values

test_df = X_test.copy()
test_df["infant_death"] = y_test.values

train_df.to_csv("nvss_train.csv", index=False)
val_df.to_csv("nvss_val.csv", index=False)
test_df.to_csv("nvss_test.csv", index=False)
