# Creating target data files
### NOTE: This notebook does not include Scaling/Normalization of data. That is in CSL_create_targets_NORM.ipynb

In [1]:
import pandas as pd
import numpy as np
import os
from IPython.display import display, HTML

In [2]:
pd.options.display.max_columns = 30
# Note: Name currently assumes updated symbolic link
df = pd.read_csv('../data/csl/MFM_CSL_d6_fields.csv', index_col=0)
print(df.shape)
display(df.head())

(185413, 210)


Unnamed: 0_level_0,trans_loss,transfus_yes,transfus_all,transfus_hyster,MomNearMiss,Hysterectomy,Momdeath,MomICU,Postbleed,high_MomLOS,Bloodproduct,Posttransfus,EBLoss,high_EBLoss,hemorrhage,...,UnspecHBP,Urupture,uscar,version9,vertex,Activeherpes,iufd9,R_preg,high_BMI,high_Age,high_Gravidity,high_height,intra_abruptio,intra_previa,pre_PROM
MomID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
41-00002,0,0,0,0,0,0,0,0,0,3,0,0,350.0,0,0,...,0,0,1,0,9,0,0,1,0,0,2,2,0,0,0
41-00003,0,0,0,0,0,0,0,0,0,2,0,0,250.0,0,0,...,0,0,0,0,1,0,0,1,0,0,3,2,0,0,0
41-00004,0,0,0,0,0,0,0,0,0,1,0,0,250.0,0,0,...,0,0,0,0,1,0,0,1,0,0,4,3,0,0,0
41-00005,1,1,1,1,0,0,0,0,0,2,0,1,350.0,0,1,...,0,0,0,0,1,0,0,1,0,0,3,2,0,0,0
41-00006,0,0,0,0,0,0,0,0,0,2,0,0,450.0,0,0,...,0,0,0,0,1,0,0,1,0,0,1,3,0,0,0


In [3]:
# Drop all extra targets and collinear data fields
drop_list = ['transfus_all','transfus_hyster',\
             'MomNearMiss','Hysterectomy','Momdeath','MomICU','Postbleed',\
             'high_MomLOS','Bloodproduct','Posttransfus','EBLoss','high_EBLoss',\
             'hemorrhage','postpartum_hemorrhage9']
df.drop(drop_list, axis=1, inplace=True)
display(df.head())

Unnamed: 0_level_0,trans_loss,transfus_yes,Accrete,AdmBishop,Admcervpos,Admconsistency,Admcontract,AdmDBP,Admefface,Admpresent,Admreason,AdmSBP,Alcohol,Analgesia,Anteabruption,...,UnspecHBP,Urupture,uscar,version9,vertex,Activeherpes,iufd9,R_preg,high_BMI,high_Age,high_Gravidity,high_height,intra_abruptio,intra_previa,pre_PROM
MomID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
41-00002,0,0,0,8,8,8,99.0,83,0.0,77,1,150.0,0,7,0,...,0,0,1,0,9,0,0,1,0,0,2,2,0,0,0
41-00003,0,0,0,6,3,2,2.9,77,70.0,1,3,105.0,0,8,0,...,0,0,0,0,1,0,0,1,0,0,3,2,0,0,0
41-00004,0,0,0,5,3,2,3.3,65,70.0,1,3,122.0,0,8,0,...,0,0,0,0,1,0,0,1,0,0,4,3,0,0,0
41-00005,1,1,0,11,2,3,4.0,77,80.0,77,4,138.0,0,8,0,...,0,0,0,0,1,0,0,1,0,0,3,2,0,0,0
41-00006,0,0,0,11,1,3,5.0,98,100.0,77,4,134.0,0,8,0,...,0,0,0,0,1,0,0,1,0,0,1,3,0,0,0


## Break up file into groups of columns used for different versions of output files 

In [4]:
# Save trans_loss and tranfus_yes for later
tl_series = df['trans_loss']
ty_series = df['transfus_yes']

In [5]:
# Read in variables categorized by "timing" (pre, intra, post, etc)
varTimingPath = '../data/csl/V3_d5_code_timing.csv'
var_times_df = pd.read_csv(varTimingPath, delimiter='\t')
var_times_df

Unnamed: 0,Code,timing
0,delivery,drop
1,fever9,drop
2,ga,drop
3,Lac_Unkn,drop
4,onepregflag,index
...,...,...
245,MomID,target
246,Postbleed,target
247,postpartum_hemorrhage9,target
248,Posttransfus,target


In [6]:
# Save off "pre" and "intra" columns
pre_df = df[var_times_df[var_times_df['timing'] == 'pre']['Code'].to_list()]
display(pre_df)
intra_df = df[var_times_df[var_times_df['timing'] == 'intra']['Code'].to_list()]
display(intra_df)

Unnamed: 0_level_0,Activeherpes,Admreason,Alcohol,Anteabruption,Anteanemia,Anteasthma,Antebleed3,AnteCHBP,Antechorio,Antefetaldth,Antefetdistress,AnteGBS,AnteGDM,Antehospital,AnteLGA,...,Preeclampsia,prelaborCD,prim_hypo,R_preg,renal_disease_comb,renal_disease9,Rhincompat,Smoke,TD_nos,ThreatenedPB,threatpb9,TTTwin,UnspecHBP,uscar,version9
MomID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
41-00002,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0
41-00003,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0
41-00004,0,3,0,0,0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0
41-00005,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0
41-00006,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52-07247,0,2,0,0,0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0
52-07248,0,2,0,1,0,0,0,1,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0
52-07249,0,5,0,1,0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,0,0,1,0,1,0
52-07250,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0


Unnamed: 0_level_0,Accrete,AdmBishop,Admcervpos,Admconsistency,Admcontract,AdmDBP,Admefface,Admpresent,AdmSBP,Analgesia,Augment,BESTGA,breech,breech9,chorio,...,Meconium,Momseizure,MthInd_AROM,MthInd_Oxy,Operative,Presentdel,Prolapse,ROM,ROMmeth,SE_pre,Shoulder,spontlabor,TrialLabor,Urupture,vertex
MomID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
41-00002,0,8,8,8,99.0,83,0.0,77,150.0,7,1,38.2,0,0,0,...,1,0,0,0,0,88,0,0,1,0,0,1,0,0,9
41-00003,0,6,3,2,2.9,77,70.0,1,105.0,8,88,39.0,0,0,0,...,1,0,1,1,0,1,0,0,1,0,0,0,1,0,1
41-00004,0,5,3,2,3.3,65,70.0,1,122.0,8,88,39.1,0,0,0,...,1,0,1,1,0,1,0,0,1,0,0,0,1,0,1
41-00005,0,11,2,3,4.0,77,80.0,77,138.0,8,88,38.4,0,0,0,...,4,0,0,0,0,1,0,0,2,0,0,1,1,0,1
41-00006,0,11,1,3,5.0,98,100.0,77,134.0,8,88,39.6,0,0,0,...,1,0,0,0,0,1,0,0,2,0,0,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52-07247,0,8,8,8,77.0,74,70.0,1,124.4,5,4,37.1,0,0,0,...,5,0,0,1,1,1,0,0,1,0,1,0,1,0,1
52-07248,0,8,8,8,99.0,74,0.0,2,124.4,5,0,24.6,1,1,0,...,1,0,0,1,0,2,0,0,8,0,0,0,1,0,0
52-07249,0,8,8,8,66.0,74,0.0,77,124.4,9,0,28.7,0,0,0,...,1,0,0,0,0,88,0,0,2,0,0,0,0,0,9
52-07250,0,8,8,8,66.0,74,90.0,1,124.4,5,0,37.4,0,0,0,...,1,0,0,0,0,1,0,0,1,0,0,1,1,0,1


In [7]:
ty_PI = pd.concat([intra_df,pre_df], axis=1)
ty_PI.insert(0, 'transfus_yes', ty_series)
print(ty_PI.shape)
ty_PI.head()

(185413, 195)


Unnamed: 0_level_0,transfus_yes,Accrete,AdmBishop,Admcervpos,Admconsistency,Admcontract,AdmDBP,Admefface,Admpresent,AdmSBP,Analgesia,Augment,BESTGA,breech,breech9,...,Preeclampsia,prelaborCD,prim_hypo,R_preg,renal_disease_comb,renal_disease9,Rhincompat,Smoke,TD_nos,ThreatenedPB,threatpb9,TTTwin,UnspecHBP,uscar,version9
MomID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
41-00002,0,0,8,8,8,99.0,83,0.0,77,150.0,7,1,38.2,0,0,...,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0
41-00003,0,0,6,3,2,2.9,77,70.0,1,105.0,8,88,39.0,0,0,...,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0
41-00004,0,0,5,3,2,3.3,65,70.0,1,122.0,8,88,39.1,0,0,...,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0
41-00005,1,0,11,2,3,4.0,77,80.0,77,138.0,8,88,38.4,0,0,...,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0
41-00006,0,0,11,1,3,5.0,98,100.0,77,134.0,8,88,39.6,0,0,...,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0


In [8]:
ty_Pre = pd.concat([pre_df], axis=1)
ty_Pre.insert(0, 'transfus_yes', ty_series)
print(ty_Pre.shape)
ty_Pre.head()

(185413, 120)


Unnamed: 0_level_0,transfus_yes,Activeherpes,Admreason,Alcohol,Anteabruption,Anteanemia,Anteasthma,Antebleed3,AnteCHBP,Antechorio,Antefetaldth,Antefetdistress,AnteGBS,AnteGDM,Antehospital,...,Preeclampsia,prelaborCD,prim_hypo,R_preg,renal_disease_comb,renal_disease9,Rhincompat,Smoke,TD_nos,ThreatenedPB,threatpb9,TTTwin,UnspecHBP,uscar,version9
MomID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
41-00002,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0
41-00003,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0
41-00004,0,0,3,0,0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0
41-00005,1,0,4,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0
41-00006,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0


In [9]:
tl_PI = pd.concat([intra_df,pre_df], axis=1)
tl_PI.insert(0, 'trans_loss', tl_series)
print(tl_PI.shape)
tl_PI.head()

(185413, 195)


Unnamed: 0_level_0,trans_loss,Accrete,AdmBishop,Admcervpos,Admconsistency,Admcontract,AdmDBP,Admefface,Admpresent,AdmSBP,Analgesia,Augment,BESTGA,breech,breech9,...,Preeclampsia,prelaborCD,prim_hypo,R_preg,renal_disease_comb,renal_disease9,Rhincompat,Smoke,TD_nos,ThreatenedPB,threatpb9,TTTwin,UnspecHBP,uscar,version9
MomID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
41-00002,0,0,8,8,8,99.0,83,0.0,77,150.0,7,1,38.2,0,0,...,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0
41-00003,0,0,6,3,2,2.9,77,70.0,1,105.0,8,88,39.0,0,0,...,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0
41-00004,0,0,5,3,2,3.3,65,70.0,1,122.0,8,88,39.1,0,0,...,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0
41-00005,1,0,11,2,3,4.0,77,80.0,77,138.0,8,88,38.4,0,0,...,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0
41-00006,0,0,11,1,3,5.0,98,100.0,77,134.0,8,88,39.6,0,0,...,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0


In [10]:
tl_Pre = pd.concat([pre_df], axis=1)
tl_Pre.insert(0, 'trans_loss', tl_series)
print(tl_Pre.shape)
tl_Pre.head()

(185413, 120)


Unnamed: 0_level_0,trans_loss,Activeherpes,Admreason,Alcohol,Anteabruption,Anteanemia,Anteasthma,Antebleed3,AnteCHBP,Antechorio,Antefetaldth,Antefetdistress,AnteGBS,AnteGDM,Antehospital,...,Preeclampsia,prelaborCD,prim_hypo,R_preg,renal_disease_comb,renal_disease9,Rhincompat,Smoke,TD_nos,ThreatenedPB,threatpb9,TTTwin,UnspecHBP,uscar,version9
MomID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
41-00002,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0
41-00003,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0
41-00004,0,0,3,0,0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0
41-00005,1,0,4,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0
41-00006,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0


### Use "overwrite" variable to control whether files are written or not
### @todo - Calculate current date and create new one every time.

In [11]:
overwrite = False

In [12]:
# Write out transfus_yes for "pre_intra"
filename = '../data/csl/MFM_CSL_d6_transfus_yes_Pre-Intra_2021-03-26.csv'
if overwrite:
    ty_PI.to_csv(filename, header=True)
else:
    print(f'File not written: {filename}')

In [13]:
# Write out transfus_yes for "pre" only
filename = '../data/csl/MFM_CSL_d6_transfus_yes_Pre_2021-03-26.csv'
if overwrite:
    ty_Pre.to_csv(filename, header=True)
else:
    print(f'File not written: {filename}')

In [14]:
# Write out trans_loss for "pre_intra"
filename = '../data/csl/MFM_CSL_d6_trans_loss_Pre-Intra_2021-03-26.csv'
if overwrite:
    tl_PI.to_csv(filename, header=True)
else:
    print(f'File not written: {filename}')

In [15]:
# Write out trans_loss for "pre" only
filename = '../data/csl/MFM_CSL_d6_trans_loss_Pre_2021-03-26.csv'
if overwrite:
    tl_Pre.to_csv(filename, header=True)
else:
    print(f'File not written: {filename}')