## CSL_d5_to_d6
 This notebook takes the "data5" datafile, cleans up the text data and removes numerous variables that are not considered useful. It also reduces the precision of the "BESTGA" field to 2 decimal places. It is then saved as "data6"

In [1]:
import sys
import os
import pandas as pd
import numpy as np

In [2]:
# "Original" df5 version from Laritza
os.chdir('/MFMDatasets/MFM_bopf/data/csl')
csl = pd.read_csv('MFM_data5.csv', index_col=0)

In [3]:
pd.options.display.max_columns = 30
csl.head()

Unnamed: 0,MomID,Sitenum,Momdeath,MomICU,Bloodproduct,Posttransfus,onepregflag,AB_any,AB_ceph,AB_clind,AB_doxy,AB_eryth,AB_flagyl,AB_gent,AB_peni,...,Activeherpes,ga,hie_new1,ich_new1,iufd9,myocardio_new1,R_preg,Outcome1,high_BMI,high_Age,transfus_yes,high_EBLoss,high_Gravidity,high_MomLOS,high_height
1,b'41-00002',b'41',1,1,1,1,1,1,1,1,1,1,1,1,1,...,1,38,1,1,0,1,1,1,0,0,1,0,2,3,2.0
2,b'41-00003',b'41',1,1,1,1,1,1,1,1,1,1,1,1,1,...,1,39,1,1,0,1,1,1,0,0,1,0,3,2,2.0
3,b'41-00004',b'41',1,1,1,1,1,1,1,1,1,1,1,1,1,...,1,39,1,1,0,1,1,1,0,0,1,0,4,1,3.0
4,b'41-00005',b'41',1,1,1,2,1,1,1,1,1,1,1,1,1,...,1,38,1,1,0,1,1,1,0,0,2,0,3,2,2.0
5,b'41-00006',b'41',1,1,1,1,1,1,1,1,1,1,1,1,1,...,1,39,1,1,0,1,1,1,0,0,1,0,1,2,3.0


# Remove extraneous text characters

In [4]:
# Remove extraneous characters from strings; had to use .loc[row, col] to avoid warning
csl.loc[:,'MomID'] = csl['MomID'].str.replace('b','')
csl.loc[:,'MomID'] = csl['MomID'].str.replace("'","")
csl.loc[:,'Sitenum'] = csl['Sitenum'].str.replace('b','')
csl.loc[:,'Sitenum'] = csl['Sitenum'].str.replace("'","")

In [5]:
csl.set_index('MomID', inplace=True)
csl.head()

Unnamed: 0_level_0,Sitenum,Momdeath,MomICU,Bloodproduct,Posttransfus,onepregflag,AB_any,AB_ceph,AB_clind,AB_doxy,AB_eryth,AB_flagyl,AB_gent,AB_peni,AB_vanc,...,Activeherpes,ga,hie_new1,ich_new1,iufd9,myocardio_new1,R_preg,Outcome1,high_BMI,high_Age,transfus_yes,high_EBLoss,high_Gravidity,high_MomLOS,high_height
MomID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
41-00002,41,1,1,1,1,1,1,1,1,1,1,1,1,1,1,...,1,38,1,1,0,1,1,1,0,0,1,0,2,3,2.0
41-00003,41,1,1,1,1,1,1,1,1,1,1,1,1,1,1,...,1,39,1,1,0,1,1,1,0,0,1,0,3,2,2.0
41-00004,41,1,1,1,1,1,1,1,1,1,1,1,1,1,1,...,1,39,1,1,0,1,1,1,0,0,1,0,4,1,3.0
41-00005,41,1,1,1,2,1,1,1,1,1,1,1,1,1,1,...,1,38,1,1,0,1,1,1,0,0,2,0,3,2,2.0
41-00006,41,1,1,1,1,1,1,1,1,1,1,1,1,1,1,...,1,39,1,1,0,1,1,1,0,0,1,0,1,2,3.0


In [6]:
# Need to drop onepregflag since it is constant across all rows
print(csl['onepregflag'].value_counts())
csl.drop('onepregflag', axis=1, inplace=True)

1    185413
Name: onepregflag, dtype: int64


In [7]:
# Need to drop Sitenum as it isn't relevant
print(csl['Sitenum'].value_counts())
csl.drop('Sitenum', axis=1, inplace=True) 

41    42856
49    24275
44    16534
48    16088
51    16060
50    13868
47    13434
45    13246
43     9665
46     6913
52     6616
42     5858
Name: Sitenum, dtype: int64


In [8]:
# Can't have any NaN values or many algorithms will puke
for col in csl:
    if csl[col].isna().any():
        print(f'{col} ***** COLUMN HAS NaN values ****')

In [9]:
# Check for any variables that only have 1 value
for col in csl:
    if csl[col].value_counts().index.size == 1:
        print(f'{col} has only 1 value: dropping')
        csl.drop(col, axis=1, inplace=True)

TTTwin has only 1 value: dropping
R_preg has only 1 value: dropping


# Move 2-value categoricals to [0, 1]

In [10]:
# Make all 2-value categoricals (0,1)
csl_mod = csl.copy()
for col in csl:
    cnts = csl[col].value_counts()
    if cnts.index.shape[0] == 2:
        if (cnts.index.values == [1,2]).all() | (cnts.index.values == [2,1]).all():
            csl_mod[col] = csl[col].replace([1,2], [0,1])
        elif (cnts.index.values == [1,5]).all():   # delivery
            csl_mod[col] = csl[col].replace([1,5], [0,1])
        elif (cnts.index.values == [1,3]).all():   # HosEpitype
            csl_mod[col] = csl[col].replace([1,3], [0,1])
        elif (cnts.index.values == [0,1]).all():   # Already (0,1)
            continue
        else:
            print(f'Unexpected 2-value column = {col}')
            print(cnts.index.values)
    elif col == 'Inoxy_incrdose':   # Inoxy_incrdose
        csl_mod[col] = csl[col].replace([1,2,555,666,777], [0,1,5,6,7])
#pd.options.display.max_columns = None
csl_mod

Unnamed: 0_level_0,Momdeath,MomICU,Bloodproduct,Posttransfus,AB_any,AB_ceph,AB_clind,AB_doxy,AB_eryth,AB_flagyl,AB_gent,AB_peni,AB_vanc,abruptio9,abruption,...,Woundsep,Activeherpes,ga,hie_new1,ich_new1,iufd9,myocardio_new1,Outcome1,high_BMI,high_Age,transfus_yes,high_EBLoss,high_Gravidity,high_MomLOS,high_height
MomID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
41-00002,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,38,0,0,0,0,1,0,0,0,0,2,3,2.0
41-00003,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,39,0,0,0,0,1,0,0,0,0,3,2,2.0
41-00004,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,39,0,0,0,0,1,0,0,0,0,4,1,3.0
41-00005,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,...,0,0,38,0,0,0,0,1,0,0,1,0,3,2,2.0
41-00006,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,39,0,0,0,0,1,0,0,0,0,1,2,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52-07247,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,37,0,0,0,0,3,0,3,0,0,1,2,2.0
52-07248,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,...,0,0,24,0,0,1,0,3,0,0,0,0,2,2,3.0
52-07249,0,1,1,0,0,0,0,0,0,0,0,0,0,1,1,...,0,0,28,0,0,1,0,3,0,0,1,3,3,3,2.0
52-07250,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,37,0,0,1,0,5,0,3,0,0,1,2,2.0


## Create new abruption variable

In [11]:
for col in csl_mod:
    if 'abrupt' in col:
        print(f'{col}.value_counts(): \n{csl_mod[col].value_counts().to_string()}')

abruptio9.value_counts(): 
0    183220
1      2193
abruption.value_counts(): 
0    183141
1      2272
Anteabruption.value_counts(): 
0    183707
1      1706
CS_Pabrupt.value_counts(): 
0    185041
1       372
Intraabrupt.value_counts(): 
0    184179
1      1234


In [12]:
csl_mod['intra_abruptio'] = 0
csl_mod.loc[(csl_mod['abruptio9'] == 1) | \
            (csl_mod['abruption'] == 1) | \
            (csl_mod['CS_Pabrupt'] == 1) | \
            (csl_mod['Intraabrupt'] == 1), 'intra_abruptio'] = 1
csl_mod['intra_abruptio'].value_counts()

0    182373
1      3040
Name: intra_abruptio, dtype: int64

## Create new Previa variable

In [13]:
for col in csl_mod:
    if 'revia' in col:
        print(f'{col}.value_counts(): \n{csl_mod[col].value_counts().to_string()}')

Anteprevia.value_counts(): 
0    184325
1      1088
CS_Pprevia.value_counts(): 
0    184818
1       595
Previa.value_counts(): 
0    185042
1       371
previa_chart.value_counts(): 
0    184304
1      1109
previa9.value_counts(): 
0    184426
1       987


In [14]:
csl_mod['intra_previa'] = 0
csl_mod.loc[(csl_mod['previa9'] == 1) | \
            (csl_mod['Previa'] == 1) | \
            (csl_mod['CS_Pprevia'] == 1) | \
            (csl_mod['previa_chart'] == 1), 'intra_previa'] = 1
csl_mod['intra_previa'].value_counts()

0    184059
1      1354
Name: intra_previa, dtype: int64

In [15]:
for col in csl_mod:
    if 'revia' in col:
        print(f'{col}.value_counts(): \n{csl_mod[col].value_counts().to_string()}')

Anteprevia.value_counts(): 
0    184325
1      1088
CS_Pprevia.value_counts(): 
0    184818
1       595
Previa.value_counts(): 
0    185042
1       371
previa_chart.value_counts(): 
0    184304
1      1109
previa9.value_counts(): 
0    184426
1       987
intra_previa.value_counts(): 
0    184059
1      1354


## Create new PROM variable

In [16]:
for col in csl_mod:
    if 'prom' in col or 'PROM' in col:
        print(f'{col}.value_counts(): \n{csl_mod[col].value_counts().to_string()}')

Ind_PROM.value_counts(): 
0    183795
1      1618
PPROM_new.value_counts(): 
0    181028
1      4385
PROM.value_counts(): 
0    175198
1     10215
PROM_new.value_counts(): 
0    172419
1     12994
prom9.value_counts(): 
0    177673
1      7740


In [17]:
csl_mod['pre_PROM'] = 0
csl_mod.loc[(csl_mod['prom9'] == 1) | \
            (csl_mod['PROM'] == 1) | \
            (csl_mod['PROM_new'] == 1) | \
            (csl_mod['PPROM_new'] == 1) | \
            (csl_mod['Ind_PROM'] == 1), 'pre_PROM'] = 1
csl_mod['pre_PROM'].value_counts()

0    169824
1     15589
Name: pre_PROM, dtype: int64

## Remove all "undesirable" variables

In [18]:
# Read in variables categorized by "timing" (pre, intra, post, etc)
pd.options.display.max_rows = None
varTimingPath = './V3_d5_code_timing.csv'
var_times_df = pd.read_csv(varTimingPath, delimiter='\t')
var_times_df

Unnamed: 0,Code,timing
0,delivery,drop
1,fever9,drop
2,ga,drop
3,Lac_Unkn,drop
4,onepregflag,index
5,Accrete,intra
6,AdmBishop,intra
7,Admcervpos,intra
8,Admconsistency,intra
9,Admcontract,intra


In [19]:
# Drop all timings labeled 'drop', 'post', or 'merged'. This leaves pre, intra, and target.
# NOTE: most of the targets will be deleted in the next step.
drop_list = var_times_df[(var_times_df['timing'] == 'drop') | \
                         (var_times_df['timing'] == 'post') | \
                         (var_times_df['timing'] == 'merged')]['Code'].to_list()
drop_list

['delivery',
 'fever9',
 'ga',
 'Lac_Unkn',
 'abruptio9',
 'abruption',
 'CS_Pabrupt',
 'Intraabrupt',
 'CS_Pprevia',
 'Previa',
 'previa_chart',
 'previa9',
 'Ind_PROM',
 'PPROM_new',
 'PROM',
 'PROM_new',
 'prom9',
 'AB_any',
 'AB_ceph',
 'AB_clind',
 'AB_doxy',
 'AB_eryth',
 'AB_flagyl',
 'AB_gent',
 'AB_peni',
 'AB_vanc',
 'Endometritis',
 'hie_new1',
 'Hyp_diur',
 'ich_new1',
 'Ileus',
 'Ligation',
 'myocardio_new1',
 'Outcome1',
 'Postfever',
 'Postpulembol',
 'Postthrombosis',
 'Pulmemb_new',
 'pulmonary_embolism9',
 'S_pre',
 'Woundinf',
 'Woundsep']

In [20]:
csl_mod.drop(drop_list, axis=1, inplace=True)
csl_mod.head()

Unnamed: 0_level_0,Momdeath,MomICU,Bloodproduct,Posttransfus,Accrete,AdmBishop,Admcervpos,Admconsistency,Admcontract,AdmDBP,Admefface,Admpresent,Admreason,AdmSBP,Alcohol,...,uscar,version9,vertex,Activeherpes,iufd9,high_BMI,high_Age,transfus_yes,high_EBLoss,high_Gravidity,high_MomLOS,high_height,intra_abruptio,intra_previa,pre_PROM
MomID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
41-00002,0,0,0,0,0,8,8,8,99.0,83,0.0,77,1,150.0,0,...,1,0,9,0,0,0,0,0,0,2,3,2.0,0,0,0
41-00003,0,0,0,0,0,6,3,2,2.9,77,70.0,1,3,105.0,0,...,0,0,1,0,0,0,0,0,0,3,2,2.0,0,0,0
41-00004,0,0,0,0,0,5,3,2,3.3,65,70.0,1,3,122.0,0,...,0,0,1,0,0,0,0,0,0,4,1,3.0,0,0,0
41-00005,0,0,0,1,0,11,2,3,4.0,77,80.0,77,4,138.0,0,...,0,0,1,0,0,0,0,1,0,3,2,2.0,0,0,0
41-00006,0,0,0,0,0,11,1,3,5.0,98,100.0,77,4,134.0,0,...,0,0,1,0,0,0,0,0,0,1,2,3.0,0,0,0


In [21]:
from datetime import datetime
datestr = datetime.today().strftime('%Y-%m-%d')

out_flag = False

In [22]:
# Save new dataset as "MFM_CSL_d6.csv"
filename = './MFM_CSL_d6_' + datestr + '.csv'
linkname = './CSL_d6.csv'
# NOTE: only write out files if out_flag = True
if out_flag:
    csl_mod.to_csv(filename)
    os.remove(linkname)
    os.symlink(filename, linkname)
else:
    print("out_flag False: File not written")

out_flag False: File not written
