 # part 4: descriptive statistics and bariclot calculation

 * create summary statistics of the entire dataset using
 * compare training, validation, and testing sets for both leak and clot
 * calculate bariclot score for each patient in the clot training set

 this will generate

 * a `results` directory
 * a `results/descriptive_stats` subdirectory - holds all descriptive data generated by tableone
 * a `results/bariclot` subdirectory - holds bariclot results for the test cohort. we also produce a sanity check for bariclot on 2015 data to make sure it performs as expected

 preliminaries

In [0]:
import pandas as pd
import numpy as np
import python_modules.constants as constants
import os
from tableone import TableOne



In [0]:
np.random.seed(seed=1872)



In [0]:
# Set ipython's max row display
pd.set_option('display.max_row', 100)

# Set iPython's max column display
pd.set_option('display.max_columns', 50)



In [0]:
PATH_IMPORT = 'study_data/'
PATH_RESULTS = 'results/'
PATH_DESCRIPTIVE = 'results/descriptive_stats/'
PATH_BARICLOT = 'results/bariclot/'


# make dirs to hold outputs
# if folders already exist, this will throw errors
os.mkdir(f'{PATH_RESULTS}')
os.mkdir(f'{PATH_DESCRIPTIVE}')
os.mkdir(f'{PATH_BARICLOT}')


 import data, specify variables to be included in table 1

In [0]:
df_main = pd.read_csv(f'{PATH_IMPORT}/study_data_split.csv', low_memory=False, index_col=0)



In [0]:
#labs_type = 'continuous'
#lim_intra = False
#
#table_one_cats = constants.categorical(labs = labs_type, lim_intra = lim_intra) + ['LEAK', 'CLOT']
#table_one_cons = constants.continuous(labs = labs_type, lim_intra = lim_intra)
#table_one_include = table_one_cats + table_one_cons

table_one_cats = constants.CATEGORICAL_PRE + constants.OUTCOME
table_one_cons = constants.CONTINUOUS_PRE + constants.CONTINUOUS_POST
table_one_include = table_one_cats + table_one_cons


 ## descriptive statistics

 we split the data into quarters in the last notebook and somewhat confusingly named part of the training set `val_1`. therefore create another row to map the actual training, validation, and testing sets, ultimately creating the final training set by adding `val_1` patients to `train` patients.

In [0]:
final_analysis_pop_dict = {'train':0, 'val_1':0, 'val_2':1, 'test': 2}

df_main['consolidated_clot_groups'] = df_main.CLOT_SET.map(final_analysis_pop_dict)
df_main['consolidated_leak_groups'] = df_main.LEAK_SET.map(final_analysis_pop_dict)


 build summary tables

In [0]:
mytable = TableOne(df_main, table_one_include, table_one_cats)
mytable_clot = TableOne(df_main, table_one_include, table_one_cats, 'consolidated_clot_groups', pval=True)
mytable_leak = TableOne(df_main, table_one_include, table_one_cats, 'consolidated_leak_groups', pval=True)


 save them



In [0]:
mytable.to_csv(f'{PATH_DESCRIPTIVE}dataset_summary.csv')
mytable_clot.to_csv(f'{PATH_DESCRIPTIVE}clot_set_summary.csv')
mytable_leak.to_csv(f'{PATH_DESCRIPTIVE}leak_set_summary.csv')


 ## bariclot calculation

 dicts and functions to parse data into format needed for bariclot calculation

In [0]:
d_funcstat = {'Independent':0, 'Partially Dependent':1, 'Totally Dependent':2}
d_clothist = {'No':0, 'Yes':1}

def d_race(race):
    if race == 'Black or African American':
        return 1
    else:
        return 0


 bariclot calculation
 * Dang JT, Switzer N, Delisle M, Laffin M, Gill R, Birch DW, Karmali S. Predicting venous thromboembolism following laparoscopic bariatric surgery: development of the BariClot tool using the MBSAQIP database. Surg Endosc. 2018; PMID:30003351; http://dx.doi.org/10.1007/s00464-018-6348-0


In [0]:
def calculate_bariclot(row):

    w = d_funcstat[row.FUNSTATPRESURG] * 3
    x = d_clothist[row.HISTORY_DVT] * 9
    y = d_race(row.race_PUF) * 3
    z = row.OPLENGTH / 60
        
    return np.sum([w,x,y,z])

def calculate_bariclot_groups(row):

    bc = calculate_bariclot(row)
    
    if bc < 1:
        return 0
    elif bc < 7:
        return 1   
    elif bc < 10:
        return 2
    else:
        return 3


 ### test population

 some patients (333 in the total study cohort) have missing data for operative duration; replace these with the mean operative duration.

In [0]:
df_main['OPLENGTH'].fillna(df_main['OPLENGTH'].mean(), inplace = True)


 select down to clot test population

In [0]:
df_test_clot = df_main[df_main['CLOT_SET'] == 'test']



In [0]:
len(df_test_clot)


 ### run the calculator

In [0]:
bariclot_scores = df_test_clot.apply(calculate_bariclot, axis=1)
bariclot_score_groups = df_test_clot.apply(calculate_bariclot_groups, axis=1)


 save targets and values for post-processing in R

In [0]:
#dataframe to hold results
df_bariclot = pd.DataFrame()

#populate dataframe with bariclot scores and target outcomes
df_bariclot['scores'] = bariclot_scores
df_bariclot['targs'] = df_test_clot['CLOT']



In [0]:
df_bariclot = df_bariclot.reset_index(drop=True)



In [0]:
df_bariclot.to_csv(f'{PATH_BARICLOT}bariclot_test.csv')


 ### look at auc for 2015 population just as a sanity check

 (it checks out when we run stats in R)

In [0]:
df_2015_clot = df_main[df_main['OPYEAR'] == 2015]



In [0]:
bariclot_scores_2015 = df_2015_clot.apply(calculate_bariclot, axis=1)
df_bariclot_2015 = pd.DataFrame()
df_bariclot_2015['scores'] = bariclot_scores_2015
df_bariclot_2015['targs'] = df_2015_clot['CLOT']
df_bariclot_2015 = df_bariclot_2015.reset_index(drop=True)
df_bariclot_2015.to_csv(f'{PATH_BARICLOT}bariclot_2015.csv')


