In [0]:
from IPython import get_ipython


 # part 1: parse original data

 this notebook contains code to parse raw mbsaqip files after they have been uncompressed locally

 note that this is designed to merge tables in a way that allows us to specify composite outcomes of interest to the present study.

In [0]:
get_ipython().run_line_magic('matplotlib', 'inline')
get_ipython().run_line_magic('reload_ext', 'autoreload')
get_ipython().run_line_magic('autoreload', '2')



In [0]:
import numpy as np
import pandas as pd
import os



In [0]:
# Set ipython's max row display
pd.set_option('display.max_row', 100)

# Set iPython's max column display
pd.set_option('display.max_columns', 50)



In [0]:
PATH = 'mbsaqip_originals/'


 * uncomment this if building and joining from original data tables

In [0]:
PATH_YEARS = ['2015/', '2016/', '2017/']
table_names = ['main', 'intv', 'bmi', 'reop', 'read']


 ### step 1: load from txt files and merge tables within each year

 in the following loop, for each year of data:

 * load the years data from csv files
 * merge tables to get data needed for  composite endpoint as defined in Dang et al Bariclot paper
  * readmission for dvt or pe
  * reoperation for dvt or pe
  * reintervention for dvt or pe
 * save  resultant dataframe to new .csv

 uncomment this to build merged data tables for each year

In [0]:
for year in PATH_YEARS:
    for fname in table_names:
        df_main = pd.read_csv(f'{PATH}{year}main.txt', sep='\t', low_memory=False)
        df_read = pd.read_csv(f'{PATH}{year}read.txt', sep='\t', low_memory=False)
        df_reop = pd.read_csv(f'{PATH}{year}reop.txt', sep='\t', low_memory=False)
        df_intv = pd.read_csv(f'{PATH}{year}intv.txt', sep='\t', low_memory=False)
        df_bmi  = pd.read_csv(f'{PATH}{year}bmi.txt' , sep='\t', low_memory=False)
        
        df_main = pd.merge(df_main,
                            df_read[['CASEID', 'SUSPREASON']],
                            on='CASEID',
                            how='left')
        
        df_main = pd.merge(df_main,
                            df_reop[['CASEID', 'REOP_SUSPECTED_REASON_BAR']],
                            on='CASEID',
                            how='left')
        
        df_main = pd.merge(df_main,
                            df_intv[['CASEID', 'INTV_REASON_BAR']],
                            on='CASEID',
                            how='left')
        
        df_main.to_csv(f'{PATH}{year}year_joined.csv')


 ### step 3: load merged csv file for each year and concatenate

 * load each year of data
 * inspect and compare the columns for each year
 * fix one header that is inconsistent between years
  * `agegt80` in 2015 is called  `ageGT80` in 2016 and 2017
  * does not affect this analysis but prove useful to consolidate this column for other analyses
 * drop non-intersecting columns

In [0]:
df_joined_fiv = pd.read_csv(f'{PATH}2015/year_joined.csv', low_memory=False, index_col=0)
df_joined_six = pd.read_csv(f'{PATH}2016/year_joined.csv', low_memory=False, index_col=0)
df_joined_sev = pd.read_csv(f'{PATH}2017/year_joined.csv', low_memory=False, index_col=0)



In [0]:
print(len(df_joined_fiv.columns))
print(len(df_joined_six.columns))
print(len(df_joined_sev.columns))



In [0]:
df_joined_fiv['ageGT80'] = df_joined_fiv['agegt80'] 
df_joined_fiv = df_joined_fiv.drop(columns=['agegt80'])



In [0]:
six_not_fiv = df_joined_fiv.columns ^ df_joined_six.columns



In [0]:
df_joined_six = df_joined_six.drop(columns=six_not_fiv)



In [0]:
sev_not_six = df_joined_six.columns ^ df_joined_sev.columns
df_joined_sev = df_joined_sev.drop(columns=sev_not_six)



In [0]:
df_all_years = pd.concat([df_joined_fiv, df_joined_six, df_joined_sev], sort=False)


 ### step 4: quick data integrity test

 make sure we got the right number of total patients:

In [0]:
print(len(df_joined_fiv), len(df_joined_six), len(df_joined_sev))
print(len(df_joined_fiv) + len(df_joined_six) + len(df_joined_sev))
print(len(df_all_years))


 ### step 5: write concatenated data to a new file

 this will throw an error if it runs after the '~/all_years' directory has been built; if that happens just delete the directory (or write some additional code for better file handling).

In [0]:
# make a dir to hold data from all years
os.mkdir(f'{PATH}all_years')

# save the data
df_all_years.to_csv(f'{PATH}all_years/all_years.csv')


 the data for all available years is now concatenated