In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#show more columns
pd.set_option('display.max_columns', None)

In [2]:
data = pd.read_csv('data/IBD NSQIP Year 5 merged v1.csv',na_values='Unknown')
print(data.shape)
#data.head()

(1339, 236)


In [3]:
list(data.columns)

['case.number',
 'completion.status',
 'date.of.birth',
 'age.at.time.of.surgery',
 'gender',
 'race',
 'hispanic.ethnicity',
 'cpt.code',
 'cpt.description',
 'principal.operative.procedure',
 'in.out.patient.status',
 'elective.surgery',
 'origin.status',
 'hospital.admission.date',
 'operation.date',
 'principal.anesthesia.technique',
 'additional.anesthesia.technique.s.',
 'surgical.specialty',
 'height',
 'height.unit',
 'height.unknown',
 'weight',
 'weight.unit',
 'weight.unknown',
 'bmi',
 'diabetes.mellitus',
 'current.smoker.within.1.year',
 'dyspnea',
 'functional.heath.status',
 'ventilator.dependent',
 'history.of.severe.copd',
 'ascites.w.in.30.days.prior.to.surgery',
 'congestive.heart.failure.w.in.30.days.prior.to.surgery',
 'hypertension.requiring.medication',
 'acute.renal.failure',
 'currently.requiring.or.on.dialysis',
 'disseminated.cancer',
 'open.wound.with.or.without.infection',
 'steroid.immunosuppressant.use.for.chronic.condition',
 'malnourishment',
 'bleedin

In [4]:
#subset to only CPT code 44205 or 44160
data = data.loc[data['cpt.code'].isin([44205,44160])]
#drop all observations that have 'ileostomy.formation' set to missing
data = data.dropna(subset=['ileostomy.formation'])
#combine colectomy.operative.approach categories
data['colectomy.operative.approach'] = data['colectomy.operative.approach'].replace({'Endoscopic w/ unplanned conversion to open':'MIS unplanned conversion to open','Hybrid':'MIS','Laparoscopic':'MIS','Laparoscopic w/ open assist':'MIS','Laparoscopic w/ unplanned conversion to open':'MIS, unplanned conversion to open','Open (planned)':'Open','Robotic':'MIS','Robotic w/ open assist':'MIS','Robotic w/ unplanned conversion to open':'MIS, unplanned conversion to open','SILS':'MIS','SILS w/ unplanned conversion to open':'MIS'})
#binarize 'ileostomy.formation' outcomes
data['ileostomy.formation'] = data['ileostomy.formation'].replace({'New ileostomy':'Ileostomy','Other ileostomy revision':'Ileostomy','Conversion of end ileostomy to loop':'Ileostomy','Ileostomy from prior is unaltered':'Ileostomy'})
#Only want Crohn's disease
data = data.loc[data['colectomy.primary.indication.for.surgery'].isin(["Crohn's Disease"])]


In [6]:
#if 'height.unit' is 'in', convert to 'cm' | if 'weight.unit' is 'lb', convert to 'kg'
#get height and weight to the correct units
for x in data.index:
    if data['height.unit'][x] == 'in':
        data['height'][x] = data['height'][x] * 2.54
    if data['weight.unit'][x] == 'lb':
        data['weight'][x] = data['weight'][x] * 0.453592

In [5]:
# there is an outcome called 'ibd_ileost' that wasn't addressed in preproc.ipynb
data['ileostomy.formation'].value_counts()

None          200
Ileostomy      54
ibd_ileost      2
Name: ileostomy.formation, dtype: int64

In [6]:
data['colectomy.operative.approach'].value_counts()

MIS                                  47
Open                                 13
MIS, unplanned conversion to open     6
Name: colectomy.operative.approach, dtype: int64

In [7]:
# large amount of NA in the 'colectomy.operative.approach'
data['colectomy.operative.approach'].isna().sum()

190

In [11]:
#renaming columns
data.columns = data.columns.str.replace('surgical.wound.s..closure','surgical.wound(s).closure',regex=False)
data.columns = data.columns.str.replace('sepsis,.sirs.sepsis.septic.shock.>.48h.','sepsis.(sirs/sepsis/septic.shock).(48h)',regex=False)
data.columns = data.columns.str.replace('duration.of.surgical.procedure,.in.minutes.','duration.of.surgical.procedure.(in.minutes)',regex=False)
data.columns = data.columns.str.replace('#.of.postop.transfusion.intraop,.postop,.72h.of.surgery.start.time.','#.of.postop.transfusion.intraop/.postop.(72h.of.surgery.start.time)',regex=False)
data.columns = data.columns.str.replace('x.','#',regex=False)
data.columns = data.columns.str.replace('w.in','w/in',regex=False)
data.columns = data.columns.str.replace('...','.>.',regex=False)
data.columns = data.columns.str.replace('..',',.',regex=False)
data.columns = data.columns.str.replace('procedure.surgery','procedure/surgery',regex=False)
data.columns = data.columns.str.replace('organ.space','organ/space',regex=False)
data.columns = data.columns.str.replace('steroid.immunosuppressant','steroid/immunosuppressant',regex=False)
data.columns = data.columns.str.replace('in.out.patient.status','in/out-patient.status',regex=False)
data.columns = data.columns.str.replace('preop.transfusions,.rbc.w/in.72.hrs.prior.to.surgery.start.time.','preop.transfusions.(rbc.w/in.72.hrs.prior.to.surgery.start.time)',regex=False)
data.columns = data.columns.str.replace('ast.sgot','ast/sgot',regex=False)
data.columns = data.columns.str.replace('#.of.postop.transfusion.intraop..postop..72h.of.surgery.start.time.','#.of.postop.transfusion.intraop/.postop.(72h.of.surgery.start.time)',regex=False)
data.columns = data.columns.str.replace('preop.transfusions..rbc.w/in.72.hrs.prior.to.surgery.start.time.','preop.transfusions.(rbc.w/in.72.hrs.prior.to.surgery.start.time)',regex=False)
data.columns = data.columns.str.replace('#.of.postop.c,.diff','#.of.postop.c..diff',regex=False)
data.columns = data.columns.str.replace('colectomy.non.emergent.indication.icd10.code','colectomy.non-emergent.indication.icd10.code',regex=False)

In [14]:
#list(data)

In [8]:
data_1_4 = pd.read_csv('data/comb4_data.csv',na_values='Unknown')
data_1_4_vars = list(data_1_4)

In [13]:
yr_5_vars = list(data)
sum = 0
for i in range(len(data_1_4_vars)):
    if (data_1_4_vars[i] in yr_5_vars):
        #print("IN",data_1_4_vars[i])
        sum+=1
    else:
        print("OUT",data_1_4_vars[i])
print(sum)


OUT >10%.loss.of.body.weight.in.the.6.months.prior.to.surgery
121
