In [2]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

In [3]:
dfAdmissions = pd.read_csv("../data/ADMISSIONS.csv")

dfDiagnoses = pd.read_csv("../data/DIAGNOSES_ICD.csv")

dfTitle = pd.read_csv("../data/d_icd_diagnoses.csv")

df = pd.merge(dfAdmissions, dfDiagnoses, how="inner", on=['SUBJECT_ID'])

#drop na values
df = df.dropna()

In [4]:
#creating vector of HAI related to pneumonia, and HAP

hcapCode = ["48230","48231","48232","48233", "48234", "48235", "48236", "48237", "48238", 
            "48239", "48240", "4830", "4822", "4820", "48240", "48241", "48242", "48243",
           "48244", "48245", "48246", "48247", "48248", "48249", "48282", "48281", "48283",
           "48284", "48289"]

#creating boolean values that are HAI related to pneumonia, and HAP
df['is_it_HAI'] = df['ICD9_CODE'].map(lambda x: any(x == i for i in hcapCode)) 

## Total number of HAIs cases

In [5]:
len(df['SUBJECT_ID'][df['is_it_HAI']].unique())

123

## Total number of cases

In [6]:
#finding all unique subject_id
subjectID_All = df['SUBJECT_ID'].unique()

#finding the subject_id with the HAI
subjectID_HAI = df['SUBJECT_ID'][df['is_it_HAI']].unique()

#total number of cases remainding 
print(len(subjectID_All))

2053


In [7]:
#creating a dataframe
dfHAI = pd.DataFrame()
dfHAI['SUBJECT_ID'] = subjectID_HAI
dfHAI['is_it_HAI'] = 1
print('Total number of HAI cases', dfHAI.shape)

#drop duplicate
df = df.drop_duplicates('SUBJECT_ID')

#merge dataframes
dfFinal = pd.merge(df, dfHAI, how="left", on=['SUBJECT_ID'])
dfFinal = dfFinal.replace('NaN', 0)
print('Total number of cases', dfFinal.shape)
print('Total number of HAI Cases', dfFinal['is_it_HAI_y'].sum())

Total number of HAI cases (123, 2)
Total number of cases (2053, 25)
Total number of HAI Cases 123.0


In [8]:
#selecting the relevant variables for logistic regression
dfFinal = dfFinal[['INSURANCE','LANGUAGE','RELIGION','MARITAL_STATUS','ETHNICITY','is_it_HAI_y']]

In [9]:
#create dummy variables

dummy_insurance = pd.get_dummies(dfFinal['INSURANCE'], prefix='INSURANCE')
dummy_language = pd.get_dummies(dfFinal['LANGUAGE'], prefix = 'LANGUAGE')
dummy_religion = pd.get_dummies(dfFinal['RELIGION'], prefix = 'RELIGION')
dummy_maritalStatus = pd.get_dummies(dfFinal['MARITAL_STATUS'], prefix = 'MARITAL_STATUS')
dummy_ethnicity = pd.get_dummies(dfFinal['ETHNICITY'], prefix = 'ETHNICITY')

#create a new data frame to fit the dummy variables
colsToKeep = ['is_it_HAI_y']
data = dfFinal[colsToKeep].join(dummy_insurance.ix[:, 'INSURANCE_Medicaid':])
data = data.join(dummy_language.ix[:, 'LANGUAGE_* FU':])
data = data.join(dummy_religion.ix[:, 'RELIGION_BAPTIST':])
data = data.join(dummy_maritalStatus.ix[:, 'MARITAL_STATUS_LIFE PARTNER':])
data = data.join(dummy_ethnicity.ix[:, 'ETHNICITY_AMERICAN INDIAN/ALASKA NATIVE FEDERALLY RECOGNIZED TRIBE':])

In [10]:
#logistic regression

from sklearn.linear_model import LogisticRegression

logistic = LogisticRegression()

train_cols = data.columns[1:]

logitModel = logistic.fit(data[train_cols], data['is_it_HAI_y'])

In [11]:
# examine the coefficients

print(logitModel.coef_)

[[ 0.00527     0.17007421 -0.13722057 -0.6319552  -0.15180349 -0.04178479
  -0.20659271 -0.06117407 -0.04688902 -0.10099955 -0.07161925 -0.05793265
  -0.10099955 -0.12673119  0.86296729 -0.14897084  0.86738604  0.71294436
  -0.08935306  0.17708546 -0.03637894 -0.37870897 -0.23843291 -0.59149185
   0.26425863 -0.13167681 -0.38545577 -0.04508461 -0.16841202 -0.07013045
  -0.08762275  1.19128529 -0.16648692 -0.08512805 -0.35026016 -1.07986765
  -0.14987721 -0.55769807 -0.42599703 -0.7073037  -0.35706717 -0.34416275
  -0.24857694  0.42987268 -0.07930218 -0.30094864 -0.37613544 -0.09092447
   0.5444868  -0.19574878  0.22265011  0.244857    0.33959508 -0.33745692
  -0.67272629 -0.0982113  -0.09244441 -0.18026697 -0.31710303 -0.65924671
  -0.10766175  0.19876177 -0.09128789 -0.08935306  0.0545893  -0.10280664
  -0.03985465 -0.05735032 -0.1887334  -0.16275921 -0.121808   -0.34485402
  -0.37364753 -0.11867913 -0.0564717  -0.03890417 -0.35868147 -0.14951263
  -0.34261459  0.22992886 -0.31511951 

In [43]:
#predict the values

logitModel.predict(data[train_cols])


array([ 0.,  0.,  0., ...,  0.,  0.,  0.])

In [31]:
#split into training and testing data set

split = np.random.rand(len(data)) < 0.5

test = data[split]

train = data[~split]


In [42]:
#training data

logitModel = logistic.fit(train[train_cols], train['is_it_HAI_y'])

#predict the values

logitModel.predict(train[train_cols]).sum()

0.0