In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# load the csv file
df = pd.read_csv('data\diabetic_data.csv')

If we look at the IDs_mapping.csv we can see that 11,13,14,19,20,21 are related to death or hospice.

We should remove these samples from the predictive model.

In [None]:
df = df.loc[~df.discharge_disposition_id.isin([11,13,14,19,20,21])]
len(df)

In [None]:
df['OUTPUT_LABEL'] = (df.readmitted == '<30').astype('int')

In [None]:
def calc_prevalence(y_actual):
    return (sum(y_actual)/len(y_actual))

In [None]:
print('Prevalence:%.3f'%calc_prevalence(df['OUTPUT_LABEL'].values))

Around 11% of the population is rehospitalized. This makes it an imbalanced classification problem.

Let's see how to address the imbalance in data. To begin with, let's inspect data in each column in the dataset by looking at them in groups of 10.

Inspecting the data () I see that there are a lot of categorical (non-numeric) variables. Note that the variables with _id are also categorical as these ids refer to categories mentioned in the IDs_mapping.csv file. 

Let's take a look at the unique values for each column.

In [None]:
# print unique values for each column or count if higher uniqueness
for c in list(df.columns):
    n = df[c].unique()    
    # if number of unique values is less than 30, print the values. Otherwise print the number of unique values
    if len(n)<30:
        print(c)
        print(n)
    else:
        print(c + ': ' +str(len(n)) + ' unique values')

I see there is a mix of categorical (non-numeric) and numerical data. A few things to point out,
- encounter_id and patient_nbr are unique identifiers that are not useful variables for this model.
- age and weight are categorical in this data set
- admission_type_id,discharge_disposition_id,admission_source_id are numerical here. But are mapped to certain categories (see IDs_mapping).
- examide and citoglipton only have 1 value, so these variables can be ignored in the model.
- diag1, diag2, diag3 - are categorical and have a lot of values. I will not use these as part of this model. I will use number_diagnoses to capture some of this information in the model.
ToDo: Group diagnostics into ICD codes to reduce the dimension and use them in modelling. 
- medical_speciality has many categorical variables, so I will consider this as a feature for this model. 

# Performing Exploratory Data Analysis (EDA)

### Check for Correlation if any

In [None]:
# Let's see the relationship between different variables to understand the data and if there is a strong correlation between 
#two variables then we can consider one of them.
from pandas.plotting import scatter_matrix
scatter_matrix(df[['num_procedures', 'num_medications', 'number_emergency']], figsize = (10, 10))


From the above, we can see that there is no problem of multi-collinearity. We can also see that as the number_emergency increases the num_medication decreases.

In [None]:
#Let's try to see how the age and number of medicines vary,
sortage = df.sort_values(by = 'age')
x = sns.stripplot(x = "age", y = "num_medications", data = sortage, color = 'red')
sns.despine() #remove top and right axes
x.figure.set_size_inches(10, 6)
x.set_xlabel('Age')
x.set_ylabel('Number of Medications')
x.axes.set_title('Number of Medications vs. Age')
plt.show()

In [None]:
#Gender and Readmissions,
plot1 = sns.countplot(x = 'gender', hue = 'OUTPUT_LABEL' ,data = df) 
sns.despine()
plot1.figure.set_size_inches(7, 6.5)
plot1.legend(title = 'Readmitted patients', labels = ('No', 'Yes'))
plot1.axes.set_title('Readmissions Balance by Gender')
plt.show()

In [None]:

b = df.age.unique()
b.sort()
b_sort = np.array(b).tolist()


ageplt = sns.countplot(x = 'age', hue = 'OUTPUT_LABEL', data = df, order = b_sort) 

sns.despine()
ageplt.figure.set_size_inches(7, 6.5)
ageplt.legend(title = 'Readmitted within 30 days', labels = ('No', 'Yes'))
ageplt.axes.set_title('Readmissions Balance by Age')
plt.show()

In [None]:
#Exploring the categorical variables
fig, ax = plt.subplots(figsize=(15,10), ncols=2, nrows=2)

sns.countplot(x="readmitted", data=df, ax=ax[0][0])
sns.countplot(x="race", data=df, ax=ax[0][1])
sns.countplot(x="gender", data=df, ax=ax[1][0])
sns.countplot(x="age", data=df, ax=ax[1][1])

## Feature Engineering
The missing numbers in the data set are filled with a question mark. Replace it with a np.nan representation.

In [None]:

# replace ? with nan
df = df.replace('?',np.nan)

#list numerical features
cols_num = ['time_in_hospital','num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient','number_diagnoses']

print(df[cols_num].isnull().sum())

#list categorical features
cols_cat = ['race', 'gender', 
       'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed','payer_code','medical_specialty']

print(df[cols_cat].isnull().sum())

In [None]:
#Fill empty categories with UNK
df['race'] = df['race'].fillna('UNK')
df['payer_code'] = df['payer_code'].fillna('UNK')
df['medical_specialty'] = df['medical_specialty'].fillna('UNK')


In [None]:
print('Number medical specialty:', df.medical_specialty.nunique())
num_specialties = df.groupby('medical_specialty').size().sort_values(ascending = False)
top_specialties = list(i for i, n in num_specialties.items() if n>300)
print(top_specialties)
print(len(top_specialties))

In [None]:
# make a new column with duplicated data
df['med_spec'] = df['medical_specialty'].copy()

# replace all specialties not in top 10 with 'Other' category
df.loc[~df.med_spec.isin(top_specialties),'med_spec'] = 'Other'

df.groupby('med_spec').size()

In [None]:
#Use one-hot encoding to convert categorical features to numbers. 
cols_cat_num = ['admission_type_id', 'discharge_disposition_id', 'admission_source_id']

#get_dummies function in pandas will do one-hot encoding on string values.
#convert numbers to str
df[cols_cat_num] = df[cols_cat_num].astype('str')

df_cat = pd.get_dummies(df[cols_cat + cols_cat_num + ['med_spec']],drop_first = True)
df_cat.head()

In [None]:
cols_all_cat = list(df_cat.columns)
df = pd.concat([df,df_cat], axis=1)

In [None]:
df[['age', 'weight']].head()

In [None]:
df.weight.notnull().sum()

In [None]:
#Age and weight are categorical in this dataset. Convert them to numerical
age_map={
    '[0-10)':0,
    '[10-20)':1,
    '[20-30)':2,
    '[30-40)':3,
    '[40-50)':4,
    '[50-60)':50,
    '[60-70)':60, 
    '[70-80)':70, 
    '[80-90)':80, 
    '[90-100)':90
}

df['age_group'] = df.age.replace(age_map)
df['has_weight'] = df.weight.notnull().astype('int')

cols_extra = ['age_group','has_weight']

In [None]:
print('Total number of features:', len(cols_num + cols_all_cat + cols_extra))
print('Numerical Features:',len(cols_num))
print('Categorical Features:',len(cols_all_cat))
print('Extra features:',len(cols_extra))

In [None]:
df[cols_num + cols_all_cat + cols_extra].isnull().sum().sort_values(ascending = False).head(10)

In [107]:
#save processed feature data to a file.
col2use = cols_num + cols_all_cat + cols_extra
df_data = df[col2use + ['OUTPUT_LABEL']]

df_data.to_csv('data\df_data_features.csv')