In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Data Preprocessing

In [None]:
df = pd.read_csv('./data/diabetic_data.csv', header=0, index_col=0)
replace_values = {'?': np.nan,
                 'Unknown/Invalid': np.nan}
df.replace(replace_values, inplace=True)

df.shape

In [None]:
df.gender.unique()

In [None]:
df.head()

In [None]:
df.info()

## Readmission label

In [None]:
# count the number of rows for each readmitted type
df.groupby('readmitted').size()

In [None]:
# create a new variable LABEL
df['LABEL'] = (df.readmitted == '<30').astype('int')

In [None]:
df.LABEL.sum()

## Missing values

In [None]:
# check missing values in columns
(df.isnull().sum()*100/df.shape[0]).sort_values(ascending=False)[:10]

In [None]:
# check missing values in rows
(df.isnull().sum(axis=1)*100/df.shape[1]).sort_values(ascending=False)[:30]

In [None]:
#dropping columns with high NA percentage (>30%)
df.drop(['weight','medical_specialty','payer_code'], axis=1, inplace=True)

In [None]:
# check missing values in rows after dropping columns
(df.isnull().sum(axis=1)*100/df.shape[1]).sort_values(ascending=False)[:20]

In [None]:
# dropping rows with NAs.
df.dropna(inplace=True)

# check missing values in rows after dropping columns
(df.isnull().sum(axis=1)*100/df.shape[1]).sort_values(ascending=False)[:10]

In [None]:
df.shape

## Encoding categorical variables

In [None]:
diag_cols = ['diag_1','diag_2','diag_3']
for col in diag_cols:
    df[col] = df[col].str.replace('E','-')
    df[col] = df[col].str.replace('V','-')
    condition = df[col].str.contains('250')
    df.loc[condition,col] = '250'

df[diag_cols] = df[diag_cols].astype(float)

# diagnosis grouping
for col in diag_cols:
    df['temp']=np.nan
    
    condition = df[col]==250
    df.loc[condition,'temp']='Diabetes'
    
    condition = (df[col]>=390) & (df[col]<=458) | (df[col]==785)
    df.loc[condition,'temp']='Circulatory'
    
    condition = (df[col]>=460) & (df[col]<=519) | (df[col]==786)
    df.loc[condition,'temp']='Respiratory'
    
    condition = (df[col]>=520) & (df[col]<=579) | (df[col]==787)
    df.loc[condition,'temp']='Digestive'
    
    condition = (df[col]>=580) & (df[col]<=629) | (df[col]==788)
    df.loc[condition,'temp']='Genitourinary'
    
    condition = (df[col]>=800) & (df[col]<=999)
    df.loc[condition,'temp']='Injury'
    
    condition = (df[col]>=710) & (df[col]<=739)
    df.loc[condition,'temp']='Muscoloskeletal'
    
    condition = (df[col]>=140) & (df[col]<=239)
    df.loc[condition,'temp']='Neoplasms'
    
    condition = df[col]==0
    df.loc[condition,col]='?'
    df['temp']=df['temp'].fillna('Others')
    condition = df['temp']=='0'
    df.loc[condition,'temp']=np.nan
    df[col]=df['temp']
    df.drop('temp',axis=1,inplace=True)

df.dropna(inplace=True)

df['age'] = df['age'].str[1:].str.split('-',expand=True)[0]
df['age'] = df['age'].astype(int)
max_glu_serum_dict = {'None':0,
                      'Norm':100,
                      '>200':200,
                      '>300':300
                     }
df['max_glu_serum'] = df['max_glu_serum'].replace(max_glu_serum_dict)

A1Cresult_dict = {'None':0,
                  'Norm':5,
                  '>7':7,
                  '>8':8
                 }
df['A1Cresult'] = df['A1Cresult'].replace(A1Cresult_dict)

change_dict = {'No':-1,
               'Ch':1
              }
df['change'] = df['change'].replace(change_dict)

diabetesMed_dict = {'No':-1,
                    'Yes':1
                   }
df['diabetesMed'] = df['diabetesMed'].replace(diabetesMed_dict)

d23_feature_dict = {'Up':10,
                    'Down':-10,
                    'Steady':0,
                    'No':-20
                   }
d23_cols = ['metformin','repaglinide','nateglinide','chlorpropamide',
 'glimepiride','acetohexamide','glipizide','glyburide',
 'tolbutamide','pioglitazone','rosiglitazone','acarbose',
 'miglitol','troglitazone','tolazamide','examide',
 'citoglipton','insulin','glyburide-metformin','glipizide-metformin',
 'glimepiride-pioglitazone','metformin-rosiglitazone','metformin-pioglitazone']
for col in d23_cols:
    df[col] = df[col].replace(d23_feature_dict)

condition = df['readmitted']!='NO'
df['readmitted'] = np.where(condition,1,0)

df.head()

In [None]:
df.age.unique()

In [None]:
df.info()

## Examine outliers

In [None]:
bp = df.boxplot(column=['time_in_hospital', 'num_procedures',  
                         'number_diagnoses'])

In [None]:
bp = df.boxplot(column = ['num_medications', 'num_lab_procedures'])

In [None]:
bp = df.boxplot(column = ['number_outpatient', 'number_inpatient','number_emergency'])

## Plot nominal variables with LABEL

In [None]:
fig = plt.figure(figsize=(8,15))
sns.countplot(y = df['discharge_disposition_id'], hue = df['LABEL']).set_title('Discharge Type VS. Readmission')

In [None]:
fig = plt.figure(figsize=(8,8))
sns.countplot(y = df['age'], hue = df['LABEL']).set_title('Age of Patient VS. Readmission')

In [None]:
fig = plt.figure(figsize=(8,8))
sns.countplot(y = df['gender'], hue = df['LABEL']).set_title('Gender of Patient VS. Readmission')

In [None]:
fig = plt.figure(figsize=(8,8))
sns.countplot(y = df['race'], hue = df['LABEL']).set_title('Race of Patient VS. Readmission')

In [None]:
plt.figure(figsize=(10,8))
missing = pd.DataFrame({'column':df.columns ,'na_percent':df.isnull().sum()/len(df)*100})
missing.sort_values('na_percent',inplace=True)
plt.barh(missing['column'],width=missing['na_percent']);