In [758]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

In [759]:
df = pd.read_csv('./data/data.csv')

FileNotFoundError: [Errno 2] No such file or directory: './data/data.csv'

In [None]:
df.dtypes

In [None]:
column_list = df.columns
column_list

# FUNCTIONS

In [None]:
def percent_na_df(df,feature_list):
    
    '''
    This function takes in a dataframe and feature list
    (1) calculates the percentage of NAN per feature
    
    Returns a dataframe showing percent NAN per feature
    '''
    
    num_na = []
    col_list = []
    for column in df[feature_list].columns:
        col_list.append(column)
        num_na.append(round(df[column].isna().sum()/df.shape[0]*100,2))

    zipped = list(zip(col_list,num_na))
    zorted = sorted(zipped,key=lambda x:x[1],reverse=True)
    
    zorted_vals = [[val[1]] for val in zorted]
    zorted_feats = [[feat[0]]for feat in zorted]
    
    nans_ = pd.DataFrame(zorted_vals,columns=['percent_na'])
    feats_ = pd.DataFrame(zorted_feats,columns=['features'])
    nans_df = feats_.merge(nans_,left_index=True,right_index=True)
    nans_df_sorted = nans_df.sort_values(['percent_na'],ascending=True)
    return nans_df_sorted


def select_correlated(df):
       
    '''
    This function takes in a dataframe
    (1) calculates the correlation of each feature with the target
    (2) creates a dataframe of features, correlations, and percent NA
    (3) drops the features where more than 58% of the data is NA
    (4) drops the features that are not very correlated with target
    
    Returns the dataframe
    '''
    
    nans_df = percent_na(df)
    corr_ = df.corr()[['diabetes_mellitus']].reset_index()
    corr_nans = nans_df.merge(corr_,left_on='features',right_on='index')
    corr_nans.drop(columns='index',axis=1,inplace=True)
    
    corr_sort = corr_nans.sort_values(['diabetes_mellitus'],ascending=False)
    corr_sort_drop = corr_sort[corr_sort['percent_na'] < 58].copy()
    corr_sort_drop_2 = corr_sort_drop[(corr_sort_drop['diabetes_mellitus'] > .03) | \
                                      (corr_sort_drop['diabetes_mellitus'] < -.03)]
    
    return (corr_sort_drop_2)

    
def correlated_features(df):
    '''
    This function returns the feature list from the above function
    '''
    
    corr_sort_drop_2 = show_correlated(df)  
    return (list(corr_sort_drop_2['features']))


def na_feature_list(df,feature_list):
    
    '''This function returns a subset of features -- the features that contain NAN values
    --- from the correlated features list above'''
    
    nans_df = percent_na_df(df,feature_list)
    nans_df_drop = nans_df[nans_df['percent_na'] != 0]
    
    return nans_df_drop['features']

def no_nulls_features(df,feature_list):
    
    '''This function returns a subset of features -- the features that DO NOT contain NAN values
    --- from the full list of continuous features'''

    df_ = percent_na_df(df,feature_list)
    nonull_df = df_[df_['percent_na'] == 0]
    
    return nonull_df['features']

# SUPER BASIC

In [None]:
continuous_features = []
categorical_features = []
def type_groups(df):
    for column in df.columns:
        if df[column].dtype == 'int64' or df[column].dtype == 'float64':
            continuous_features.append(column)
        elif df[column].dtype == 'object':
            categorical_features.append(column)
        else:
            pass
    return

In [None]:
type_groups(df)

In [None]:
# isolate continuous features minus the irrelevant 'Unnamed: 0', 'encounter_id', 'hospital_id'

continuous_features = continuous_features[3:].copy()

In [None]:
# insert target to our categorical_features list
# for correlation calculation later

categorical_features.append('diabetes_mellitus')

In [None]:
# set up DF with continuous variables `and categorical variables to handle separately

df_cont = df[continuous_features]
df_cat = df[categorical_features]

In [None]:
df_cat.to_csv('./data/categorical_df.csv',index=False)

In [None]:
feature_list = correlated_features(df_cont)[:15]
feature_list

In [None]:
first_pass = df_cont[feature_list]
df_fillna = first_pass.fillna(first_pass.median())
df_fillna

In [None]:
X = df_fillna.iloc[:,1:]
y = df_fillna['diabetes_mellitus']

X_train, X_hold, y_train, y_hold = train_test_split(X, y, test_size=0.20, random_state=18)

X_train.to_csv('./data/processed_first_X_15.csv',index=False)
y_train.to_csv('./data/processed_first_y_15.csv',index=False)

# DUMMY CATEGORICAL VARIABLE HANDLING

In [None]:
# dummify

cat_dummies = pd.get_dummies(df_cat,columns=['ethnicity','gender','hospital_admit_source',
                                             'icu_admit_source','icu_stay_type','icu_type'],
                             dummy_na=True)

In [None]:
# combine ethnicity "other" and ethnicity "na" to one category
# drop original
# drop icu stay and icu type nan

cat_dummies['ethnicity_other_na'] = cat_dummies['ethnicity_Other/Unknown'] + \
                                                                cat_dummies['ethnicity_nan']
cat_dummies.drop(columns=['ethnicity_Other/Unknown','ethnicity_nan',
                          'icu_stay_type_nan','icu_type_nan'],inplace=True,axis=1)

In [None]:
# drop some other categories

cat_dummies.drop(columns=['ethnicity_Native American','gender_nan',
                          'icu_type_CTICU','icu_stay_type_readmit'],axis=1,inplace=True)

# APPLY FUNCTIONS

In [None]:
# continuous variable correlation DF

select_correlated(df_cont)

In [None]:
# continuous variable correlated features list

correlated_features = correlated_features(df_cont)

In [None]:
# percent NAN of selected correlated feature (37) DF

percent_na_df(df_cont,correlated_features)

In [None]:
# features with NAN in our selected correlated features

na_features = na_feature_list(df_cont,correlated_features)

In [None]:
na_features #35 features -- REMINDER arf_apache not in list 

In [None]:
# features with NO NAN values that we will use to train model

notnull_features = no_nulls_features(df_cont,continuous_features)[1:] #remove target
notnull_features

# IMPUTE NAN WITH KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
df_copy = df_cont.copy()

In [None]:
def impute_these_NAN(df,notnull_features,na_features):
    copy = df.copy()
    
    for feature in na_features:
        df_ = copy[copy[feature].notnull()]
        
        X = df_[notnull_features]
        y = df_[feature].astype(int)
#        print('\n\nFEATURE: ',feature)
#        print('X-matrix SHAPE: ',X.shape)
#        print('y SHAPE: ',y.shape)
    
        knn = KNeighborsClassifier(3,weights='distance')
        model = knn.fit(X,y)

        missing = copy[feature].isnull()
        df_missing = pd.DataFrame(copy[notnull_features][missing])
        
        impute = model.predict(df_missing)
        
        gen_alt = (alt for alt in impute)
        for i, value in enumerate(copy[feature]):
            if not pd.isnull(value): continue
            try:
                copy[feature].iloc[i] = next(gen_alt)
            except StopIteration:
                break
                
#        print(copy[feature].isnull().value_counts())
    return copy[na_features]

In [None]:
impute_df = impute_these_NAN(df_copy,notnull_features,na_features)

In [None]:
impute_df

In [None]:
# sanity check ...did we actually impute?
impute_df.info()

In [None]:
select_correlated(cat_dummies)

In [None]:
# narrow down categorical features

cat_correlated_features = correlated_features(cat_dummies)
cat_correlated_features
df_cat_select = cat_dummies[cat_correlated_features]

In [None]:
df_all_but_one = df_cat_select.merge(impute_df,how='inner',left_index=True,right_index=True)

In [None]:
df_all = df_all_but_one.merge(df_cont['arf_apache'],left_index=True,right_index=True)

In [None]:
df_tableau_= df_cat.merge(impute_df,how='inner',left_index=True,right_index=True)

In [None]:
df_tableau = df_tableau_.merge(df_cont['arf_apache'],left_index=True, right_index=True)

In [None]:
df_tableau.columns

In [None]:
df_tableau.to_csv('./data/for_tableau.csv',index=False)

In [None]:
df_all.to_csv('./data/processed_all.csv',index=False)

In [None]:
X = df_all.iloc[:,1:]
y = df_all['diabetes_mellitus']

X_train, X_hold, y_train, y_hold = train_test_split(X, y, test_size=0.20, random_state=18)

X_train.to_csv('./data/processed_second_X.csv',index=False)
y_train.to_csv('./data/processed_second_y.csv',index=False)

In [None]:
X_hold.to_csv('./data/processed_HOLDOUT_X.csv',index=False)
y_hold.to_csv('./data/processed_HOLDOUT_y.csv',index=False)