In [152]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

In [153]:
df = pd.read_csv('./data/data.csv')

In [3]:
df.dtypes

Unnamed: 0                       int64
encounter_id                     int64
hospital_id                      int64
age                            float64
bmi                            float64
                                ...   
immunosuppression                int64
leukemia                         int64
lymphoma                         int64
solid_tumor_with_metastasis      int64
diabetes_mellitus                int64
Length: 181, dtype: object

In [154]:
column_list = df.columns
column_list

Index(['Unnamed: 0', 'encounter_id', 'hospital_id', 'age', 'bmi',
       'elective_surgery', 'ethnicity', 'gender', 'height',
       'hospital_admit_source',
       ...
       'h1_pao2fio2ratio_max', 'h1_pao2fio2ratio_min', 'aids', 'cirrhosis',
       'hepatic_failure', 'immunosuppression', 'leukemia', 'lymphoma',
       'solid_tumor_with_metastasis', 'diabetes_mellitus'],
      dtype='object', length=181)

In [155]:
continuous_features = []
categorical_features = []
def type_groups(df):
    for column in df.columns:
        if df[column].dtype == 'int64' or df[column].dtype == 'float64':
            continuous_features.append(column)
        elif df[column].dtype == 'object':
            categorical_features.append(column)
        else:
            pass
    return

In [156]:
type_groups(df)

In [157]:
continuous_features = continuous_features[3:].copy()

In [158]:
categorical_features.append('diabetes_mellitus')

In [159]:
df_cont = df[continuous_features]
df_cat = df[categorical_features]

In [160]:
cat_dummies = pd.get_dummies(df_cat,columns=['ethnicity','gender','hospital_admit_source',
                                             'icu_admit_source','icu_stay_type','icu_type'],
                             dummy_na=True)

In [161]:
cat_dummies['ethnicity_other_na'] = cat_dummies['ethnicity_Other/Unknown'] + \
                                                                cat_dummies['ethnicity_nan']
cat_dummies.drop(columns=['ethnicity_Other/Unknown','ethnicity_nan',
                          'icu_stay_type_nan','icu_type_nan'],inplace=True,axis=1)

In [162]:
cat_dummies.drop(columns=['ethnicity_Native American','gender_nan',
                          'icu_type_CTICU','icu_stay_type_readmit'],axis=1,inplace=True)

In [176]:
def percent_na_df(df,feature_list):
    
    '''
    This function takes in a dataframe and feature list
    (1) calculates the percentage of NAN per feature
    
    Returns a dataframe showing percent NAN per feature
    '''
    
    num_na = []
    col_list = []
    for column in df[feature_list].columns:
        col_list.append(column)
        num_na.append(round(df[column].isna().sum()/df.shape[0]*100,2))

    zipped = list(zip(col_list,num_na))
    zorted = sorted(zipped,key=lambda x:x[1],reverse=True)
    
    zorted_vals = [[val[1]] for val in zorted]
    zorted_feats = [[feat[0]]for feat in zorted]
    
    nans_ = pd.DataFrame(zorted_vals,columns=['percent_na'])
    feats_ = pd.DataFrame(zorted_feats,columns=['features'])
    nans_df = feats_.merge(nans_,left_index=True,right_index=True)
    nans_df_sorted = nans_df.sort_values(['percent_na'],ascending=True)
    return nans_df_sorted


def select_correlated(df):
       
    '''
    This function takes in a dataframe
    (1) calculates the correlation of each feature with the target
    (2) creates a dataframe of features, correlations, and percent NA
    (3) drops the features where more than 58% of the data is NA
    (4) drops the features that are not very correlated with target
    
    Returns the dataframe
    '''
    
    nans_df = percent_na(df)
    corr_ = df.corr()[['diabetes_mellitus']].reset_index()
    corr_nans = nans_df.merge(corr_,left_on='features',right_on='index')
    corr_nans.drop(columns='index',axis=1,inplace=True)
    
    corr_sort = corr_nans.sort_values(['diabetes_mellitus'],ascending=False)
    corr_sort_drop = corr_sort[corr_sort['percent_na'] < 58].copy()
    corr_sort_drop_2 = corr_sort_drop[(corr_sort_drop['diabetes_mellitus'] > .03) | \
                                      (corr_sort_drop['diabetes_mellitus'] < -.03)]
    
    return (corr_sort_drop_2)

    
def correlated_features(df):
    '''
    This function returns the feature list from the above function
    '''
    
    corr_sort_drop_2 = show_correlated(df)  
    return (corr_sort_drop_2['features'])


def na_feature_list(df,feature_list):
    
    nans_df = percent_na_df(df,feature_list)
    nans_df_drop = nans_df[nans_df['percent_na'] != 0]
    
    return nans_df_drop['features']

def no_nulls_features(df,feature_list):

    df_ = percent_na_df(df,feature_list)
    nonull_df = df_[df_['percent_na'] == 0]
    
    return nonull_df['features']

In [164]:
percent_na_df(df_cont,continuous_features)

Unnamed: 0,features,percent_na
171,diabetes_mellitus,0.00
156,elective_surgery,0.00
157,icu_id,0.00
158,pre_icu_los_days,0.00
159,readmission_status,0.00
...,...,...
4,h1_lactate_max,91.02
3,h1_albumin_min,91.43
2,h1_albumin_max,91.43
1,h1_bilirubin_min,92.09


In [168]:
select_correlated(df_cont)

Unnamed: 0,features,percent_na,diabetes_mellitus
0,diabetes_mellitus,0.0,1.0
63,d1_glucose_max,6.33,0.400742
78,glucose_apache,11.29,0.354359
102,h1_glucose_max,57.68,0.316847
101,h1_glucose_min,57.68,0.30452
47,bmi,3.45,0.169043
44,weight,2.66,0.155517
77,d1_bun_max,10.55,0.14699
93,bun_apache,19.52,0.145241
76,d1_bun_min,10.55,0.137304


In [170]:
correlated_features = correlated_features(df_cont)

In [181]:
percent_na_df(df_cont,correlated_features)

Unnamed: 0,features,percent_na
36,arf_apache,0.0
35,diabetes_mellitus,0.0
33,d1_sysbp_max,0.21
34,d1_diasbp_min,0.21
32,d1_sysbp_noninvasive_max,1.25
31,d1_diasbp_noninvasive_min,1.26
30,weight,2.66
29,bmi,3.45
28,age,3.83
27,h1_sysbp_max,4.24


In [213]:
na_features = na_feature_list(df_cont,correlated_features)

In [214]:
na_features

33                 d1_sysbp_max
34                d1_diasbp_min
32     d1_sysbp_noninvasive_max
31    d1_diasbp_noninvasive_min
30                       weight
29                          bmi
28                          age
27                 h1_sysbp_max
26                h1_diasbp_min
25                h1_diasbp_max
24               d1_glucose_min
23               d1_glucose_max
22     h1_sysbp_noninvasive_max
21    h1_diasbp_noninvasive_min
20    h1_diasbp_noninvasive_max
19             d1_potassium_min
18             d1_potassium_max
16            d1_creatinine_min
15            d1_creatinine_max
17                d1_sodium_min
14                   d1_bun_min
13                   d1_bun_max
12               glucose_apache
11            d1_hematocrit_max
10            d1_hematocrit_min
8             d1_hemaglobin_min
9             d1_hemaglobin_max
7                d1_calcium_max
6                   d1_hco3_min
5                 sodium_apache
4             creatinine_apache
3       

In [187]:
notnull_features = no_nulls_features(df_cont,continuous_features)[1:]
df_cont[notnull_features].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130157 entries, 0 to 130156
Data columns (total 15 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   elective_surgery             130157 non-null  int64  
 1   icu_id                       130157 non-null  int64  
 2   pre_icu_los_days             130157 non-null  float64
 3   readmission_status           130157 non-null  int64  
 4   apache_post_operative        130157 non-null  int64  
 5   arf_apache                   130157 non-null  int64  
 6   intubated_apache             130157 non-null  int64  
 7   solid_tumor_with_metastasis  130157 non-null  int64  
 8   aids                         130157 non-null  int64  
 9   lymphoma                     130157 non-null  int64  
 10  leukemia                     130157 non-null  int64  
 11  ventilated_apache            130157 non-null  int64  
 12  hepatic_failure              130157 non-null  int64  
 13 

In [85]:
from sklearn.neighbors import KNeighborsClassifier

In [197]:
DF = df_cont[df_cont['d1_sysbp_max'].notnull()]
X = DF[notnull_features]
y = DF['d1_sysbp_max']

In [198]:
knn = KNeighborsClassifier(3, weights='distance')
model = knn.fit(X,y)

In [211]:
missing_ = df_cont['d1_sysbp_max'].isnull()
df_missing_ = pd.DataFrame(df_cont[notnull_features][missing_])
impute = model.predict(df_missing_)
df_missing_['d1_sysbp_max_hat'] = impute

In [212]:
def impute_these_NAN(df,notnull_features,na_features)
    
    for feature in na_features:
        df = df[df[feature].notnull()]
        X = df[notnull_features]
        y = df[null_feature]
    
        knn = KNeighborsClassifier(3,weights='distance')
        model = knn.fit(X,y)

        missing = df[feature].isnull()
        df_missing = pd.DataFrame(df[notnull_features][missing])
        impute = model.predict(df_missing)
        feature_name = feature + 'y_hat'
        df_missing[feature_name] = impute
    
    

Unnamed: 0,elective_surgery,icu_id,pre_icu_los_days,readmission_status,apache_post_operative,arf_apache,intubated_apache,solid_tumor_with_metastasis,aids,lymphoma,leukemia,ventilated_apache,hepatic_failure,cirrhosis,immunosuppression,d1_sysbp_max_hat
1867,0,92,0.001389,0,0,0,0,0,0,0,0,0,0,0,0,147.0
9839,0,143,0.050000,0,0,0,0,0,0,0,0,0,0,0,0,112.0
10409,0,195,0.000694,0,0,0,0,0,0,0,0,0,0,0,0,120.0
10429,0,176,0.201389,0,0,0,0,0,0,0,0,0,0,0,0,127.0
11224,0,174,0.000000,0,0,0,0,0,0,0,0,0,0,0,0,130.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120189,0,1045,0.414583,0,0,0,0,0,0,0,0,0,0,0,0,144.0
122469,0,1071,0.275694,0,0,0,0,0,0,0,0,0,0,0,0,128.0
123174,0,1076,0.036111,0,0,0,0,0,0,0,0,0,0,0,0,149.0
128929,0,1109,0.101389,0,0,0,0,0,0,0,0,0,0,0,0,136.0


In [None]:
cat_corr = percent_na(cat_dummies)
cat_feat = cat_corr['features']
cat_corr_final = cat_dummies[cat_feat]
cat_corr_final.merge(cont_corr_final,how='inner',on='diabetes_mellitus')

In [196]:
df_fillna = df_17.fillna(df_15.median())
df_fillna

Unnamed: 0,diabetes_mellitus,d1_glucose_max,glucose_apache,h1_glucose_max,h1_glucose_min,bmi,weight,d1_bun_max,bun_apache,d1_bun_min,d1_glucose_min,d1_creatinine_max,d1_creatinine_min,creatinine_apache,arf_apache,d1_potassium_max,age
0,1,168.0,168.0,140.0,134.0,22.732803,73.9,31.0,31.0,30.0,109.0,2.51,2.23,2.51,0,4.0,68.0
1,1,145.0,145.0,145.0,143.0,27.421875,70.2,11.0,9.0,9.0,128.0,0.71,0.56,0.56,0,4.2,77.0
2,0,150.0,133.0,140.0,134.0,31.952749,95.3,19.0,19.0,17.0,108.0,1.00,0.94,0.97,0,,25.0
3,0,185.0,185.0,140.0,134.0,22.635548,61.7,19.0,19.0,17.0,88.0,1.00,0.94,0.97,0,5.0,81.0
4,0,150.0,133.0,140.0,134.0,27.564749,80.0,19.0,19.0,17.0,108.0,1.00,0.94,0.97,0,,19.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130152,0,150.0,133.0,140.0,134.0,29.287256,90.0,19.0,19.0,17.0,108.0,1.00,0.94,0.97,0,,50.0
130153,0,139.0,139.0,140.0,134.0,29.653433,78.4,32.0,32.0,32.0,139.0,1.07,1.07,1.07,0,3.6,79.0
130154,1,346.0,346.0,140.0,134.0,32.265371,102.0,31.0,30.0,30.0,55.0,2.65,2.41,2.43,0,4.4,73.0
130155,0,156.0,137.0,140.0,134.0,24.408579,83.9,45.0,19.0,45.0,137.0,2.21,2.21,0.97,0,5.7,81.0


In [215]:
X = df_fillna.iloc[:,1:]
y = df_fillna['diabetes_mellitus']

In [216]:
X_train, X_hold, y_train, y_hold = train_test_split(X, y, test_size=0.20, random_state=18)

In [220]:
y_train

16444    0
96784    1
86146    1
86115    0
5525     0
        ..
1726     0
35653    0
91256    0
94483    0
60714    0
Name: diabetes_mellitus, Length: 104125, dtype: int64

In [221]:
X_train.to_csv('./data/processed_first_X_17feat.csv',index=False)
y_train.to_csv('./data/processed_first_y_17feat.csv',index=False)