In [1]:
# Imports

import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

import psycopg2
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from scipy.stats import pearsonr
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import SGDRegressor
import statsmodels.api as sm
from sklearn.model_selection import GridSearchCV


pd.set_option('display.max_rows', None) #magic command
pd.set_option('display.max_columns', None)



In [2]:
conn = psycopg2.connect("dbname=mimic user=mimic password=Mimic@4@plhi")
cur = conn.cursor()

In [3]:
# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "Machine_learning_ch-4"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [4]:
def create_pandas_table(sql_query, database = conn):
    table = pd.read_sql_query(sql_query, database)
    return table

df_labevents_1 = create_pandas_table("""select itemid, label, loinc_code from mimic_hosp.d_labitems """)
print (df_labevents_1.shape)
df_labevents_1.head(1625)
# df_labevents_1['loinc_code'].isnull().sum

(1625, 3)


Unnamed: 0,itemid,label,loinc_code
0,51898,,
1,51532,11-Deoxycorticosterone,
2,51952,17-Hydroxycorticosteroids,
3,51953,"17-Ketosteroids, Urine",
4,52063,24 Hr,
5,51066,24 hr Calcium,
6,51067,24 hr Creatinine,
7,51068,24 hr Protein,
8,50853,25-OH Vitamin D,
9,51533,3t,


In [5]:
# dropping the Null values
df_labevents_1.dropna(inplace = True)
df_labevents_1

Unnamed: 0,itemid,label,loinc_code
12,51535,5' Nucleotidase,1690-7
17,52064,Absolute Basophil Count,704-7
20,51130,Absolute CD3 Count,8124-0
21,51131,Absolute CD4 Count,8128-1
23,51132,Absolute CD8 Count,8138-0
24,52068,Absolute Eosinophil Count,711-2
27,51133,Absolute Lymphocyte Count,731-0
29,52069,Absolute Monocyte Count,742-7
31,52070,Absolute Neutrophil Count,751-8
34,51134,Acanthocytes,7789-1


In [6]:
# df_labevents = create_pandas_table("""select subject_id, hadm_id, itemid, flag, value from mimic_hosp.labevents""")
# print (df_labevents.shape)
# df_labevents.head(10) 

In [7]:
# df_labevents.to_csv('df_labevents.csv', index = False)

In [8]:
df_labevents = pd.read_csv('df_labevents.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [9]:
itemid_label = df_labevents_1.set_index('itemid').to_dict()['label']
itemid_label

{51535: "5' Nucleotidase",
 52064: 'Absolute Basophil Count',
 51130: 'Absolute CD3 Count',
 51131: 'Absolute CD4 Count',
 51132: 'Absolute CD8 Count',
 52068: 'Absolute Eosinophil Count',
 51133: 'Absolute Lymphocyte Count',
 52069: 'Absolute Monocyte Count',
 52070: 'Absolute Neutrophil Count',
 51134: 'Acanthocytes',
 50856: 'Acetaminophen',
 50857: 'Acetone',
 50858: 'Acid Phosphatase',
 51539: 'Acid Phosphatase, Prostatic Fraction',
 51540: 'Adrenocorticotrophic Hormone',
 50861: 'Alanine Aminotransferase (ALT)',
 51542: '(Albumin)',
 50862: 'Albumin',
 51070: 'Albumin/Creatinine, Urine',
 51544: 'Aldosterone',
 50863: 'Alkaline Phosphatase',
 51545: 'Alpha-1',
 51547: 'Alpha-2',
 50864: 'Alpha-Fetoprotein',
 51462: 'Amorphous Crystals',
 50867: 'Amylase',
 50868: 'Anion Gap',
 51137: 'Anisocytosis',
 51556: 'Anti-Microsomal Antibodies',
 51557: 'Anti-Microsomal Antibody',
 50873: 'Anti-Nuclear Antibody',
 50874: 'Anti-Nuclear Antibody, Titer',
 50877: 'Anti-Thyroglobulin Antibodi

In [10]:
df_labevents['label'] = df_labevents['itemid'].map(itemid_label)

In [11]:
df_labevents.head(10)

Unnamed: 0,subject_id,hadm_id,itemid,flag,value,label
0,16047719,,51492,,,Protein
1,16117323,24152682.0,51492,,,Protein
2,16117323,23463879.0,51492,,,Protein
3,16117323,24152682.0,51492,,,Protein
4,16117323,24152682.0,51492,,,Protein
5,16045617,,51492,,,Protein
6,12403460,,51492,,,Protein
7,12403460,,51492,,,Protein
8,12403460,,51492,,,Protein
9,12403460,,51492,,,Protein


In [12]:
df_labevents.shape

(122289828, 6)

In [13]:
df_labevents.dropna(inplace = True)

In [14]:
df_labevents.shape

(19736755, 6)

In [15]:
# # df_labevents_ = df_labevents.loc[df_labevents['label'].isin(['Absolute Lymphocyte Count', 'Absolute Neutrophil Count', 'WBC',  'RBC', 'RDW', 'Basophils', 'Eosinophils','Neutrophils', 'Monocytes',
#                                                                   'Hematocrit', 'Hemoglobin', 'MCV', 'Platelet Count', 'Alanine Aminotransferase (ALT)', 'Asparate Aminotransferase (AST)', 'Alkaline Phosphatase','PT', 'Albumin', 'Globulin',
#                                                                  'Bilirubin, Total', 'Potassium', 'Sodium', 'Creatine Kinase (CK)', 'Cholesterol, HDL', 'Cholesterol, LDL, Calculated', 'Cholesterol, Total', 'C-Reactive Protein', 
#                                                                    'Creatinine', 'Urea Nitrogen', 'Uric Acid', 'Lactate Dehydrogenase (LD)',
#                                                     ])]

In [16]:
df_labevents.dtypes


subject_id      int64
hadm_id       float64
itemid          int64
flag           object
value          object
label          object
dtype: object

In [17]:
df_labevents['flag'] = df_labevents['flag'].fillna('normal')

In [18]:
df_labevents.isnull().sum()


subject_id    0
hadm_id       0
itemid        0
flag          0
value         0
label         0
dtype: int64

In [19]:
# x = pd.read_csv('df_admissions_data.csv')

In [20]:
# x.head(10)

In [21]:
# x.drop(['blood', 'circulatory','congenital', 'digestive', 'endocrine', 'genitourinary', 'infectious', 'injury', 'mental', 'misc','muscular','neoplasms', 'nervous', 'pregnancy', 'prenatal', 'respiratory', 'skin'], axis=1)
# x.head(10)

In [22]:
df_labevents['value'].value_counts()

1.2                                                                            198987
1.3                                                                            196023
21                                                                             182639
1.4                                                                            160122
1.5                                                                            151376
8.3                                                                            126054
8.2                                                                            118545
0.00                                                                           113685
8.1                                                                            110393
8.0                                                                            101773
1.6                                                                            101472
2.6                                                   

In [23]:
# convert datatype of values in column 'value' :- https://stackoverflow.com/questions/15891038/change-column-type-in-pandas
df_labevents["value"] = pd.to_numeric(df_labevents["value"], downcast = 'integer',  errors='coerce') # coerce invalid values to NaN as follows using the errors keyword argument

In [24]:
#Reshaping by pivoting DataFrame objects :- https://pandas-docs.github.io/pandas-docs-travis/user_guide/reshaping.html
df_labevents = df_labevents.pivot_table(index="subject_id", columns="label", values="value")

In [25]:
df_labevents.shape

(196833, 162)

In [26]:
df_labevents.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 196833 entries, 10000019 to 19999987
Columns: 162 entries, % Hemoglobin A1c to pH
dtypes: float64(162)
memory usage: 244.8 MB


In [27]:
df_labevents.head(50)

label,% Hemoglobin A1c,Absolute Basophil Count,Absolute CD3 Count,Absolute CD4 Count,Absolute CD8 Count,Absolute Eosinophil Count,Absolute Lymphocyte Count,Absolute Monocyte Count,Absolute Neutrophil Count,Acetaminophen,Alanine Aminotransferase (ALT),Albumin,"Albumin/Creatinine, Urine",Alkaline Phosphatase,Alpha-Fetoprotein,Amylase,Anion Gap,Anisocytosis,Anti-Thyroglobulin Antibodies,Asparate Aminotransferase (AST),Atypical Lymphocytes,Bacteria,Bands,Basophilic Stippling,Basophils,Bicarbonate,"Bilirubin, Direct","Bilirubin, Total",Blasts,C-Reactive Protein,"Calcium, Total",Calculated Free Testosterone,Calculated TBG,Carbamazepine,Carcinoembyronic Antigen (CEA),Chloride,"Cholesterol, HDL","Cholesterol, LDL, Calculated","Cholesterol, Total",Cortisol,Creatine Kinase (CK),"Creatine Kinase, MB Isoenzyme",Creatinine,Cyclosporin,DHEA-Sulfate,Digoxin,Eosinophils,Ethanol,Factor II,Factor IX,Factor V,Factor VII,Factor VIII,Factor X,Factor XI,Factor XII,Ferritin,Follicle Stimulating Hormone,Gamma Glutamyltransferase,Gentamicin,Globulin,Glucose,Granular Casts,Haptoglobin,Hematocrit,Hemoglobin,Hemoglobin A2,Hemoglobin C,Hemoglobin F,Hemogloblin S,Hepatitis B Virus Core Antibody,Homocysteine,Howell-Jolly Bodies,Human Chorionic Gonadotropin,Hyaline Casts,Hypersegmented Neutrophils,Hypochromia,INR(PT),Immunoglobulin A,Immunoglobulin G,Immunoglobulin M,Iron,Ketone,Lactate Dehydrogenase (LD),Leukocyte Alkaline Phosphatase,Lithium,Luteinizing Hormone,Lymphocytes,MCH,MCHC,MCV,Macrocytes,Magnesium,Microcytes,Monocytes,Neutrophils,Nucleated Red Cells,Other Cells,Ovalocytes,PT,PTT,Pappenheimer Bodies,Parathyroid Hormone,Phenobarbital,Phenytoin,"Phenytoin, Free",Phosphate,Platelet Count,Platelet Smear,Poikilocytosis,Polychromasia,Potassium,Procainamide,Prolactin,Promyelocytes,Prostate Specific Antigen,Protein,"Protein, Total",Protein/Creatinine Ratio,RBC,RBC Casts,RDW,Red Blood Cells,Rheumatoid Factor,Salicylate,Schistocytes,Sedimentation Rate,Serum Viscosity,Sex Hormone Binding Globulin,Sickle Cells,Sodium,Specific Gravity,Spherocytes,Target Cells,Teardrop Cells,Testosterone,"Testosterone, Free",Theophylline,Thyroglobulin,Thyroid Stimulating Hormone,Thyroxine (T4),"Thyroxine (T4), Free",Tobramycin,Transferrin,Triglycerides,Triiodothyronine (T3),Troponin T,Uptake Ratio,Urea Nitrogen,Uric Acid,"Urine Casts, Other",Urobilinogen,Valproic Acid,Vancomycin,Vitamin B12,Von Willebrand Factor Activity,Von Willebrand Factor Antigen,WBC,WBC Casts,Waxy Casts,White Blood Cells,pH
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1
10000019,,,,,,,,,,,,,,,,,,,,,,,,,,,0.4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,52.6,18.0,,,,,,,,,,,,,,,,,,,,,,25.0,34.8,,,,,,,66.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,16.5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,20.3,
10000032,,,,,,,,,,,109.833333,2.7,,111.0,,,6.0,,,168.0,,,,,,21.0,,2.2,,,7.8,,,,,93.4,,,,,,,0.3,,,,,,,,,,,,,,,,,,,111.25,,,34.1,11.566667,,,,,,,,,,,,1.666667,,,,,,,,,,,34.933333,,101.333333,,,,,,,,,17.966667,37.6,,,,,,2.333333,106.166667,,,,5.716667,,,,,,,,,,15.88,3.43,,,,,,,,126.571429,,,,,,,,,,,,,,,,,,30.5,,,4.0,,,,,,13.0,,,,
10000074,,,,,,,,,,,,,,,,,,,,,,,,,,,,7.1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
10000084,,,,,,,,1.165,,,,,,39.0,,,,,,45.0,,,,,,19.0,,,,,,,,,,,,,,,,,,,,,0.5,,,,,,,,,,,,,,,108.0,,,39.1,13.08,,,,,,,,,,,,,,,,,,,,,,,32.2,,,,,,15.9,,,,,,23.4,,,,,,,,,,,5.8,,,,,,,,,,,4.13,,,,,,,,133.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
10000117,,,,,,,,,,,,,,,,,21.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,106.0,,,45.1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
10000200,,,,,,,,,,,,,,,,,,,,,,,,,,,,11.3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
10000248,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,27.525,9.875,,,,,,,,,,,,,,,,,,,,,,17.6,,35.95,,,,,,72.3,,,,,,,,,,,,,,,,,,,,,,,,,,,3.3225,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
10000306,,,,,,,,,,,,,,,,,,,,,,,,,,,,14.05,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
10000560,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,150.0,,,33.8,11.9,,,,,,,,,,,,,,,,,,,,,,,,35.2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3.82,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
10000674,,,,,,,,,,,,,,,,,,,,,,,,,,,,8.4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [28]:
df_labevents.reset_index(level=0, inplace=True)

In [29]:
df_labevents.isnull().sum()

label
subject_id                              0
% Hemoglobin A1c                   183735
Absolute Basophil Count            185041
Absolute CD3 Count                 196100
Absolute CD4 Count                 195907
Absolute CD8 Count                 195943
Absolute Eosinophil Count          174316
Absolute Lymphocyte Count          171062
Absolute Monocyte Count            174204
Absolute Neutrophil Count          165933
Acetaminophen                      196199
Alanine Aminotransferase (ALT)     162474
Albumin                            158524
Albumin/Creatinine, Urine          195651
Alkaline Phosphatase               164934
Alpha-Fetoprotein                  196380
Amylase                            192123
Anion Gap                          155529
Anisocytosis                       196829
Anti-Thyroglobulin Antibodies      196787
Asparate Aminotransferase (AST)    157720
Atypical Lymphocytes               184838
Bacteria                           196827
Bands                       

In [32]:
# from sklearn.impute import SimpleImputer
# imputer = SimpleImputer(strategy="mean")
# imputer.fit(df_labevents_)

# imputer.statistics_

# df_labevents_.mean().values

# X = imputer.transform(df_labevents_)

# df_labevents_ = pd.DataFrame(X, columns= df_labevents_.columns,
#                      index= df_labevents_.index)

In [33]:
df_labevents_.shape

NameError: name 'df_labevents_' is not defined

In [None]:
los_data = pd.read_csv('df_admissions_data.csv')

In [None]:
los_data.head(10)

In [None]:
df_labevents_['subject_id'].value_counts()

In [None]:
# Fix NaNs and file under 'UNKNOWN'
los_data['marital_status'] = los_data['marital_status'].fillna('Unknown')
los_data['marital_status'].value_counts(dropna=False)

In [None]:
# Compress the number of ethnicity categories

los_data['ethnicity'].replace(['UNABLE TO OBTAIN', 'OTHER',  
                         'UNKNOWN'], value='OTHER/UNKNOWN', inplace=True)

los_data['ethnicity'].value_counts()

In [None]:
los_data.drop(columns=[ 'hadm_id','admittime', 'admission_location','discharge_location', 'language','Deceased','deathtime'], inplace=True)


In [None]:
# Create dummy columns for categorical variables
prefix_cols = ['ADM', 'INS', 'ETH', 'MAR']
dummy_cols = ['admission_type', 'insurance','ethnicity',  'marital_status']
los_data = pd.get_dummies(los_data, prefix=prefix_cols, columns=dummy_cols)
los_data.info()

In [None]:
LOS_ = los_data.set_index('subject_id').to_dict()['LOS']
LOS_

In [None]:
df_labevents_['LOS'] = df_labevents_['subject_id'].map(LOS_)

In [None]:
df_labevents_.head(50)

In [None]:
df_labevents_['LOS'].isnull().sum()

In [None]:
df_labevents_ = df_labevents_.fillna(0)

In [None]:
df_labevents_ = df_labevents_[df_labevents_['LOS'] >0]

In [None]:
df_labevents_.head(50)

In [None]:
# from sklearn.impute import SimpleImputer
# imputer = SimpleImputer(strategy="mean")
# imputer.fit(df_labevents_['LOS'])

# imputer.statistics_

# df_labevents_['LOS'].mean().values

# X = imputer.transform(df_labevents_['LOS'])

# df_labevents_ = pd.DataFrame(X, columns= df_labevents_.columns,
#                      index= df_labevents_.index)

In [None]:
df_labevents_.head(50)

In [None]:
df_labevents_.isnull().sum()

In [None]:
main_data = df_labevents_

In [None]:
main_data.info()

In [None]:
main_data.shape

In [None]:
# main_data.to_csv('main_data.csv', index = False)

In [None]:
main_data_corr=main_data.corr()
f,ax=plt.subplots(figsize=(18,15))
sns.heatmap(main_data_corr, cmap='viridis')
plt.title("Correlation between features", 
          weight='bold', 
          fontsize=18)
plt.xticks(weight='bold')
plt.yticks(weight='bold')

plt.show()


In [None]:
main_data_corr["LOS"].sort_values(ascending=False)

In [None]:
main_data.describe()

In [None]:
main_data = main_data.drop(columns=['subject_id'])

In [None]:
# numlist = ['Alkaline Phosphatase ', 'RDW','Lactate Dehydrogenase (LD) ','MCV','Urea Nitrogen','Monocytes', 'Potassium', 'Sodium', 'PT', 
#           'Platelet Count','Neutrophils','Eosinophils', 'Creatinine','Hemoglobin', 'Hematocrit', 'Albumin ', 'Absolute Lymphocyte Count ', 'Absolute Neutrophil Count ',
#           'RBC','C-Reactive Protein', 'WBC ','Bilirubin, Total', 'Uric Acid','Alanine Aminotransferase (ALT)', 'Globulin ',
#           'Asparate Aminotransferase (AST)','Creatine Kinase (CK)', 'Cholesterol, Total ', 'Cholesterol, HDL','Cholesterol, LDL, Calculated',
#           'Basophils', 'LOS']
# for i in main_data.columns:
#     if i in numlist:
#         upper = main_data[i].mean() + 2*main_data[i].std()
#         lower = main_data[i].mean() - 2*main_data[i].std()
#         main_data[i] = np.where(main_data[i] > upper, upper,
#                                np.where(main_data[i] < lower, lower, main_data[i]))  

In [None]:
main_data.describe()

In [None]:
main_data.hist(figsize=(20,15))
plt.show()

In [None]:
main_data = main_data[main_data['Alkaline Phosphatase'] <= main_data['Alkaline Phosphatase'].quantile(0.97)]
main_data = main_data[main_data['LOS'] <= main_data['LOS'].quantile(0.97)]
print (main_data.shape) 

In [None]:
main_data.describe()

In [None]:
main_data.to_csv('main_data.csv', index = False)