### Imports

In [1]:
import pandas as pd
from sklearn.impute import SimpleImputer # Used to replace missing values in the dataset
from sklearn.compose import ColumnTransformer # Allows to apply different transformations to different columns on the dataset.
from sklearn.preprocessing import OneHotEncoder, StandardScaler # These classes are used for preprocessing the data before training a machine learning model.
from sklearn.model_selection import train_test_split #  used to split the dataset into training and testing sets.
from sklearn.naive_bayes import GaussianNB # Used to train a Naive Bayes model.
from sklearn.metrics import accuracy_score, classification_report # Used to evaluate the model.

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


### Loading Dataset 4

In [2]:
ds4 = pd.read_csv('dataset4.csv').drop(columns=["encounter_id", "patient_id"])
ds4.head()

Unnamed: 0,hospital_id,hospital_death,age,bmi,elective_surgery,ethnicity,gender,height,hospital_admit_source,icu_admit_source,...,aids,cirrhosis,diabetes_mellitus,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,apache_3j_bodysystem,apache_2_bodysystem
0,118,0,68.0,22.73,0,Caucasian,M,180.3,Floor,Floor,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Sepsis,Cardiovascular
1,81,0,77.0,27.42,0,Caucasian,F,160.0,Floor,Floor,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Respiratory,Respiratory
2,118,0,25.0,31.95,0,Caucasian,F,172.7,Emergency Department,Accident & Emergency,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Metabolic,Metabolic
3,118,0,81.0,22.64,1,Caucasian,F,165.1,Operating Room,Operating Room / Recovery,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Cardiovascular,Cardiovascular
4,33,0,19.0,,0,Caucasian,M,188.0,,Accident & Emergency,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Trauma,Trauma


### Missing data

In [3]:
missing_values = ds4.isnull().sum() # Returns the number of missing values for each column
missing_values = missing_values[missing_values > 0] # Removes columns with no missing values.
missing_values_percentage = missing_values / len(ds4) * 100 # Calculate percentage of missing values
missing_values_percentage_sorted = missing_values_percentage.sort_values(ascending=False) # Sort by percentage descending
missing_values_percentage_sorted.head(20)  # Display top 10 columns with missing values

h1_bilirubin_max        92.265001
h1_bilirubin_min        92.265001
h1_lactate_min          91.992411
h1_lactate_max          91.992411
h1_albumin_max          91.398166
h1_albumin_min          91.398166
h1_pao2fio2ratio_min    87.441257
h1_pao2fio2ratio_max    87.441257
h1_arterial_ph_min      83.329517
h1_arterial_ph_max      83.329517
h1_hco3_min             82.969699
h1_hco3_max             82.969699
h1_arterial_pco2_min    82.822501
h1_arterial_pco2_max    82.822501
h1_wbc_max              82.815958
h1_wbc_min              82.815958
h1_arterial_po2_max     82.807236
h1_arterial_po2_min     82.807236
h1_calcium_min          82.717826
h1_calcium_max          82.717826
dtype: float64

In [4]:
# Separating numeric and categorical columns
numeric_columns = ds4.select_dtypes(include=['number']).columns
categorical_columns = ds4.select_dtypes(exclude=['number']).columns

# Imputing numeric columns with median
numeric_data = ds4[numeric_columns]
imputer_numeric = SimpleImputer(strategy='median') # Median strategy to impute missing values in numeric columns.
numeric_data_imputed = pd.DataFrame(imputer_numeric.fit_transform(numeric_data), columns=numeric_columns)

# Imputing categorical columns with mode (most frequent)
categorical_data = ds4[categorical_columns]
imputer_categorical = SimpleImputer(strategy='most_frequent')
categorical_data_imputed = pd.DataFrame(imputer_categorical.fit_transform(categorical_data), columns=categorical_columns)

# Merging numeric and categorical data back together
data_preprocessed = pd.concat([numeric_data_imputed, categorical_data_imputed], axis=1)

# Checking if all missing values are addressed
data_preprocessed.isnull().sum().max()  # should be 0 if no missing values remain

0

### Data Preprocessing and Train-Test Splitting for Classification Model

In [5]:
y = data_preprocessed["hospital_death"]  # Target variable
X = data_preprocessed.drop("hospital_death", axis=1)  # Features

# Defining numeric and categorical column indices
numeric_columns = X.select_dtypes(include=['int64', 'float64']).columns
categorical_columns = X.select_dtypes(include=['object']).columns

# Defining the column transformer with one-hot encoding for categorical variables and scaling for numeric variables
preprocessor = ColumnTransformer(
	transformers=[
		('num', StandardScaler(), numeric_columns), # standardize the numeric features.
		('cat', OneHotEncoder(), categorical_columns) #  encode categorical variables using one-hot encoding.
	],
	remainder='passthrough'  # Pass through any columns not specified in transformers
)

# Applying the transformations
X_processed = preprocessor.fit_transform(X) # transformation scales numeric features and encodes categorical features

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

# Displaying the shapes of the train and test sets
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((73370, 235), (18343, 235), (73370,), (18343,))

### Naive Bayes Model

In [6]:
gnb = GaussianNB()
gnb_model = gnb.fit(X_train, y_train)
y_pred = gnb_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

In [7]:
accuracy

0.8132802704028785

In [8]:
confusion_matrix = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'])
confusion_matrix

Predicted,0.0,1.0
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,13918,2838
1.0,587,1000


In [9]:
# Displaying the classification report as a dataframe
classification_rep = classification_report(y_test, y_pred, output_dict=True)
df_classification_rep = pd.DataFrame(classification_rep)
df_report = pd.DataFrame(classification_rep).transpose()
df_report

Unnamed: 0,precision,recall,f1-score,support
0.0,0.959531,0.830628,0.890439,16756.0
1.0,0.260552,0.63012,0.368664,1587.0
accuracy,0.81328,0.81328,0.81328,0.81328
macro avg,0.610042,0.730374,0.629551,18343.0
weighted avg,0.899057,0.81328,0.845296,18343.0
