## Imports

In [8]:
import pandas as pd
from sklearn.impute import SimpleImputer # Used to replace missing values in the dataset
from sklearn.preprocessing import OneHotEncoder, StandardScaler # These classes are used for preprocessing the data before training a machine learning model.
from sklearn.compose import ColumnTransformer # Allows you to apply different transformations to different columns in your dataset.
from sklearn.model_selection import train_test_split #  used to split the dataset into training and testing sets.
from sklearn.svm import SVC # Imports SVC supervised learning model used for classification tasks
from sklearn.metrics import accuracy_score, classification_report # These classes are used to evaluate the model's performance.

## Loading Dataset 4

In [9]:

ds4 = pd.read_csv('dataset4.csv')
ds4.head()

Unnamed: 0,encounter_id,patient_id,hospital_id,hospital_death,age,bmi,elective_surgery,ethnicity,gender,height,...,aids,cirrhosis,diabetes_mellitus,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,apache_3j_bodysystem,apache_2_bodysystem
0,66154,25312,118,0,68.0,22.73,0,Caucasian,M,180.3,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Sepsis,Cardiovascular
1,114252,59342,81,0,77.0,27.42,0,Caucasian,F,160.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Respiratory,Respiratory
2,119783,50777,118,0,25.0,31.95,0,Caucasian,F,172.7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Metabolic,Metabolic
3,79267,46918,118,0,81.0,22.64,1,Caucasian,F,165.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Cardiovascular,Cardiovascular
4,92056,34377,33,0,19.0,,0,Caucasian,M,188.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Trauma,Trauma


## Missing data

In [10]:
missing_values = ds4.isnull().sum() # Returns the number of missing values for each column
missing_values = missing_values[missing_values > 0] # Removes columns with no missing values.
missing_values_percentage = missing_values / len(ds4) * 100 # Calculate percentage of missing values
missing_values_percentage_sorted = missing_values_percentage.sort_values(ascending=False) # Sort by percentage descending
missing_values_percentage_sorted.head(20)  # Display top 10 columns with missing values

h1_bilirubin_max        92.265001
h1_bilirubin_min        92.265001
h1_lactate_min          91.992411
h1_lactate_max          91.992411
h1_albumin_max          91.398166
h1_albumin_min          91.398166
h1_pao2fio2ratio_min    87.441257
h1_pao2fio2ratio_max    87.441257
h1_arterial_ph_min      83.329517
h1_arterial_ph_max      83.329517
h1_hco3_min             82.969699
h1_hco3_max             82.969699
h1_arterial_pco2_min    82.822501
h1_arterial_pco2_max    82.822501
h1_wbc_max              82.815958
h1_wbc_min              82.815958
h1_arterial_po2_max     82.807236
h1_arterial_po2_min     82.807236
h1_calcium_min          82.717826
h1_calcium_max          82.717826
dtype: float64

In [5]:
# Separating numeric and categorical columns
numeric_columns = ds4.select_dtypes(include=['number']).columns
categorical_columns = ds4.select_dtypes(exclude=['number']).columns

# Imputing numeric columns with median
numeric_data = ds4[numeric_columns]
imputer_numeric = SimpleImputer(strategy='median') # Median strategy to impute missing values in numeric columns.
numeric_data_imputed = pd.DataFrame(imputer_numeric.fit_transform(numeric_data), columns=numeric_columns)

# Imputing categorical columns with mode (most frequent)
categorical_data = ds4[categorical_columns]
imputer_categorical = SimpleImputer(strategy='most_frequent')
categorical_data_imputed = pd.DataFrame(imputer_categorical.fit_transform(categorical_data), columns=categorical_columns)

# Merging numeric and categorical data back together
data_preprocessed = pd.concat([numeric_data_imputed, categorical_data_imputed], axis=1)

# Checking if all missing values are addressed
print(data_preprocessed.isnull().sum().max())  # should be 0 if no missing values remain


0


## Data Preprocessing and Train-Test Splitting for Classification Model

In [11]:
y = data_preprocessed["hospital_death"]  # Target variable
X = data_preprocessed.drop("hospital_death", axis=1)  # Features

# Defining numeric and categorical column indices
numeric_columns = X.select_dtypes(include=['int64', 'float64']).columns
categorical_columns = X.select_dtypes(include=['object']).columns

# Defining the column transformer with one-hot encoding for categorical variables and scaling for numeric variables
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_columns), # standardize the numeric features.
        ('cat', OneHotEncoder(), categorical_columns) #  encode categorical variables using one-hot encoding.
    ],
    remainder='passthrough'  # Pass through any columns not specified in transformers
)

# Applying the transformations
X_processed = preprocessor.fit_transform(X) # transformation scales numeric features and encodes categorical features

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

# Displaying the shapes of the train and test sets
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((73370, 237), (18343, 237), (73370,), (18343,))

## SVM Model

In [13]:
# Training a basic SVM model
svm_model = SVC() #  Initializes a support vector classifier.
svm_model.fit(X_train, y_train) #  Learns the parameters of the SVM model to best separate the data into different classes.

# Uses the trained SVM model to make predictions on the test data
y_pred = svm_model.predict(X_test)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Evaluating the model: accuracy and classification report(precision, recall, f1-score, support)
accuracy, classification_rep

(0.9276563266641226,
 '              precision    recall  f1-score   support\n\n         0.0       0.93      0.99      0.96     16756\n         1.0       0.79      0.22      0.35      1587\n\n    accuracy                           0.93     18343\n   macro avg       0.86      0.61      0.65     18343\nweighted avg       0.92      0.93      0.91     18343\n')

In [16]:
# Displaying the classification report as a dataframe
classification_rep = classification_report(y_test, y_pred, output_dict=True)
df_classification_rep = pd.DataFrame(classification_rep)
df_report = pd.DataFrame(classification_rep).transpose()
df_report

Unnamed: 0,precision,recall,f1-score,support
0.0,0.93105,0.99445,0.961706,16756.0
1.0,0.79148,0.222432,0.34727,1587.0
accuracy,0.927656,0.927656,0.927656,0.927656
macro avg,0.861265,0.608441,0.654488,18343.0
weighted avg,0.918975,0.927656,0.908546,18343.0
