# ICU Mortality Data Analysis

## Import packages

In [None]:
# Import required packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Enable high DPI plotting
%config InlineBackend.figure_format = 'retina'

## Data import and cleaning

In [None]:
# Import the ICU mortality data
df = pd.read_csv('ICU_Mortality.csv')

In [None]:
# Display the first few rows
df.head()

In [None]:
# Check the shape and data types
print(f"Dataset shape: {df.shape}")
print("\nData types:")
df.info()

In [None]:
# Check for missing values
df.isnull().sum()

In [None]:
# Remove rows with missing values
df_clean = df.dropna()
print(f"Original dataset shape: {df.shape}")
print(f"Cleaned dataset shape: {df_clean.shape}")
print(f"Rows removed: {df.shape[0] - df_clean.shape[0]}")

## Data analysis and visualization

### Age (Numerical Variable)

In [None]:
# Calculate summary statistics for Age
age_mean = df_clean['Age'].mean()
age_std = df_clean['Age'].std()
age_var = df_clean['Age'].var()
age_median = df_clean['Age'].median()
age_q1 = df_clean['Age'].quantile(0.25)
age_q3 = df_clean['Age'].quantile(0.75)
age_iqr = age_q3 - age_q1

print(f"Mean: {age_mean:.2f}")
print(f"Standard Deviation: {age_std:.2f}")
print(f"Variance: {age_var:.2f}")
print(f"Median: {age_median:.2f}")
print(f"Q1 (25th percentile): {age_q1:.2f}")
print(f"Q3 (75th percentile): {age_q3:.2f}")
print(f"Interquartile Range (IQR): {age_iqr:.2f}")

The Age variable has a sample mean of $\bar{X}$ and standard deviation of $s$. The median age is shown above, with an interquartile range (IQR) indicating the spread of the middle 50% of the data.

In [None]:
# Create box-and-whisker plot for Age
plt.figure(figsize=(8, 6))
plt.boxplot(df_clean['Age'], vert=True)
plt.ylabel('Age (years)')
plt.title('Box-and-Whisker Plot of Patient Age')
plt.grid(True, alpha=0.3)
plt.show()

The box-and-whisker plot shows the distribution of patient ages, with the box representing the interquartile range and the whiskers extending to the minimum and maximum values (excluding outliers).

### WCC (Numerical Variable)

In [None]:
# Calculate summary statistics for WCC
wcc_mean = df_clean['WCC'].mean()
wcc_std = df_clean['WCC'].std()
wcc_var = df_clean['WCC'].var()
wcc_median = df_clean['WCC'].median()
wcc_q1 = df_clean['WCC'].quantile(0.25)
wcc_q3 = df_clean['WCC'].quantile(0.75)
wcc_iqr = wcc_q3 - wcc_q1

print(f"Mean: {wcc_mean:.2f}")
print(f"Standard Deviation: {wcc_std:.2f}")
print(f"Variance: {wcc_var:.2f}")
print(f"Median: {wcc_median:.2f}")
print(f"Q1 (25th percentile): {wcc_q1:.2f}")
print(f"Q3 (75th percentile): {wcc_q3:.2f}")
print(f"Interquartile Range (IQR): {wcc_iqr:.2f}")

The white cell count (WCC) is measured in $10^9$ cells per liter. The summary statistics above show the central tendency and variability of WCC values in the patient sample.

In [None]:
# Create box-and-whisker plot for WCC
plt.figure(figsize=(8, 6))
plt.boxplot(df_clean['WCC'], vert=True)
plt.ylabel('WCC ($10^9$ cells/L)')
plt.title('Box-and-Whisker Plot of White Cell Count')
plt.grid(True, alpha=0.3)
plt.show()

The box-and-whisker plot visualizes the distribution of white cell counts across the patient population, showing the median, quartiles, and potential outliers.

### HB (Numerical Variable)

In [None]:
# Calculate summary statistics for HB
hb_mean = df_clean['HB'].mean()
hb_std = df_clean['HB'].std()
hb_var = df_clean['HB'].var()
hb_median = df_clean['HB'].median()
hb_q1 = df_clean['HB'].quantile(0.25)
hb_q3 = df_clean['HB'].quantile(0.75)
hb_iqr = hb_q3 - hb_q1

print(f"Mean: {hb_mean:.2f}")
print(f"Standard Deviation: {hb_std:.2f}")
print(f"Variance: {hb_var:.2f}")
print(f"Median: {hb_median:.2f}")
print(f"Q1 (25th percentile): {hb_q1:.2f}")
print(f"Q3 (75th percentile): {hb_q3:.2f}")
print(f"Interquartile Range (IQR): {hb_iqr:.2f}")

The hemoglobin (HB) level is measured in gram percentage (g%). The descriptive statistics provide insights into the distribution of hemoglobin levels among ICU patients.

In [None]:
# Create box-and-whisker plot for HB
plt.figure(figsize=(8, 6))
plt.boxplot(df_clean['HB'], vert=True)
plt.ylabel('HB (g%)')
plt.title('Box-and-Whisker Plot of Hemoglobin')
plt.grid(True, alpha=0.3)
plt.show()

The box-and-whisker plot displays the hemoglobin distribution, highlighting the central tendency and variability in the patient cohort.

### Mortality (Categorical Variable)

In [None]:
# Calculate frequency and relative frequency for Mortality
mortality_freq = df_clean['Mortality'].value_counts()
mortality_rel_freq = df_clean['Mortality'].value_counts(normalize=True)

print("Frequency:")
print(mortality_freq)
print("\nRelative Frequency:")
print(mortality_rel_freq)

The Mortality variable shows the distribution of patient outcomes in the ICU. The frequency table shows the count of patients who died versus survived, while the relative frequency expresses these as proportions of the total sample.

In [None]:
# Create bar plot for Mortality
plt.figure(figsize=(8, 6))
mortality_freq.plot(kind='bar', color=['#E74C3C', '#27AE60'])
plt.xlabel('Mortality Status')
plt.ylabel('Frequency')
plt.title('Bar Plot of Mortality Status')
plt.xticks(rotation=0)
plt.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.show()

The bar plot provides a visual comparison of mortality outcomes, clearly showing the number of patients who survived (green) versus those who died (red) in the ICU.

### Diabetes (Categorical Variable)

In [None]:
# Calculate frequency and relative frequency for Diabetes
diabetes_freq = df_clean['Diabetes'].value_counts()
diabetes_rel_freq = df_clean['Diabetes'].value_counts(normalize=True)

print("Frequency:")
print(diabetes_freq)
print("\nRelative Frequency:")
print(diabetes_rel_freq)

The Diabetes variable categorizes patients into three groups: those without diabetes (No), those with Type I diabetes (insulin-dependent), and those with Type II diabetes (non-insulin-dependent). The frequencies show the distribution across these categories.

In [None]:
# Create bar plot for Diabetes
plt.figure(figsize=(8, 6))
diabetes_freq.plot(kind='bar', color=['#3498DB', '#E67E22', '#9B59B6'])
plt.xlabel('Diabetes Status')
plt.ylabel('Frequency')
plt.title('Bar Plot of Diabetes Status')
plt.xticks(rotation=0)
plt.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.show()

The bar plot illustrates the distribution of diabetes status among ICU patients, with different colors representing each diabetes category.

### Class (Categorical Variable)

In [None]:
# Calculate frequency and relative frequency for Class
class_freq = df_clean['Class'].value_counts()
class_rel_freq = df_clean['Class'].value_counts(normalize=True)

print("Frequency:")
print(class_freq)
print("\nRelative Frequency:")
print(class_rel_freq)

The Class variable categorizes patients by the type of disease for which they were admitted to the ICU: infectious or non-infectious diseases.

In [None]:
# Create bar plot for Class
plt.figure(figsize=(8, 6))
class_freq.plot(kind='bar', color=['#1ABC9C', '#F39C12'])
plt.xlabel('Disease Class')
plt.ylabel('Frequency')
plt.title('Bar Plot of Disease Class')
plt.xticks(rotation=0)
plt.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.show()

The bar plot shows the distribution of disease classes among ICU admissions, comparing infectious versus non-infectious disease presentations.

## Conclusions

This comprehensive analysis of ICU mortality data has provided valuable insights into the characteristics of patients admitted to the Intensive Care Unit. The following key findings emerge from the analysis:

**Numerical Variables:**
- Age, white cell count (WCC), and hemoglobin (HB) levels were analyzed using descriptive statistics including measures of central tendency (mean, median) and variability (standard deviation, variance, IQR).
- Box-and-whisker plots revealed the distributions of these continuous variables, highlighting the spread and potential outliers in the patient population.

**Categorical Variables:**
- Mortality outcomes, diabetes status, and disease classification (infectious vs. non-infectious) were examined through frequency and relative frequency analyses.
- Bar plots provided clear visual representations of the distribution across categorical groups.

**Data Quality:**
- The dataset required minimal cleaning, with only a small number of rows containing missing values that were removed from the analysis.
- The cleaned dataset provided a robust foundation for statistical analysis and visualization.

These descriptive statistics and visualizations form the foundation for understanding the ICU patient population and could inform future predictive modeling efforts to identify risk factors associated with mortality in critically ill patients.