In [128]:
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv("../datasets/sepsis.csv")

# Drop truly unnecessary columns
columns_to_drop = ["Unnamed: 0", "Patient_ID", "HospAdmTime"]
df.drop(columns=[col for col in columns_to_drop if col in df.columns], inplace=True, errors="ignore")

# Ensure SepsisLabel exists
if "SepsisLabel" not in df.columns:
    raise ValueError("SepsisLabel column is missing!")

# Step 1: Handling Missing Values
# Drop columns only if more than **85%** of values are missing (previously 70%)
threshold = len(df) * 0.15  # Allow more missing data before dropping
missing_percent = df.isnull().mean() * 100
columns_to_drop = missing_percent[missing_percent > 85].index
df.drop(columns=columns_to_drop, inplace=True)

# Fill missing values using median/mode imputation
numeric_cols = df.select_dtypes(include=np.number).columns

for col in numeric_cols:
    if df[col].isnull().sum() > 0:
        if df[col].nunique() > 10:  # Continuous variable - use median
            df[col].fillna(df[col].median(), inplace=True)
        else:  # Categorical/low unique values - use mode
            df[col].fillna(df[col].mode()[0], inplace=True)

# Step 2: Ensure Sepsis Cases Are Not Removed
sepsis_cases = df[df["SepsisLabel"] == 1]
non_sepsis_cases = df[df["SepsisLabel"] == 0]

# Check the ratio of SepsisLabel=1 cases
sepsis_ratio = len(sepsis_cases) / len(df)
print(f"Sepsis Cases Ratio: {sepsis_ratio:.4f}")

# Ensure at least a **1:10 ratio of Sepsis cases** (no extreme undersampling)
if sepsis_ratio < 0.1:  
    non_sepsis_cases = non_sepsis_cases.sample(n=min(len(sepsis_cases) * 10, len(df[df["SepsisLabel"] == 0])), random_state=42)

# Combine back
df = pd.concat([sepsis_cases, non_sepsis_cases]).sample(frac=1, random_state=42)

# Step 3: Final Checks
print("Final SepsisLabel Distribution:")
print(df["SepsisLabel"].value_counts(normalize=True) * 100)

# Save cleaned data
df.to_csv("../datasets/sepsis_cleaned.csv", index=False)
print("✅ Data cleaning complete. Saved as 'cleaned_dataset.csv'.")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values

Sepsis Cases Ratio: 0.0180
Final SepsisLabel Distribution:
SepsisLabel
0    90.909091
1     9.090909
Name: proportion, dtype: float64
✅ Data cleaning complete. Saved as 'cleaned_dataset.csv'.
