Age: Integer 
Gender: Categorical 
Hypertension: Boolean 
Heart Disease: Boolean 
Smoking History: Enum 
BMI: Float 
HbA1c: Float 
Random Glucos: Float 
i want you to make a python script that does a few things with this CSV DB: 1) Removes any row that has a NULL value 2) For gender, assigns Male to 0, and Female to 1 3) For smoking history, assigns never=0, current=1, past=2, 4) For HbA1c, maps the float to these values: (0,4)=0, [4,5.6) = 1, [5.6, 6.4) = 2, [6.4, 7) = 3, [7, 15) = 4 5) For Random Glucose, maps the float to these values: (30, 70) = 0, [70, 140) = 1, [140, 200) = 2, [200, 500) = 3

If it detects any other columns, just outright remove them

In [None]:
import pandas as pd

# ============================================================
# ðŸ§  Diabetes Dataset Cleaner
# ============================================================

# -----------------------------
# Input / Output paths
# -----------------------------
INPUT_CSV = "cleaned_patients_data.csv"
OUTPUT_CSV = "cleaned_dataset.csv"

def clean_diabetes_data(input_path: str, output_path: str = "cleaned_dataset.csv"):
    """
    Cleans and encodes a diabetes dataset CSV.

    Steps:
    1) Removes rows with NULLs
    2) Keeps only relevant columns (including Index and patient_id)
    3) Encodes categorical variables
    4) Maps HbA1c and Glucose Enums to numeric categories
    5) Keeps the original Index and patient_id columns intact
    """

    # ------------------------------------------------------------
    # Load data
    # ------------------------------------------------------------
    df = pd.read_csv(input_path)
    print(f"âœ… Loaded dataset with {df.shape[0]} rows and {df.shape[1]} columns")

    # ------------------------------------------------------------
    # Keep only expected columns (including Index and patient_id)
    # ------------------------------------------------------------
    expected_cols = [
        "patient_id", "Index", "Age", "Gender", "Hypertension", "Heart Disease",
        "Smoking History", "BMI", "HbA1c", "Random Glucose"
    ]
    df = df[[c for c in expected_cols if c in df.columns]]
    print(f"âœ… Kept {len(df.columns)} relevant columns (including identifiers)")

    # ------------------------------------------------------------
    # Drop rows with NULLs
    # ------------------------------------------------------------
    df = df.dropna()
    print(f"âœ… Dropped rows with missing values â†’ {df.shape[0]} rows left")

    # ------------------------------------------------------------
    # Encode Gender
    # ------------------------------------------------------------
    if "Gender" in df.columns:
        df["Gender"] = df["Gender"].astype(str).str.lower().map({
            "male": 0,
            "female": 1
        }).fillna(0).astype(int)

    # ------------------------------------------------------------
    # Encode Smoking History
    # ------------------------------------------------------------
    if "Smoking History" in df.columns:
        df["Smoking History"] = df["Smoking History"].astype(str).str.lower().map({
            "never": 0,
            "current": 1,
            "past": 2
        }).fillna(0).astype(int)

    # ------------------------------------------------------------
    # Encode HbA1c (Enum-based)
    # ------------------------------------------------------------
    if "HbA1c" in df.columns:
        def encode_hba1c_enum(val):
            val = str(val).strip().lower()
            mapping = {
                "low": 0,
                "normal": 1,
                "high": 2,
                "very high": 3
            }
            return mapping.get(val, 1)  # Default to 'normal' = 1

        df["HbA1c"] = df["HbA1c"].apply(encode_hba1c_enum)

    # ------------------------------------------------------------
    # Encode Random Glucose (Enum-based)
    # ------------------------------------------------------------
    if "Random Glucose" in df.columns:
        def encode_glucose_enum(val):
            val = str(val).strip().lower()
            mapping = {
                "low": 0,
                "normal": 1,
                "high": 2,
                "elevated": 2,
                "very high": 3
            }
            return mapping.get(val, 1)  # Default to 'normal' = 1

        df["Random Glucose"] = df["Random Glucose"].apply(encode_glucose_enum)

    # ------------------------------------------------------------
    # Save cleaned dataset (keeping Index and patient_id as-is)
    # ------------------------------------------------------------
    df.to_csv(output_path, index=False)
    print(f"âœ… Cleaned dataset saved to '{output_path}' ({df.shape[0]} rows)")

    return df


# ============================================================
# ðŸ§© Run script directly
# ============================================================
if __name__ == "__main__":
    clean_diabetes_data(INPUT_CSV, OUTPUT_CSV)

âœ… Loaded dataset with 300 rows and 10 columns
âœ… Kept 9 relevant columns (including identifiers)
âœ… Dropped rows with missing values â†’ 300 rows left
âœ… Cleaned dataset saved to 'cleaned_dataset.csv' (300 rows)
