In [22]:
import os
os.makedirs("data", exist_ok=True)


In [23]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder



In [24]:
df = pd.read_csv("diabetes_dataset.csv")

# clean column names
df.columns = (
    df.columns.str.strip()
              .str.lower()
              .str.replace(":", "_")
              .str.replace("-", "_")
              .str.replace(" ", "_")
)

df.head()


Unnamed: 0,year,gender,age,location,race_africanamerican,race_asian,race_caucasian,race_hispanic,race_other,hypertension,heart_disease,smoking_history,bmi,hba1c_level,blood_glucose_level,diabetes
0,2020,Female,32.0,Alabama,0,0,0,0,1,0,0,never,27.32,5.0,100,0
1,2015,Female,29.0,Alabama,0,1,0,0,0,0,0,never,19.95,5.0,90,0
2,2015,Male,18.0,Alabama,0,0,0,0,1,0,0,never,23.76,4.8,160,0
3,2015,Male,41.0,Alabama,0,0,1,0,0,0,0,never,27.32,4.0,159,0
4,2016,Female,52.0,Alabama,1,0,0,0,0,0,0,never,23.75,6.5,90,0


In [25]:
df = df.drop(columns=["location"], errors="ignore")


In [26]:
# gender to numeric
df["gender"] = df["gender"].map({"Male": 1, "Female": 0, "Other": 2})

# smoking_history one-hot encode
df["smoking_history"] = df["smoking_history"].str.lower().str.replace(" ", "_")
df = pd.get_dummies(df, columns=["smoking_history"], drop_first=True)


In [27]:
imputer = SimpleImputer(strategy="median")
df[df.columns] = imputer.fit_transform(df)


In [28]:
scaler = StandardScaler()
df[df.columns] = scaler.fit_transform(df)


In [29]:
print(df.info())
print(df.describe())
print(df["diabetes"].value_counts(normalize=True))


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 19 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   year                         100000 non-null  float64
 1   gender                       100000 non-null  float64
 2   age                          100000 non-null  float64
 3   race_africanamerican         100000 non-null  float64
 4   race_asian                   100000 non-null  float64
 5   race_caucasian               100000 non-null  float64
 6   race_hispanic                100000 non-null  float64
 7   race_other                   100000 non-null  float64
 8   hypertension                 100000 non-null  float64
 9   heart_disease                100000 non-null  float64
 10  bmi                          100000 non-null  float64
 11  hba1c_level                  100000 non-null  float64
 12  blood_glucose_level          100000 non-null  float64
 13  

In [30]:
# Separate features and target before scaling
y = df["diabetes"]
X = df.drop(columns=["diabetes"])

# Scale only features
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Recombine features + unscaled target
df_clean = pd.concat([X_scaled, y], axis=1)

# Save cleaned dataset
df_clean.to_csv("data/cleaned_diabetes_data.csv", index=False)
print("Cleaned dataset saved to data/cleaned_diabetes_data.csv")

# Quick check
print(df_clean["diabetes"].value_counts(normalize=True))


Cleaned dataset saved to data/cleaned_diabetes_data.csv
diabetes
-0.304789    0.915
 3.280961    0.085
Name: proportion, dtype: float64


In [31]:
# Unscale diabetes target (convert back to 0 and 1)
df_clean["diabetes"] = (df_clean["diabetes"] > 0).astype(int)
print(df_clean["diabetes"].value_counts(normalize=True))


diabetes
0    0.915
1    0.085
Name: proportion, dtype: float64


In [32]:
df_clean.to_csv("data/cleaned_diabetes_data.csv", index=False)
print("✅ Cleaned dataset saved with unscaled target.")


✅ Cleaned dataset saved with unscaled target.
