# Diabetes Dataset

In [None]:
# Generate a synthetic dataset
import pandas as pd
import numpy as np

np.random.seed(42)
n = 1000

# --- Basic demographic and health features ---
age = np.random.randint(18, 90, size=n)
bmi = np.round(np.random.normal(27, 5, size=n), 1)
bmi = np.clip(bmi, 15, 50)

# Lifestyle and health conditions
inactivity_levels = np.random.choice(["Low", "Moderate", "High"], size=n, p=[0.4, 0.35, 0.25])
family_history = np.random.choice(["Yes", "No"], size=n, p=[0.4, 0.6])
sex = np.random.choice(["Male", "Female"], size=n)

# Hypertension correlated with age and BMI
hypertension_prob = 1 / (1 + np.exp(-0.08 * (age - 50) - 0.15 * (bmi - 27)))
hypertension = np.where(np.random.rand(n) < hypertension_prob, "Yes", "No")

# --- Additional correlated features ---
# Diet score (lower if inactivity is high or BMI is high)
diet_score = np.clip(
    10 - 0.05 * (bmi - 25) - 2 * (inactivity_levels == "High").astype(int)
    - 1 * (inactivity_levels == "Moderate").astype(int)
    - np.random.normal(0, 1, size=n),
    1, 10
)

# Blood glucose (correlated with BMI, age, hypertension, diet, family history)
blood_glucose = (
    80
    + 0.6 * (bmi - 25)
    + 0.3 * (age - 40)
    + 15 * (hypertension == "Yes").astype(int)
    - 2 * (diet_score - 5)
    + 10 * (family_history == "Yes").astype(int)
    + np.random.normal(0, 10, size=n)
)
blood_glucose = np.clip(blood_glucose, 60, 250)

# --- Risk score combining all factors ---
risk_score = (
    0.03 * (age - 18)
    + 0.5 * (bmi - 18)
    + 10 * (inactivity_levels == "High").astype(int)
    + 5 * (inactivity_levels == "Moderate").astype(int)
    + 15 * (family_history == "Yes").astype(int)
    + 10 * (hypertension == "Yes").astype(int)
    + 0.4 * (blood_glucose - 100)
    - 3 * (diet_score - 5)
)

# Normalize and categorize risk
risk_prob = 1 / (1 + np.exp(-(risk_score - np.mean(risk_score)) / np.std(risk_score)))

diabetes_risk = np.where(
    risk_prob > 0.7, "High",
    np.where(risk_prob > 0.4, "Intermediate", "Low")
)

# --- Final dataset ---
df = pd.DataFrame({
    "Age": age,
    "BMI": bmi,
    "Physical_Inactivity": inactivity_levels,
    "Family_History": family_history,
    "Hypertension": hypertension,
    "Sex": sex,
    "Blood_Glucose": np.round(blood_glucose, 1),
    "Diet_Score": np.round(diet_score, 1),
    "Diabetes_Risk": diabetes_risk
})

# Save as CSV
df.to_csv("synthetic_diabetes_risk_dataset_v2.csv", index=False)

print("âœ… Synthetic dataset saved as 'synthetic_diabetes_risk_dataset_v2.csv'")
print(df.head())
