In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from src.med_project.config import RAW_DATA_DIR

diabetes = pd.read_csv(RAW_DATA_DIR / 'diabetes_dataset.csv').drop(
    ['diabetes_stage'], axis=1
)

## Looking for outliers, features that are logical or input errors.

In [None]:
# blood pressure
impossible_bp = diabetes[
    (diabetes['systolic_bp'] <= diabetes['diastolic_bp']) |
    (diabetes['systolic_bp'] < 60) |
    (diabetes['systolic_bp'] > 300)
]
print(f"BP (systolic must be higher than diastolic, out of possible range): {len(impossible_bp)} rows.")

# cholesterol
cholesterol_error = diabetes[
    diabetes['cholesterol_total'] < (diabetes['hdl_cholesterol'] + diabetes['ldl_cholesterol']) - 20
]
print(f"Cholesterol (Total < HDL+LDL): {len(cholesterol_error)} rows.")

# screen time, sleep hours, age
sleep_error = diabetes[(diabetes['sleep_hours_per_day'] <= 0) | (diabetes['sleep_hours_per_day'] > 24)]
age_error = diabetes[(diabetes['age'] < 0) | (diabetes['age'] > 110)]
screentime_error=diabetes[(diabetes['screen_time_hours_per_day'] < 0) | (diabetes['screen_time_hours_per_day'] > 24)]
print(f"Sleep hours per day error(<=0 or >24h): {len(sleep_error)} rows.")
print(f"Age error: {len(age_error)} rows.")
print(f"Screen time error: {len(screentime_error)} rows.")

# WHR (Waist to hip ratio)
whr_error = diabetes[(diabetes['waist_to_hip_ratio'] < 0.5) | (diabetes['waist_to_hip_ratio'] > 2.5)]
print(f"WHR error: {len(whr_error)} rows.")

# lab errors (impossible zeros)
lab_cols = ['insulin_level', 'cholesterol_total', 'triglycerides']
for col in lab_cols:
    zeros = len(diabetes[diabetes[col] == 0])
    if zeros > 0:
        print(f"ERROR in {col}: {zeros}")


In [None]:
# Deleting
index_to_drop = set(impossible_bp.index) | \
                  set(cholesterol_error.index) | \
                  set(sleep_error.index) | \
                  set(age_error.index) | \
                  set(screentime_error.index) | \
                  set(whr_error.index)

diabetes_clean = diabetes.drop(index=list(index_to_drop)).copy()

print(f"Data rows before: {len(diabetes)}, after: {len(diabetes_clean)}")

In [None]:
# grouping by age and BMI
bmi_bins = [0, 18.5, 25, 30, 35, 40, np.inf]
bmi_labels = ['Underweight', 'Normal', 'Overweight', 'Obese I', 'Obese II', 'Obese III+']

age_bins = [0, 30, 40, 50, 60, 70, 80, np.inf]
age_labels = ['<30', '30-40', '40-50', '50-60', '60-70', '70-80', '80+']

diabetes_clean['bmi_group'] = pd.cut(diabetes_clean['bmi'], bins=bmi_bins, labels=bmi_labels)
diabetes_clean['age_group'] = pd.cut(diabetes_clean['age'], bins=age_bins, labels=age_labels)

#
risk_matrix = diabetes_clean.pivot_table(
    index='bmi_group',
    columns='age_group',
    values='diagnosed_diabetes',
    aggfunc='mean',
    observed=False
)

plt.figure(figsize=(10, 8))
sns.heatmap(risk_matrix, annot=True, fmt=".0%", cmap='YlOrRd')
plt.title('Diabetes risk (Age and BMI)')
plt.ylabel('BMI category')
plt.xlabel('Age group')
plt.gca().invert_yaxis()
plt.show()

Ryzyko rośnie diagonalnie wraz z wiekiem i bmi,
wysoki procent dla młodych i chudych znaczy, że mamy trochę pesymistyczne dane, tak zbalansowane żeby patrzeć też na inne czynniki?