In [None]:


import numpy as np 
import pandas as pd 


import os 
for dirname, _, filenames in os.walk('/kaggle/input'):  
    for filename in filenames:
        print(os.path.join(dirname, filename)) 
import seaborn as sns  
import matplotlib.pyplot as plt

 

# Exploratory Data Analysis (EDA)

This notebook provides an exploratory data analysis of the **Kaggle Playground Series - Season 5 Episode 12** dataset.  
The goal is to understand the distribution of features, detect relationships, compare groups, and extract insights that will guide future modeling steps.

#  Loading Raw Data
We load the training data and get an initial overview using `head()`, `describe()`, and missing values checking.

In [None]:
train = pd.read_csv('/kaggle/input/playground-series-s5e12/train.csv')

In [None]:
train.head(5)

In [None]:
train.describe()

#  checking  missing  values
The dataset contains **no missing values**, which simplifies preprocessing.



In [None]:
train.isna().sum()

# Exploratory Data Analysis (EDA): Global Distributions

In [None]:
train['ethnicity'].hist()

# 4. Ethnicity vs Diabetes
We examine how diabetes prevalence varies across ethnic groups.

In [None]:
train.groupby(["ethnicity", "diagnosed_diabetes"])['id'].count()

In [None]:
plt.figure(figsize=(12,6))
sns.countplot(
    data=train,
    x="ethnicity",
    hue="diagnosed_diabetes"
)
plt.title("Count of Diabetes vs No Diabetes per Ethnicity")
plt.show()


The distribution of the numerical features appears approximately symmetric.

In [None]:
train.drop(['id','age','diagnosed_diabetes','cardiovascular_history','hypertension_history','family_history_diabetes'],axis=1).hist(bins=50,figsize=(12,12))
plt.show()

# 5. Physical Activity vs Diabetes
People with diabetes tend to have lower physical activity levels

In [None]:
train.boxplot(column='physical_activity_minutes_per_week' , by='diagnosed_diabetes',showfliers=False)
plt.show()


# 6. Diet Score vs Diabete
People diagnosed with diabetes tend to have a lower median diet_score.

In [None]:

train.boxplot(column='diet_score',  by='diagnosed_diabetes',showfliers=False)
plt.show()


# 7.  Age vs Diabetes
Older individuals show **higher diabetes prevalence**, confirming medical expectations.


In [None]:

train.boxplot(column ='age',by='diagnosed_diabetes')
plt.show()

In [None]:
sns.countplot(x='smoking_status', hue='diagnosed_diabetes', data=train)
plt.xticks(rotation=45)


In [None]:
sns.countplot(x='diagnosed_diabetes', data=train)
plt.show()


# proportion disgnosed diabetes 

in person that has family history diabetes

In [None]:
prop=train.groupby('family_history_diabetes')['diagnosed_diabetes'].mean()
prop.plot(kind='bar')
plt.ylabel('proportion disgnosed diabetes')
plt.xticks(rotation=0)
plt.show()

in person how have cardio vascular history 

In [None]:
prob2 = train.groupby('cardiovascular_history')['diagnosed_diabetes'].mean()
prob2.plot(kind='bar')
plt.ylabel('proportion disgnosed diabetes')
plt.show

In [None]:
from pandas.plotting import scatter_matrix
corr_matrix=train.select_dtypes(include='number').corr()


In [None]:
pd.cut(train['age'],bins=[18,20,40,60])

# 8.  Correlation Matrix

The heatmap below highlights important linear relationships between numerical features.  
Notably:
- **cholesterol_total** and **ldl_cholesterol** are highly correlated  
- **BMI** is correlated with **waist_to_hip_ratio**

In [None]:
plt.figure(figsize=(12,10))
numeric = train.select_dtypes(include="number").drop(['id','diagnosed_diabetes','cardiovascular_history','hypertension_history','family_history_diabetes'] , axis=1)
corr = numeric.corr()
sns.heatmap(corr, cmap="coolwarm", center=0,annot=True , fmt='.2f')
plt.show()


cholesterol_total is correlated lineary with  ldl_cholesterol we can do a features merging here 

In [None]:
sample = train.sample(1100)
sns.scatterplot(data=sample, x='cholesterol_total', y='ldl_cholesterol')
plt.show()

In [None]:
train.columns 

we have also a strong correlation between bmi and waist to hip radio 

In [None]:
plt.plot(sample['bmi'],sample['waist_to_hip_ratio'],'r.')
plt.xlabel('bmi')
plt.ylabel('waist to hip ratio')
plt.show() 

# 10. Key Insights Summary

- No missing values in the dataset  
- Older individuals have higher diabetes rates  
- Diabetes patients have lower physical activity  
- Diet quality is lower among diabetic individuals  
- Strong cholesterol-related correlations: total â†” LDL  
- BMI and waist-to-hip ratio show meaningful correlation  
- Several numerical features have symmetric distributions  

