
# Healthy Meals – Exploratory Data Analysis (EDA)

**Author:** Yosef Reda  
**Objective:** Build a clear, portfolio‑ready EDA in English for a meals/nutrition dataset: cleaning, feature engineering, visuals, and a simple composite **Health Score (0–100)**.

**Deliverables in this notebook**
- Data loading & schema overview
- Robust cleaning (duplicates, types, plausibility checks)
- Feature engineering (kcal/100g, macro shares, Health Score)
- Key visuals (calories distribution, diet/cuisine comparisons, correlations)
- Top meals by Health Score / Rating
- Save a cleaned CSV for downstream work

> *Note:* This analysis is educational and comparative for this dataset only; it is **not medical advice**.


In [None]:

# Core imports
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Plot settings
plt.rcParams['figure.dpi'] = 120
plt.rcParams['font.size'] = 11
plt.rcParams['font.family'] = 'DejaVu Sans'

DATA_PATH = 'healthy_eating_dataset.csv'  # original file
CLEAN_PATH = 'healthy_eating_dataset_clean.csv'  # output file


In [None]:

# Load dataset
raw = pd.read_csv(DATA_PATH)
raw_shape = raw.shape
raw.head()


In [None]:

# Basic info
print('Original shape:', raw_shape)
print('
Dtypes:')
print(raw.dtypes)



## Cleaning Strategy
1. **Normalize columns** (snake_case; strip spaces; unify dtypes).
2. **Critical fields** required per row: `meal_name, cuisine, diet_type, calories, protein_g, carbs_g, fat_g`.
3. **Plausibility checks**:
   - Compute macro‑based kcal = `4*protein + 4*carbs + 9*fat` and compare to declared `calories`.
   - Flag rows with |declared − macro_kcal| / declared > **50%**.
   - Flag rows with extreme sodium (e.g., > **5000 mg**).
4. **Derived features**: calories per 100g, macro shares (% of kcal), composite **Health Score**.


In [None]:

# Normalize column names
df = raw.copy()
df.columns = (df.columns
              .str.strip()
              .str.lower()
              .str.replace(' ', '_')
              .str.replace('-', '_'))

df.head(3)


In [None]:

# Ensure numeric types for known numeric columns
num_cols = [
    'calories','protein_g','carbs_g','fat_g','fiber_g','sugar_g',
    'sodium_mg','cholesterol_mg','serving_size_g','prep_time_min',
    'cook_time_min','rating','is_healthy'
]
for c in num_cols:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors='coerce')

# Drop duplicates (prefer by meal_id if present)
if 'meal_id' in df.columns:
    df = df.drop_duplicates(subset=['meal_id'])
else:
    df = df.drop_duplicates()

# Enforce presence of critical fields
critical = ['meal_name','cuisine','diet_type','calories','protein_g','carbs_g','fat_g']
critical_present = [c for c in critical if c in df.columns]
df = df.dropna(subset=critical_present)

# Negative values to NaN for numeric cols (will be handled downstream)
for c in ['calories','protein_g','carbs_g','fat_g','fiber_g','sugar_g','sodium_mg','cholesterol_mg',
          'serving_size_g','prep_time_min','cook_time_min','rating']:
    if c in df.columns:
        df.loc[df[c] < 0, c] = np.nan

print('Shape after basic cleaning:', df.shape)


In [None]:

# Macro-based kcal and plausibility flags
if set(['protein_g','carbs_g','fat_g','calories']).issubset(df.columns):
    df['macro_kcal'] = 4*df['protein_g'] + 4*df['carbs_g'] + 9*df['fat_g']
    df['kcal_diff_pct'] = np.where(df['calories']>0,
                                   (df['calories'] - df['macro_kcal']).abs()/df['calories'],
                                   np.nan)
else:
    df['macro_kcal'] = np.nan
    df['kcal_diff_pct'] = np.nan

# Flag implausible rows
df['implausible'] = False
if 'kcal_diff_pct' in df.columns:
    df.loc[df['kcal_diff_pct'] > 0.50, 'implausible'] = True
if 'sodium_mg' in df.columns:
    df.loc[df['sodium_mg'] > 5000, 'implausible'] = True

print('Implausible rows:', int(df['implausible'].sum()))

clean = df.loc[~df['implausible']].copy()
print('Shape after plausibility filter:', clean.shape)
clean.head(3)


In [None]:

# Derived fields
if 'serving_size_g' in clean.columns:
    clean['cal_per_100g'] = (clean['calories'] / clean['serving_size_g']) * 100
else:
    clean['cal_per_100g'] = np.nan

# Health Score components

def minmax(s, lo, hi):
    # Return higher values for better scores within [lo, hi] range
    return 1 - np.clip((s - lo) / (hi - lo), 0, 1)

def pos_minmax(s, lo, hi):
    # Monotonic increasing to 1 between [lo, hi]
    return np.clip((s - lo) / (hi - lo), 0, 1)

clean['score_cal'] = minmax(clean['calories'], 200, 800)
clean['score_sodium'] = minmax(clean['sodium_mg'], 0, 1500) if 'sodium_mg' in clean.columns else np.nan
clean['score_sugar'] = minmax(clean['sugar_g'], 0, 25) if 'sugar_g' in clean.columns else np.nan
clean['score_fiber'] = pos_minmax(clean['fiber_g'], 5, 15) if 'fiber_g' in clean.columns else np.nan

# Macro % shares
macro_total_kcal = clean['macro_kcal'].replace(0, np.nan)
clean['pct_p'] = (4*clean['protein_g'] / macro_total_kcal) * 100
clean['pct_c'] = (4*clean['carbs_g'] / macro_total_kcal) * 100
clean['pct_f'] = (9*clean['fat_g'] / macro_total_kcal) * 100

# Macro balance score (1 if within target range, taper to 0 at +/-20% abs)

def in_range_score(pct, lo, hi):
    score = np.where(pd.isna(pct), np.nan, 1.0)
    score = score * (1 - np.clip(np.maximum(lo - pct, pct - hi) / 20.0, 0, 1))
    return score

clean['score_macro'] = (
    in_range_score(clean['pct_p'], 20, 35) +
    in_range_score(clean['pct_c'], 40, 55) +
    in_range_score(clean['pct_f'], 20, 35)
) / 3.0

score_cols = ['score_cal','score_sodium','score_sugar','score_fiber','score_macro']
clean['health_score'] = clean[score_cols].mean(axis=1, skipna=True) * 100

clean[['meal_name','cuisine','diet_type','calories','health_score']].head()


In [None]:

summary = {
    'Rows (original)': raw_shape[0],
    'Columns (original)': raw_shape[1],
    'Rows after cleaning': int(clean.shape[0]),
    'Excluded rows %': round((1 - clean.shape[0]/raw_shape[0]) * 100, 2)
}
summary



## Key Visuals


In [None]:

# 9.1 Calories distribution
ax = clean['calories'].dropna().plot(kind='hist', bins=40, color='#5DADE2', edgecolor='white')
ax.set_title('Calories Distribution')
ax.set_xlabel('Calories (kcal)')
ax.set_ylabel('Number of Meals')
plt.show()


In [None]:

# 9.2 Calories by diet type
if 'diet_type' in clean.columns:
    clean.boxplot(column='calories', by='diet_type', vert=True)
    plt.suptitle('')
    plt.title('Calories by Diet Type')
    plt.xlabel('Diet Type')
    plt.ylabel('Calories (kcal)')
    plt.show()


In [None]:

# 9.3 Avg Health Score by cuisine (Top 10 by count)
if 'cuisine' in clean.columns:
    top_cuisines = clean['cuisine'].value_counts().nlargest(10).index
    (clean[clean['cuisine'].isin(top_cuisines)]
        .groupby('cuisine')['health_score']
        .mean()
        .sort_values()
        .plot(kind='barh', color='#58D68D'))
    plt.title('Avg Health Score by Cuisine (Top 10)')
    plt.xlabel('Score (0–100)')
    plt.ylabel('Cuisine')
    plt.show()


In [None]:

# 9.4 Share of meals labeled healthy by diet type
if 'is_healthy' in clean.columns and 'diet_type' in clean.columns:
    ratio = clean.groupby('diet_type')['is_healthy'].mean().sort_values()
    ratio.plot(kind='barh', color='#AF7AC5')
    plt.title('Share of Meals Labeled Healthy (by Diet Type)')
    plt.xlabel('Share')
    plt.ylabel('Diet Type')
    plt.show()


In [None]:

# 9.5 Correlation heatmap for numeric features
num_for_corr = clean.select_dtypes(include=[np.number]).drop(columns=['meal_id'], errors='ignore')
if num_for_corr.shape[1] >= 2:
    corr = num_for_corr.corr(numeric_only=True)
    plt.imshow(corr, cmap='viridis', interpolation='nearest')
    plt.colorbar(label='Correlation')
    plt.xticks(range(corr.shape[1]), corr.columns, rotation=90)
    plt.yticks(range(corr.shape[0]), corr.index)
    plt.title('Correlation Heatmap (Numeric Features)')
    plt.tight_layout()
    plt.show()



## Top Meals


In [None]:

# Top 5 by Health Score
cols = ['meal_name','cuisine','diet_type','calories','health_score']
clean.sort_values('health_score', ascending=False)[cols].head(5)


In [None]:

# Top 5 by Rating (if available)
if 'rating' in clean.columns:
    clean.sort_values('rating', ascending=False)[['meal_name','cuisine','diet_type','calories','rating']].head(5)


In [None]:

# Save cleaned dataset for downstream tasks
clean.to_csv(CLEAN_PATH, index=False)
CLEAN_PATH



## Next Steps
- Add a **classification model** to predict `is_healthy` (e.g., Logistic Regression / Random Forest) with evaluation metrics.
- Or a **regression model** to predict `rating` and analyze key drivers.
- Publish a **dashboard** (Power BI / Excel) with slicers for Cuisine, Diet Type, Cooking Method.
- Package this notebook + figures + cleaned CSV into a public GitHub repo with a polished README.
