
# Healthy Meals – Modeling (Classification)

**Author:** Yosef Reda  
**Objective:** Build an English, portfolio‑ready classification model to predict `is_healthy` from meals data, using a reproducible ML pipeline with clean metrics and visuals.

**Contents**
- Load cleaned data
- Train/validation split (stratified)
- Preprocessing pipelines (numeric + categorical)
- Models: Logistic Regression, Random Forest (with class weighting)
- Cross‑validation (ROC AUC)
- Evaluation on hold‑out set (ROC, PR, Confusion Matrix, Classification Report)
- Feature importance (model‑based & permutation)
- Export the best model

> **Note**: Educational analysis for this dataset; not medical advice.


In [None]:

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import (roc_auc_score, average_precision_score, roc_curve,
                             precision_recall_curve, confusion_matrix, classification_report)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
import joblib

plt.rcParams['figure.dpi'] = 120
plt.rcParams['font.size'] = 11
plt.rcParams['axes.grid'] = True
RANDOM_STATE = 42


In [None]:

CLEAN_PATH = 'healthy_eating_dataset_clean.csv'
if not os.path.exists(CLEAN_PATH):
    raise FileNotFoundError(f"Cleaned file not found: {CLEAN_PATH}. Please run the EDA notebook first.")

df = pd.read_csv(CLEAN_PATH)
print(df.shape)
df.head()


In [None]:

# Drop rows with missing target
if 'is_healthy' not in df.columns:
    raise KeyError("Column 'is_healthy' not found in cleaned data.")

df = df.dropna(subset=['is_healthy']).copy()

# Define candidate features (keep it broad, exclude obvious labels/IDs)
exclude_cols = {'meal_id','meal_name','image_url','implausible'}
features = [c for c in df.columns if c not in exclude_cols.union({'is_healthy'})]

# Identify types
num_cols = df[features].select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [c for c in features if c not in num_cols]

X = df[features].copy()
y = df['is_healthy'].astype(int)

print('Numeric features:', len(num_cols))
print('Categorical features:', len(cat_cols))
print('Target positive rate:', y.mean().round(3))


In [None]:

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
)
X_train.shape, X_test.shape


In [None]:

# Numeric: impute median, scale
num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Categorical: impute most_frequent, one-hot encode
cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

preprocess = ColumnTransformer([
    ('num', num_pipe, num_cols),
    ('cat', cat_pipe, cat_cols)
])


In [None]:

# Models
logreg = Pipeline([
    ('prep', preprocess),
    ('clf', LogisticRegression(max_iter=2000, class_weight='balanced', solver='lbfgs', random_state=RANDOM_STATE))
])

rf = Pipeline([
    ('prep', preprocess),
    ('clf', RandomForestClassifier(n_estimators=300, class_weight='balanced', random_state=RANDOM_STATE))
])

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

# Cross-validated ROC AUC
for name, model in [('LogisticRegression', logreg), ('RandomForest', rf)]:
    scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='roc_auc', n_jobs=-1)
    print(f"{name} | CV ROC AUC: mean={scores.mean():.3f} ± {scores.std():.3f}")


In [None]:

results = {}

for name, model in [('LogisticRegression', logreg), ('RandomForest', rf)]:
    model.fit(X_train, y_train)
    y_proba = model.predict_proba(X_test)[:,1]
    y_pred = (y_proba >= 0.5).astype(int)

    roc = roc_auc_score(y_test, y_proba)
    prc = average_precision_score(y_test, y_proba)
    cm = confusion_matrix(y_test, y_pred)
    rpt = classification_report(y_test, y_pred, digits=3)

    results[name] = {'model': model, 'roc_auc': roc, 'ap': prc, 'cm': cm, 'report': rpt, 'y_proba': y_proba, 'y_pred': y_pred}
    print(f"
=== {name} ===
ROC AUC: {roc:.3f}
PR AUC: {prc:.3f}
Confusion Matrix:
{cm}

Report:
{rpt}")

# Pick best by ROC AUC
best_name = max(results, key=lambda k: results[k]['roc_auc'])
best = results[best_name]
print(f"
Best model by ROC AUC: {best_name} -> {best['roc_auc']:.3f}")


In [None]:

figures = []

# ROC curve
plt.figure(figsize=(6,4))
for name, res in results.items():
    fpr, tpr, _ = roc_curve(y_test, res['y_proba'])
    plt.plot(fpr, tpr, label=f"{name} (AUC={roc_auc_score(y_test, res['y_proba']):.2f})")
plt.plot([0,1],[0,1],'k--',alpha=0.5)
plt.title('ROC Curve (Hold-out)')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.tight_layout()
roc_path = 'en_model_roc_curve.png'
plt.savefig(roc_path, dpi=150)
figures.append(roc_path)
plt.show()

# Precision-Recall curve
plt.figure(figsize=(6,4))
for name, res in results.items():
    prec, rec, _ = precision_recall_curve(y_test, res['y_proba'])
    plt.plot(rec, prec, label=f"{name} (AP={average_precision_score(y_test, res['y_proba']):.2f})")
plt.title('Precision-Recall Curve (Hold-out)')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend()
plt.tight_layout()
pr_path = 'en_model_pr_curve.png'
plt.savefig(pr_path, dpi=150)
figures.append(pr_path)
plt.show()

# Confusion matrices
for name, res in results.items():
    cm = res['cm']
    plt.figure(figsize=(4,4))
    plt.imshow(cm, cmap='Blues')
    plt.title(f'Confusion Matrix – {name}')
    plt.xticks([0,1], ['Pred 0','Pred 1'])
    plt.yticks([0,1], ['True 0','True 1'])
    for i in range(2):
        for j in range(2):
            plt.text(j, i, cm[i,j], ha='center', va='center', color='black')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.tight_layout()
    cm_path = f'en_cm_{name.replace(" ", "_")}.png'
    plt.savefig(cm_path, dpi=150)
    figures.append(cm_path)
    plt.show()

figures


In [None]:

# Helper to get feature names from ColumnTransformer

def get_feature_names(preprocessor, num_cols, cat_cols):
    names = []
    # numeric pipeline columns
    names.extend(num_cols)
    # categorical OHE names
    try:
        ohe = preprocessor.named_transformers_['cat'].named_steps['ohe']
        cat_names = ohe.get_feature_names_out(cat_cols)
        names = num_cols + list(cat_names)
    except Exception:
        names = num_cols + cat_cols
    return names

best_pipeline = best['model']
prep = best_pipeline.named_steps['prep']
feature_names = get_feature_names(prep, num_cols, cat_cols)

imp_df = None
if 'LogisticRegression' in best_name:
    clf = best_pipeline.named_steps['clf']
    coefs = clf.coef_.ravel()
    imp_df = pd.DataFrame({'feature': feature_names, 'importance': np.abs(coefs)}).sort_values('importance', ascending=False).head(20)
elif 'RandomForest' in best_name:
    clf = best_pipeline.named_steps['clf']
    imps = clf.feature_importances_
    imp_df = pd.DataFrame({'feature': feature_names, 'importance': imps}).sort_values('importance', ascending=False).head(20)

if imp_df is not None:
    plt.figure(figsize=(8,6))
    imp_df[::-1].plot(kind='barh', x='feature', y='importance', color='#58D68D', legend=False)
    plt.title(f'Top Features – {best_name}')
    plt.xlabel('Importance')
    plt.tight_layout()
    fi_path = 'en_top_features.png'
    plt.savefig(fi_path, dpi=150)
    plt.show()
else:
    print('Could not compute model-based feature importance for the best model.')

# Permutation importance (on a smaller subset for speed)
try:
    n_samples = min(1000, X_test.shape[0])
    X_sub = X_test.iloc[:n_samples]
    y_sub = y_test.iloc[:n_samples]
    perm = permutation_importance(best_pipeline, X_sub, y_sub, n_repeats=10, random_state=RANDOM_STATE, scoring='roc_auc')
    perm_imp = pd.DataFrame({'feature': feature_names, 'importance_mean': perm.importances_mean})
    perm_imp = perm_imp.sort_values('importance_mean', ascending=False).head(20)
    plt.figure(figsize=(8,6))
    perm_imp[::-1].plot(kind='barh', x='feature', y='importance_mean', color='#AF7AC5', legend=False)
    plt.title(f'Permutation Importance – {best_name}')
    plt.xlabel('Mean Importance (AUC drop)')
    plt.tight_layout()
    pfi_path = 'en_perm_importance.png'
    plt.savefig(pfi_path, dpi=150)
    plt.show()
except Exception as e:
    print('Permutation importance failed:', e)


In [None]:

MODEL_PATH = f'best_is_healthy_model_{best_name.replace(" ", "_")}.joblib'
joblib.dump(best['model'], MODEL_PATH)
MODEL_PATH



## Summary
- Compared **Logistic Regression** and **Random Forest** with 5‑fold stratified CV (ROC AUC).
- Evaluated the best model on a hold‑out set with ROC AUC, PR AUC, confusion matrix, and a full classification report.
- Reported top features via model‑based importance and **permutation importance**.
- Exported the best pipeline as a `.joblib` artifact for reuse in apps/dashboards.

> Next: add hyper‑parameter tuning (e.g., `GridSearchCV`), or build a small API/Streamlit demo that loads `joblib` and scores new meals.
