## Import Library

In [11]:
# TensorFlow text classifier — copy into a notebook cell
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
import joblib

## Variables

In [3]:
# Reproducibility
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)

In [7]:
# === Parameters ===
DATASET_PATH = "./dataset/filtered_data.csv"
TEXT_COL = "ingredients_text"
NUM_COLS = [
    "energy-kcal_100g",
    "fat_100g",
    "saturated-fat_100g",
    "carbohydrates_100g",
    "sugars_100g",
    "proteins_100g",
    "fiber_100g",
    "salt_100g",
    "sodium_100g"
]
LABEL_COL = "healthy_level"
MAX_TOKENS = 20000
SEQ_LEN = 200
EMBED_DIM = 64
BATCH_SIZE = 32
EPOCHS = 5
VALIDATION_SPLIT = 0.1

## Load Dataset

In [10]:
# Read dataset from .csv
df = pd.read_csv(DATASET_PATH)
df = df.drop(columns=["nutriscore_score", "nutriscore_grade"])
print("Loaded:", df.shape)

# Clean dataset
df = df[df[TEXT_COL].astype(str).str.len() > 0].reset_index(drop=True)

df.head()

Loaded: (42790, 11)


Unnamed: 0,ingredients_text,energy-kcal_100g,fat_100g,saturated-fat_100g,carbohydrates_100g,sugars_100g,proteins_100g,fiber_100g,salt_100g,sodium_100g,healthy_level
0,HONIG stillende Frauen nicht geeignet. D bestr...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.4,Medium
1,"Sojaproteinisolat, Weizen - protein, Kaffee-Ex...",358.0,2.0,0.5,6.7,1.7,76.0,10.714286,1.5,0.6,Medium
2,"Water, Leptospermum Scoparium Mel (Manuka Hone...",45.0,13.0,6.7,15.0,3.6,11.0,0.0,0.0625,0.025,Medium
3,"Farine de blé 27%, chocolat au lait 18% (sucre...",460.0,24.0,6.0,54.0,31.0,6.4,1.4,0.48,0.192,Not Healthy
4,Madeleines ChocoNoir - Madeleines nappées de c...,389.0,16.7,6.48,35.2,1.85,37.0,18.5,0.88,0.352,Medium


In [14]:
x_text = df[TEXT_COL].values
x_num = df[NUM_COLS].values
y = df[LABEL_COL].values

scaler = StandardScaler()
X_num = scaler.fit_transform(x_num)

In [15]:
x_train_text, x_test_text, x_train_num, x_test_num, y_train, y_test = train_test_split(
    x_text, x_num, y, test_size=0.2, random_state=42
)

In [16]:
VOCAB_SIZE = 20000
SEQ_LEN = 200

vectorizer = TextVectorization(
    max_tokens=VOCAB_SIZE,
    output_mode="int",
    output_sequence_length=SEQ_LEN,
)

# Fit vectorizer
vectorizer.adapt(x_train_text)

In [29]:
# text_model = tf.keras.Sequential([
#     tf.keras.layers.Input(shape=(1,), dtype=tf.string),
#     vectorizer,
#     tf.keras.layers.Embedding(VOCAB_SIZE, 128),
#     tf.keras.layers.GlobalAveragePooling1D(),
#     tf.keras.layers.Dense(64, activation="relu"),
# ])

# num_model = tf.keras.Sequential([
#     tf.keras.layers.Input(shape=(len(NUM_COLS),)),
#     tf.keras.layers.Dense(32, activation="relu"),
#     tf.keras.layers.Dense(16, activation="relu")
# ])

# # 1) Create input tensors
text_input = tf.keras.Input(shape=(1,), dtype=tf.string)
num_input = tf.keras.Input(shape=(len(NUM_COLS),))

# # 2) Call the sequential models on these inputs
# text_output = text_model(text_input)
# num_output = num_model(num_input)

# # 3) Combine
# combined = tf.keras.layers.Concatenate()([text_output, num_output])

# z = tf.keras.layers.Dense(64, activation="relu")(combined)
# z = tf.keras.layers.Dense(32, activation="relu")(z)
# output = tf.keras.layers.Dense(1, activation="sigmoid")(z)

# # 4) Build final model
# model = tf.keras.Model(
#     inputs=[text_input, num_input],
#     outputs=output
# )

x = vectorizer(text_input)
x = tf.keras.layers.Embedding(VOCAB_SIZE, 128)(x)
x = tf.keras.layers.GlobalAveragePooling1D()(x)
x = tf.keras.layers.Dense(64, activation="relu")(x)

y = tf.keras.layers.Dense(32, activation="relu")(num_input)
y = tf.keras.layers.Dense(16, activation="relu")(y)

combined = tf.keras.layers.Concatenate()([x, y])

z = tf.keras.layers.Dense(64, activation="relu")(combined)
z = tf.keras.layers.Dense(32, activation="relu")(z)
output = tf.keras.layers.Dense(1, activation="sigmoid")(z)

model = tf.keras.Model(inputs=[text_input, num_input], outputs=output)

model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

model.summary()


In [31]:
history = model.fit(
    {
        "text_input": x_train_text,
        "num_input": x_train_num
    },
    y_train,
    validation_data=(
        {
            "text_input": x_test_text,
            "num_input": x_test_num
        },
        y_test
    ),
    epochs=5,
    batch_size=32
)

# ==========================
# 8. EVALUATE
# ==========================
# model.evaluate(
#     {
#         "input_layer_9": x_test_text,
#         "input_layer_10": x_test_num
#     },
#     y_test
# )

Epoch 1/5


ValueError: Missing data for input "input_layer_11". You passed a data dictionary with keys ['text_input', 'num_input']. Expected the following keys: ['input_layer_11', 'input_layer_12']

In [None]:

])

combined = tf.keras.layers.Concatenate()([text_model.output, num_model.output])

z = tf.keras.layers.Dense(64, activation="relu")(combined)
z = tf.keras.layers.Dense(32, activation="relu")(z)
output = tf.keras.layers.Dense(1, activation="sigmoid")(z)

model = tf.keras.models.Model(inputs=[text_model.input, num_model.input], outputs=output)
model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

model.summary()

AttributeError: The layer sequential_6 has never been called and thus has no defined output.

In [None]:
tf.keras.Seq

# === Prepare X and y ===
X = df[TEXT_COL].astype(str)
y = df[LABEL_COL].astype(str)

# Encode labels to integers
le = LabelEncoder()
y_enc = le.fit_transform(y)
num_classes = len(le.classes_)
print("Classes:", list(le.classes_), " -> num_classes:", num_classes)

# Stratified train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y_enc, test_size=0.2, stratify=y_enc, random_state=SEED
)
print("Train / Test sizes:", len(X_train), len(X_test))

# Optionally compute class weights if classes are imbalanced
class_weights = None
try:
    cw = compute_class_weight("balanced", classes=np.unique(y_train), y=y_train)
    class_weights = {i: float(w) for i, w in enumerate(cw)}
    print("Class weights:", class_weights)
except Exception as e:
    print("Could not compute class weight:", e)

# === TextVectorization and vocabulary ===
vectorizer = TextVectorization(max_tokens=MAX_TOKENS, output_sequence_length=SEQ_LEN)
# adapt uses a tf dataset or numpy array of strings
vectorizer.adapt(X_train.values)

vocab = vectorizer.get_vocabulary()
vocab_size = len(vocab)
print("Vocab size:", vocab_size)

# === Build the model ===
model = tf.keras.Sequential([
    vectorizer,
    tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=EMBED_DIM),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(64, activation="relu"),
    tf.keras.layers.Dense(num_classes, activation="softmax")
])

model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)
model.summary()

# === Train ===
history = model.fit(
    X_train.values,
    y_train,
    validation_split=VALIDATION_SPLIT,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    class_weight=class_weights  # set to None if you don't want to use it
)

# === Evaluate ===
loss, acc = model.evaluate(X_test.values, y_test, batch_size=BATCH_SIZE)
print("Test loss:", loss, " Test accuracy:", acc)

# Predictions and classification report
y_prob = model.predict(X_test.values, batch_size=BATCH_SIZE)
y_pred = np.argmax(y_prob, axis=1)
print(classification_report(y_test, y_pred, target_names=le.classes_))

# === Save model and label encoder ===
model_save_path = "text_classifier_tf.keras"
le_save_path = "label_encoder.joblib"
model.save(model_save_path)
joblib.dump(le, le_save_path)
print(f"Saved model -> {model_save_path}, label encoder -> {le_save_path}")

In [None]:
import tensorflow as tf
import numpy as np
import joblib

model = tf.keras.models.load_model("text_classifier_tf.keras")
le = joblib.load("label_encoder.joblib")


In [None]:
text = "acid, sugar, salt, palm oil, wheat flour, fruit"

# Convert to numpy array so TensorFlow accepts it
# x = np.array([text], dtype=tf.string)
x = tf.constant([text], dtype=tf.string)

pred_prob = model.predict(x)
pred_class = np.argmax(pred_prob, axis=1)[0]
label = le.inverse_transform([pred_class])[0]

print("Predicted class:", label)


In [32]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import (
    Input, Dense, Embedding, TextVectorization, LSTM, Concatenate, Dropout, Normalization
)
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split

# ===========================
# 1. LOAD DATA
# ===========================
TEXT_COL = "ingredients_text"
NUM_COLS = [
    "energy-kcal_100g",
    "fat_100g",
    "saturated-fat_100g",
    "carbohydrates_100g",
    "sugars_100g",
    "proteins_100g",
    "fiber_100g",
    "salt_100g",
    "sodium_100g"
]
LABEL_COL = "healthy_level"

df = pd.read_csv("./dataset/filtered_data.csv")

# Remove rows with missing text or numeric data
df = df.dropna(subset=[TEXT_COL] + NUM_COLS + [LABEL_COL])

cat = df[LABEL_COL].astype("category")
print("Category order:", list(cat.cat.categories))

Category order: ['Healthy', 'Medium', 'Not Healthy']
