# Embeddings on recipes with BERT, Neural Network for classification

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

import torch
from transformers import BertTokenizer, BertModel, TFBertModel
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from tqdm import tqdm

import tensorflow as tf

from keras.layers import Input, Dense, Dropout, Flatten
from keras.models import Model, Sequential
from keras.optimizers import Adam

# Dataset Loading
We upload the old dataset (the one used for the slide presentations): `recipes_df_r.csv` is made of 5.000 recipes.

In [3]:
df = pd.read_csv('./dataset/recipes_df_r.csv')
len(df)

10000

In [4]:
df.columns

Index(['cooking_method', 'ingredients', 'recipe_name', 'tags',
       'Vegetarian&Desserts', 'Others&D', 'Vegetarian', 'Others', 'Dairy Free',
       'Gluten Free', 'Low Carb', 'Low Fat', 'Low Sodium'],
      dtype='object')

In [5]:
df['Vegetarian&Desserts'].value_counts()

Vegetarian&Desserts
0    6670
1    3330
Name: count, dtype: int64

In [6]:
# Keep only the columns we need
columns = ['cooking_method', 'ingredients', 'Vegetarian&Desserts']
df = df[columns]

## Mantaining Veg distribution (33%)

We restrict even more the dataset: keep only 666 Veg and 1.334 Non-Veg recipes.

In [10]:
# take 2.000 samples keeping same distribution of feature 'Vegetarian&Desserts'
df_v = df[df['Vegetarian&Desserts'] == 1].sample(n=666)
df_nv = df[df['Vegetarian&Desserts'] == 0].sample(n=1334)

df = pd.concat([df_v, df_nv])
df = df.sample(frac=1)
df.head()

Unnamed: 0,cooking_method,ingredients,Vegetarian&Desserts
6734,['Heat a nonstick or cast-iron skillet over me...,['2 slices sourdough sandwich bread or crusty ...,1
6512,"['Blend 15 to 20 seconds, use a spatula to rem...","['8 ounces crushed ice', '1 ounce strawberry j...",0
7273,['Cut off about 1 inch from both ends of all 4...,"['4 medium navel oranges (2 to 2 1/4 pounds), ...",1
98,"['In a large, heavy bottom saucepan over mediu...","['4 tablespoons unsalted butter', '1 medium le...",0
5802,"['Directions', 'Bring a large pot of salted wa...","['Kosher salt', '1/2 pound medium shell pasta'...",0


In [11]:
# get the cooking_method as a list of strings
cooking_methods = df['cooking_method'].values.tolist()
type(cooking_methods)

list

In [12]:
# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [7]:
X = cooking_methods
y = df['Vegetarian&Desserts'].values

# Assuming 'X' contains your input data (cooking methods) and 'y' contains the target labels
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenize input data using BERT tokenizer
max_length = 200  
X_train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length=max_length, return_tensors='tf')
X_val_encodings = tokenizer(X_val, truncation=True, padding=True, max_length=max_length, return_tensors='tf')

# Obtain BERT embeddings
train_outputs = bert_model(X_train_encodings)
val_outputs = bert_model(X_val_encodings)

In [8]:
# Extract BERT embeddings
train_embeddings = train_outputs.last_hidden_state
val_embeddings = val_outputs.last_hidden_state

In [9]:
train_embeddings.shape, val_embeddings.shape

(TensorShape([1600, 200, 768]), TensorShape([400, 200, 768]))

In [12]:
y_train.shape, y_val.shape

((1600,), (400,))

In [13]:
# reshape y_train and y_val
y_train = y_train.reshape(-1, 1)
y_val = y_val.reshape(-1, 1)

y_train.shape, y_val.shape

((1600, 1), (400, 1))

Now, we define our Neural Network classifier using Keras, we train it and test.

In [18]:
# Define feedforward neural network for classification using Keras
input_layer = Input(shape=(max_length, 768))  # BERT embedding size is 768
flatten_layer = Flatten()(input_layer)  # Flatten the BERT embeddings
dense_layer1 = Dense(128, activation='relu')(flatten_layer)
dropout_layer = Dropout(0.2)(dense_layer1)
dense_layer2 = Dense(64, activation='relu')(dropout_layer)
output_layer = Dense(1, activation='sigmoid')(dense_layer2)  # Assuming binary classification

# Compile the model
model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer=Adam(learning_rate=1e-5), loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(train_embeddings.numpy(), y_train, epochs=10, batch_size=10, verbose=True)

Epoch 1/10
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 29ms/step - accuracy: 0.7047 - loss: 0.6005
Epoch 2/10
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 29ms/step - accuracy: 0.8287 - loss: 0.3739
Epoch 3/10
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 28ms/step - accuracy: 0.9198 - loss: 0.2339
Epoch 4/10
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 28ms/step - accuracy: 0.9791 - loss: 0.1296
Epoch 5/10
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 28ms/step - accuracy: 0.9858 - loss: 0.0934
Epoch 6/10
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 28ms/step - accuracy: 0.9942 - loss: 0.0590
Epoch 7/10
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 29ms/step - accuracy: 0.9966 - loss: 0.0450
Epoch 8/10
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 28ms/step - accuracy: 0.9975 - loss: 0.0356
Epoch 9/10
[1m160/160[0m [32m

In [19]:
# predict the output and compare with y_val
predicted_labels = model.predict(val_embeddings.numpy())
predicted_labels = (predicted_labels > 0.5).astype(int)

# classification report
print(classification_report(y_val, predicted_labels))

# confusion matrix
cm = confusion_matrix(y_val, predicted_labels)
print(cm)

[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
              precision    recall  f1-score   support

           0       0.78      0.92      0.84       251
           1       0.81      0.55      0.66       149

    accuracy                           0.79       400
   macro avg       0.79      0.74      0.75       400
weighted avg       0.79      0.79      0.77       400

[[232  19]
 [ 67  82]]


## Balancing Veg distribution (50%)

In [13]:
df = pd.read_csv('./dataset/recipes_df_r.csv')

# keep only the columns we need
columns = ['cooking_method', 'ingredients', 'Vegetarian&Desserts']
df = df[columns]

# take 3.000 samples balancing the feature 'Vegetarian&Desserts'
df_v = df[df['Vegetarian&Desserts'] == 1].sample(n=1500)
df_nv = df[df['Vegetarian&Desserts'] == 0].sample(n=1500)

df = pd.concat([df_v, df_nv])
df = df.sample(frac=1)
df.head()

Unnamed: 0,cooking_method,ingredients,Vegetarian&Desserts
7498,['In a cocktail shaker mix all the ingredients...,"['2 ounces vanilla vodka', '1-ounce peach schn...",0
6122,"[""Make a cone holder: Find a small, sturdy, cl...","['4 ounces semisweet chocolate, chopped into s...",1
3820,"['In a high-sided skillet over medium heat, ad...","['1 teaspoon canola oil', '1/2 red onion, slic...",0
6723,['Place the garlic in cup of boiling water for...,"['5 cloves garlic, peeled', '2 tablespoons oli...",0
4699,"['Drain the beans and set aside. In a bowl, wh...","['1 pound dried navy beans, soaked overnight i...",0


In [14]:
# get the cooking_method as a list of strings
cooking_methods = df['cooking_method'].values.tolist()

In [15]:
# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [None]:
X = cooking_methods
y = df['Vegetarian&Desserts'].values

# Assuming 'X' contains your input data (cooking methods) and 'y' contains the target labels
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [25]:
# Tokenize input data using BERT tokenizer
max_length = 200  
X_train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length=max_length, return_tensors='tf')
X_val_encodings = tokenizer(X_val, truncation=True, padding=True, max_length=max_length, return_tensors='tf')

# Obtain BERT embeddings
train_outputs = bert_model(X_train_encodings)
val_outputs = bert_model(X_val_encodings)

# Extract BERT embeddings
train_embeddings = train_outputs.last_hidden_state
val_embeddings = val_outputs.last_hidden_state
train_embeddings.shape, val_embeddings.shape

(TensorShape([2400, 200, 768]), TensorShape([600, 200, 768]))

In [None]:
# reshape y_train and y_val
y_train = y_train.reshape(-1, 1)
y_val = y_val.reshape(-1, 1)

Now, we define our Neural Network classifier using Keras, we train it and test.

In [51]:
# Define feedforward neural network for classification using Keras
input_layer = Input(shape=(max_length, 768))  # BERT embedding size is 768
flatten_layer = Flatten()(input_layer)  # Flatten the BERT embeddings
dense_layer1 = Dense(64, activation='relu')(flatten_layer)
dropout_layer = Dropout(0.4)(dense_layer1)
dense_layer2 = Dense(32, activation='relu')(dropout_layer)
dropout_layer2 = Dropout(0.2)(dense_layer2)
dense_layer3 = Dense(16, activation='relu')(dropout_layer2)
dense_layer4 = Dense(8, activation='relu')(dense_layer3)
output_layer = Dense(1, activation='sigmoid')(dense_layer4)  # Assuming binary classification

# Compile the model
model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer=Adam(learning_rate=1e-5), loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(train_embeddings.numpy(), y_train, epochs=30, batch_size=32, verbose=True)
# predict the output and compare with y_val
predicted_labels = model.predict(val_embeddings.numpy())
predicted_labels = (predicted_labels > 0.5).astype(int)

# classification report
print(classification_report(y_val, predicted_labels))

# confusion matrix
cm = confusion_matrix(y_val, predicted_labels)
print(cm)

Epoch 1/30
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 18ms/step - accuracy: 0.5146 - loss: 0.6968
Epoch 2/30
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - accuracy: 0.5791 - loss: 0.6759
Epoch 3/30
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - accuracy: 0.6571 - loss: 0.6279
Epoch 4/30
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - accuracy: 0.7152 - loss: 0.5592
Epoch 5/30
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - accuracy: 0.7675 - loss: 0.5025
Epoch 6/30
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step - accuracy: 0.8053 - loss: 0.4781
Epoch 7/30
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step - accuracy: 0.8154 - loss: 0.4486
Epoch 8/30
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - accuracy: 0.8359 - loss: 0.4120
Epoch 9/30
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━