In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split

In [16]:
# Load the dataset
recipes_df = pd.read_csv('../dataset/preprocessed_data.csv')
recipes_df.head()

Unnamed: 0,user_id,recipe_id,date,rating,review,name,minutes,contributor_id,submitted,tags,...,sugar,sodium,protein,saturated_fat,carbohydrates,food_types,negative,neutral,positive,compound
0,7708,60599,2005-09-02,4,very good,kfc honey bbq strips,40,166019,2005-08-24,"60-minutes-or-less, time-to-make, main-ingredi...",...,40.0,37.0,78.0,4.0,10.0,Non-veg,0.0,0.238,0.762,0.4927
1,27707,60599,2005-12-22,5,better than the real,kfc honey bbq strips,40,166019,2005-08-24,"60-minutes-or-less, time-to-make, main-ingredi...",...,40.0,37.0,78.0,4.0,10.0,Non-veg,0.0,0.508,0.492,0.4404
2,35308,60599,2006-09-26,5,absolutely awesome i was speechless when i tri...,kfc honey bbq strips,40,166019,2005-08-24,"60-minutes-or-less, time-to-make, main-ingredi...",...,40.0,37.0,78.0,4.0,10.0,Non-veg,0.0,0.883,0.117,0.659
3,19399,60599,2007-03-09,5,these taste absolutely wonderful my son in law...,kfc honey bbq strips,40,166019,2005-08-24,"60-minutes-or-less, time-to-make, main-ingredi...",...,40.0,37.0,78.0,4.0,10.0,Non-veg,0.0,0.675,0.325,0.8908
4,43887,60599,2008-02-20,0,made my own buttermilk w vinegar and milk. use...,kfc honey bbq strips,40,166019,2005-08-24,"60-minutes-or-less, time-to-make, main-ingredi...",...,40.0,37.0,78.0,4.0,10.0,Non-veg,0.0,0.929,0.071,0.4588


In [17]:
# Drop irrelevant columns
columns_to_drop = ['user_id', 'date', 'rating', 'review', 'contributor_id', 'submitted', 'n_steps', 'steps', 'description']
recipes_df = recipes_df.drop(columns_to_drop, axis=1)
recipes_df.head()

Unnamed: 0,recipe_id,name,minutes,tags,ingredients,n_ingredients,calories,total_fat,sugar,sodium,protein,saturated_fat,carbohydrates,food_types,negative,neutral,positive,compound
0,60599,kfc honey bbq strips,40,"60-minutes-or-less, time-to-make, main-ingredi...","chicken tenders, flour, garlic powder, salt, g...",12,316.0,4.0,40.0,37.0,78.0,4.0,10.0,Non-veg,0.0,0.238,0.762,0.4927
1,60599,kfc honey bbq strips,40,"60-minutes-or-less, time-to-make, main-ingredi...","chicken tenders, flour, garlic powder, salt, g...",12,316.0,4.0,40.0,37.0,78.0,4.0,10.0,Non-veg,0.0,0.508,0.492,0.4404
2,60599,kfc honey bbq strips,40,"60-minutes-or-less, time-to-make, main-ingredi...","chicken tenders, flour, garlic powder, salt, g...",12,316.0,4.0,40.0,37.0,78.0,4.0,10.0,Non-veg,0.0,0.883,0.117,0.659
3,60599,kfc honey bbq strips,40,"60-minutes-or-less, time-to-make, main-ingredi...","chicken tenders, flour, garlic powder, salt, g...",12,316.0,4.0,40.0,37.0,78.0,4.0,10.0,Non-veg,0.0,0.675,0.325,0.8908
4,60599,kfc honey bbq strips,40,"60-minutes-or-less, time-to-make, main-ingredi...","chicken tenders, flour, garlic powder, salt, g...",12,316.0,4.0,40.0,37.0,78.0,4.0,10.0,Non-veg,0.0,0.929,0.071,0.4588


In [18]:
# Drop null values
recipes_df = recipes_df.dropna()

In [19]:
# Normalize numerical data
numerical_columns = ['minutes', 'n_ingredients', 'calories', 'total_fat', 'sugar', 'sodium', 'protein', 'saturated_fat', 'carbohydrates', 'negative', 'neutral', 'positive', 'compound']
recipes_df[numerical_columns] = (recipes_df[numerical_columns] - recipes_df[numerical_columns].mean()) / recipes_df[numerical_columns].std()

In [20]:
# Extract latent features using LDA
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

recipes_df['tags'] = recipes_df['tags'].astype(str)
tags = recipes_df['tags']
vectorizer = CountVectorizer()
tags_matrix = vectorizer.fit_transform(tags)
lda = LatentDirichletAllocation(n_components=10, random_state=42)
tags_lda = lda.fit_transform(tags_matrix)

In [21]:
tags_lda

array([[0.00344915, 0.00344853, 0.00344878, ..., 0.0034489 , 0.00344874,
        0.00344876],
       [0.00344915, 0.00344853, 0.00344878, ..., 0.0034489 , 0.00344874,
        0.00344876],
       [0.00344915, 0.00344853, 0.00344878, ..., 0.0034489 , 0.00344874,
        0.00344876],
       ...,
       [0.00500121, 0.0050006 , 0.00500086, ..., 0.00500117, 0.00500102,
        0.00500118],
       [0.00500121, 0.0050006 , 0.00500086, ..., 0.00500117, 0.00500102,
        0.00500118],
       [0.00500121, 0.0050006 , 0.00500086, ..., 0.00500117, 0.00500102,
        0.00500118]])

In [None]:
# Split the dataset into training and testing sets
train_df, test_df = train_test_split(recipes_df, test_size=0.2, random_state=42)

# Define the neural network model
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(train_df.shape[1],)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(10, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
X_train = train_df.drop('recipe_id', axis=1).to_numpy()
y_train = train_df.filter(regex='^lda_feature_', axis=1).to_numpy()
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2)

# Evaluate the model on the testing set
X_test = test_df.drop('recipe_id', axis=1).to_numpy()
y_test = test_df.filter(regex='^lda_feature_', axis=1).to_numpy()
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f'Test accuracy: {test_acc}')