In [12]:
import pandas as pd
import ast
from sentence_transformers import SentenceTransformer

RANDOM_STATE = 42

LABELLED_COLS = [
    "Savoury",
    "Rough",
    "Hot",
    "Spicy",
    "Acidic",
    "Crunchy",
    "Creamy",
    "Sticky",
    "Liquid",
    "Aromatic",
    "Salty",
    "Citrusy",
    "Herbal",
    "Fluffy",
    "Flaky",
    "Cooling",
    "Chunky",
    "Fishy",
    "Firm",
]

In [17]:
pd.read_csv("trimmed_recipes_labelled.csv", nrows=100).head(1)["RecipeIngredientParts"][
    0
]

'"strawberry", "feta cheese", "avocado", "walnuts", "olive oil", "sugar"'

# Prepare dataset

Import the full recipe dataset:

In [13]:
recipes = pd.read_csv("../data/recipes.csv")

Import the partially-labelled dataset:

In [3]:
df = pd.read_csv("trimmed_recipes_labelled.csv")

df = df[["OriginalIndex", "RecipeId", "Name", "Description"] + LABELLED_COLS]

df["OriginalIndex"] = df["OriginalIndex"].astype("int64")

df = df.set_index("OriginalIndex")

  df = pd.read_csv("trimmed_recipes_labelled.csv")


Drop rows that aren't labelled:

In [4]:
df = df.dropna(subset=LABELLED_COLS)

Join the datasets to get all columns for the labelled rows:

In [5]:
labelled_recipes = df[LABELLED_COLS].join(recipes, how="inner")

We now create different datasets to compare the effect of using title embeddings, ingredient embeddings and description embeddings.

The SBERT model is imported as its state will not change with use:

In [11]:
transformer_model: str = "all-mpnet-base-v2"

model: SentenceTransformer = SentenceTransformer(transformer_model)

# Old

### Title

Create embeddings for title:

In [46]:
# X_title = model.encode(labelled_recipes["Name"].values)
# y_title = labelled_recipes["Savoury"].values
# datasets["title"] = (X_title, y_title)

### Description

In [51]:
# X_description = model.encode(labelled_recipes["Description"].values)

# y_description = labelled_recipes["Savoury"].values

# datasets["description"] = (X_description, y_description)

# Classification

### Ingredients

Convert ingredients to lists:

In [40]:
ingredient_recipes = labelled_recipes.drop(
    labelled_recipes[labelled_recipes["RecipeIngredientParts"].str[:2] != "c("].index
)

ingredient_recipes["RecipeIngredientParts"] = ingredient_recipes[
    "RecipeIngredientParts"
].str[1:]

parseTuple = lambda tupleStr: ast.literal_eval(tupleStr)


def parseTupleFunc(tupleStr):

    try:
        return ast.literal_eval(tupleStr)

    except Exception as e:

        print(tupleStr)


ingredient_recipes["RecipeIngredientParts"] = ingredient_recipes[
    "RecipeIngredientParts"
].apply(parseTupleFunc)

ingredient_recipes = ingredient_recipes[["RecipeIngredientParts", "Savoury"]]

Convert the list into a string:

In [41]:
ingredient_recipes["RecipeIngredientParts"] = ingredient_recipes[
    "RecipeIngredientParts"
].str.join(" ")

Embed the strings:

In [59]:
ingredient_embeds = model.encode(ingredient_recipes["RecipeIngredientParts"].values)

Create dataset:

In [63]:
datasets["ingredient"] = (ingredient_embeds, ingredient_recipes["Savoury"].values)

# Create classifier

In [61]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [64]:
for dataset in datasets:
    classifier_model = RandomForestClassifier()

    X_train, X_test, y_train, y_test = train_test_split(
        datasets[dataset][0],
        datasets[dataset][1],
        test_size=0.30,
        random_state=RANDOM_STATE,
    )

    classifier_model.fit(X_train, y_train)

    y_pred = classifier_model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)

    print(f"{dataset}, accuracy: {accuracy}%")

title, accuracy: 0.885%
ingredient, accuracy: 0.9319727891156463%
description, accuracy: 0.855%
