# Introduction

This notebook is designed to take pre-trained recommender systems and visualise a small sample of the recipes dataset using each system.

The recipe's embeddings are generated, projected down to 2D and plotted on a graph. The graph can be written to disk or observed in the notebook (where it's interactive).

In [1]:
import sys

sys.path.insert(1, "..")

In [2]:
from recipe_rec.data import load_dataset, store
from recipe_rec.systems import (
    Recipe2Vec,
    FeatureGenerationRecommender,
    SBERTRecommender,
    fastRecipeRecommender,
)
from recipe_rec import RANDOM_STATE
from pathlib import Path
import numpy as np
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
import plotly.graph_objs as go
import plotly

plotly.io.kaleido.scope.mathjax = None

np.random.seed(RANDOM_STATE + 3)

In [3]:
load_dataset("../data/recipes.csv")

In [4]:
def plot_scatter(
    x, y, labels, colors, sys_name, to_disk=True, index_subset=None, html=False
):

    title = sys_name + " Embedding Space"
    fig = go.Figure()

    if to_disk:

        if not html:
            if index_subset is None:
                index_subset = np.random.randint(low=1, high=len(labels), size=25)
            labels_subset = [
                labels[i] if i in index_subset else "" for i in range(len(labels))
            ]
            mode = "markers+text"
        else:
            labels_subset = labels
            mode = "markers"
        # print([(labels[v],v) for v in index_subset])

    else:
        labels_subset = labels
        mode = "markers"

    fig.add_trace(
        go.Scattergl(
            x=x,
            y=y,
            mode=mode,
            text=labels_subset,
            marker=dict(size=5, color=colors),
            textposition="top center",
            name=title,  # set color equal to a variable
        )
    )

    fig.update_layout(width=2000, height=2000, font={"size": 18})

    if to_disk:
        if html:
            fig.write_html(f"{sys_name}.html")
        else:
            fig.write_image(f"{sys_name}.png", format="png", engine="kaleido")
    else:
        fig.show("notebook")


def dimension_reducer(vecs):

    tsne = TSNE(
        n_components=2, random_state=RANDOM_STATE, init="pca", learning_rate="auto"
    )
    vectors = tsne.fit_transform(vecs)

    x = [v[0] for v in vectors]
    y = [v[1] for v in vectors]
    return x, y


def clustering(x, y):

    xy_zip = list(zip(x, y))

    kmeans = KMeans(n_clusters=15)

    cluster_labels = kmeans.fit_predict(xy_zip)

    return cluster_labels

In [5]:
base_out_path = "_embeddings.png"

# Word2Vec

In [6]:
# rec = Recipe2Vec(model_path=Path("../data/recipe2vec/recipe2vec_06a2136e9ad8459c8ae9e90995b2bd4b.model"),
#                  index_path=Path("../data/recipe2vec/recipe2vec_b8a733c754e64d7db7594c1c2fbb3197.ann"))

In [7]:
# systems = {
#     "Word2Vec" : Recipe2Vec(model_path=Path("../data/recipe2vec/recipe2vec_06a2136e9ad8459c8ae9e90995b2bd4b.model"),
#                  index_path=Path("../data/recipe2vec/recipe2vec_b8a733c754e64d7db7594c1c2fbb3197.ann")),
# #     "FeatureGeneration" : FeatureGenerationRecommender(),
# #     "SBERT":  SBERTRecommender(),
# #     "fastText" : fastRecipeRecommender()

# }

In [8]:
# sample some dataframes
recipes = store["recipes"]

num_samples = 1000

recipes_sample = recipes.sample(num_samples, random_state=RANDOM_STATE)

recipes_sample_ingredients = recipes_sample["RecipeIngredientParts"].values.tolist()
labels = recipes_sample["Name"].values.tolist()

In [22]:
def plot_system_scatter(
    system, recipes_sample_ingredients, system_name, index_subset=None, html=False
):

    # get vectors
    vectors = np.array(
        [system.recipe_vectorizer(recipe) for recipe in recipes_sample_ingredients]
    )

    # call dimension reducing func
    x, y = dimension_reducer(vectors)

    # clustering
    cluster_labels = clustering(x, y)

    # call plotter
    plot_scatter(
        x,
        y,
        labels,
        cluster_labels,
        sys_name=system_name,
        to_disk=True,
        index_subset=index_subset,
        html=html,
    )

In [13]:
import ast

indexes = ast.literal_eval(
    "[('Old Fashioned Spicy Oatmeal Raisin Bars', 5099), ('Quick Chocolate Cake', 9576), ('Carrot-Raisin Quick Bread', 6760), ('Classic English Suet Dumplings', 2252), ('Apple Maple Fool', 2994), ('Fresh Figs Stuffed and Wrapped With Prosciutto', 7848), ('Fresh Herb, Chili and Lemon Spaghettini', 1419), ('Winter Salad', 289), ('Poached Apples in Calvados', 9585), ('Asian Beef Noodle Salad', 3484), ('Plain Cake Jamaican Style', 8728), ('Easy Linguine Del Mar', 241), ('Low Fat Cornbread', 7020), ('Custard Tart With Garibaldi Biscuits by Marcus Wareing', 809), ('Tangy Cranberry Sauce', 7153), ('Havana Cocktail', 5186), ('Strawberry &amp; Cream Pinwheel Appetizers', 9544), ('Half and Half Substitute', 6178), ('Fruit and Caramel Brie', 5484), ('Parmesan Pork Chops', 866), ('Cheesy Baked Fettuccine With Bacon', 6283), ('Chocolate Toast', 7254), ('Prawns Peri-Peri', 2847), ('Maple Syrup Pudding Sauce', 196), ('BLT Chicken Salad With Ranch', 4470)]"
)

In [14]:
indexes_good = [x[1] for x in indexes]

In [15]:
# sbert_model = SBERTRecommender(
#     embeddings_path=Path("../data/sbert/sbert_recipe_embeddingsd05ccab26baf4154b99820d435d5aa9d.pkl"),
#     index_path=Path("../data/sbert/sbert_889c6acca58647418f0df944f261b972.ann"),
# )

# plot_system_scatter(sbert_model, recipes_sample_ingredients, "SBERT",html=False)

In [16]:
# fasttext_model = fastRecipeRecommender(
#     output_dir=Path("../data/fastRecipe/")
# )

In [17]:
# plot_system_scatter(fasttext_model, recipes_sample_ingredients, "fastText",indexes_good)

In [18]:
feat_model = FeatureGenerationRecommender(
    embeddings_path=Path(
        "../data/sbert/sbert_recipe_embeddingsd05ccab26baf4154b99820d435d5aa9d.pkl"
    ),
    classifiers_path=Path(
        "../data/feature_generation/trained_classifiers_3eb9cd75582b41e68b0557f471e522fc.pkl"
    ),
    prelabelled_dataset_path=Path("../data/labelled_data.xlsx"),
    labelled_dataset_path=Path(
        "../data/feature_generation/labelled_dataset_466bf05a687343beb2647be876f6492b.csv"
    ),
)

02:06:13.SentenceTransformer: Load pretrained SentenceTransformer: all-MiniLM-L12-v2
02:06:14.SentenceTransformer: Use pytorch device: cpu


In [23]:
plot_system_scatter(
    feat_model, recipes_sample_ingredients, "Feature_Generation", html=False
)


The PCA initialization in TSNE will change to have the standard deviation of PC1 equal to 1e-4 in 1.2. This will ensure better convergence.



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



Number of distinct clusters (5) found smaller than n_clusters (15). Possibly due to duplicate points in X.



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
