# Introduction

This notebook is designed to take pre-trained recommender systems and visualise a small sample of the recipes dataset. This is performed for the Word2Vec embeddng space, but the functions outlined are easily adapted for use with other embedding spaces.

The recipe's embeddings are generated, projected down to 2D and plotted on a graph. The graph can be written to disk or observed in the notebook (where it's interactive).

In [1]:
import sys

sys.path.insert(1, "..")

In [2]:
from recipe_rec.data import load_dataset, store
from recipe_rec.models import (
    Recipe2Vec,
    FeatureGenerationRecommender,
    SBERTRecommender,
    fastRecipeRecommender,
)
from recipe_rec import RANDOM_STATE
from pathlib import Path
import numpy as np
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
import plotly.graph_objs as go
import plotly

plotly.io.kaleido.scope.mathjax = None

np.random.seed(RANDOM_STATE + 3)

In [3]:
load_dataset("../data/recipes.csv")

In [4]:
def plot_scatter(
    x, y, labels, colors, sys_name, to_disk=True, index_subset=None, html=False
):

    title = sys_name + " Embedding Space"
    fig = go.Figure()

    if to_disk:

        if not html:
            if index_subset is None:
                index_subset = np.random.randint(low=1, high=len(labels), size=25)
            labels_subset = [
                labels[i] if i in index_subset else "" for i in range(len(labels))
            ]
            mode = "markers+text"
        else:
            labels_subset = labels
            mode = "markers"
        # print([(labels[v],v) for v in index_subset])

    else:
        labels_subset = labels
        mode = "markers"

    fig.add_trace(
        go.Scattergl(
            x=x,
            y=y,
            mode=mode,
            text=labels_subset,
            marker=dict(size=5, color=colors),
            textposition="top center",
            name=title,  # set color equal to a variable
        )
    )

    fig.update_layout(width=2000, height=2000, font={"size": 18})

    if to_disk:
        if html:
            fig.write_html(f"{sys_name}.html")
        else:
            fig.write_image(f"{sys_name}.png", format="png", engine="kaleido")
    else:
        fig.show("notebook")


def dimension_reducer(vecs):

    tsne = TSNE(
        n_components=2, random_state=RANDOM_STATE, init="pca", learning_rate="auto"
    )
    vectors = tsne.fit_transform(vecs)

    x = [v[0] for v in vectors]
    y = [v[1] for v in vectors]
    return x, y


def clustering(x, y):

    xy_zip = list(zip(x, y))

    kmeans = KMeans(n_clusters=15)

    cluster_labels = kmeans.fit_predict(xy_zip)

    return cluster_labels

In [5]:
base_out_path = "_embeddings.png"

# Word2Vec

Load a pre-built system:

In [6]:
word2vec_model = Recipe2Vec(
    model_path=Path(
        "../data/recipe2vec/recipe2vec_06a2136e9ad8459c8ae9e90995b2bd4b.model"
    ),
    index_path=Path(
        "../data/recipe2vec/recipe2vec_b8a733c754e64d7db7594c1c2fbb3197.ann"
    ),
)

Sample recipes to display:

In [8]:
# sample some dataframes
recipes = store["recipes"]

num_samples = 1000

recipes_sample = recipes.sample(num_samples, random_state=RANDOM_STATE)

recipes_sample_ingredients = recipes_sample["RecipeIngredientParts"].values.tolist()
labels = recipes_sample["Name"].values.tolist()

Create a function to plot the data:

In [22]:
def plot_system_scatter(
    system, recipes_sample_ingredients, system_name, index_subset=None, html=False
):

    # get vectors
    vectors = np.array(
        [system.recipe_vectorizer(recipe) for recipe in recipes_sample_ingredients]
    )

    # call dimension reducing func
    x, y = dimension_reducer(vectors)

    # clustering
    cluster_labels = clustering(x, y)

    # call plotter
    plot_scatter(
        x,
        y,
        labels,
        cluster_labels,
        sys_name=system_name,
        to_disk=True,
        index_subset=index_subset,
        html=html,
    )

In [15]:
plot_system_scatter(word2vec_model, recipes_sample_ingredients, "Word2Vec", html=False)