Calculate the minimum necessary vector size for Word2Vec as described by [Patel and Bhattacharyya](https://aclanthology.org/I17-2006/).

Import packages and the dataset:

In [None]:
import numpy as np
import pandas as pd

In [None]:
recipes = pd.read_csv("../data/recipes.csv")

Create the ingredient co-ocurrence matrix:

In [None]:
ingredients = []
ingredient_indexes = {}

recipe_ingredients = recipes["RecipeIngredientParts"]

# get matrix size 
for recipe in recipe_ingredients:
    
    for ingredient in recipe:
        if ingredient not in ingredient_indexes.keys() and :
            
            ingredient_indexes[ingredient] = len(ingredients)
            ingredients.append(ingredient)

In [None]:
len(ingredients)

3631

In [None]:
matrix = np.zeros((len(ingredients), len(ingredients)))

for recipe in recipe_ingredients:

    for ingredient in recipe:

        ingredient_index = ingredient_indexes[ingredient]

        for context_ingredient in recipe:

            context_index = ingredient_indexes[context_ingredient]

            matrix[ingredient_index][context_index] += 1
            matrix[context_index][ingredient_index] += 1

Create a word-word cosine similarity matrix using the co-ocurrence matrix:

In [None]:
from scipy import spatial
import tqdm

similarity_matrix = np.zeros((len(ingredients), len(ingredients)))

# get each row's vector
for i in tqdm.tqdm(range(matrix.shape[0])):

    for j in range(matrix.shape[0]):

        if i == j:

            continue

        # compute similarity
        similarity = 1.0 - spatial.distance.cosine(matrix[i], matrix[j])

        similarity_matrix[i][j] = similarity
        similarity_matrix[j][i] = similarity

100%|██████████| 3631/3631 [10:38<00:00,  5.69it/s]


Save the resulting matrix to disk for re-use:

In [None]:
import pickle

with open("similarity_matrix_trimmed.npy", "wb") as f:
    np.save(f, similarity_matrix, allow_pickle=True)

In [None]:
import tqdm.notebook as tqdm

In [None]:
import numpy as np

with open("similarity_matrix.npy", "rb") as f:
    similarity_matrix = np.load(f)

Create mapping table:

In [None]:
import networkx as nx
from networkx.algorithms import approximation


def lambda_lookup_func(i):

    if i == 3 or i == 4:
        return 6
    elif i == 5:
        return 10
    elif i == 6:
        return 16
    elif i >= 7 and i <= 13:
        return 28
    elif i == 14:
        return 30
    elif i == 15:
        return 36
    elif i == 16:
        return 42
    elif i == 17:
        return 51
    elif i == 18:
        return 61
    elif i == 19:
        return 76
    elif i == 20:
        return 96
    elif i == 21:
        return 126
    elif i == 22:
        return 176
    elif i >= 23 and i <= 41:
        return 276
    elif i == 42:
        return 288
    elif i == 43:
        return 344


lambda_lookup = {lambda_lookup_func(i) for i in range(3, 44)}

In [None]:
# build graph
G = nx.Graph()

for i in range(similarity_matrix.shape[0]):
    G.add_node(i)

In [None]:
# get unique similarity values from the similarity matrix
unique_similarities = np.unique(similarity_matrix)

For each value in the similarity matrix, build a graph and identify maximum clique.

In [None]:
lambdas = []

for val in tqdm.tqdm(unique_similarities, desc="outer", position=0):

    # for each cell in the similarity matrix
    for i in tqdm.tqdm(range(similarity_matrix.shape[0]), desc="inner", position=1):

        for j in range(i + 1):

            if similarity_matrix[i][j] == val:

                G.add_edge(i, j)

    equal_indexes = np.where(similarity_matrix == val)

    for index in equal_indexes:

        G.add_edge()

    max_clique = approximation.max_clique(G)

    print("got max clique")
    max_clique_size = len(max_clique)

    lambda_k = lambda_lookup[max_clique_size]

    lambdas.append(lambda_k)

    G.clear_edges()

outer:   0%|          | 0/6586847 [00:00<?, ?it/s]

inner:   0%|          | 0/3631 [00:00<?, ?it/s]

: 

: 

In [None]:
min_vec_size = max(lambdas)