In [1]:
import pandas as pd
import pickle
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import plotly.express as px
import seaborn as sns

In [2]:
ingr_map_file = open('./data/ingr_map.pkl', 'rb')
ingr_map = pickle.load(ingr_map_file)

recipes = pd.read_csv('./data/PP_recipes.csv')
raw_recipes = pd.read_csv('./data/RAW_recipes.csv')
recipe_corpus = recipes['ingredient_ids']

In [19]:
def create_ingr_dict_frame(ingr_map=ingr_map, min_occurence=10):
    ingr_above_min = ingr_map[ingr_map['count'] > min_occurence]
    unique_ingrs = ingr_above_min.loc[:, ['id', 'replaced', 'count']].drop_duplicates().sort_values(by='id')
    unique_ingrs.rename(columns={'id': 'org_id', 'replaced': 'name'}, inplace=True)
    unique_ingrs['idx'] = list(range(len(unique_ingrs)))
    return unique_ingrs

ingr_frame = create_ingr_dict_frame(min_occurence=0)

In [23]:
mock_table = ingr_frame.loc[:, ['org_id', 'name']].reset_index(drop=True)

In [25]:
mock_table.rename(columns={'org_id': 'ingredient_id'}, inplace=True)

In [27]:
mock_table.to_pickle('mock_ingredient_table.pkl')

In [4]:
org_id_to_idx = ingr_frame.set_index('org_id').to_dict()['idx']
idx_to_name = ingr_frame.set_index('idx').to_dict()['name']

In [5]:
def org_ids_to_idxs(org_ids, exclude):
    idxs = []
    for org_id in org_ids:
        if org_id != exclude:
            try:
                idx = org_id_to_idx[org_id]
                idxs.append(idx)
            
            except KeyError:
                pass
    return idxs

def mat_size(ingr_frame=ingr_frame):
    return len(ingr_frame)

In [6]:
def create_occurence_matrix(recipe_corpus=recipe_corpus):
    size = mat_size()
    mat = np.zeros((size, size))
    for recipe in tqdm(recipe_corpus):
        recipe = eval(recipe)
        for ingr in recipe:
            try:
                m = org_id_to_idx[ingr]
            except KeyError:
                continue
            n = org_ids_to_idxs(recipe, exclude=ingr)
            mat[m, n] += 1
    return mat
            
            
    
def most_similar(idx, co_mat, n=1):
    print(f'selected: {idx_to_name[idx]}')
    sim_ingrs = {}
    sims = cosine_similarity(co_mat_reduced[idx].reshape(1, -1), co_mat_reduced).flatten()
    sims_sorted_idx = np.argsort(-sims).flatten()[1: n+1]
    sim_scores = sims[sims_sorted_idx]
    for i, score in zip(sims_sorted_idx, sim_scores):
        name = idx_to_name[i]
        sim_ingrs[name] = score
    return sim_ingrs
    
    
    

In [7]:
co_mat = create_occurence_matrix()

100%|██████████| 178265/178265 [00:13<00:00, 12741.33it/s]


In [8]:
svd = TruncatedSVD(n_components=20)

In [9]:
co_mat_reduced = svd.fit_transform(co_mat)

In [10]:
most_similar(57, co_mat, n=5)

selected: american cheese


{'cheddar': 0.9834412936919918,
 'processed cheese': 0.9775793597547966,
 'velveeta cheese': 0.9738401287584965,
 'cooked ham': 0.9719391219484539,
 'medium noodle': 0.9718757272123286}

In [11]:
svd_2 = TruncatedSVD(n_components=2)
co_mat_2d = svd_2.fit_transform(co_mat)

In [12]:
names = [idx_to_name[i] for i in range(co_mat_2d.shape[0])]

In [13]:
df = pd.DataFrame(co_mat_2d, columns=['x', 'y'] )

In [14]:
df['name'] = names

In [36]:
df.head()

Unnamed: 0,x,y,name
0,8.520132,-5.8889,'s chocolate chip
1,10.866503,-7.546211,'s syrup
2,5.044401,-1.435586,7-up
3,15.519834,-5.186499,7-up soda
4,78.60577,43.099998,a.1. sauce


In [41]:
fig = px.scatter(df, x="x", y="y", hover_data=['name'])
fig.show()