In [97]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import ast
import json
import re
from collections import defaultdict
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.metrics import silhouette_score

from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem import rdMolDescriptors
import requests
import time

from bs4 import BeautifulSoup
from scipy.sparse import csr_matrix
from sklearn.preprocessing import LabelEncoder

In [16]:
ingredients_df = pd.read_csv('data/recipes.csv')
ingredients_df = pd.DataFrame(ingredients_df)

In [92]:
# Load the data
comp_path = 'data/flavor_network_data/ingr_comp/comp_info.tsv'
comp_tsv = pd.read_csv(comp_path, delimiter='\t')

comp_df = pd.DataFrame(data = comp_tsv)
comp_columns = ['compound_id', 'compound_name', 'CAS_number']
comp_df.columns = comp_columns

ingr_path = 'data/flavor_network_data/ingr_comp/ingr_info.tsv'
ingr_tsv = pd.read_csv(ingr_path, delimiter='\t')

ingr_df = pd.DataFrame(data = ingr_tsv)
ingr_columns = ['ingredient_id', 'ingredient_name', 'ingredient_category']
ingr_df.columns = ingr_columns

ingr_comp_pathh = 'data/flavor_network_data/ingr_comp/ingr_comp.tsv'
ingr_comp_tsv = pd.read_csv(ingr_comp_pathh, delimiter='\t')


In [19]:
ingr_comp_df = pd.DataFrame(data = ingr_comp_tsv)

In [20]:
ingr_comp_df.columns

Index(['# ingredient id', 'compound id'], dtype='object')

In [21]:
ingr_comp_df.rename(columns={
    '# ingredient id': 'ingredient_id',
    'compound id': 'compound_id'
}, inplace=True)

In [22]:
flav_edges_path = 'data/flavor_network_data/flavor_network_backbone/flavor_network_backbone_edges.tsv'
flav_edges_tsv = pd.read_csv(flav_edges_path, delimiter='\t')

flav_edges_df = pd.DataFrame(data = flav_edges_tsv)
flav_edges_columns = ['ingredient_1', 'ingredient_2', 'number_of_shared_compounds']
flav_edges_df.columns = flav_edges_columns

flav_nodes_path = 'data/flavor_network_data/flavor_network_backbone/flavor_network_backbone_nodes.tsv'
flav_nodes_tsv = pd.read_csv(flav_nodes_path, delimiter='\t')

flav_nodes_df = pd.DataFrame(data = flav_nodes_tsv)
flav_nodes_columns = ['ingredient_name', 'x_coordinate', 'y_coordinate', 'prevalence', 'r', 'g', 'b']
flav_nodes_df.columns = flav_nodes_columns

In [23]:
# Function to replace spaces with underscores
def replace_spaces(value):
    if isinstance(value, str):
        return value.replace(' ', '_')
    return value

In [24]:
ingr_df['ingredient_name'] = ingr_df['ingredient_name'].apply(replace_spaces)
ingr_df['ingredient_category'] = ingr_df['ingredient_category'].apply(replace_spaces)
flav_edges_df['ingredient_1'] = flav_edges_df['ingredient_1'].apply(replace_spaces)
flav_edges_df['ingredient_2'] = flav_edges_df['ingredient_2'].apply(replace_spaces)
flav_nodes_df['ingredient_name'] = flav_nodes_df['ingredient_name'].apply(replace_spaces)

In [25]:
# Load the data
file_path = 'data/flavor_network_data/scirep-cuisines-detail/map.txt'
mapping = pd.read_csv(file_path, delimiter='\t', header=None, names=['country', 'region'])

# Clean the data
mapping['country'] = mapping['country'].str.strip()
mapping['region'] = mapping['region'].str.strip()

# Get unique regions and countries
unique_regions = mapping['region'].unique()
unique_countries = mapping['country'].unique()

regions_countries = mapping.groupby('region')['country'].apply(list).reset_index()

In [26]:
# Define the cuisine mapping with adjectival forms and variations
cuisine_mapping = {
    'vietnamese': ['vietnamese', 'vietnam'],
    'indian': ['indian', 'india'],
    'spanish_portuguese': ['spanish_portuguese'],
    'jewish': ['jewish'],
    'french': ['french', 'france'],
    'central_south_american': ['central_southamerican'],
    'cajun_creole': ['cajun_creole'],
    'thai': ['thai', 'thailand'],
    'scandinavian': ['scandinavian', 'scandinavia'],
    'greek': ['greek'],
    'american': ['american'],
    'african': ['african'],
    'middle_eastern': ['middleeastern', 'middle_eastern', 'turkey', 'iran', 'israel', 'lebanon'],
    'eastern_european_russian': ['easterneuropean_russian', 'eastern-europe', 'russia'],
    'italian': ['italian', 'italy'],
    'irish': ['irish', 'ireland'],
    'mexican': ['mexican', 'mexico'],
    'chinese': ['chinese', 'china'],
    'german': ['german', 'germany'],
    'mediterranean': ['mediterranean'],
    'japanese': ['japanese', 'japan'],
    'moroccan': ['moroccan'],
    'southern_soul_food': ['southern_soulfood'],
    'english_scottish': ['english_scottish', 'uk-and-ireland', 'england', 'scotland'],
    'asian': ['asian'],
    'southwestern': ['southwestern'],
    'east_asian': ['east_asian'],
    'western': ['western'],
    'korean': ['korean', 'korea'],
    'canadian': ['canada'],
    'caribbean': ['caribbean'],
    'bangladeshi': ['bangladesh'],
    'israeli': ['israel'],
    'iranian': ['iran'],
    'south_african': ['south-african'],
    'belgian': ['belgium'],
    'spanish': ['spain'],
    'dutch': ['netherlands'],
    'filipino': ['philippines'],
    'indonesian': ['indonesia'],
    'east_african': ['east-african'],
    'swiss': ['switzerland'],
    'west_african': ['west-african'],
    'north_african': ['north-african'],
    'pakistani': ['pakistan'],
    'portuguese': ['portugal'],
    'malaysian': ['malaysia'],
    'austrian': ['austria']
}

# Reverse the mapping for easier lookup
cuisine_lookup = {alias: cuisine for cuisine, aliases in cuisine_mapping.items() for alias in aliases}



# Create a dictionary from the mapping DataFrame
country_to_region = dict(zip(mapping['country'].str.lower().str.replace(' ', '_'), mapping['region'].str.lower().str.replace(' ', '_')))

# Explicitly format region names
def format_region_name(region):
    if pd.isna(region):
        return 'unknown'
    formatted_region = region.replace('southeastasian', 'south_east_asian') \
        .replace('southasian', 'south_asian') \
        .replace('southerneuropean', 'southern_european') \
        .replace('middleeastern', 'middle_eastern') \
        .replace('westerneuropean', 'western_european') \
        .replace('latinamerican', 'latin_american') \
        .replace('northamerican', 'north_american') \
        .replace('northerneuropean', 'northern_european') \
        .replace('easterneuropean', 'eastern_european') \
        .replace('eastasian', 'east_asian')
    return formatted_region

# Function to standardize country names and map to region
def standardize_and_map_region(country):
    if pd.isna(country):
        return 'unknown', 'unknown'
    country_lower = str(country).lower().replace(' ', '_')
    standardized_country = cuisine_lookup.get(country_lower, country_lower)
    region = country_to_region.get(standardized_country, 'unknown')
    return standardized_country, format_region_name(region)


# Apply the function to create new columns
ingredients_df[['country', 'region']] = ingredients_df['country'].apply(lambda x: pd.Series(standardize_and_map_region(x)))

# Move the 'region' column to the beginning
columns = ['region'] + [col for col in ingredients_df if col != 'region']
ingredients_df = ingredients_df[columns]


In [27]:
ingredients_df.replace({'Yes': 1, 'No': 0}, inplace=True)

  ingredients_df.replace({'Yes': 1, 'No': 0}, inplace=True)


In [28]:
ingr_small = ingredients_df.columns.copy()

In [29]:
ingr_small = pd.DataFrame(data=ingr_small)

In [30]:
# Assign column names if necessary
ingr_small.columns = ['ingredient_name']

# Drop rows that contain 'region' or 'country'
ingr_small = ingr_small[~ingr_small['ingredient_name'].isin(['region', 'country'])]

# Add ingredient_id column
ingr_small.insert(0, 'ingredient_id', range(1, 1 + len(ingr_small)))

# Print the cleaned DataFrame
print(ingr_small)

     ingredient_id ingredient_name
2                1          almond
3                2        angelica
4                3           anise
5                4      anise_seed
6                5           apple
..             ...             ...
380            379            wood
381            380             yam
382            381           yeast
383            382          yogurt
384            383        zucchini

[383 rows x 2 columns]


In [31]:
ingr_small['ingredient_name'] = ingr_small['ingredient_name'].apply(replace_spaces)

In [32]:
# Function to replace ingredient IDs
def replace_ingredient_id(ingr_small, ingr_df):
    # Create a dictionary for quick lookup of ingredient_id by ingredient_name
    ingr_dict = pd.Series(ingr_df.ingredient_id.values, index=ingr_df.ingredient_name).to_dict()

    # Function to replace ingredient_id if ingredient_name matches
    def replace_id(row):
        ingredient_name = row['ingredient_name']
        if ingredient_name in ingr_dict:
            row['ingredient_id'] = ingr_dict[ingredient_name]
        return row

    # Apply the function to each row in ingr_small
    ingr_small = ingr_small.apply(replace_id, axis=1)

    return ingr_small

# Apply the function
ingr_small = replace_ingredient_id(ingr_small, ingr_df)

# Verify the changes
print(ingr_small.head())

   ingredient_id ingredient_name
2             18          almond
3            725        angelica
4            707           anise
5            395      anise_seed
6           1197           apple


In [33]:
ingr_small_def = ingr_small.copy()

In [34]:
comp_df

Unnamed: 0,compound_id,compound_name,CAS_number
0,0,jasmone,488-10-8
1,1,5-methylhexanoic_acid,628-46-6
2,2,l-glutamine,56-85-9
3,3,1-methyl-3-methoxy-4-isopropylbenzene,1076-56-8
4,4,methyl-3-phenylpropionate,103-25-3
...,...,...,...
1102,1102,2-heptanol,543-49-7
1103,1103,1-octen-3-yl_butyrate,16491-54-6
1104,1104,guaiacol,90-05-1
1105,1105,(+/?)-methyl_5-acetoxyhexanoate,35234-22-1


In [35]:
ingr_comp_merge = ingr_comp_df.copy()

In [36]:
ingr_comp_merge.compound_id = ingr_comp_merge.compound_id.astype(str)

In [37]:
ingr_small['ingredient_id'] = ingr_small['ingredient_id'].astype(str)
ingr_comp_df['ingredient_id'] = ingr_comp_df['ingredient_id'].astype(str)
ingr_comp_df['compound_id'] = ingr_comp_df['compound_id'].astype(str)

# Convert column names in ingr_small to strings
ingr_small.columns = ingr_small.columns.astype(str)

# Get the unique compound_ids in ingr_comp_df
compound_id_list = ingr_comp_df['compound_id'].unique()

# Create a DataFrame to hold the new columns with initial values set to 0
new_columns_df = pd.DataFrame(0, index=ingr_small.index, columns=compound_id_list)

# Concatenate the new columns to ingr_small
ingr_small = pd.concat([ingr_small, new_columns_df], axis=1)

# Iterate through ingr_comp_df and update the values in ingr_small
for index, row in ingr_comp_df.iterrows():
    ingredient_id = row['ingredient_id']
    compound_id = row['compound_id']
    if ingredient_id in ingr_small['ingredient_id'].values:
        ingr_small.loc[ingr_small['ingredient_id'] == ingredient_id, compound_id] = 1

# Verify the result
print(ingr_small.head())

  ingredient_id ingredient_name  906  861  673  278  171  387  165  1099  ...  \
2            18          almond    0    0    0    0    0    0    0     0  ...   
3           725        angelica    0    0    1    0    0    0    0     0  ...   
4           707           anise    0    0    1    0    0    0    1     0  ...   
5           395      anise_seed    0    0    0    0    0    0    0     0  ...   
6          1197           apple    0    0    0    1    0    0    0     0  ...   

   722  237  169  23  310  653  966  752  497  754  
2    0    0    0   0    0    0    0    0    0    0  
3    0    0    0   0    0    0    0    0    0    0  
4    0    0    0   0    0    0    0    0    0    0  
5    0    0    0   0    0    0    0    0    0    0  
6    0    0    0   0    0    0    0    0    0    0  

[5 rows x 1109 columns]


In [None]:
ingr_small.shape

In [40]:
compound_id_cols = [col for col in ingr_small.columns if col not in ['ingredient_id', 'ingredient_name']]

In [41]:
ingr_small['count_ones'] = ingr_small[compound_id_cols].sum(axis=1)

In [42]:
ingr_comp_df

Unnamed: 0,ingredient_id,compound_id
0,1392,906
1,1259,861
2,1079,673
3,22,906
4,103,906
...,...,...
36776,876,657
36777,637,461
36778,689,650
36779,689,297


In [43]:
# Group by 'ingredient_id' and aggregate 'compound_id' into a list
comp_per_ingr = ingr_comp_df.groupby('ingredient_id')['compound_id'].apply(list).reset_index()
ingr_per_comp = ingr_comp_df.groupby('compound_id')['ingredient_id'].apply(list).reset_index()

In [44]:
ingr_per_comp.sort_values(by='compound_id', ascending=False, inplace=True)

In [45]:
print(ingr_comp_df.compound_id.unique())

['906' '861' '673' ... '752' '497' '754']


In [46]:
ingr_per_comp.shape

(1107, 2)

In [47]:
comp_per_ingr

Unnamed: 0,ingredient_id,compound_id
0,0,[995]
1,1,[921]
2,10,[715]
3,100,[1011]
4,1000,"[764, 275, 630, 1046, 285, 84, 482, 772, 686, ..."
...,...,...
1520,995,[996]
1521,996,[894]
1522,997,[828]
1523,998,"[25, 317, 86]"


In [48]:
ingr_small

Unnamed: 0,ingredient_id,ingredient_name,906,861,673,278,171,387,165,1099,...,237,169,23,310,653,966,752,497,754,count_ones
2,18,almond,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,9
3,725,angelica,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
4,707,anise,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,17
5,395,anise_seed,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
6,1197,apple,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,195
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
380,1396,wood,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8
381,705,yam,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
382,407,yeast,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,3
383,230,yogurt,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,37


In [49]:
ingr_small['ingredient_name'] = ingr_small['ingredient_name'].astype(str)

In [50]:
ingredient_cols = ingredients_df.columns.copy()

In [51]:
ingredient_cols

Index(['region', 'country', 'almond', 'angelica', 'anise', 'anise_seed',
       'apple', 'apple_brandy', 'apricot', 'armagnac',
       ...
       'whiskey', 'white_bread', 'white_wine', 'whole_grain_wheat_flour',
       'wine', 'wood', 'yam', 'yeast', 'yogurt', 'zucchini'],
      dtype='object', length=385)

In [52]:
ingr_small

Unnamed: 0,ingredient_id,ingredient_name,906,861,673,278,171,387,165,1099,...,237,169,23,310,653,966,752,497,754,count_ones
2,18,almond,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,9
3,725,angelica,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
4,707,anise,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,17
5,395,anise_seed,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
6,1197,apple,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,195
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
380,1396,wood,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8
381,705,yam,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
382,407,yeast,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,3
383,230,yogurt,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,37


In [53]:
ingr_small["ingredient_name"]

2          almond
3        angelica
4           anise
5      anise_seed
6           apple
          ...    
380          wood
381           yam
382         yeast
383        yogurt
384      zucchini
Name: ingredient_name, Length: 383, dtype: object

In [54]:
ingr_small_mapping = ingr_small['ingredient_name']

In [55]:
ingr_small_mapping

2          almond
3        angelica
4           anise
5      anise_seed
6           apple
          ...    
380          wood
381           yam
382         yeast
383        yogurt
384      zucchini
Name: ingredient_name, Length: 383, dtype: object

In [56]:
ingr_small_mapping = pd.concat([ingr_small_mapping, ingr_small['ingredient_id']], axis=1)

In [57]:
ingredients_df.columns = ingredients_df.columns.astype(str)

In [58]:
ingr_small_mapping

Unnamed: 0,ingredient_name,ingredient_id
2,almond,18
3,angelica,725
4,anise,707
5,anise_seed,395
6,apple,1197
...,...,...
380,wood,1396
381,yam,705
382,yeast,407
383,yogurt,230


In [59]:
ingredient_name_to_id = dict(zip(ingr_small_mapping['ingredient_name'], ingr_small_mapping['ingredient_id']))

In [60]:
ingredient_name_to_id

{'almond': '18',
 'angelica': '725',
 'anise': '707',
 'anise_seed': '395',
 'apple': '1197',
 'apple_brandy': '761',
 'apricot': '1120',
 'armagnac': '190',
 'artemisia': '1504',
 'artichoke': '1251',
 'asparagus': '1174',
 'avocado': '94',
 'bacon': '19',
 'baked_potato': '261',
 'balm': '1416',
 'banana': '918',
 'barley': '1281',
 'bartlett_pear': '330',
 'basil': '256',
 'bay': '215',
 'bean': '1348',
 'beech': '357',
 'beef': '248',
 'beef_broth': '512',
 'beef_liver': '1115',
 'beer': '1495',
 'beet': '255',
 'bell_pepper': '1292',
 'bergamot': '778',
 'berry': '609',
 'bitter_orange': '92',
 'black_bean': '1478',
 'black_currant': '1026',
 'black_mustard_seed_oil': '15',
 'black_pepper': '7',
 'black_raspberry': '1375',
 'black_sesame_seed': '118',
 'black_tea': '908',
 'blackberry': '676',
 'blackberry_brandy': '924',
 'blue_cheese': '994',
 'blueberry': '46',
 'bone_oil': '1322',
 'bourbon_whiskey': '1351',
 'brandy': '737',
 'brassica': '598',
 'bread': '105',
 'broccoli': '

In [61]:
new_columns = []

# Iterate over the current columns and replace ingredient names with IDs, formatted as ingr_{ingredient_id}
for col in ingredients_df.columns:
    if col in ['region', 'country']:  # Keep these columns as is
        new_columns.append(col)
    else:
        # Replace ingredient names with their corresponding IDs and format as ingr_{ingredient_id}
        ingredient_id = ingredient_name_to_id.get(col, col)
        new_columns.append(f'ingr_{ingredient_id}')


In [62]:
ingredients_df.columns = new_columns

In [63]:
ingredients_df.columns = ingredients_df.columns.astype(str)

In [64]:
ingredients_df

Unnamed: 0,region,country,ingr_18,ingr_725,ingr_707,ingr_395,ingr_1197,ingr_761,ingr_1120,ingr_190,...,ingr_361,ingr_703,ingr_1428,ingr_1278,ingr_84,ingr_1396,ingr_705,ingr_407,ingr_230,ingr_1080
0,south_east_asian,vietnamese,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,south_east_asian,vietnamese,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,south_east_asian,vietnamese,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,south_east_asian,vietnamese,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,south_east_asian,vietnamese,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57686,east_asian,japanese,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
57687,east_asian,japanese,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
57688,east_asian,japanese,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
57689,east_asian,japanese,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [74]:
comp_df['compound_name'] = comp_df['compound_name'].astype(str)

In [95]:
comp_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1107 entries, 0 to 1106
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   compound_id     1107 non-null   int64 
 1   compound_name   1107 non-null   object
 2   CAS_number      1107 non-null   object
 3   smiles          502 non-null    object
 4   parsed_formula  1107 non-null   object
dtypes: int64(1), object(4)
memory usage: 43.4+ KB


In [99]:
sparse_matrix

<1525x1107 sparse matrix of type '<class 'numpy.float64'>'
	with 36781 stored elements in Compressed Sparse Row format>

In [128]:
# Convert to dense vectors
dense_vectors = sparse_matrix.todense()

# Create a DataFrame for easy handling
dense_df = pd.DataFrame(dense_vectors)

In [137]:
# Function to get vectors from the dense matrix
def get_ingredient_vectors(dense_matrix):
    ingredient_vectors = dense_matrix.values  # Convert DataFrame to numpy array
    return ingredient_vectors

# Get ingredient vectors
ingredient_vectors = get_ingredient_vectors(dense_df)

ValueError: Expected a 1D array, got an array with shape (1525, 1107)

In [146]:
dense_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1097,1098,1099,1100,1101,1102,1103,1104,1105,1106
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1520,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1521,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1522,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1523,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [147]:
dense_df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1515,1516,1517,1518,1519,1520,1521,1522,1523,1524
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1102,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1103,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1104,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1105,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [148]:
mother_matrix = dense_df.dot(dense_df.T)

In [150]:
mother_matrix = mother_matrix.astype(int)

In [151]:
mother_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1515,1516,1517,1518,1519,1520,1521,1522,1523,1524
0,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,41,1,0,29,1,0,...,1,0,0,0,28,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1520,0,0,0,0,0,0,0,0,1,0,...,1,0,0,0,0,1,0,0,0,0
1521,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1522,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1523,0,0,0,0,0,0,0,2,0,0,...,0,0,0,0,2,0,0,0,3,0


In [142]:
dense_df.dot(query_vector)

0       1.0
1       0.0
2       0.0
3       0.0
4       0.0
       ... 
1520    0.0
1521    0.0
1522    0.0
1523    0.0
1524    0.0
Length: 1525, dtype: float64

In [107]:
ingredients_df.to_csv('data/ingredients_df.csv', index=False)

In [113]:
# Function to get column names where the row has 1s
def get_ingredients(row):
    return [re.findall(r'\d+', col)[0] for col in ingredients_df.columns if row[col] == 1]

# Apply the function to each row
ingredients_df['ingredient_list'] = ingredients_df.apply(get_ingredients, axis=1)

In [114]:
print(ingredients_df['ingredient_list'])

0        [256, 1216, 663, 658, 1044, 878, 136, 141, 243...
1                            [7, 663, 878, 136, 1447, 165]
2                                    [136, 243, 1183, 406]
3        [256, 1348, 512, 663, 658, 878, 164, 1199, 243...
4        [663, 1267, 1044, 878, 136, 1132, 243, 166, 11...
                               ...                        
57686    [698, 724, 1117, 1044, 848, 396, 1140, 1447, 2...
57687                [1338, 848, 221, 417, 915, 1336, 127]
57688    [1281, 427, 878, 136, 164, 1471, 205, 455, 130...
57689                     [19, 1237, 848, 164, 1012, 1179]
57690    [1281, 848, 396, 1447, 1471, 205, 1412, 1183, ...
Name: ingredient_list, Length: 57691, dtype: object


In [126]:
ingredients_df.head()

Unnamed: 0,region,country,ingr_18,ingr_725,ingr_707,ingr_395,ingr_1197,ingr_761,ingr_1120,ingr_190,...,ingr_703,ingr_1428,ingr_1278,ingr_84,ingr_1396,ingr_705,ingr_407,ingr_230,ingr_1080,ingredient_list
0,south_east_asian,vietnamese,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[256, 1216, 663, 658, 1044, 878, 136, 141, 243..."
1,south_east_asian,vietnamese,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[7, 663, 878, 136, 1447, 165]"
2,south_east_asian,vietnamese,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[136, 243, 1183, 406]"
3,south_east_asian,vietnamese,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[256, 1348, 512, 663, 658, 878, 164, 1199, 243..."
4,south_east_asian,vietnamese,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[663, 1267, 1044, 878, 136, 1132, 243, 166, 11..."


In [121]:
ingredients_slised =ingredients_df[['country', 'region', 'ingredient_list']].copy()

In [116]:
ingredients_slised.head()

Unnamed: 0,country,region,ingredient_list
0,vietnamese,south_east_asian,"[256, 1216, 663, 658, 1044, 878, 136, 141, 243..."
1,vietnamese,south_east_asian,"[7, 663, 878, 136, 1447, 165]"
2,vietnamese,south_east_asian,"[136, 243, 1183, 406]"
3,vietnamese,south_east_asian,"[256, 1348, 512, 663, 658, 878, 164, 1199, 243..."
4,vietnamese,south_east_asian,"[663, 1267, 1044, 878, 136, 1132, 243, 166, 11..."


In [117]:
comp_per_ingr

Unnamed: 0,ingredient_id,compound_id
0,0,[995]
1,1,[921]
2,10,[715]
3,100,[1011]
4,1000,"[764, 275, 630, 1046, 285, 84, 482, 772, 686, ..."
...,...,...
1520,995,[996]
1521,996,[894]
1522,997,[828]
1523,998,"[25, 317, 86]"


In [122]:
#Function to get compound list for each ingredient in the ingredient list
def add_compound_to_recipe(row):
    compound_list = []
    for ingredient in row['ingredient_list']:
        compounds = comp_per_ingr.loc[comp_per_ingr['ingredient_id'] == ingredient, 'compound_id']
        if not compounds.empty:
            compound_list.extend(compounds.values[0])
    return compound_list

# Apply the function to each row
ingredients_slised['compound_list'] = ingredients_slised.apply(add_compound_to_recipe, axis=1)

In [123]:
ingredients_slised

Unnamed: 0,country,region,ingredient_list,compound_list
0,vietnamese,south_east_asian,"[256, 1216, 663, 658, 1044, 878, 136, 141, 243...","[347, 847, 700, 75, 278, 767, 283, 295, 442, 7..."
1,vietnamese,south_east_asian,"[7, 663, 878, 136, 1447, 165]","[273, 971, 348, 628, 767, 704, 79, 965, 361, 4..."
2,vietnamese,south_east_asian,"[136, 243, 1183, 406]","[273, 827, 388, 175, 798, 392, 599, 1035, 429,..."
3,vietnamese,south_east_asian,"[256, 1348, 512, 663, 658, 878, 164, 1199, 243...","[347, 847, 700, 75, 278, 767, 283, 295, 442, 7..."
4,vietnamese,south_east_asian,"[663, 1267, 1044, 878, 136, 1132, 243, 166, 11...","[273, 971, 348, 628, 767, 704, 620, 79, 965, 3..."
...,...,...,...,...
57686,japanese,east_asian,"[698, 724, 1117, 1044, 848, 396, 1140, 1447, 2...","[558, 423, 766, 282, 426, 1035, 633, 976, 96, ..."
57687,japanese,east_asian,"[1338, 848, 221, 417, 915, 1336, 127]","[272, 965, 827, 275, 424, 400, 704, 427, 284, ..."
57688,japanese,east_asian,"[1281, 427, 878, 136, 164, 1471, 205, 455, 130...","[772, 526, 157, 72, 423, 195, 1025, 282, 990, ..."
57689,japanese,east_asian,"[19, 1237, 848, 164, 1012, 1179]","[272, 275, 284, 285, 568, 292, 837, 841, 1, 19..."


In [145]:
ingredients_df.head()

Unnamed: 0,region,country,ingr_18,ingr_725,ingr_707,ingr_395,ingr_1197,ingr_761,ingr_1120,ingr_190,...,ingr_703,ingr_1428,ingr_1278,ingr_84,ingr_1396,ingr_705,ingr_407,ingr_230,ingr_1080,ingredient_list
0,south_east_asian,vietnamese,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[256, 1216, 663, 658, 1044, 878, 136, 141, 243, 166, 998, 205, 165, 249, 694, 1012]"
1,south_east_asian,vietnamese,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[7, 663, 878, 136, 1447, 165]"
2,south_east_asian,vietnamese,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[136, 243, 1183, 406]"
3,south_east_asian,vietnamese,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[256, 1348, 512, 663, 658, 878, 164, 1199, 243, 166, 1088, 205, 258, 1305, 1113]"
4,south_east_asian,vietnamese,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[663, 1267, 1044, 878, 136, 1132, 243, 166, 1160, 205, 915, 694, 1012]"
