In [1]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import ast
import json
import re
from collections import defaultdict
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.metrics import silhouette_score

from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem import rdMolDescriptors
import requests
import time

from bs4 import BeautifulSoup
from scipy.sparse import csr_matrix
from sklearn.preprocessing import LabelEncoder

In [97]:
ingredients_df = pd.read_csv('data/recipes.csv')
ingredients_df = pd.DataFrame(ingredients_df)

In [98]:
# Load the data
comp_path = 'data/flavor_network_data/ingr_comp/comp_info.tsv'
comp_tsv = pd.read_csv(comp_path, delimiter='\t')

comp_df = pd.DataFrame(data = comp_tsv)
comp_columns = ['compound_id', 'compound_name', 'CAS_number']
comp_df.columns = comp_columns

ingr_path = 'data/flavor_network_data/ingr_comp/ingr_info.tsv'
ingr_tsv = pd.read_csv(ingr_path, delimiter='\t')

ingr_df = pd.DataFrame(data = ingr_tsv)
ingr_columns = ['ingredient_id', 'ingredient_name', 'ingredient_category']
ingr_df.columns = ingr_columns

ingr_comp_pathh = 'data/flavor_network_data/ingr_comp/ingr_comp.tsv'
ingr_comp_tsv = pd.read_csv(ingr_comp_pathh, delimiter='\t')

ingr_comp_df = pd.DataFrame(data = ingr_comp_tsv)
ingr_comp_df.rename(columns={
    '# ingredient id': 'ingredient_id',
    'compound id': 'compound_id'
}, inplace=True)

In [99]:
flav_edges_path = 'data/flavor_network_data/flavor_network_backbone/flavor_network_backbone_edges.tsv'
flav_edges_tsv = pd.read_csv(flav_edges_path, delimiter='\t')

flav_edges_df = pd.DataFrame(data = flav_edges_tsv)
flav_edges_columns = ['ingredient_1', 'ingredient_2', 'number_of_shared_compounds']
flav_edges_df.columns = flav_edges_columns

flav_nodes_path = 'data/flavor_network_data/flavor_network_backbone/flavor_network_backbone_nodes.tsv'
flav_nodes_tsv = pd.read_csv(flav_nodes_path, delimiter='\t')

flav_nodes_df = pd.DataFrame(data = flav_nodes_tsv)
flav_nodes_columns = ['ingredient_name', 'x_coordinate', 'y_coordinate', 'prevalence', 'r', 'g', 'b']
flav_nodes_df.columns = flav_nodes_columns

In [87]:
# Function to replace spaces with underscores
def replace_spaces(value):
    if isinstance(value, str):
        return value.replace(' ', '_')
    return value

In [100]:
ingr_df['ingredient_name'] = ingr_df['ingredient_name'].apply(replace_spaces)
ingr_df['ingredient_category'] = ingr_df['ingredient_category'].apply(replace_spaces)
flav_edges_df['ingredient_1'] = flav_edges_df['ingredient_1'].apply(replace_spaces)
flav_edges_df['ingredient_2'] = flav_edges_df['ingredient_2'].apply(replace_spaces)
flav_nodes_df['ingredient_name'] = flav_nodes_df['ingredient_name'].apply(replace_spaces)

In [101]:
data = {
    'Canada': 'NorthAmerican',
    'Turkey': 'MiddleEastern',
    'east_asian': 'EastAsian',
    'Caribbean': 'LatinAmerican',
    'Bangladesh': 'SouthAsian',
    'chinese': 'EastAsian',
    'mexico': 'LatinAmerican',
    'Lebanon': 'MiddleEastern',
    'japanese': 'EastAsian',
    'North-African': 'African',
    'MiddleEastern': 'MiddleEastern',
    'Indian': 'SouthAsian',
    'asian': 'EastAsian',
    'Italy': 'SouthernEuropean',
    'EasternEuropean_Russian': 'EasternEuropean',
    'Israel': 'MiddleEastern',
    'Korea': 'EastAsian',
    'Iran': 'MiddleEastern',
    'Eastern-Europe': 'EasternEuropean',
    'Jewish': 'MiddleEastern',
    'South-African': 'African',
    'Vietnamese': 'SoutheastAsian',
    'UK-and-Ireland': 'WesternEuropean',
    'French': 'WesternEuropean',
    'Mediterranean': 'SouthernEuropean',
    'Central_SouthAmerican': 'LatinAmerican',
    'Cajun_Creole': 'NorthAmerican',
    'Belgium': 'WesternEuropean',
    'China': 'EastAsian',
    'korean': 'EastAsian',
    'Germany': 'WesternEuropean',
    'South-America': 'LatinAmerican',
    'Spain': 'SouthernEuropean',
    'Netherlands': 'WesternEuropean',
    'Scandinavia': 'NorthernEuropean',
    'Philippines': 'SoutheastAsian',
    'Indonesia': 'SoutheastAsian',
    'East-African': 'African',
    'Scandinavian': 'NorthernEuropean',
    'Greek': 'SouthernEuropean',
    'American': 'NorthAmerican',
    'Vietnam': 'SoutheastAsian',
    'western': 'WesternEuropean',
    'African': 'African',
    'Switzerland': 'WesternEuropean',
    'West-African': 'African',
    'France': 'WesternEuropean',
    'Thai': 'SoutheastAsian',
    'Thailand': 'SoutheastAsian',
    'Italian': 'SouthernEuropean',
    'Pakistan': 'SouthAsian',
    'Irish': 'WesternEuropean',
    'Mexican': 'LatinAmerican',
    'Portugal': 'SouthernEuropean',
    'Chinese': 'EastAsian',
    'Mexico': 'LatinAmerican',
    'German': 'WesternEuropean',
    'Spanish_Portuguese': 'SouthernEuropean',
    'India': 'SouthAsian',
    'Japanese': 'EastAsian',
    'Moroccan': 'African',
    'Southern_SoulFood': 'NorthAmerican',
    'Malaysia': 'SoutheastAsian',
    'Austria': 'WesternEuropean',
    'English_Scottish': 'WesternEuropean',
    'Asian': 'EastAsian',
    'Southwestern': 'NorthAmerican',
    'Japan': 'EastAsian',
    'italian': 'SouthernEuropean',
    'canadian': 'NorthAmerican',
    'eastern_european_russian': 'EasternEuropean',
    'southern_soul_food': 'NorthAmerican',
    'middle_eastern': 'MiddleEastern',
    'central_south_american': 'LatinAmerican',
    'spanish': 'SouthernEuropean',
    'north_african': 'African',
    'portuguese': 'SouthernEuropean',
    'filipino': 'SoutheastAsian',
    'dutch': 'WesternEuropean',
    'iranian': 'MiddleEastern',
    'austrian': 'WesternEuropean',
    'swiss': 'WesternEuropean',
    'pakistani': 'SouthAsian',
    'malaysian': 'SoutheastAsian',
    'south_african': 'African',
    'west_african': 'African',
    'indonesian': 'SoutheastAsian',
    'belgian': 'WesternEuropean',
    'east_african': 'African',
    'israeli': 'MiddleEastern',
    'bangladeshi': 'SouthAsian'
}

mapping = pd.DataFrame(list(data.items()), columns=['country', 'region'])

# Clean the data
mapping['country'] = mapping['country'].str.strip()
mapping['region'] = mapping['region'].str.strip()

# Get unique regions and countries
unique_regions = mapping['region'].unique()
unique_countries = mapping['country'].unique()

regions_countries = mapping.groupby('region')['country'].apply(list).reset_index()

In [78]:
regions_countries.head(10)

Unnamed: 0,region,country
0,African,"[North-African, South-African, East-African, A..."
1,EastAsian,"[east_asian, chinese, japanese, asian, Korea, ..."
2,EasternEuropean,"[EasternEuropean_Russian, Eastern-Europe, east..."
3,LatinAmerican,"[Caribbean, mexico, Central_SouthAmerican, Sou..."
4,MiddleEastern,"[Turkey, Lebanon, MiddleEastern, Israel, Iran,..."
5,NorthAmerican,"[Canada, Cajun_Creole, American, Southern_Soul..."
6,NorthernEuropean,"[Scandinavia, Scandinavian]"
7,SouthAsian,"[Bangladesh, Indian, Pakistan, India, pakistan..."
8,SoutheastAsian,"[Vietnamese, Philippines, Indonesia, Vietnam, ..."
9,SouthernEuropean,"[Italy, Mediterranean, Spain, Greek, Italian, ..."


In [102]:
# Define the cuisine mapping with adjectival forms and variations
cuisine_mapping = {
    'vietnamese': ['vietnamese', 'vietnam'],
    'indian': ['indian', 'india'],
    'spanish_portuguese': ['spanish_portuguese'],
    'jewish': ['jewish'],
    'french': ['french', 'france'],
    'central_south_american': ['central_southamerican'],
    'cajun_creole': ['cajun_creole'],
    'thai': ['thai', 'thailand'],
    'scandinavian': ['scandinavian', 'scandinavia'],
    'greek': ['greek'],
    'american': ['american'],
    'african': ['african'],
    'middle_eastern': ['middleeastern', 'middle_eastern', 'turkey', 'iran', 'israel', 'lebanon'],
    'eastern_european_russian': ['easterneuropean_russian', 'eastern-europe', 'russia'],
    'italian': ['italian', 'italy'],
    'irish': ['irish', 'ireland'],
    'mexican': ['mexican', 'mexico'],
    'chinese': ['chinese', 'china'],
    'german': ['german', 'germany'],
    'mediterranean': ['mediterranean'],
    'japanese': ['japanese', 'japan'],
    'moroccan': ['moroccan'],
    'southern_soul_food': ['southern_soulfood'],
    'english_scottish': ['english_scottish', 'uk-and-ireland', 'england', 'scotland'],
    'asian': ['asian'],
    'southwestern': ['southwestern'],
    'east_asian': ['east_asian'],
    'western': ['western'],
    'korean': ['korean', 'korea'],
    'canadian': ['canada'],
    'caribbean': ['caribbean'],
    'bangladeshi': ['bangladesh'],
    'israeli': ['israel'],
    'iranian': ['iran'],
    'south_african': ['south-african'],
    'belgian': ['belgium'],
    'spanish': ['spain'],
    'dutch': ['netherlands'],
    'filipino': ['philippines'],
    'indonesian': ['indonesia'],
    'east_african': ['east-african'],
    'swiss': ['switzerland'],
    'west_african': ['west-african'],
    'north_african': ['north-african'],
    'pakistani': ['pakistan'],
    'portuguese': ['portugal'],
    'malaysian': ['malaysia'],
    'austrian': ['austria']
}

# Reverse the mapping for easier lookup
cuisine_lookup = {alias: cuisine for cuisine, aliases in cuisine_mapping.items() for alias in aliases}



# Create a dictionary from the mapping DataFrame
country_to_region = dict(zip(mapping['country'].str.lower().str.replace(' ', '_'), mapping['region'].str.lower().str.replace(' ', '_')))

# Explicitly format region names
def format_region_name(region):
    if pd.isna(region):
        return 'unknown'
    formatted_region = region.replace('southeastasian', 'south_east_asian') \
        .replace('southasian', 'south_asian') \
        .replace('southerneuropean', 'southern_european') \
        .replace('middleeastern', 'middle_eastern') \
        .replace('westerneuropean', 'western_european') \
        .replace('latinamerican', 'latin_american') \
        .replace('northamerican', 'north_american') \
        .replace('northerneuropean', 'northern_european') \
        .replace('easterneuropean', 'eastern_european') \
        .replace('eastasian', 'east_asian')
    return formatted_region

# Function to standardize country names and map to region
def standardize_and_map_region(country):
    if pd.isna(country):
        return 'unknown', 'unknown'
    country_lower = str(country).lower().replace(' ', '_')
    standardized_country = cuisine_lookup.get(country_lower, country_lower)
    region = country_to_region.get(standardized_country, 'unknown')
    return standardized_country, format_region_name(region)


# Apply the function to create new columns
ingredients_df[['country', 'region']] = ingredients_df['country'].apply(lambda x: pd.Series(standardize_and_map_region(x)))

# Move the 'region' column to the beginning
columns = ['region'] + [col for col in ingredients_df if col != 'region']
ingredients_df = ingredients_df[columns]

In [80]:
ingredients_df['region'].unique()

array(['south_east_asian', 'south_asian', 'southern_european',
       'middle_eastern', 'western_european', 'latin_american',
       'north_american', 'northern_european', 'african',
       'eastern_european', 'east_asian'], dtype=object)

In [104]:
ingredients_df.to_csv('data/ingredients_df.csv', index=False)

In [11]:
''''# Define the cuisine mapping with adjectival forms and variations
cuisine_mapping = {
    'vietnamese': ['vietnamese', 'vietnam'],
    'indian': ['indian', 'india'],
    'spanish_portuguese': ['spanish_portuguese'],
    'jewish': ['jewish'],
    'french': ['french', 'france'],
    'central_south_american': ['central_southamerican'],
    'cajun_creole': ['cajun_creole'],
    'thai': ['thai', 'thailand'],
    'scandinavian': ['scandinavian', 'scandinavia'],
    'greek': ['greek'],
    'american': ['american'],
    'african': ['african'],
    'middle_eastern': ['middleeastern', 'middle_eastern', 'turkey', 'iran', 'israel', 'lebanon'],
    'eastern_european_russian': ['easterneuropean_russian', 'eastern-europe', 'russia'],
    'italian': ['italian', 'italy'],
    'irish': ['irish', 'ireland'],
    'mexican': ['mexican', 'mexico'],
    'chinese': ['chinese', 'china'],
    'german': ['german', 'germany'],
    'mediterranean': ['mediterranean'],
    'japanese': ['japanese', 'japan'],
    'moroccan': ['moroccan'],
    'southern_soul_food': ['southern_soulfood'],
    'english_scottish': ['english_scottish', 'uk-and-ireland', 'england', 'scotland'],
    'asian': ['asian'],
    'southwestern': ['southwestern'],
    'east_asian': ['east_asian'],
    'western': ['western'],
    'korean': ['korean', 'korea'],
    'canadian': ['canada'],
    'caribbean': ['caribbean'],
    'bangladeshi': ['bangladesh'],
    'israeli': ['israel'],
    'iranian': ['iran'],
    'south_african': ['south-african'],
    'belgian': ['belgium'],
    'spanish': ['spain'],
    'dutch': ['netherlands'],
    'filipino': ['philippines'],
    'indonesian': ['indonesia'],
    'east_african': ['east-african'],
    'swiss': ['switzerland'],
    'west_african': ['west-african'],
    'north_african': ['north-african'],
    'pakistani': ['pakistan'],
    'portuguese': ['portugal'],
    'malaysian': ['malaysia'],
    'austrian': ['austria']
}

# Reverse the mapping for easier lookup
cuisine_lookup = {alias: cuisine for cuisine, aliases in cuisine_mapping.items() for alias in aliases}



# Create a dictionary from the mapping DataFrame
country_to_region = dict(zip(mapping['country'].str.lower().str.replace(' ', '_'), mapping['region'].str.lower().str.replace(' ', '_')))

# Explicitly format region names
def format_region_name(region):
    if pd.isna(region):
        return 'unknown'
    formatted_region = region.replace('southeastasian', 'south_east_asian') \
        .replace('southasian', 'south_asian') \
        .replace('southerneuropean', 'southern_european') \
        .replace('middleeastern', 'middle_eastern') \
        .replace('westerneuropean', 'western_european') \
        .replace('latinamerican', 'latin_american') \
        .replace('northamerican', 'north_american') \
        .replace('northerneuropean', 'northern_european') \
        .replace('easterneuropean', 'eastern_european') \
        .replace('eastasian', 'east_asian')
    return formatted_region

# Function to standardize country names and map to region
def standardize_and_map_region(country):
    if pd.isna(country):
        return 'unknown', 'unknown'
    country_lower = str(country).lower().replace(' ', '_')
    standardized_country = cuisine_lookup.get(country_lower, country_lower)
    region = country_to_region.get(standardized_country, 'unknown')
    return standardized_country, format_region_name(region)


# Apply the function to create new columns
ingredients_df[['country', 'region']] = ingredients_df['country'].apply(lambda x: pd.Series(standardize_and_map_region(x)))

# Move the 'region' column to the beginning
columns = ['region'] + [col for col in ingredients_df if col != 'region']
ingredients_df = ingredients_df[columns]'''


In [103]:
ingredients_df.replace({'Yes': 1, 'No': 0}, inplace=True)

  ingredients_df.replace({'Yes': 1, 'No': 0}, inplace=True)


In [65]:
ingredients_df

Unnamed: 0,region,country,almond,angelica,anise,anise_seed,apple,apple_brandy,apricot,armagnac,...,whiskey,white_bread,white_wine,whole_grain_wheat_flour,wine,wood,yam,yeast,yogurt,zucchini
0,south_east_asian,vietnamese,No,No,No,No,No,No,No,No,...,No,No,No,No,No,No,No,No,No,No
1,south_east_asian,vietnamese,No,No,No,No,No,No,No,No,...,No,No,No,No,No,No,No,No,No,No
2,south_east_asian,vietnamese,No,No,No,No,No,No,No,No,...,No,No,No,No,No,No,No,No,No,No
3,south_east_asian,vietnamese,No,No,No,No,No,No,No,No,...,No,No,No,No,No,No,No,No,No,No
4,south_east_asian,vietnamese,No,No,No,No,No,No,No,No,...,No,No,No,No,No,No,No,No,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57686,east_asian,japanese,No,No,No,No,No,No,No,No,...,No,No,No,No,No,No,No,No,No,No
57687,east_asian,japanese,No,No,No,No,No,No,No,No,...,No,No,No,No,No,No,No,No,No,No
57688,east_asian,japanese,No,No,No,No,No,No,No,No,...,No,No,No,No,No,No,No,No,No,No
57689,east_asian,japanese,No,No,No,No,No,No,No,No,...,No,No,No,No,No,No,No,No,No,No


In [13]:
ingr_small = ingredients_df.columns.copy()

In [14]:
ingr_small = pd.DataFrame(data=ingr_small)

In [15]:
# Assign column names if necessary
ingr_small.columns = ['ingredient_name']

# Drop rows that contain 'region' or 'country'
ingr_small = ingr_small[~ingr_small['ingredient_name'].isin(['region', 'country'])]

# Add ingredient_id column
ingr_small.insert(0, 'ingredient_id', range(1, 1 + len(ingr_small)))

# Print the cleaned DataFrame
print(ingr_small)

     ingredient_id ingredient_name
2                1          almond
3                2        angelica
4                3           anise
5                4      anise_seed
6                5           apple
..             ...             ...
380            379            wood
381            380             yam
382            381           yeast
383            382          yogurt
384            383        zucchini

[383 rows x 2 columns]


In [16]:
ingr_small['ingredient_name'] = ingr_small['ingredient_name'].apply(replace_spaces)

In [66]:
# Function to replace ingredient IDs
def replace_ingredient_id(ingr_small, ingr_df):
    # Create a dictionary for quick lookup of ingredient_id by ingredient_name
    ingr_dict = pd.Series(ingr_df.ingredient_id.values, index=ingr_df.ingredient_name).to_dict()

    # Function to replace ingredient_id if ingredient_name matches
    def replace_id(row):
        ingredient_name = row['ingredient_name']
        if ingredient_name in ingr_dict:
            row['ingredient_id'] = ingr_dict[ingredient_name]
        return row

    # Apply the function to each row in ingr_small
    ingr_small = ingr_small.apply(replace_id, axis=1)

    return ingr_small

# Apply the function
ingr_small = replace_ingredient_id(ingr_small, ingr_df)

# Verify the changes
print(ingr_small.head())

   ingredient_id ingredient_name  906  861  673  278  171  387  165  1099  \
2             18          almond    0    0    0    0    0    0    0     0   
3            725        angelica    0    0    1    0    0    0    0     0   
4            707           anise    0    0    1    0    0    0    1     0   
5            395      anise_seed    0    0    0    0    0    0    0     0   
6           1197           apple    0    0    0    1    0    0    0     0   

   ...  237  169  23  310  653  966  752  497  754  count_ones  
2  ...    0    0   0    0    0    0    0    0    0           9  
3  ...    0    0   0    0    0    0    0    0    0           4  
4  ...    0    0   0    0    0    0    0    0    0          17  
5  ...    0    0   0    0    0    0    0    0    0           2  
6  ...    0    0   0    0    0    0    0    0    0         195  

[5 rows x 1110 columns]


In [18]:
ingr_small_def = ingr_small.copy()

In [19]:
comp_df

Unnamed: 0,compound_id,compound_name,CAS_number
0,0,jasmone,488-10-8
1,1,5-methylhexanoic_acid,628-46-6
2,2,l-glutamine,56-85-9
3,3,1-methyl-3-methoxy-4-isopropylbenzene,1076-56-8
4,4,methyl-3-phenylpropionate,103-25-3
...,...,...,...
1102,1102,2-heptanol,543-49-7
1103,1103,1-octen-3-yl_butyrate,16491-54-6
1104,1104,guaiacol,90-05-1
1105,1105,(+/?)-methyl_5-acetoxyhexanoate,35234-22-1


In [20]:
ingr_comp_merge = ingr_comp_df.copy()

In [21]:
ingr_comp_merge.compound_id = ingr_comp_merge.compound_id.astype(str)

In [22]:
ingr_small['ingredient_id'] = ingr_small['ingredient_id'].astype(str)
ingr_comp_df['ingredient_id'] = ingr_comp_df['ingredient_id'].astype(str)
ingr_comp_df['compound_id'] = ingr_comp_df['compound_id'].astype(str)

# Convert column names in ingr_small to strings
ingr_small.columns = ingr_small.columns.astype(str)

# Get the unique compound_ids in ingr_comp_df
compound_id_list = ingr_comp_df['compound_id'].unique()

# Create a DataFrame to hold the new columns with initial values set to 0
new_columns_df = pd.DataFrame(0, index=ingr_small.index, columns=compound_id_list)

# Concatenate the new columns to ingr_small
ingr_small = pd.concat([ingr_small, new_columns_df], axis=1)

# Iterate through ingr_comp_df and update the values in ingr_small
for index, row in ingr_comp_df.iterrows():
    ingredient_id = row['ingredient_id']
    compound_id = row['compound_id']
    if ingredient_id in ingr_small['ingredient_id'].values:
        ingr_small.loc[ingr_small['ingredient_id'] == ingredient_id, compound_id] = 1

# Verify the result
print(ingr_small.head())

  ingredient_id ingredient_name  906  861  673  278  171  387  165  1099  ...  \
2            18          almond    0    0    0    0    0    0    0     0  ...   
3           725        angelica    0    0    1    0    0    0    0     0  ...   
4           707           anise    0    0    1    0    0    0    1     0  ...   
5           395      anise_seed    0    0    0    0    0    0    0     0  ...   
6          1197           apple    0    0    0    1    0    0    0     0  ...   

   722  237  169  23  310  653  966  752  497  754  
2    0    0    0   0    0    0    0    0    0    0  
3    0    0    0   0    0    0    0    0    0    0  
4    0    0    0   0    0    0    0    0    0    0  
5    0    0    0   0    0    0    0    0    0    0  
6    0    0    0   0    0    0    0    0    0    0  

[5 rows x 1109 columns]


In [23]:
ingr_small.shape

(383, 1109)

In [24]:
compound_id_cols = [col for col in ingr_small.columns if col not in ['ingredient_id', 'ingredient_name']]

In [25]:
ingr_small['count_ones'] = ingr_small[compound_id_cols].sum(axis=1)

In [26]:
ingr_comp_df

Unnamed: 0,ingredient_id,compound_id
0,1392,906
1,1259,861
2,1079,673
3,22,906
4,103,906
...,...,...
36776,876,657
36777,637,461
36778,689,650
36779,689,297


In [27]:
# Group by 'ingredient_id' and aggregate 'compound_id' into a list
comp_per_ingr = ingr_comp_df.groupby('ingredient_id')['compound_id'].apply(list).reset_index()
ingr_per_comp = ingr_comp_df.groupby('compound_id')['ingredient_id'].apply(list).reset_index()

In [28]:
ingr_per_comp.sort_values(by='compound_id', ascending=False, inplace=True)

In [29]:
print(ingr_comp_df.compound_id.unique())

['906' '861' '673' ... '752' '497' '754']


In [30]:
ingr_per_comp.shape

(1107, 2)

In [31]:
comp_per_ingr

Unnamed: 0,ingredient_id,compound_id
0,0,[995]
1,1,[921]
2,10,[715]
3,100,[1011]
4,1000,"[764, 275, 630, 1046, 285, 84, 482, 772, 686, ..."
...,...,...
1520,995,[996]
1521,996,[894]
1522,997,[828]
1523,998,"[25, 317, 86]"


In [32]:
ingr_small

Unnamed: 0,ingredient_id,ingredient_name,906,861,673,278,171,387,165,1099,...,237,169,23,310,653,966,752,497,754,count_ones
2,18,almond,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,9
3,725,angelica,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
4,707,anise,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,17
5,395,anise_seed,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
6,1197,apple,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,195
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
380,1396,wood,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8
381,705,yam,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
382,407,yeast,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,3
383,230,yogurt,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,37


In [33]:
ingr_small['ingredient_name'] = ingr_small['ingredient_name'].astype(str)

In [34]:
ingredient_cols = ingredients_df.columns.copy()

In [35]:
ingredient_cols

Index(['region', 'country', 'almond', 'angelica', 'anise', 'anise_seed',
       'apple', 'apple_brandy', 'apricot', 'armagnac',
       ...
       'whiskey', 'white_bread', 'white_wine', 'whole_grain_wheat_flour',
       'wine', 'wood', 'yam', 'yeast', 'yogurt', 'zucchini'],
      dtype='object', length=385)

In [36]:
ingr_small

Unnamed: 0,ingredient_id,ingredient_name,906,861,673,278,171,387,165,1099,...,237,169,23,310,653,966,752,497,754,count_ones
2,18,almond,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,9
3,725,angelica,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
4,707,anise,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,17
5,395,anise_seed,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
6,1197,apple,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,195
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
380,1396,wood,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8
381,705,yam,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
382,407,yeast,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,3
383,230,yogurt,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,37


In [37]:
ingr_small["ingredient_name"]

2          almond
3        angelica
4           anise
5      anise_seed
6           apple
          ...    
380          wood
381           yam
382         yeast
383        yogurt
384      zucchini
Name: ingredient_name, Length: 383, dtype: object

In [38]:
ingr_small_mapping = ingr_small['ingredient_name']

In [39]:
ingr_small_mapping

2          almond
3        angelica
4           anise
5      anise_seed
6           apple
          ...    
380          wood
381           yam
382         yeast
383        yogurt
384      zucchini
Name: ingredient_name, Length: 383, dtype: object

In [40]:
ingr_small_mapping = pd.concat([ingr_small_mapping, ingr_small['ingredient_id']], axis=1)

In [41]:
ingredients_df.columns = ingredients_df.columns.astype(str)

In [42]:
ingr_small_mapping

Unnamed: 0,ingredient_name,ingredient_id
2,almond,18
3,angelica,725
4,anise,707
5,anise_seed,395
6,apple,1197
...,...,...
380,wood,1396
381,yam,705
382,yeast,407
383,yogurt,230


In [43]:
ingredient_name_to_id = dict(zip(ingr_small_mapping['ingredient_name'], ingr_small_mapping['ingredient_id']))

In [44]:
ingredient_name_to_id

{'almond': '18',
 'angelica': '725',
 'anise': '707',
 'anise_seed': '395',
 'apple': '1197',
 'apple_brandy': '761',
 'apricot': '1120',
 'armagnac': '190',
 'artemisia': '1504',
 'artichoke': '1251',
 'asparagus': '1174',
 'avocado': '94',
 'bacon': '19',
 'baked_potato': '261',
 'balm': '1416',
 'banana': '918',
 'barley': '1281',
 'bartlett_pear': '330',
 'basil': '256',
 'bay': '215',
 'bean': '1348',
 'beech': '357',
 'beef': '248',
 'beef_broth': '512',
 'beef_liver': '1115',
 'beer': '1495',
 'beet': '255',
 'bell_pepper': '1292',
 'bergamot': '778',
 'berry': '609',
 'bitter_orange': '92',
 'black_bean': '1478',
 'black_currant': '1026',
 'black_mustard_seed_oil': '15',
 'black_pepper': '7',
 'black_raspberry': '1375',
 'black_sesame_seed': '118',
 'black_tea': '908',
 'blackberry': '676',
 'blackberry_brandy': '924',
 'blue_cheese': '994',
 'blueberry': '46',
 'bone_oil': '1322',
 'bourbon_whiskey': '1351',
 'brandy': '737',
 'brassica': '598',
 'bread': '105',
 'broccoli': '

In [91]:
new_columns = []

# Iterate over the current columns and replace ingredient names with IDs, formatted as ingr_{ingredient_id}
for col in ingredients_df.columns:
    if col in ['region', 'country']:  # Keep these columns as is
        new_columns.append(col)
    else:
        # Replace ingredient names with their corresponding IDs and format as ingr_{ingredient_id}
        ingredient_id = ingredient_name_to_id.get(col, col)
        new_columns.append(f'{ingredient_id}')


In [92]:
ingredients_df.columns = new_columns

In [93]:
ingredients_df

Unnamed: 0,region,country,18,725,707,395,1197,761,1120,190,...,361,703,1428,1278,84,1396,705,407,230,1080
0,south_east_asian,vietnamese,No,No,No,No,No,No,No,No,...,No,No,No,No,No,No,No,No,No,No
1,south_east_asian,vietnamese,No,No,No,No,No,No,No,No,...,No,No,No,No,No,No,No,No,No,No
2,south_east_asian,vietnamese,No,No,No,No,No,No,No,No,...,No,No,No,No,No,No,No,No,No,No
3,south_east_asian,vietnamese,No,No,No,No,No,No,No,No,...,No,No,No,No,No,No,No,No,No,No
4,south_east_asian,vietnamese,No,No,No,No,No,No,No,No,...,No,No,No,No,No,No,No,No,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57686,east_asian,japanese,No,No,No,No,No,No,No,No,...,No,No,No,No,No,No,No,No,No,No
57687,east_asian,japanese,No,No,No,No,No,No,No,No,...,No,No,No,No,No,No,No,No,No,No
57688,east_asian,japanese,No,No,No,No,No,No,No,No,...,No,No,No,No,No,No,No,No,No,No
57689,east_asian,japanese,No,No,No,No,No,No,No,No,...,No,No,No,No,No,No,No,No,No,No


In [96]:
ingredients_df.to_csv('data/ingredients_df_n.csv', index=False)

In [71]:
ingredients_df

TypeError: unhashable type: 'set'

TypeError: unhashable type: 'set'

In [49]:
comp_df['compound_name'] = comp_df['compound_name'].astype(str)

In [50]:
comp_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1107 entries, 0 to 1106
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   compound_id    1107 non-null   int64 
 1   compound_name  1107 non-null   object
 2   CAS_number     1107 non-null   object
dtypes: int64(1), object(2)
memory usage: 26.1+ KB


In [51]:
sparse_matrix

NameError: name 'sparse_matrix' is not defined

In [None]:
# Convert to dense vectors
dense_vectors = sparse_matrix.todense()

# Create a DataFrame for easy handling
dense_df = pd.DataFrame(dense_vectors)

In [None]:
# Function to get vectors from the dense matrix
def get_ingredient_vectors(dense_matrix):
    ingredient_vectors = dense_matrix.values  # Convert DataFrame to numpy array
    return ingredient_vectors

# Get ingredient vectors
ingredient_vectors = get_ingredient_vectors(dense_df)

In [None]:
dense_df

In [None]:
dense_df.T

In [None]:
mother_matrix = dense_df.dot(dense_df.T)

In [None]:
mother_matrix = mother_matrix.astype(int)

In [None]:
mother_matrix

In [None]:
dense_df.dot(query_vector)

In [None]:
ingredients_df.to_csv('data/ingredients_df.csv', index=False)

In [None]:
# Function to get column names where the row has 1s
def get_ingredients(row):
    return [re.findall(r'\d+', col)[0] for col in ingredients_df.columns if row[col] == 1]

# Apply the function to each row
ingredients_df['ingredient_list'] = ingredients_df.apply(get_ingredients, axis=1)

In [None]:
print(ingredients_df['ingredient_list'])

In [None]:
ingredients_df.head()

In [None]:
ingredients_slised =ingredients_df[['country', 'region', 'ingredient_list']].copy()

In [None]:
ingredients_slised.head()

In [None]:
comp_per_ingr

In [None]:
#Function to get compound list for each ingredient in the ingredient list
def add_compound_to_recipe(row):
    compound_list = []
    for ingredient in row['ingredient_list']:
        compounds = comp_per_ingr.loc[comp_per_ingr['ingredient_id'] == ingredient, 'compound_id']
        if not compounds.empty:
            compound_list.extend(compounds.values[0])
    return compound_list

# Apply the function to each row
ingredients_slised['compound_list'] = ingredients_slised.apply(add_compound_to_recipe, axis=1)

In [None]:
ingredients_slised

In [None]:
ingredients_df.head()