In [1]:
# try to combine all the datasets to create a pdf with ingredients, aromatic

import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import ast
import json
import re
from collections import defaultdict
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.metrics import silhouette_score

#from rdkit import Chem
#from rdkit.Chem import Descriptors
#from rdkit.Chem import rdMolDescriptors
import requests
import time

from bs4 import BeautifulSoup
from scipy.sparse import csr_matrix
from sklearn.preprocessing import LabelEncoder

In [2]:
ingredients_df = pd.read_csv('data/recipes.csv')
ingredients_df = pd.DataFrame(ingredients_df)

In [3]:
# Load the data
comp_path = 'data/flavor_network_data/ingr_comp/comp_info.tsv'
comp_tsv = pd.read_csv(comp_path, delimiter='\t')

comp_df = pd.DataFrame(data = comp_tsv)
comp_columns = ['compound_id', 'compound_name', 'CAS_number']
comp_df.columns = comp_columns

ingr_path = 'data/flavor_network_data/ingr_comp/ingr_info.tsv'
ingr_tsv = pd.read_csv(ingr_path, delimiter='\t')

ingr_df = pd.DataFrame(data = ingr_tsv)
ingr_columns = ['ingredient_id', 'ingredient_name', 'ingredient_category']
ingr_df.columns = ingr_columns

ingr_comp_pathh = 'data/flavor_network_data/ingr_comp/ingr_comp.tsv'
ingr_comp_tsv = pd.read_csv(ingr_comp_pathh, delimiter='\t')


In [4]:
ingr_comp_df = pd.DataFrame(data = ingr_comp_tsv)

In [5]:
ingr_comp_df.columns

Index(['# ingredient id', 'compound id'], dtype='object')

In [6]:
ingr_comp_df.rename(columns={
    '# ingredient id': 'ingredient_id',
    'compound id': 'compound_id'
}, inplace=True)

In [7]:
flav_edges_path = 'data/flavor_network_data/flavor_network_backbone/flavor_network_backbone_edges.tsv'
flav_edges_tsv = pd.read_csv(flav_edges_path, delimiter='\t')

flav_edges_df = pd.DataFrame(data = flav_edges_tsv)
flav_edges_columns = ['ingredient_1', 'ingredient_2', 'number_of_shared_compounds']
flav_edges_df.columns = flav_edges_columns

flav_nodes_path = 'data/flavor_network_data/flavor_network_backbone/flavor_network_backbone_nodes.tsv'
flav_nodes_tsv = pd.read_csv(flav_nodes_path, delimiter='\t')

flav_nodes_df = pd.DataFrame(data = flav_nodes_tsv)
flav_nodes_columns = ['ingredient_name', 'x_coordinate', 'y_coordinate', 'prevalence', 'r', 'g', 'b']
flav_nodes_df.columns = flav_nodes_columns

In [8]:
# Function to replace spaces with underscores
def replace_spaces(value):
    if isinstance(value, str):
        return value.replace(' ', '_')
    return value

In [9]:
ingr_df['ingredient_name'] = ingr_df['ingredient_name'].apply(replace_spaces)
ingr_df['ingredient_category'] = ingr_df['ingredient_category'].apply(replace_spaces)
flav_edges_df['ingredient_1'] = flav_edges_df['ingredient_1'].apply(replace_spaces)
flav_edges_df['ingredient_2'] = flav_edges_df['ingredient_2'].apply(replace_spaces)
flav_nodes_df['ingredient_name'] = flav_nodes_df['ingredient_name'].apply(replace_spaces)

In [10]:

data = {
    'Canada': 'NorthAmerican',
    'Turkey': 'MiddleEastern',
    'east_asian': 'EastAsian',
    'Caribbean': 'LatinAmerican',
    'Bangladesh': 'SouthAsian',
    'chinese': 'EastAsian',
    'mexico': 'LatinAmerican',
    'Lebanon': 'MiddleEastern',
    'japanese': 'EastAsian',
    'North-African': 'African',
    'MiddleEastern': 'MiddleEastern',
    'Indian': 'SouthAsian',
    'asian': 'EastAsian',
    'Italy': 'SouthernEuropean',
    'EasternEuropean_Russian': 'EasternEuropean',
    'Israel': 'MiddleEastern',
    'Korea': 'EastAsian',
    'Iran': 'MiddleEastern',
    'Eastern-Europe': 'EasternEuropean',
    'Jewish': 'MiddleEastern',
    'South-African': 'African',
    'Vietnamese': 'SoutheastAsian',
    'UK-and-Ireland': 'WesternEuropean',
    'French': 'WesternEuropean',
    'Mediterranean': 'SouthernEuropean',
    'Central_SouthAmerican': 'LatinAmerican',
    'Cajun_Creole': 'NorthAmerican',
    'Belgium': 'WesternEuropean',
    'China': 'EastAsian',
    'korean': 'EastAsian',
    'Germany': 'WesternEuropean',
    'South-America': 'LatinAmerican',
    'Spain': 'SouthernEuropean',
    'Netherlands': 'WesternEuropean',
    'Scandinavia': 'NorthernEuropean',
    'Philippines': 'SoutheastAsian',
    'Indonesia': 'SoutheastAsian',
    'East-African': 'African',
    'Scandinavian': 'NorthernEuropean',
    'Greek': 'SouthernEuropean',
    'American': 'NorthAmerican',
    'Vietnam': 'SoutheastAsian',
    'western': 'WesternEuropean',
    'African': 'African',
    'Switzerland': 'WesternEuropean',
    'West-African': 'African',
    'France': 'WesternEuropean',
    'Thai': 'SoutheastAsian',
    'Thailand': 'SoutheastAsian',
    'Italian': 'SouthernEuropean',
    'Pakistan': 'SouthAsian',
    'Irish': 'WesternEuropean',
    'Mexican': 'LatinAmerican',
    'Portugal': 'SouthernEuropean',
    'Chinese': 'EastAsian',
    'Mexico': 'LatinAmerican',
    'German': 'WesternEuropean',
    'Spanish_Portuguese': 'SouthernEuropean',
    'India': 'SouthAsian',
    'Japanese': 'EastAsian',
    'Moroccan': 'African',
    'Southern_SoulFood': 'NorthAmerican',
    'Malaysia': 'SoutheastAsian',
    'Austria': 'WesternEuropean',
    'English_Scottish': 'WesternEuropean',
    'Asian': 'EastAsian',
    'Southwestern': 'NorthAmerican',
    'Japan': 'EastAsian',
    'italian': 'SouthernEuropean',
    'canadian': 'NorthAmerican',
    'eastern_european_russian': 'EasternEuropean',
    'southern_soul_food': 'NorthAmerican',
    'middle_eastern': 'MiddleEastern',
    'central_south_american': 'LatinAmerican',
    'spanish': 'SouthernEuropean',
    'north_african': 'African',
    'portuguese': 'SouthernEuropean',
    'filipino': 'SoutheastAsian',
    'dutch': 'WesternEuropean',
    'iranian': 'MiddleEastern',
    'austrian': 'WesternEuropean',
    'swiss': 'WesternEuropean',
    'pakistani': 'SouthAsian',
    'malaysian': 'SoutheastAsian',
    'south_african': 'African',
    'west_african': 'African',
    'indonesian': 'SoutheastAsian',
    'belgian': 'WesternEuropean',
    'east_african': 'African',
    'israeli': 'MiddleEastern',
    'bangladeshi': 'SouthAsian'
}

mapping = pd.DataFrame(list(data.items()), columns=['country', 'region'])

# Clean the data
mapping['country'] = mapping['country'].str.strip()
mapping['region'] = mapping['region'].str.strip()

# Get unique regions and countries
unique_regions = mapping['region'].unique()
unique_countries = mapping['country'].unique()

regions_countries = mapping.groupby('region')['country'].apply(list).reset_index()






In [11]:
regions_countries.head(10)

Unnamed: 0,region,country
0,African,"[North-African, South-African, East-African, A..."
1,EastAsian,"[east_asian, chinese, japanese, asian, Korea, ..."
2,EasternEuropean,"[EasternEuropean_Russian, Eastern-Europe, east..."
3,LatinAmerican,"[Caribbean, mexico, Central_SouthAmerican, Sou..."
4,MiddleEastern,"[Turkey, Lebanon, MiddleEastern, Israel, Iran,..."
5,NorthAmerican,"[Canada, Cajun_Creole, American, Southern_Soul..."
6,NorthernEuropean,"[Scandinavia, Scandinavian]"
7,SouthAsian,"[Bangladesh, Indian, Pakistan, India, pakistan..."
8,SoutheastAsian,"[Vietnamese, Philippines, Indonesia, Vietnam, ..."
9,SouthernEuropean,"[Italy, Mediterranean, Spain, Greek, Italian, ..."


In [12]:
# Define the cuisine mapping with adjectival forms and variations
cuisine_mapping = {
    'vietnamese': ['vietnamese', 'vietnam'],
    'indian': ['indian', 'india'],
    'spanish_portuguese': ['spanish_portuguese'],
    'jewish': ['jewish'],
    'french': ['french', 'france'],
    'central_south_american': ['central_southamerican'],
    'cajun_creole': ['cajun_creole'],
    'thai': ['thai', 'thailand'],
    'scandinavian': ['scandinavian', 'scandinavia'],
    'greek': ['greek'],
    'american': ['american'],
    'african': ['african'],
    'middle_eastern': ['middleeastern', 'middle_eastern', 'turkey', 'iran', 'israel', 'lebanon'],
    'eastern_european_russian': ['easterneuropean_russian', 'eastern-europe', 'russia'],
    'italian': ['italian', 'italy'],
    'irish': ['irish', 'ireland'],
    'mexican': ['mexican', 'mexico'],
    'chinese': ['chinese', 'china'],
    'german': ['german', 'germany'],
    'mediterranean': ['mediterranean'],
    'japanese': ['japanese', 'japan'],
    'moroccan': ['moroccan'],
    'southern_soul_food': ['southern_soulfood'],
    'english_scottish': ['english_scottish', 'uk-and-ireland', 'england', 'scotland'],
    'asian': ['asian'],
    'southwestern': ['southwestern'],
    'east_asian': ['east_asian'],
    'western': ['western'],
    'korean': ['korean', 'korea'],
    'canadian': ['canada'],
    'caribbean': ['caribbean'],
    'bangladeshi': ['bangladesh'],
    'israeli': ['israel'],
    'iranian': ['iran'],
    'south_african': ['south-african'],
    'belgian': ['belgium'],
    'spanish': ['spain'],
    'dutch': ['netherlands'],
    'filipino': ['philippines'],
    'indonesian': ['indonesia'],
    'east_african': ['east-african'],
    'swiss': ['switzerland'],
    'west_african': ['west-african'],
    'north_african': ['north-african'],
    'pakistani': ['pakistan'],
    'portuguese': ['portugal'],
    'malaysian': ['malaysia'],
    'austrian': ['austria']
}

# Reverse the mapping for easier lookup
cuisine_lookup = {alias: cuisine for cuisine, aliases in cuisine_mapping.items() for alias in aliases}



# Create a dictionary from the mapping DataFrame
country_to_region = dict(zip(mapping['country'].str.lower().str.replace(' ', '_'), mapping['region'].str.lower().str.replace(' ', '_')))

# Explicitly format region names
def format_region_name(region):
    if pd.isna(region):
        return 'unknown'
    formatted_region = region.replace('southeastasian', 'south_east_asian') \
        .replace('southasian', 'south_asian') \
        .replace('southerneuropean', 'southern_european') \
        .replace('middleeastern', 'middle_eastern') \
        .replace('westerneuropean', 'western_european') \
        .replace('latinamerican', 'latin_american') \
        .replace('northamerican', 'north_american') \
        .replace('northerneuropean', 'northern_european') \
        .replace('easterneuropean', 'eastern_european') \
        .replace('eastasian', 'east_asian')
    return formatted_region

# Function to standardize country names and map to region
def standardize_and_map_region(country):
    if pd.isna(country):
        return 'unknown', 'unknown'
    country_lower = str(country).lower().replace(' ', '_')
    standardized_country = cuisine_lookup.get(country_lower, country_lower)
    region = country_to_region.get(standardized_country, 'unknown')
    return standardized_country, format_region_name(region)


# Apply the function to create new columns
ingredients_df[['country', 'region']] = ingredients_df['country'].apply(lambda x: pd.Series(standardize_and_map_region(x)))

# Move the 'region' column to the beginning
columns = ['region'] + [col for col in ingredients_df if col != 'region']
ingredients_df = ingredients_df[columns]


In [13]:
ingredients_df.replace({'Yes': 1, 'No': 0}, inplace=True)

In [14]:
ingr_small = ingredients_df.columns.copy()

In [15]:
ingr_small = pd.DataFrame(data=ingr_small)

In [16]:
# Assign column names if necessary
ingr_small.columns = ['ingredient_name']

# Drop rows that contain 'region' or 'country'
ingr_small = ingr_small[~ingr_small['ingredient_name'].isin(['region', 'country'])]

# Add ingredient_id column
ingr_small.insert(0, 'ingredient_id', range(1, 1 + len(ingr_small)))

# Print the cleaned DataFrame
print(ingr_small)

     ingredient_id ingredient_name
2                1          almond
3                2        angelica
4                3           anise
5                4      anise_seed
6                5           apple
..             ...             ...
380            379            wood
381            380             yam
382            381           yeast
383            382          yogurt
384            383        zucchini

[383 rows x 2 columns]


In [17]:
ingr_small['ingredient_name'] = ingr_small['ingredient_name'].apply(replace_spaces)

In [18]:
# Function to replace ingredient IDs
def replace_ingredient_id(ingr_small, ingr_df):
    # Create a dictionary for quick lookup of ingredient_id by ingredient_name
    ingr_dict = pd.Series(ingr_df.ingredient_id.values, index=ingr_df.ingredient_name).to_dict()

    # Function to replace ingredient_id if ingredient_name matches
    def replace_id(row):
        ingredient_name = row['ingredient_name']
        if ingredient_name in ingr_dict:
            row['ingredient_id'] = ingr_dict[ingredient_name]
        return row

    # Apply the function to each row in ingr_small
    ingr_small = ingr_small.apply(replace_id, axis=1)

    return ingr_small

# Apply the function
ingr_small = replace_ingredient_id(ingr_small, ingr_df)

# Verify the changes
print(ingr_small.head())

   ingredient_id ingredient_name
2             18          almond
3            725        angelica
4            707           anise
5            395      anise_seed
6           1197           apple


In [19]:
ingr_small_def = ingr_small.copy()

In [20]:
comp_df

Unnamed: 0,compound_id,compound_name,CAS_number
0,0,jasmone,488-10-8
1,1,5-methylhexanoic_acid,628-46-6
2,2,l-glutamine,56-85-9
3,3,1-methyl-3-methoxy-4-isopropylbenzene,1076-56-8
4,4,methyl-3-phenylpropionate,103-25-3
...,...,...,...
1102,1102,2-heptanol,543-49-7
1103,1103,1-octen-3-yl_butyrate,16491-54-6
1104,1104,guaiacol,90-05-1
1105,1105,(+/?)-methyl_5-acetoxyhexanoate,35234-22-1


In [21]:
ingr_comp_merge = ingr_comp_df.copy()

In [22]:
ingr_comp_merge.compound_id = ingr_comp_merge.compound_id.astype(str)

In [23]:
ingr_small['ingredient_id'] = ingr_small['ingredient_id'].astype(str)
ingr_comp_df['ingredient_id'] = ingr_comp_df['ingredient_id'].astype(str)
ingr_comp_df['compound_id'] = ingr_comp_df['compound_id'].astype(str)

# Convert column names in ingr_small to strings
ingr_small.columns = ingr_small.columns.astype(str)

# Get the unique compound_ids in ingr_comp_df
compound_id_list = ingr_comp_df['compound_id'].unique()

# Create a DataFrame to hold the new columns with initial values set to 0
new_columns_df = pd.DataFrame(0, index=ingr_small.index, columns=compound_id_list)

# Concatenate the new columns to ingr_small
ingr_small = pd.concat([ingr_small, new_columns_df], axis=1)

# Iterate through ingr_comp_df and update the values in ingr_small
for index, row in ingr_comp_df.iterrows():
    ingredient_id = row['ingredient_id']
    compound_id = row['compound_id']
    if ingredient_id in ingr_small['ingredient_id'].values:
        ingr_small.loc[ingr_small['ingredient_id'] == ingredient_id, compound_id] = 1

# Verify the result
print(ingr_small.head())

  ingredient_id ingredient_name  906  861  673  278  171  387  165  1099  ...   
2            18          almond    0    0    0    0    0    0    0     0  ...  \
3           725        angelica    0    0    1    0    0    0    0     0  ...   
4           707           anise    0    0    1    0    0    0    1     0  ...   
5           395      anise_seed    0    0    0    0    0    0    0     0  ...   
6          1197           apple    0    0    0    1    0    0    0     0  ...   

   722  237  169  23  310  653  966  752  497  754  
2    0    0    0   0    0    0    0    0    0    0  
3    0    0    0   0    0    0    0    0    0    0  
4    0    0    0   0    0    0    0    0    0    0  
5    0    0    0   0    0    0    0    0    0    0  
6    0    0    0   0    0    0    0    0    0    0  

[5 rows x 1109 columns]


In [24]:
ingr_small.shape

(383, 1109)

In [25]:
compound_id_cols = [col for col in ingr_small.columns if col not in ['ingredient_id', 'ingredient_name']]

In [26]:
ingr_small['count_ones'] = ingr_small[compound_id_cols].sum(axis=1)

In [27]:
ingr_comp_df

Unnamed: 0,ingredient_id,compound_id
0,1392,906
1,1259,861
2,1079,673
3,22,906
4,103,906
...,...,...
36776,876,657
36777,637,461
36778,689,650
36779,689,297


In [28]:
# Group by 'ingredient_id' and aggregate 'compound_id' into a list
comp_per_ingr = ingr_comp_df.groupby('ingredient_id')['compound_id'].apply(list).reset_index()
ingr_per_comp = ingr_comp_df.groupby('compound_id')['ingredient_id'].apply(list).reset_index()

In [29]:
ingr_per_comp.sort_values(by='compound_id', ascending=False, inplace=True)

In [30]:
print(ingr_comp_df.compound_id.unique())

['906' '861' '673' ... '752' '497' '754']


In [31]:
ingr_per_comp.shape

(1107, 2)

In [32]:
comp_per_ingr

Unnamed: 0,ingredient_id,compound_id
0,0,[995]
1,1,[921]
2,10,[715]
3,100,[1011]
4,1000,"[764, 275, 630, 1046, 285, 84, 482, 772, 686, ..."
...,...,...
1520,995,[996]
1521,996,[894]
1522,997,[828]
1523,998,"[25, 317, 86]"


In [33]:
ingr_small

Unnamed: 0,ingredient_id,ingredient_name,906,861,673,278,171,387,165,1099,...,237,169,23,310,653,966,752,497,754,count_ones
2,18,almond,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,9
3,725,angelica,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
4,707,anise,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,17
5,395,anise_seed,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
6,1197,apple,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,195
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
380,1396,wood,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8
381,705,yam,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
382,407,yeast,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,3
383,230,yogurt,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,37


In [34]:
ingr_small['ingredient_name'] = ingr_small['ingredient_name'].astype(str)

In [35]:
ingredient_cols = ingredients_df.columns.copy()

In [36]:
ingredient_cols

Index(['region', 'country', 'almond', 'angelica', 'anise', 'anise_seed',
       'apple', 'apple_brandy', 'apricot', 'armagnac',
       ...
       'whiskey', 'white_bread', 'white_wine', 'whole_grain_wheat_flour',
       'wine', 'wood', 'yam', 'yeast', 'yogurt', 'zucchini'],
      dtype='object', length=385)

In [37]:
ingr_small

Unnamed: 0,ingredient_id,ingredient_name,906,861,673,278,171,387,165,1099,...,237,169,23,310,653,966,752,497,754,count_ones
2,18,almond,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,9
3,725,angelica,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
4,707,anise,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,17
5,395,anise_seed,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
6,1197,apple,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,195
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
380,1396,wood,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8
381,705,yam,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
382,407,yeast,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,3
383,230,yogurt,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,37


In [38]:
ingr_small["ingredient_name"]

2          almond
3        angelica
4           anise
5      anise_seed
6           apple
          ...    
380          wood
381           yam
382         yeast
383        yogurt
384      zucchini
Name: ingredient_name, Length: 383, dtype: object

In [39]:
ingr_small_mapping = ingr_small['ingredient_name']

In [40]:
ingr_small_mapping

2          almond
3        angelica
4           anise
5      anise_seed
6           apple
          ...    
380          wood
381           yam
382         yeast
383        yogurt
384      zucchini
Name: ingredient_name, Length: 383, dtype: object

In [41]:
ingr_small_mapping = pd.concat([ingr_small_mapping, ingr_small['ingredient_id']], axis=1)

In [42]:
ingredients_df.columns = ingredients_df.columns.astype(str)

In [43]:
ingr_small_mapping

Unnamed: 0,ingredient_name,ingredient_id
2,almond,18
3,angelica,725
4,anise,707
5,anise_seed,395
6,apple,1197
...,...,...
380,wood,1396
381,yam,705
382,yeast,407
383,yogurt,230


In [44]:
ingredient_name_to_id = dict(zip(ingr_small_mapping['ingredient_name'], ingr_small_mapping['ingredient_id']))

In [45]:
ingredient_name_to_id

{'almond': '18',
 'angelica': '725',
 'anise': '707',
 'anise_seed': '395',
 'apple': '1197',
 'apple_brandy': '761',
 'apricot': '1120',
 'armagnac': '190',
 'artemisia': '1504',
 'artichoke': '1251',
 'asparagus': '1174',
 'avocado': '94',
 'bacon': '19',
 'baked_potato': '261',
 'balm': '1416',
 'banana': '918',
 'barley': '1281',
 'bartlett_pear': '330',
 'basil': '256',
 'bay': '215',
 'bean': '1348',
 'beech': '357',
 'beef': '248',
 'beef_broth': '512',
 'beef_liver': '1115',
 'beer': '1495',
 'beet': '255',
 'bell_pepper': '1292',
 'bergamot': '778',
 'berry': '609',
 'bitter_orange': '92',
 'black_bean': '1478',
 'black_currant': '1026',
 'black_mustard_seed_oil': '15',
 'black_pepper': '7',
 'black_raspberry': '1375',
 'black_sesame_seed': '118',
 'black_tea': '908',
 'blackberry': '676',
 'blackberry_brandy': '924',
 'blue_cheese': '994',
 'blueberry': '46',
 'bone_oil': '1322',
 'bourbon_whiskey': '1351',
 'brandy': '737',
 'brassica': '598',
 'bread': '105',
 'broccoli': '

In [46]:
new_columns = []

# Iterate over the current columns and replace ingredient names with IDs, formatted as ingr_{ingredient_id}
for col in ingredients_df.columns:
    if col in ['region', 'country']:  # Keep these columns as is
        new_columns.append(col)
    else:
        # Replace ingredient names with their corresponding IDs and format as ingr_{ingredient_id}
        ingredient_id = ingredient_name_to_id.get(col, col)
        new_columns.append(f'ingr_{ingredient_id}')


In [47]:
ingredients_df.columns = new_columns

In [48]:
ingredients_df.columns = ingredients_df.columns.astype(str)

In [49]:
ingredients_df

Unnamed: 0,region,country,ingr_18,ingr_725,ingr_707,ingr_395,ingr_1197,ingr_761,ingr_1120,ingr_190,...,ingr_361,ingr_703,ingr_1428,ingr_1278,ingr_84,ingr_1396,ingr_705,ingr_407,ingr_230,ingr_1080
0,south_east_asian,vietnamese,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,south_east_asian,vietnamese,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,south_east_asian,vietnamese,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,south_east_asian,vietnamese,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,south_east_asian,vietnamese,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57686,east_asian,japanese,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
57687,east_asian,japanese,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
57688,east_asian,japanese,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
57689,east_asian,japanese,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [50]:
ingredients_df['region'].unique()

array(['south_east_asian', 'south_asian', 'southern_european',
       'middle_eastern', 'western_european', 'latin_american',
       'north_american', 'northern_european', 'african',
       'eastern_european', 'east_asian'], dtype=object)

In [51]:
unknown_countries = ingredients_df.loc[ingredients_df['region'] == 'unknown', 'country'].value_counts()

# Print the results
print("Countries associated with the 'unknown' region:")
print(unknown_countries)

Countries associated with the 'unknown' region:
Series([], Name: count, dtype: int64)


In [52]:
ingredient_id_to_name = {v: k for k, v in ingredient_name_to_id.items()}

In [53]:
ingredients_df_with_names = ingredients_df.copy()

# Create new column names by replacing ingredient IDs with names using the reverse mapping
new_columns_with_names = []

for col in ingredients_df.columns:
    if col in ['region', 'country']:  # Keep these columns as is
        new_columns_with_names.append(col)
    else:
        # Extract the ID from the column name (e.g., 'ingr_18' -> '18')
        ingredient_id = col.replace('ingr_', '')
        # Get the ingredient name from the ID
        ingredient_name = ingredient_id_to_name.get(ingredient_id, ingredient_id)
        new_columns_with_names.append(ingredient_name)


ingredients_df_with_names.columns = new_columns_with_names


ingredients_df_with_names.head()

Unnamed: 0,region,country,almond,angelica,anise,anise_seed,apple,apple_brandy,apricot,armagnac,...,whiskey,white_bread,white_wine,whole_grain_wheat_flour,wine,wood,yam,yeast,yogurt,zucchini
0,south_east_asian,vietnamese,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,south_east_asian,vietnamese,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,south_east_asian,vietnamese,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,south_east_asian,vietnamese,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,south_east_asian,vietnamese,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [54]:
ingredients_df_with_names.columns

Index(['region', 'country', 'almond', 'angelica', 'anise', 'anise_seed',
       'apple', 'apple_brandy', 'apricot', 'armagnac',
       ...
       'whiskey', 'white_bread', 'white_wine', 'whole_grain_wheat_flour',
       'wine', 'wood', 'yam', 'yeast', 'yogurt', 'zucchini'],
      dtype='object', length=385)

In [55]:
ingredients_df_with_names['region'].unique()

array(['south_east_asian', 'south_asian', 'southern_european',
       'middle_eastern', 'western_european', 'latin_american',
       'north_american', 'northern_european', 'african',
       'eastern_european', 'east_asian'], dtype=object)

In [56]:
ingr_small.head()

Unnamed: 0,ingredient_id,ingredient_name,906,861,673,278,171,387,165,1099,...,237,169,23,310,653,966,752,497,754,count_ones
2,18,almond,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,9
3,725,angelica,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
4,707,anise,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,17
5,395,anise_seed,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
6,1197,apple,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,195


In [57]:
ingr_small_copy = ingr_small.copy()

# Estraiamo le colonne che rappresentano i composti (escludendo le prime due colonne)
compound_columns = ingr_small_copy.columns[2:]

# Crea una nuova colonna per contenere le liste di nomi delle colonne con valore 1
ingr_small_copy['compounds_present'] = ingr_small_copy.apply(lambda row: list(compound_columns[row[compound_columns] == 1]), axis=1)

# Ora ingr_small_copy ha una nuova colonna 'compounds_present' che contiene le liste dei nomi delle colonne con valore 1 per ogni riga
ingr_small_copy.head()

Unnamed: 0,ingredient_id,ingredient_name,906,861,673,278,171,387,165,1099,...,169,23,310,653,966,752,497,754,count_ones,compounds_present
2,18,almond,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,9,"[541, 72, 107, 195, 669, 871, 1064, 890, 568]"
3,725,angelica,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,4,"[673, 793, 974, 995]"
4,707,anise,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,17,"[673, 165, 94, 635, 700, 177, 215, 898, 101, 3..."
5,395,anise_seed,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2,"[204, 885]"
6,1197,apple,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,195,"[278, 611, 724, 288, 577, 136, 700, 608, 1026,..."


In [58]:
columns_to_keep = ['ingredient_name', 'compounds_present']
ingr_small_copy = ingr_small_copy[columns_to_keep]

# Ora ingr_small_copy contiene solo le colonne ingredient_name e compounds_present
ingr_small_copy.head()

Unnamed: 0,ingredient_name,compounds_present
2,almond,"[541, 72, 107, 195, 669, 871, 1064, 890, 568]"
3,angelica,"[673, 793, 974, 995]"
4,anise,"[673, 165, 94, 635, 700, 177, 215, 898, 101, 3..."
5,anise_seed,"[204, 885]"
6,apple,"[278, 611, 724, 288, 577, 136, 700, 608, 1026,..."


In [59]:
comp_df.head(4)

Unnamed: 0,compound_id,compound_name,CAS_number
0,0,jasmone,488-10-8
1,1,5-methylhexanoic_acid,628-46-6
2,2,l-glutamine,56-85-9
3,3,1-methyl-3-methoxy-4-isopropylbenzene,1076-56-8


In [60]:
import pandas as pd

# Funzione per calcolare la prevalenza di un ingrediente in base al numero di volte che appare
def ingredient_prevalence(ingredient_column, Nc):
    n_i_c = ingredient_column.sum()  # Somma delle occorrenze di 1 nelle colonne degli ingredienti
    Pc_i = n_i_c / Nc
    return Pc_i

# Funzione per calcolare la prevalenza relativa di un ingrediente rispetto alle altre regioni
def relative_prevalence(ingredient_column, Nc, total_recipes, region_mask):
    Pc_i = ingredient_prevalence(ingredient_column, Nc)
    Pc_i_other = (ingredient_column.sum() - ingredient_column[region_mask].sum()) / (total_recipes - Nc)
    relative_prevalence = Pc_i - Pc_i_other
    return relative_prevalence

# Calcolo delle prevalenze per ogni regione
def calculate_prevalences_by_region(df):
    regions = df['region'].unique()
    region_prevalences = {}

    for region in regions:
        region_df = df[df['region'] == region]
        
        # Calcolo di Nc per la regione corrente
        Nc_region = region_df.iloc[:, 2:].any(axis=1).sum()
        
        # Calcolo di total_recipes per la regione corrente
        total_recipes_region = len(region_df)
        
        # Calcoliamo le prevalenze relative per ogni ingrediente nella regione corrente
        prevalences = {}
        for ingredient in region_df.columns[2:]:
            region_mask = (df['region'] == region)
            prevalences[ingredient] = relative_prevalence(region_df[ingredient], Nc_region, total_recipes=len(df), region_mask=region_mask)
        
        # Aggiungiamo le prevalenze della regione al dizionario
        region_prevalences[region] = prevalences
    
    return region_prevalences

# Caricamento del DataFrame (sostituisci con il tuo DataFrame effettivo)
# ingredients_df_with_names = pd.read_csv('nome_del_tuo_file.csv')

# Calcoliamo le prevalenze per ogni regione nel DataFrame fornito
region_prevalences = calculate_prevalences_by_region(ingredients_df_with_names)

# Stampiamo i risultati
for region, prevalences in region_prevalences.items():
    print(f"Region: {region}")
    for ingredient, prevalence in prevalences.items():
        print(f"{ingredient}: {prevalence}")
    print()


Region: south_east_asian
almond: 0.006564551422319475
angelica: 0.0
anise: 0.002188183807439825
anise_seed: 0.0
apple: 0.0087527352297593
apple_brandy: 0.0
apricot: 0.002188183807439825
armagnac: 0.0
artemisia: 0.0
artichoke: 0.0
asparagus: 0.010940919037199124
avocado: 0.002188183807439825
bacon: 0.0
baked_potato: 0.0
balm: 0.0
banana: 0.0175054704595186
barley: 0.0
bartlett_pear: 0.0
basil: 0.16411378555798686
bay: 0.0262582056892779
bean: 0.13566739606126915
beech: 0.0
beef: 0.11597374179431072
beef_broth: 0.01312910284463895
beef_liver: 0.0
beer: 0.0
beet: 0.0
bell_pepper: 0.15317286652078774
bergamot: 0.0
berry: 0.0
bitter_orange: 0.0
black_bean: 0.002188183807439825
black_currant: 0.0
black_mustard_seed_oil: 0.0
black_pepper: 0.1925601750547046
black_raspberry: 0.0
black_sesame_seed: 0.0
black_tea: 0.0
blackberry: 0.0
blackberry_brandy: 0.0
blue_cheese: 0.0
blueberry: 0.0
bone_oil: 0.002188183807439825
bourbon_whiskey: 0.0
brandy: 0.0
brassica: 0.0
bread: 0.0525164113785558
brocc

In [61]:
import pandas as pd

# Funzione per calcolare la prevalenza di un ingrediente in base al numero di volte che appare
def ingredient_prevalence(ingredient_column, Nc):
    n_i_c = ingredient_column.sum()  # Somma delle occorrenze di 1 nelle colonne degli ingredienti
    Pc_i = n_i_c / Nc
    return Pc_i

# Funzione per calcolare la prevalenza relativa di un ingrediente rispetto alle altre regioni
def relative_prevalence(ingredient_column, Nc, total_recipes, region_mask):
    Pc_i = ingredient_prevalence(ingredient_column, Nc)
    Pc_i_other = (ingredient_column.sum() - ingredient_column[region_mask].sum()) / (total_recipes - Nc)
    relative_prevalence = Pc_i - Pc_i_other
    return relative_prevalence

# Calcolo delle prevalenze per ogni regione
def calculate_prevalences_by_region(df):
    regions = df['region'].unique()
    region_prevalences = {}

    for region in regions:
        region_df = df[df['region'] == region]
        
        # Calcolo di Nc per la regione corrente
        Nc_region = region_df.iloc[:, 2:].any(axis=1).sum()
        
        # Calcolo di total_recipes per la regione corrente
        total_recipes_region = len(region_df)
        
        # Calcoliamo le prevalenze relative per ogni ingrediente nella regione corrente
        prevalences = {}
        for ingredient in region_df.columns[2:]:
            region_mask = (df['region'] == region)
            prevalences[ingredient] = relative_prevalence(region_df[ingredient], Nc_region, total_recipes=len(df), region_mask=region_mask)
        
        # Aggiungiamo le prevalenze della regione al dizionario
        region_prevalences[region] = prevalences
    
    return region_prevalences



# Calcoliamo le prevalenze per ogni regione nel DataFrame fornito
region_prevalences = calculate_prevalences_by_region(ingredients_df_with_names)

import pandas as pd

# Funzione per calcolare la prevalenza di un ingrediente in base al numero di volte che appare
def ingredient_prevalence(ingredient_column, Nc):
    n_i_c = ingredient_column.sum()  # Somma delle occorrenze di 1 nelle colonne degli ingredienti
    Pc_i = n_i_c / Nc
    return Pc_i

# Funzione per calcolare la prevalenza relativa di un ingrediente rispetto alle altre regioni
def relative_prevalence(ingredient_column, Nc, total_recipes, region_mask):
    Pc_i = ingredient_prevalence(ingredient_column, Nc)
    Pc_i_other = (ingredient_column.sum() - ingredient_column[region_mask].sum()) / (total_recipes - Nc)
    relative_prevalence = Pc_i - Pc_i_other
    return relative_prevalence

# Calcolo delle prevalenze per ogni regione
def calculate_prevalences_by_region(df):
    regions = df['region'].unique()
    region_prevalences = {}

    for region in regions:
        region_df = df[df['region'] == region]
        
        # Calcolo di Nc per la regione corrente
        Nc_region = region_df.iloc[:, 2:].any(axis=1).sum()
        
        # Calcolo di total_recipes per la regione corrente
        total_recipes_region = len(region_df)
        
        # Calcoliamo le prevalenze relative per ogni ingrediente nella regione corrente
        prevalences = {}
        for ingredient in region_df.columns[2:]:
            region_mask = (df['region'] == region)
            prevalences[ingredient] = relative_prevalence(region_df[ingredient], Nc_region, total_recipes=len(df), region_mask=region_mask)
        
        # Aggiungiamo le prevalenze della regione al dizionario
        region_prevalences[region] = prevalences
    
    return region_prevalences


# Calcoliamo le prevalenze per ogni regione nel DataFrame fornito
region_prevalences = calculate_prevalences_by_region(ingredients_df_with_names)

for region, prevalences in region_prevalences.items():
    sorted_prevalences = sorted(prevalences.items(), key=lambda x: x[1], reverse=True)
    print(f"Region: {region}")
    for ingredient, prevalence in sorted_prevalences:
        print(f"{ingredient}: {prevalence}")
    print()



Region: south_east_asian
garlic: 0.6214442013129103
fish: 0.49452954048140046
cayenne: 0.4179431072210066
vegetable_oil: 0.3938730853391685
cilantro: 0.35667396061269147
rice: 0.35667396061269147
ginger: 0.3347921225382932
soy_sauce: 0.3150984682713348
chicken: 0.2975929978118162
lime_juice: 0.2888402625820569
coconut: 0.28227571115973743
scallion: 0.2800875273522976
onion: 0.27571115973741794
pepper: 0.26914660831509846
coriander: 0.26695842450765866
cumin: 0.2210065645514223
vinegar: 0.21444201312910285
lemongrass: 0.2100656455142232
turmeric: 0.19693654266958424
black_pepper: 0.1925601750547046
lime: 0.1838074398249453
shrimp: 0.1772428884026258
basil: 0.16411378555798686
mint: 0.16192560175054704
chicken_broth: 0.1575492341356674
bell_pepper: 0.15317286652078774
carrot: 0.15317286652078774
fenugreek: 0.15317286652078774
shallot: 0.15098468271334792
bean: 0.13566739606126915
egg: 0.13566739606126915
tomato: 0.11816192560175055
beef: 0.11597374179431072
cane_molasses: 0.1159737417943

In [62]:
"""If the value is positive (e.g., 0.002): It means that the ingredient is more prevalent in the specific region compared to the 
average of the other regions. In other words, the ingredient is considered more characteristic or distinctive for that region.

If the value is zero: It means that the prevalence of the ingredient is the same in both regions or that the ingredient is not present in the region in question.

If the value is negative: It indicates that the ingredient is less prevalent in the specific region compared to the average of the other regions."""

'If the value is positive (e.g., 0.002): It means that the ingredient is more prevalent in the specific region compared to the \naverage of the other regions. In other words, the ingredient is considered more characteristic or distinctive for that region.\n\nIf the value is zero: It means that the prevalence of the ingredient is the same in both regions or that the ingredient is not present in the region in question.\n\nIf the value is negative: It indicates that the ingredient is less prevalent in the specific region compared to the average of the other regions.'

In [63]:
!pip install reportlab



Collecting reportlab
  Downloading reportlab-4.2.2-py3-none-any.whl.metadata (1.4 kB)
Collecting chardet (from reportlab)
  Downloading chardet-5.2.0-py3-none-any.whl.metadata (3.4 kB)
Downloading reportlab-4.2.2-py3-none-any.whl (1.9 MB)
   ---------------------------------------- 0.0/1.9 MB ? eta -:--:--
   - -------------------------------------- 0.1/1.9 MB 1.1 MB/s eta 0:00:02
   ----- ---------------------------------- 0.3/1.9 MB 2.6 MB/s eta 0:00:01
   ------------ --------------------------- 0.6/1.9 MB 4.2 MB/s eta 0:00:01
   -------------------------- ------------- 1.3/1.9 MB 6.8 MB/s eta 0:00:01
   ---------------------------------------- 1.9/1.9 MB 8.9 MB/s eta 0:00:00
Downloading chardet-5.2.0-py3-none-any.whl (199 kB)
   ---------------------------------------- 0.0/199.4 kB ? eta -:--:--
   --------------------------------------- 199.4/199.4 kB 11.8 MB/s eta 0:00:00
Installing collected packages: chardet, reportlab
Successfully installed chardet-5.2.0 reportlab-4.2.2


In [64]:
"""import pandas as pd
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
from reportlab.lib import colors
from reportlab.lib.units import inch

# Function to calculate the relative prevalence of an ingredient compared to other regions
def relative_prevalence(ingredient_column, Nc, total_recipes, region_mask):
    Pc_i = ingredient_column.sum() / Nc
    Pc_i_other = (ingredient_column.sum() - ingredient_column[region_mask].sum()) / (total_recipes - Nc)
    relative_prevalence = Pc_i - Pc_i_other
    return relative_prevalence

# Calculate prevalences for each region
def calculate_prevalences_by_region(df):
    regions = df['cuisine'].unique()
    region_prevalences = {}

    for region in regions:
        region_df = df[df['cuisine'] == region]
        
        # Calculate Nc for the current region
        Nc_region = region_df.iloc[:, 2:].any(axis=1).sum()
        
        # Calculate total_recipes for the current region
        total_recipes_region = len(region_df)
        
        # Calculate the relative prevalences for each ingredient in the current region
        prevalences = {}
        for ingredient in region_df.columns[2:]:
            region_mask = (df['cuisine'] == region)
            prevalences[ingredient] = relative_prevalence(region_df[ingredient], Nc_region, total_recipes=len(df), region_mask=region_mask)
        
        # Sort ingredients by relative prevalence and take the top 10
        sorted_prevalences = sorted(prevalences.items(), key=lambda x: x[1], reverse=True)[:10]
        sorted_prevalences = {item[0]: item[1] for item in sorted_prevalences}
        
        # Add the top 10 ingredient prevalences to the current region
        region_prevalences[region] = sorted_prevalences
    
    return region_prevalences

# Function to create a PDF for a region
def create_pdf(region, prevalences, output_filename):
    c = canvas.Canvas(output_filename, pagesize=letter)
    width, height = letter
    
    # Title
    c.setFont("Helvetica-Bold", 16)
    c.drawString(100, height - 40, f"Top 10 Characteristic Ingredients for {region.capitalize()} Cuisine")
    
    # Content
    c.setFont("Helvetica", 12)
    y = height - 60
    for ingredient, prevalence in prevalences.items():
        text = f"{ingredient}: {prevalence:.4f}"
        c.drawString(100, y, text)
        y -= 20
    
    c.save()

# Rename the column 'region' to 'cuisine'
ingredients_df_with_names = ingredients_df_with_names.rename(columns={'region': 'cuisine'})

# Calculate prevalences for each region in the provided DataFrame
region_prevalences = calculate_prevalences_by_region(ingredients_df_with_names)

# Create a PDF for each region with the top 10 characteristic ingredients
for region, prevalences in region_prevalences.items():
    output_filename = f"{region}_cuisine_top_10_ingredients.pdf"
    create_pdf(region, prevalences, output_filename)
    print(f"PDF created for {region} cuisine: {output_filename}")"""


PDF created for south_east_asian cuisine: south_east_asian_cuisine_top_10_ingredients.pdf
PDF created for south_asian cuisine: south_asian_cuisine_top_10_ingredients.pdf
PDF created for southern_european cuisine: southern_european_cuisine_top_10_ingredients.pdf
PDF created for middle_eastern cuisine: middle_eastern_cuisine_top_10_ingredients.pdf
PDF created for western_european cuisine: western_european_cuisine_top_10_ingredients.pdf
PDF created for latin_american cuisine: latin_american_cuisine_top_10_ingredients.pdf
PDF created for north_american cuisine: north_american_cuisine_top_10_ingredients.pdf
PDF created for northern_european cuisine: northern_european_cuisine_top_10_ingredients.pdf
PDF created for african cuisine: african_cuisine_top_10_ingredients.pdf
PDF created for eastern_european cuisine: eastern_european_cuisine_top_10_ingredients.pdf
PDF created for east_asian cuisine: east_asian_cuisine_top_10_ingredients.pdf


In [66]:
"""import pandas as pd
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
from reportlab.lib import colors
from reportlab.lib.units import inch

# Function to calculate the relative prevalence of an ingredient compared to other regions
def relative_prevalence(ingredient_column, Nc, total_recipes, region_mask):
    Pc_i = ingredient_column.sum() / Nc
    Pc_i_other = (ingredient_column.sum() - ingredient_column[region_mask].sum()) / (total_recipes - Nc)
    relative_prevalence = Pc_i - Pc_i_other
    return relative_prevalence

# Calculate prevalences for each region
def calculate_prevalences_by_region(df):
    regions = df['cuisine'].unique()
    region_prevalences = {}

    for region in regions:
        region_df = df[df['cuisine'] == region]
        
        # Calculate Nc for the current region
        Nc_region = region_df.iloc[:, 2:].any(axis=1).sum()
        
        # Calculate total_recipes for the current region
        total_recipes_region = len(region_df)
        
        # Calculate the relative prevalences for each ingredient in the current region
        prevalences = {}
        for ingredient in region_df.columns[2:]:
            region_mask = (df['cuisine'] == region)
            prevalences[ingredient] = relative_prevalence(region_df[ingredient], Nc_region, total_recipes=len(df), region_mask=region_mask)
        
        # Sort ingredients by relative prevalence and take the top 10
        sorted_prevalences = sorted(prevalences.items(), key=lambda x: x[1], reverse=True)[:10]
        sorted_prevalences = {item[0]: item[1] for item in sorted_prevalences}
        
        # Add the top 10 ingredient prevalences to the current region
        region_prevalences[region] = sorted_prevalences
    
    return region_prevalences

# Function to create a PDF for a region
def create_pdf(region, prevalences, output_filename):
    c = canvas.Canvas(output_filename, pagesize=letter)
    width, height = letter
    
    # Title
    c.setFont("Helvetica-Bold", 16)
    c.drawString(100, height - 40, f"Top 10 Characteristic Ingredients for {region.capitalize()} Cuisine")
    
    # Content
    c.setFont("Helvetica", 12)
    y = height - 60
    for ingredient in prevalences.keys():
        c.drawString(100, y, ingredient)
        y -= 20
    
    c.save()

# Rename the column 'region' to 'cuisine'
ingredients_df_with_names = ingredients_df_with_names.rename(columns={'region': 'cuisine'})

# Calculate prevalences for each region in the provided DataFrame
region_prevalences = calculate_prevalences_by_region(ingredients_df_with_names)

# Create a PDF for each region with the top 10 characteristic ingredients
for region, prevalences in region_prevalences.items():
    output_filename = f"{region}_cuisine_top_10_ingredients.pdf"
    create_pdf(region, prevalences, output_filename)
    print(f"PDF created for {region} cuisine: {output_filename}")"""


PDF created for south_east_asian cuisine: south_east_asian_cuisine_top_10_ingredients.pdf
PDF created for south_asian cuisine: south_asian_cuisine_top_10_ingredients.pdf
PDF created for southern_european cuisine: southern_european_cuisine_top_10_ingredients.pdf
PDF created for middle_eastern cuisine: middle_eastern_cuisine_top_10_ingredients.pdf
PDF created for western_european cuisine: western_european_cuisine_top_10_ingredients.pdf
PDF created for latin_american cuisine: latin_american_cuisine_top_10_ingredients.pdf
PDF created for north_american cuisine: north_american_cuisine_top_10_ingredients.pdf
PDF created for northern_european cuisine: northern_european_cuisine_top_10_ingredients.pdf
PDF created for african cuisine: african_cuisine_top_10_ingredients.pdf
PDF created for eastern_european cuisine: eastern_european_cuisine_top_10_ingredients.pdf
PDF created for east_asian cuisine: east_asian_cuisine_top_10_ingredients.pdf


In [76]:
import pandas as pd
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
from reportlab.lib.units import inch

# Function to calculate the percentage prevalence of an ingredient in a region
def percentage_prevalence(ingredient_column, Nc):
    Pc_i = ingredient_column.sum() / Nc
    return Pc_i

# Calculate prevalences for each region
def calculate_prevalences_by_region(df):
    regions = df['cuisine'].unique()
    region_prevalences = {}

    for region in regions:
        region_df = df[df['cuisine'] == region]
        
        # Calculate Nc for the current region
        Nc_region = len(region_df)
        
        # Calculate the percentage prevalences for each ingredient in the current region
        prevalences = {}
        for ingredient in region_df.columns[2:]:
            prevalences[ingredient] = percentage_prevalence(region_df[ingredient], Nc_region)
        
        # Sort ingredients by percentage prevalence and take the top 10
        sorted_prevalences = sorted(prevalences.items(), key=lambda x: x[1], reverse=True)[:10]
        sorted_prevalences = {item[0]: item[1] for item in sorted_prevalences}
        
        # Add the top 10 ingredient prevalences to the current region
        region_prevalences[region] = sorted_prevalences
    
    return region_prevalences

# Function to create a PDF for a region
def create_pdf(region, prevalences, output_filename):
    c = canvas.Canvas(output_filename, pagesize=letter)
    width, height = letter
    
    # Title
    c.setFont("Helvetica-Bold", 16)
    c.drawString(100, height - 40, f"Top 10 Characteristic Ingredients for {region.capitalize()} Cuisine")
    
    # Content
    c.setFont("Helvetica", 12)
    y = height - 60
    for ingredient, prevalence in prevalences.items():
        text = f"{ingredient}: {prevalence*100:.2f}%"
        c.drawString(100, y, text)
        y -= 20
    
    c.save()

# Rename the column 'region' to 'cuisine'
ingredients_df_with_names = ingredients_df_with_names.rename(columns={'region': 'cuisine'})

# Calculate prevalences for each region in the provided DataFrame
region_prevalences = calculate_prevalences_by_region(ingredients_df_with_names)

# Create a PDF for each region with the top 10 characteristic ingredients
for region, prevalences in region_prevalences.items():
    output_filename = f"{region}_cuisine_top_10_ingredients.pdf"
    create_pdf(region, prevalences, output_filename)
    print(f"PDF created for {region} cuisine: {output_filename}")


PDF created for south_east_asian cuisine: south_east_asian_cuisine_top_10_ingredients.pdf
PDF created for south_asian cuisine: south_asian_cuisine_top_10_ingredients.pdf
PDF created for southern_european cuisine: southern_european_cuisine_top_10_ingredients.pdf
PDF created for middle_eastern cuisine: middle_eastern_cuisine_top_10_ingredients.pdf
PDF created for western_european cuisine: western_european_cuisine_top_10_ingredients.pdf
PDF created for latin_american cuisine: latin_american_cuisine_top_10_ingredients.pdf
PDF created for north_american cuisine: north_american_cuisine_top_10_ingredients.pdf
PDF created for northern_european cuisine: northern_european_cuisine_top_10_ingredients.pdf
PDF created for african cuisine: african_cuisine_top_10_ingredients.pdf
PDF created for eastern_european cuisine: eastern_european_cuisine_top_10_ingredients.pdf
PDF created for east_asian cuisine: east_asian_cuisine_top_10_ingredients.pdf


In [67]:
ingr_small_copy.head(4)

Unnamed: 0,ingredient_name,compounds_present
2,almond,"[541, 72, 107, 195, 669, 871, 1064, 890, 568]"
3,angelica,"[673, 793, 974, 995]"
4,anise,"[673, 165, 94, 635, 700, 177, 215, 898, 101, 3..."
5,anise_seed,"[204, 885]"


In [68]:
comp_df.head(4)

Unnamed: 0,compound_id,compound_name,CAS_number
0,0,jasmone,488-10-8
1,1,5-methylhexanoic_acid,628-46-6
2,2,l-glutamine,56-85-9
3,3,1-methyl-3-methoxy-4-isopropylbenzene,1076-56-8


In [73]:
comp_df["compound_id"].nunique

<bound method IndexOpsMixin.nunique of 0          0
1          1
2          2
3          3
4          4
        ... 
1102    1102
1103    1103
1104    1104
1105    1105
1106    1106
Name: compound_id, Length: 1107, dtype: int64>

In [75]:


# Creare un dizionario di mappatura da compound_id a compound_name
comp_dict = dict(zip(comp_df['compound_id'], comp_df['compound_name']))

# Funzione per sostituire i numeri con i nomi
def replace_compounds(compound_list):
    return [comp_dict.get(compound, compound) for compound in compound_list]

# Applicare la funzione alla colonna compounds_present
ingr_small_copy['compounds_present'] = ingr_small_copy['compounds_present'].apply(replace_compounds)

# Visualizzare il DataFrame aggiornato
print(ingr_small_copy)



    ingredient_name                                  compounds_present
2            almond      [541, 72, 107, 195, 669, 871, 1064, 890, 568]
3          angelica                               [673, 793, 974, 995]
4             anise  [673, 165, 94, 635, 700, 177, 215, 898, 101, 3...
5        anise_seed                                         [204, 885]
6             apple  [278, 611, 724, 288, 577, 136, 700, 608, 1026,...
..              ...                                                ...
380            wood           [861, 568, 649, 37, 249, 1033, 51, 1086]
381             yam                                  [918, count_ones]
382           yeast                                    [151, 864, 966]
383          yogurt  [519, 599, 890, 918, 524, 995, 424, 684, 275, ...
384        zucchini                                   [13, count_ones]

[383 rows x 2 columns]


In [65]:
ingredients_by_region = ingredients_df_with_names.drop('country', axis=1).groupby('region').sum()



KeyError: 'region'

In [None]:
!pip install networkx

"pip" non � riconosciuto come comando interno o esterno,
 un programma eseguibile o un file batch.


In [None]:
ingredients_by_region.head(3)

Unnamed: 0_level_0,almond,angelica,anise,anise_seed,apple,apple_brandy,apricot,armagnac,artemisia,artichoke,...,whiskey,white_bread,white_wine,whole_grain_wheat_flour,wine,wood,yam,yeast,yogurt,zucchini
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
african,44,0,2,1,6,0,23,0,0,1,...,0,2,12,4,4,0,2,9,7,25
east_asian,34,0,1,7,34,0,10,0,12,0,...,1,3,76,3,299,0,2,46,16,18
eastern_european,15,0,2,0,18,0,18,0,0,0,...,4,5,16,1,1,0,0,48,6,1


In [None]:
comp_df['compound_name'] = comp_df['compound_name'].astype(str)

In [None]:
comp_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1107 entries, 0 to 1106
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   compound_id    1107 non-null   int64 
 1   compound_name  1107 non-null   object
 2   CAS_number     1107 non-null   object
dtypes: int64(1), object(2)
memory usage: 26.1+ KB


In [None]:
sparse_matrix

NameError: name 'sparse_matrix' is not defined

In [None]:
# Convert to dense vectors
dense_vectors = sparse_matrix.todense()

# Create a DataFrame for easy handling
dense_df = pd.DataFrame(dense_vectors)

In [None]:
# Function to get vectors from the dense matrix
def get_ingredient_vectors(dense_matrix):
    ingredient_vectors = dense_matrix.values  # Convert DataFrame to numpy array
    return ingredient_vectors

# Get ingredient vectors
ingredient_vectors = get_ingredient_vectors(dense_df)

ValueError: Expected a 1D array, got an array with shape (1525, 1107)

In [None]:
dense_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1097,1098,1099,1100,1101,1102,1103,1104,1105,1106
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1520,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1521,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1522,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1523,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
dense_df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1515,1516,1517,1518,1519,1520,1521,1522,1523,1524
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1102,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1103,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1104,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1105,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
mother_matrix = dense_df.dot(dense_df.T)

In [None]:
mother_matrix = mother_matrix.astype(int)

In [None]:
mother_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1515,1516,1517,1518,1519,1520,1521,1522,1523,1524
0,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,41,1,0,29,1,0,...,1,0,0,0,28,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1520,0,0,0,0,0,0,0,0,1,0,...,1,0,0,0,0,1,0,0,0,0
1521,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1522,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1523,0,0,0,0,0,0,0,2,0,0,...,0,0,0,0,2,0,0,0,3,0


In [None]:
dense_df.dot(query_vector)

0       1.0
1       0.0
2       0.0
3       0.0
4       0.0
       ... 
1520    0.0
1521    0.0
1522    0.0
1523    0.0
1524    0.0
Length: 1525, dtype: float64

In [None]:
ingredients_df.to_csv('data/ingredients_df.csv', index=False)

In [None]:
# Function to get column names where the row has 1s
def get_ingredients(row):
    return [re.findall(r'\d+', col)[0] for col in ingredients_df.columns if row[col] == 1]

# Apply the function to each row
ingredients_df['ingredient_list'] = ingredients_df.apply(get_ingredients, axis=1)

In [None]:
print(ingredients_df['ingredient_list'])

0        [256, 1216, 663, 658, 1044, 878, 136, 141, 243...
1                            [7, 663, 878, 136, 1447, 165]
2                                    [136, 243, 1183, 406]
3        [256, 1348, 512, 663, 658, 878, 164, 1199, 243...
4        [663, 1267, 1044, 878, 136, 1132, 243, 166, 11...
                               ...                        
57686    [698, 724, 1117, 1044, 848, 396, 1140, 1447, 2...
57687                [1338, 848, 221, 417, 915, 1336, 127]
57688    [1281, 427, 878, 136, 164, 1471, 205, 455, 130...
57689                     [19, 1237, 848, 164, 1012, 1179]
57690    [1281, 848, 396, 1447, 1471, 205, 1412, 1183, ...
Name: ingredient_list, Length: 57691, dtype: object


In [None]:
ingredients_df.head()

Unnamed: 0,region,country,ingr_18,ingr_725,ingr_707,ingr_395,ingr_1197,ingr_761,ingr_1120,ingr_190,...,ingr_703,ingr_1428,ingr_1278,ingr_84,ingr_1396,ingr_705,ingr_407,ingr_230,ingr_1080,ingredient_list
0,south_east_asian,vietnamese,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[256, 1216, 663, 658, 1044, 878, 136, 141, 243..."
1,south_east_asian,vietnamese,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[7, 663, 878, 136, 1447, 165]"
2,south_east_asian,vietnamese,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[136, 243, 1183, 406]"
3,south_east_asian,vietnamese,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[256, 1348, 512, 663, 658, 878, 164, 1199, 243..."
4,south_east_asian,vietnamese,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[663, 1267, 1044, 878, 136, 1132, 243, 166, 11..."


In [None]:
ingredients_slised =ingredients_df[['country', 'region', 'ingredient_list']].copy()

In [None]:
ingredients_slised.head()

Unnamed: 0,country,region,ingredient_list
0,vietnamese,south_east_asian,"[256, 1216, 663, 658, 1044, 878, 136, 141, 243..."
1,vietnamese,south_east_asian,"[7, 663, 878, 136, 1447, 165]"
2,vietnamese,south_east_asian,"[136, 243, 1183, 406]"
3,vietnamese,south_east_asian,"[256, 1348, 512, 663, 658, 878, 164, 1199, 243..."
4,vietnamese,south_east_asian,"[663, 1267, 1044, 878, 136, 1132, 243, 166, 11..."


In [None]:
comp_per_ingr

Unnamed: 0,ingredient_id,compound_id
0,0,[995]
1,1,[921]
2,10,[715]
3,100,[1011]
4,1000,"[764, 275, 630, 1046, 285, 84, 482, 772, 686, ..."
...,...,...
1520,995,[996]
1521,996,[894]
1522,997,[828]
1523,998,"[25, 317, 86]"


In [None]:
#Function to get compound list for each ingredient in the ingredient list
def add_compound_to_recipe(row):
    compound_list = []
    for ingredient in row['ingredient_list']:
        compounds = comp_per_ingr.loc[comp_per_ingr['ingredient_id'] == ingredient, 'compound_id']
        if not compounds.empty:
            compound_list.extend(compounds.values[0])
    return compound_list

# Apply the function to each row
ingredients_slised['compound_list'] = ingredients_slised.apply(add_compound_to_recipe, axis=1)

In [None]:
ingredients_slised

Unnamed: 0,country,region,ingredient_list,compound_list
0,vietnamese,south_east_asian,"[256, 1216, 663, 658, 1044, 878, 136, 141, 243...","[347, 847, 700, 75, 278, 767, 283, 295, 442, 7..."
1,vietnamese,south_east_asian,"[7, 663, 878, 136, 1447, 165]","[273, 971, 348, 628, 767, 704, 79, 965, 361, 4..."
2,vietnamese,south_east_asian,"[136, 243, 1183, 406]","[273, 827, 388, 175, 798, 392, 599, 1035, 429,..."
3,vietnamese,south_east_asian,"[256, 1348, 512, 663, 658, 878, 164, 1199, 243...","[347, 847, 700, 75, 278, 767, 283, 295, 442, 7..."
4,vietnamese,south_east_asian,"[663, 1267, 1044, 878, 136, 1132, 243, 166, 11...","[273, 971, 348, 628, 767, 704, 620, 79, 965, 3..."
...,...,...,...,...
57686,japanese,east_asian,"[698, 724, 1117, 1044, 848, 396, 1140, 1447, 2...","[558, 423, 766, 282, 426, 1035, 633, 976, 96, ..."
57687,japanese,east_asian,"[1338, 848, 221, 417, 915, 1336, 127]","[272, 965, 827, 275, 424, 400, 704, 427, 284, ..."
57688,japanese,east_asian,"[1281, 427, 878, 136, 164, 1471, 205, 455, 130...","[772, 526, 157, 72, 423, 195, 1025, 282, 990, ..."
57689,japanese,east_asian,"[19, 1237, 848, 164, 1012, 1179]","[272, 275, 284, 285, 568, 292, 837, 841, 1, 19..."


In [None]:
ingredients_df.head()

Unnamed: 0,region,country,ingr_18,ingr_725,ingr_707,ingr_395,ingr_1197,ingr_761,ingr_1120,ingr_190,...,ingr_703,ingr_1428,ingr_1278,ingr_84,ingr_1396,ingr_705,ingr_407,ingr_230,ingr_1080,ingredient_list
0,south_east_asian,vietnamese,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[256, 1216, 663, 658, 1044, 878, 136, 141, 243, 166, 998, 205, 165, 249, 694, 1012]"
1,south_east_asian,vietnamese,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[7, 663, 878, 136, 1447, 165]"
2,south_east_asian,vietnamese,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[136, 243, 1183, 406]"
3,south_east_asian,vietnamese,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[256, 1348, 512, 663, 658, 878, 164, 1199, 243, 166, 1088, 205, 258, 1305, 1113]"
4,south_east_asian,vietnamese,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[663, 1267, 1044, 878, 136, 1132, 243, 166, 1160, 205, 915, 694, 1012]"
