In [46]:
# try to combine all the datasets to create a pdf with ingredients, aromatic

import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import ast
import json
import re
from collections import defaultdict
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.metrics import silhouette_score

import requests
import time

from bs4 import BeautifulSoup
from scipy.sparse import csr_matrix
from sklearn.preprocessing import LabelEncoder

In [47]:
ingredients_df = pd.read_csv('data/recipes.csv')
ingredients_df = pd.DataFrame(ingredients_df)

In [48]:
# Load the data
comp_path = 'data/flavor_network_data/ingr_comp/comp_info.tsv'
comp_tsv = pd.read_csv(comp_path, delimiter='\t')

comp_df = pd.DataFrame(data = comp_tsv)
comp_columns = ['compound_id', 'compound_name', 'CAS_number']
comp_df.columns = comp_columns

ingr_path = 'data/flavor_network_data/ingr_comp/ingr_info.tsv'
ingr_tsv = pd.read_csv(ingr_path, delimiter='\t')

ingr_df = pd.DataFrame(data = ingr_tsv)
ingr_columns = ['ingredient_id', 'ingredient_name', 'ingredient_category']
ingr_df.columns = ingr_columns

ingr_comp_pathh = 'data/flavor_network_data/ingr_comp/ingr_comp.tsv'
ingr_comp_tsv = pd.read_csv(ingr_comp_pathh, delimiter='\t')


In [49]:
ingr_comp_df = pd.DataFrame(data = ingr_comp_tsv)

In [50]:
ingr_comp_df.columns

Index(['# ingredient id', 'compound id'], dtype='object')

In [51]:
ingr_comp_df.rename(columns={
    '# ingredient id': 'ingredient_id',
    'compound id': 'compound_id'
}, inplace=True)

In [52]:
flav_edges_path = 'data/flavor_network_data/flavor_network_backbone/flavor_network_backbone_edges.tsv'
flav_edges_tsv = pd.read_csv(flav_edges_path, delimiter='\t')

flav_edges_df = pd.DataFrame(data = flav_edges_tsv)
flav_edges_columns = ['ingredient_1', 'ingredient_2', 'number_of_shared_compounds']
flav_edges_df.columns = flav_edges_columns

flav_nodes_path = 'data/flavor_network_data/flavor_network_backbone/flavor_network_backbone_nodes.tsv'
flav_nodes_tsv = pd.read_csv(flav_nodes_path, delimiter='\t')

flav_nodes_df = pd.DataFrame(data = flav_nodes_tsv)
flav_nodes_columns = ['ingredient_name', 'x_coordinate', 'y_coordinate', 'prevalence', 'r', 'g', 'b']
flav_nodes_df.columns = flav_nodes_columns

In [53]:
# Function to replace spaces with underscores
def replace_spaces(value):
    if isinstance(value, str):
        return value.replace(' ', '_')
    return value

In [54]:
ingr_df['ingredient_name'] = ingr_df['ingredient_name'].apply(replace_spaces)
ingr_df['ingredient_category'] = ingr_df['ingredient_category'].apply(replace_spaces)
flav_edges_df['ingredient_1'] = flav_edges_df['ingredient_1'].apply(replace_spaces)
flav_edges_df['ingredient_2'] = flav_edges_df['ingredient_2'].apply(replace_spaces)
flav_nodes_df['ingredient_name'] = flav_nodes_df['ingredient_name'].apply(replace_spaces)

In [55]:

data = {
    'Canada': 'NorthAmerican',
    'Turkey': 'MiddleEastern',
    'east_asian': 'EastAsian',
    'Caribbean': 'LatinAmerican',
    'Bangladesh': 'SouthAsian',
    'chinese': 'EastAsian',
    'mexico': 'LatinAmerican',
    'Lebanon': 'MiddleEastern',
    'japanese': 'EastAsian',
    'North-African': 'African',
    'MiddleEastern': 'MiddleEastern',
    'Indian': 'SouthAsian',
    'asian': 'EastAsian',
    'Italy': 'SouthernEuropean',
    'EasternEuropean_Russian': 'EasternEuropean',
    'Israel': 'MiddleEastern',
    'Korea': 'EastAsian',
    'Iran': 'MiddleEastern',
    'Eastern-Europe': 'EasternEuropean',
    'Jewish': 'MiddleEastern',
    'South-African': 'African',
    'Vietnamese': 'SoutheastAsian',
    'UK-and-Ireland': 'WesternEuropean',
    'French': 'WesternEuropean',
    'Mediterranean': 'SouthernEuropean',
    'Central_SouthAmerican': 'LatinAmerican',
    'Cajun_Creole': 'NorthAmerican',
    'Belgium': 'WesternEuropean',
    'China': 'EastAsian',
    'korean': 'EastAsian',
    'Germany': 'WesternEuropean',
    'South-America': 'LatinAmerican',
    'Spain': 'SouthernEuropean',
    'Netherlands': 'WesternEuropean',
    'Scandinavia': 'NorthernEuropean',
    'Philippines': 'SoutheastAsian',
    'Indonesia': 'SoutheastAsian',
    'East-African': 'African',
    'Scandinavian': 'NorthernEuropean',
    'Greek': 'SouthernEuropean',
    'American': 'NorthAmerican',
    'Vietnam': 'SoutheastAsian',
    'western': 'WesternEuropean',
    'African': 'African',
    'Switzerland': 'WesternEuropean',
    'West-African': 'African',
    'France': 'WesternEuropean',
    'Thai': 'SoutheastAsian',
    'Thailand': 'SoutheastAsian',
    'Italian': 'SouthernEuropean',
    'Pakistan': 'SouthAsian',
    'Irish': 'WesternEuropean',
    'Mexican': 'LatinAmerican',
    'Portugal': 'SouthernEuropean',
    'Chinese': 'EastAsian',
    'Mexico': 'LatinAmerican',
    'German': 'WesternEuropean',
    'Spanish_Portuguese': 'SouthernEuropean',
    'India': 'SouthAsian',
    'Japanese': 'EastAsian',
    'Moroccan': 'African',
    'Southern_SoulFood': 'NorthAmerican',
    'Malaysia': 'SoutheastAsian',
    'Austria': 'WesternEuropean',
    'English_Scottish': 'WesternEuropean',
    'Asian': 'EastAsian',
    'Southwestern': 'NorthAmerican',
    'Japan': 'EastAsian',
    'italian': 'SouthernEuropean',
    'canadian': 'NorthAmerican',
    'eastern_european_russian': 'EasternEuropean',
    'southern_soul_food': 'NorthAmerican',
    'middle_eastern': 'MiddleEastern',
    'central_south_american': 'LatinAmerican',
    'spanish': 'SouthernEuropean',
    'north_african': 'African',
    'portuguese': 'SouthernEuropean',
    'filipino': 'SoutheastAsian',
    'dutch': 'WesternEuropean',
    'iranian': 'MiddleEastern',
    'austrian': 'WesternEuropean',
    'swiss': 'WesternEuropean',
    'pakistani': 'SouthAsian',
    'malaysian': 'SoutheastAsian',
    'south_african': 'African',
    'west_african': 'African',
    'indonesian': 'SoutheastAsian',
    'belgian': 'WesternEuropean',
    'east_african': 'African',
    'israeli': 'MiddleEastern',
    'bangladeshi': 'SouthAsian'
}

mapping = pd.DataFrame(list(data.items()), columns=['country', 'region'])

# Clean the data
mapping['country'] = mapping['country'].str.strip()
mapping['region'] = mapping['region'].str.strip()

# Get unique regions and countries
unique_regions = mapping['region'].unique()
unique_countries = mapping['country'].unique()

regions_countries = mapping.groupby('region')['country'].apply(list).reset_index()






In [56]:
# Define the cuisine mapping with adjectival forms and variations
cuisine_mapping = {
    'vietnamese': ['vietnamese', 'vietnam'],
    'indian': ['indian', 'india'],
    'spanish_portuguese': ['spanish_portuguese'],
    'jewish': ['jewish'],
    'french': ['french', 'france'],
    'central_south_american': ['central_southamerican'],
    'cajun_creole': ['cajun_creole'],
    'thai': ['thai', 'thailand'],
    'scandinavian': ['scandinavian', 'scandinavia'],
    'greek': ['greek'],
    'american': ['american'],
    'african': ['african'],
    'middle_eastern': ['middleeastern', 'middle_eastern', 'turkey', 'iran', 'israel', 'lebanon'],
    'eastern_european_russian': ['easterneuropean_russian', 'eastern-europe', 'russia'],
    'italian': ['italian', 'italy'],
    'irish': ['irish', 'ireland'],
    'mexican': ['mexican', 'mexico'],
    'chinese': ['chinese', 'china'],
    'german': ['german', 'germany'],
    'mediterranean': ['mediterranean'],
    'japanese': ['japanese', 'japan'],
    'moroccan': ['moroccan'],
    'southern_soul_food': ['southern_soulfood'],
    'english_scottish': ['english_scottish', 'uk-and-ireland', 'england', 'scotland'],
    'asian': ['asian'],
    'southwestern': ['southwestern'],
    'east_asian': ['east_asian'],
    'western': ['western'],
    'korean': ['korean', 'korea'],
    'canadian': ['canada'],
    'caribbean': ['caribbean'],
    'bangladeshi': ['bangladesh'],
    'israeli': ['israel'],
    'iranian': ['iran'],
    'south_african': ['south-african'],
    'belgian': ['belgium'],
    'spanish': ['spain'],
    'dutch': ['netherlands'],
    'filipino': ['philippines'],
    'indonesian': ['indonesia'],
    'east_african': ['east-african'],
    'swiss': ['switzerland'],
    'west_african': ['west-african'],
    'north_african': ['north-african'],
    'pakistani': ['pakistan'],
    'portuguese': ['portugal'],
    'malaysian': ['malaysia'],
    'austrian': ['austria']
}

# Reverse the mapping for easier lookup
cuisine_lookup = {alias: cuisine for cuisine, aliases in cuisine_mapping.items() for alias in aliases}



# Create a dictionary from the mapping DataFrame
country_to_region = dict(zip(mapping['country'].str.lower().str.replace(' ', '_'), mapping['region'].str.lower().str.replace(' ', '_')))

# Explicitly format region names
def format_region_name(region):
    if pd.isna(region):
        return 'unknown'
    formatted_region = region.replace('southeastasian', 'south_east_asian') \
        .replace('southasian', 'south_asian') \
        .replace('southerneuropean', 'southern_european') \
        .replace('middleeastern', 'middle_eastern') \
        .replace('westerneuropean', 'western_european') \
        .replace('latinamerican', 'latin_american') \
        .replace('northamerican', 'north_american') \
        .replace('northerneuropean', 'northern_european') \
        .replace('easterneuropean', 'eastern_european') \
        .replace('eastasian', 'east_asian')
    return formatted_region

# Function to standardize country names and map to region
def standardize_and_map_region(country):
    if pd.isna(country):
        return 'unknown', 'unknown'
    country_lower = str(country).lower().replace(' ', '_')
    standardized_country = cuisine_lookup.get(country_lower, country_lower)
    region = country_to_region.get(standardized_country, 'unknown')
    return standardized_country, format_region_name(region)


# Apply the function to create new columns
ingredients_df[['country', 'region']] = ingredients_df['country'].apply(lambda x: pd.Series(standardize_and_map_region(x)))

# Move the 'region' column to the beginning
columns = ['region'] + [col for col in ingredients_df if col != 'region']
ingredients_df = ingredients_df[columns]


In [57]:
ingredients_df.replace({'Yes': 1, 'No': 0}, inplace=True)

In [58]:
ingr_small = ingredients_df.columns.copy()

In [59]:
ingr_small = pd.DataFrame(data=ingr_small)

In [60]:
# Assign column names if necessary
ingr_small.columns = ['ingredient_name']

# Drop rows that contain 'region' or 'country'
ingr_small = ingr_small[~ingr_small['ingredient_name'].isin(['region', 'country'])]

# Add ingredient_id column
ingr_small.insert(0, 'ingredient_id', range(1, 1 + len(ingr_small)))

# Print the cleaned DataFrame
print(ingr_small)

     ingredient_id ingredient_name
2                1          almond
3                2        angelica
4                3           anise
5                4      anise_seed
6                5           apple
..             ...             ...
380            379            wood
381            380             yam
382            381           yeast
383            382          yogurt
384            383        zucchini

[383 rows x 2 columns]


In [61]:
ingr_small['ingredient_name'] = ingr_small['ingredient_name'].apply(replace_spaces)

In [62]:
# Function to replace ingredient IDs
def replace_ingredient_id(ingr_small, ingr_df):
    # Create a dictionary for quick lookup of ingredient_id by ingredient_name
    ingr_dict = pd.Series(ingr_df.ingredient_id.values, index=ingr_df.ingredient_name).to_dict()

    # Function to replace ingredient_id if ingredient_name matches
    def replace_id(row):
        ingredient_name = row['ingredient_name']
        if ingredient_name in ingr_dict:
            row['ingredient_id'] = ingr_dict[ingredient_name]
        return row

    # Apply the function to each row in ingr_small
    ingr_small = ingr_small.apply(replace_id, axis=1)

    return ingr_small

# Apply the function
ingr_small = replace_ingredient_id(ingr_small, ingr_df)



In [63]:
ingr_small_def = ingr_small.copy()

In [64]:
ingr_comp_merge = ingr_comp_df.copy()

In [65]:
ingr_comp_merge.compound_id = ingr_comp_merge.compound_id.astype(str)

In [66]:
ingr_small['ingredient_id'] = ingr_small['ingredient_id'].astype(str)
ingr_comp_df['ingredient_id'] = ingr_comp_df['ingredient_id'].astype(str)
ingr_comp_df['compound_id'] = ingr_comp_df['compound_id'].astype(str)

# Convert column names in ingr_small to strings
ingr_small.columns = ingr_small.columns.astype(str)

# Get the unique compound_ids in ingr_comp_df
compound_id_list = ingr_comp_df['compound_id'].unique()

# Create a DataFrame to hold the new columns with initial values set to 0
new_columns_df = pd.DataFrame(0, index=ingr_small.index, columns=compound_id_list)

# Concatenate the new columns to ingr_small
ingr_small = pd.concat([ingr_small, new_columns_df], axis=1)

# Iterate through ingr_comp_df and update the values in ingr_small
for index, row in ingr_comp_df.iterrows():
    ingredient_id = row['ingredient_id']
    compound_id = row['compound_id']
    if ingredient_id in ingr_small['ingredient_id'].values:
        ingr_small.loc[ingr_small['ingredient_id'] == ingredient_id, compound_id] = 1



In [67]:
compound_id_cols = [col for col in ingr_small.columns if col not in ['ingredient_id', 'ingredient_name']]

In [68]:
ingr_small['count_ones'] = ingr_small[compound_id_cols].sum(axis=1)

In [69]:
# Group by 'ingredient_id' and aggregate 'compound_id' into a list
comp_per_ingr = ingr_comp_df.groupby('ingredient_id')['compound_id'].apply(list).reset_index()
ingr_per_comp = ingr_comp_df.groupby('compound_id')['ingredient_id'].apply(list).reset_index()

In [70]:
ingr_per_comp.sort_values(by='compound_id', ascending=False, inplace=True)

In [71]:
ingr_small['ingredient_name'] = ingr_small['ingredient_name'].astype(str)

In [72]:
ingredient_cols = ingredients_df.columns.copy()

In [73]:
ingr_small_mapping = ingr_small['ingredient_name']

In [74]:
ingr_small_mapping = pd.concat([ingr_small_mapping, ingr_small['ingredient_id']], axis=1)

In [75]:
ingredients_df.columns = ingredients_df.columns.astype(str)

In [76]:
ingredient_name_to_id = dict(zip(ingr_small_mapping['ingredient_name'], ingr_small_mapping['ingredient_id']))

In [77]:
new_columns = []

# Iterate over the current columns and replace ingredient names with IDs, formatted as ingr_{ingredient_id}
for col in ingredients_df.columns:
    if col in ['region', 'country']:  # Keep these columns as is
        new_columns.append(col)
    else:
        # Replace ingredient names with their corresponding IDs and format as ingr_{ingredient_id}
        ingredient_id = ingredient_name_to_id.get(col, col)
        new_columns.append(f'ingr_{ingredient_id}')


In [78]:
ingredients_df.columns = new_columns

In [79]:
ingredients_df.columns = ingredients_df.columns.astype(str)

In [80]:
unknown_countries = ingredients_df.loc[ingredients_df['region'] == 'unknown', 'country'].value_counts()

# Print the results
print("Countries associated with the 'unknown' region:")
print(unknown_countries)

Countries associated with the 'unknown' region:
Series([], Name: count, dtype: int64)


In [81]:
ingredient_id_to_name = {v: k for k, v in ingredient_name_to_id.items()}

In [82]:
ingredients_df_with_names = ingredients_df.copy()

# Create new column names by replacing ingredient IDs with names using the reverse mapping
new_columns_with_names = []

for col in ingredients_df.columns:
    if col in ['region', 'country']:  # Keep these columns as is
        new_columns_with_names.append(col)
    else:
        # Extract the ID from the column name (e.g., 'ingr_18' -> '18')
        ingredient_id = col.replace('ingr_', '')
        # Get the ingredient name from the ID
        ingredient_name = ingredient_id_to_name.get(ingredient_id, ingredient_id)
        new_columns_with_names.append(ingredient_name)


ingredients_df_with_names.columns = new_columns_with_names


In [83]:
ingr_small_copy = ingr_small.copy()

# Extract the columns representing the compounds (excluding the first two columns)
compound_columns = ingr_small_copy.columns[2:]

# Create a new column to contain lists of the names of the columns with a value of 1
ingr_small_copy['compounds_present'] = ingr_small_copy.apply(lambda row: list(compound_columns[row[compound_columns] == 1]), axis=1)



In [84]:
columns_to_keep = ['ingredient_name', 'compounds_present']
ingr_small_copy = ingr_small_copy[columns_to_keep]



In [85]:
import pandas as pd
from collections import defaultdict
import ast

def safe_literal_eval(val):
    try:
        return ast.literal_eval(val)
    except (ValueError, SyntaxError):
        return val

# Convert compounds_present column from string to list of integers
ingr_small_copy['compounds_present'] = ingr_small_copy['compounds_present'].apply(safe_literal_eval)

# Step 1: Calculate prevalence of each ingredient in each cuisine
ingredient_prevalence = defaultdict(lambda: defaultdict(int))

# Iterate over rows of ingredients_df_with_names
for index, row in ingredients_df_with_names.iterrows():
    region = row['region']
    ingredients = row.drop(['region', 'country'])
    present_ingredients = ingredients.index[ingredients == 1]
    for ingredient in present_ingredients:
        ingredient_prevalence[region][ingredient] += 1

# Convert ingredient prevalence to DataFrame
ingredient_prevalence_df = pd.DataFrame(ingredient_prevalence).fillna(0)

# Step 2: Calculate relative prevalence (authenticity)
Nc = ingredient_prevalence_df.sum(axis=0)  # Total number of recipes per cuisine
Pc0 = ingredient_prevalence_df.sum(axis=1) / len(ingredient_prevalence_df.columns)  # Prevalence of each ingredient across all cuisines

relative_prevalence = pd.DataFrame(index=ingredient_prevalence_df.index, columns=ingredient_prevalence_df.columns)
for cuisine in relative_prevalence.columns:
    relative_prevalence[cuisine] = (ingredient_prevalence_df[cuisine] / Nc[cuisine]) - Pc0

# Step 3: Identify ingredient pairs and triplets that are overrepresented in each cuisine
pair_prevalence = defaultdict(lambda: defaultdict(int))
triplet_prevalence = defaultdict(lambda: defaultdict(int))

# Iterate over rows of ingredients_df again to find pairs and triplets
for index, row in ingredients_df_with_names.iterrows():
    region = row['region']
    ingredients = row.drop(['region', 'country'])
    present_ingredients = ingredients.index[ingredients == 1]
    for i in range(len(present_ingredients)):
        for j in range(i + 1, len(present_ingredients)):
            pair_prevalence[region][(present_ingredients[i], present_ingredients[j])] += 1
    for i in range(len(present_ingredients)):
        for j in range(i + 1, len(present_ingredients)):
            for k in range(j + 1, len(present_ingredients)):
                triplet_prevalence[region][(present_ingredients[i], present_ingredients[j], present_ingredients[k])] += 1

# Convert pair and triplet prevalence to DataFrames
pair_prevalence_df = pd.DataFrame(pair_prevalence).fillna(0)
triplet_prevalence_df = pd.DataFrame(triplet_prevalence).fillna(0)

# Display or further analyze the results
relative_prevalence.head()
pair_prevalence_df.head()
triplet_prevalence_df.head()




Unnamed: 0,Unnamed: 1,Unnamed: 2,south_east_asian,south_asian,southern_european,middle_eastern,western_european,latin_american,north_american,northern_european,african,eastern_european,east_asian
basil,carrot,cayenne,9.0,0.0,4.0,0.0,2.0,5.0,32.0,0.0,0.0,1.0,10.0
basil,carrot,cilantro,9.0,1.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,10.0
basil,carrot,cucumber,6.0,0.0,0.0,0.0,0.0,0.0,11.0,0.0,0.0,0.0,5.0
basil,carrot,fish,9.0,0.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,9.0
basil,carrot,garlic,10.0,2.0,52.0,0.0,16.0,7.0,173.0,0.0,1.0,1.0,11.0


In [86]:
# Find the top 5 pairs for each region
top_pairs = {}
for cuisine in pair_prevalence_df.columns:
    sorted_pairs = pair_prevalence_df[cuisine].sort_values(ascending=False).head(5)
    top_pairs[cuisine] = sorted_pairs

# Display the top pairs for each region
for cuisine, pairs in top_pairs.items():
    print(f"Top 5 pairs for {cuisine}:")
    for pair, count in pairs.items():
        print(f"{pair}: {count}")
    print()


Top 5 pairs for south_east_asian:
('fish', 'garlic'): 148.0
('cayenne', 'garlic'): 135.0
('garlic', 'vegetable_oil'): 132.0
('garlic', 'soy_sauce'): 122.0
('cayenne', 'fish'): 121.0

Top 5 pairs for south_asian:
('cumin', 'turmeric'): 271.0
('coriander', 'cumin'): 265.0
('coriander', 'turmeric'): 246.0
('cumin', 'onion'): 225.0
('onion', 'turmeric'): 218.0

Top 5 pairs for southern_european:
('garlic', 'olive_oil'): 1760.0
('olive_oil', 'tomato'): 1250.0
('garlic', 'tomato'): 1196.0
('olive_oil', 'onion'): 1063.0
('garlic', 'onion'): 962.0

Top 5 pairs for middle_eastern:
('egg', 'wheat'): 170.0
('garlic', 'olive_oil'): 146.0
('olive_oil', 'onion'): 128.0
('butter', 'wheat'): 105.0
('garlic', 'onion'): 105.0

Top 5 pairs for western_european:
('butter', 'wheat'): 947.0
('egg', 'wheat'): 932.0
('butter', 'egg'): 817.0
('egg', 'milk'): 509.0
('milk', 'wheat'): 503.0

Top 5 pairs for latin_american:
('cayenne', 'onion'): 1496.0
('garlic', 'onion'): 1456.0
('cayenne', 'garlic'): 1413.0
('o

In [87]:
# Find the top 5 triplets for each region
top_triplets = {}
for cuisine in triplet_prevalence_df.columns:
    sorted_triplets = triplet_prevalence_df[cuisine].sort_values(ascending=False).head(5)
    top_triplets[cuisine] = sorted_triplets

#Display the top triplets for each region
for cuisine, triplets in top_triplets.items():
    print(f"Top 5 triplets for {cuisine}:")
    for triplet, count in triplets.items():
        print(f"{triplet}: {count}")
    print()

Top 5 triplets for south_east_asian:
('cayenne', 'fish', 'garlic'): 86.0
('coconut', 'coriander', 'cumin'): 76.0
('coriander', 'cumin', 'turmeric'): 73.0
('coriander', 'cumin', 'pepper'): 72.0
('cilantro', 'fish', 'garlic'): 72.0

Top 5 triplets for south_asian:
('coriander', 'cumin', 'turmeric'): 231.0
('cumin', 'onion', 'turmeric'): 184.0
('coriander', 'cumin', 'onion'): 174.0
('cumin', 'pepper', 'turmeric'): 170.0
('coriander', 'fenugreek', 'turmeric'): 169.0

Top 5 triplets for southern_european:
('garlic', 'olive_oil', 'tomato'): 982.0
('garlic', 'olive_oil', 'onion'): 736.0
('basil', 'garlic', 'olive_oil'): 688.0
('garlic', 'onion', 'tomato'): 669.0
('olive_oil', 'onion', 'tomato'): 664.0

Top 5 triplets for middle_eastern:
('garlic', 'olive_oil', 'onion'): 77.0
('butter', 'egg', 'wheat'): 76.0
('egg', 'vegetable_oil', 'wheat'): 68.0
('garlic', 'lemon_juice', 'olive_oil'): 56.0
('cumin', 'garlic', 'olive_oil'): 55.0

Top 5 triplets for western_european:
('butter', 'egg', 'wheat')

In [88]:
# Initialize defaultdicts to store compound prevalence in pairs and triplets per region or cuisine
pair_compound_prevalence = defaultdict(lambda: defaultdict(int))
triplet_compound_prevalence = defaultdict(lambda: defaultdict(int))

# Iterate over pair_prevalence_df to associate compounds_present with pairs
for cuisine in pair_prevalence_df.columns:
    for pair in pair_prevalence_df.index:
        if pair_prevalence_df[cuisine][pair] > 0:
            for ingredient in pair:
                if ingredient in ingr_small_copy['ingredient_name'].values:
                    compounds = ingr_small_copy.loc[ingr_small_copy['ingredient_name'] == ingredient, 'compounds_present'].iloc[0]
                    for compound in compounds:
                        pair_compound_prevalence[cuisine][compound] += 1

# Iterate over triplet_prevalence_df to associate compounds_present with triplets
for cuisine in triplet_prevalence_df.columns:
    for triplet in triplet_prevalence_df.index:
        if triplet_prevalence_df[cuisine][triplet] > 0:
            for ingredient in triplet:
                if ingredient in ingr_small_copy['ingredient_name'].values:
                    compounds = ingr_small_copy.loc[ingr_small_copy['ingredient_name'] == ingredient, 'compounds_present'].iloc[0]
                    for compound in compounds:
                        triplet_compound_prevalence[cuisine][compound] += 1

# Convert defaultdicts to DataFrames
pair_compound_prevalence_df = pd.DataFrame(pair_compound_prevalence).fillna(0)
triplet_compound_prevalence_df = pd.DataFrame(triplet_compound_prevalence).fillna(0)

# Find top compounds_present for each region or cuisine in pairs and triplets
top_compounds_per_pair = {}
top_compounds_per_triplet = {}

for cuisine in pair_compound_prevalence_df.columns:
    sorted_compounds_pair = pair_compound_prevalence_df[cuisine].sort_values(ascending=False).head(5)
    top_compounds_per_pair[cuisine] = sorted_compounds_pair

for cuisine in triplet_compound_prevalence_df.columns:
    sorted_compounds_triplet = triplet_compound_prevalence_df[cuisine].sort_values(ascending=False).head(5)
    top_compounds_per_triplet[cuisine] = sorted_compounds_triplet

# Display or further analyze the results
for cuisine, compounds_pair in top_compounds_per_pair.items():
    print(f"Top 5 compounds in pairs for {cuisine}:")
    print(compounds_pair)
    print()

for cuisine, compounds_triplet in top_compounds_per_triplet.items():
    print(f"Top 5 compounds in triplets for {cuisine}:")
    print(compounds_triplet)
    print()
#takes minutes..

Top 5 compounds in pairs for south_east_asian:
704    4325.0
798    4273.0
554    4221.0
524    4107.0
292    4027.0
Name: south_east_asian, dtype: float64

Top 5 compounds in pairs for south_asian:
704    5045.0
798    4653.0
554    4447.0
292    4361.0
524    4171.0
Name: south_asian, dtype: float64

Top 5 compounds in pairs for southern_european:
554    11176.0
798    11099.0
292    10777.0
136    10106.0
734     9990.0
Name: southern_european, dtype: float64

Top 5 compounds in pairs for middle_eastern:
704    4867.0
798    4650.0
292    4552.0
554    4517.0
734    4137.0
Name: middle_eastern, dtype: float64

Top 5 compounds in pairs for western_european:
798    11063.0
554    10790.0
292    10518.0
136    10199.0
524     9800.0
Name: western_european, dtype: float64

Top 5 compounds in pairs for latin_american:
798    8780.0
554    8467.0
292    8446.0
524    7961.0
734    7850.0
Name: latin_american, dtype: float64

Top 5 compounds in pairs for north_american:
798    23999.0
292 

In [89]:
!pip install fpdf




In [90]:
import pandas as pd
from collections import defaultdict
import ast
from fpdf import FPDF

# Function to safely evaluate a string as a Python literal expression
def safe_literal_eval(val):
    try:
        return ast.literal_eval(val)
    except (ValueError, SyntaxError):
        return val

# Convert compounds_present column from string to list of integers
ingr_small_copy['compounds_present'] = ingr_small_copy['compounds_present'].apply(safe_literal_eval)

# to create pdf
class PDF(FPDF):
    def __init__(self, cuisine_name):
        super().__init__()
        self.cuisine_name = cuisine_name

    def header(self):
        self.set_font('Arial', 'B', 12)
        self.cell(0, 10, f'{self.cuisine_name} Cuisine Report', 0, 1, 'C')

    def chapter_title(self, title):
        self.set_font('Arial', 'B', 12)
        self.cell(0, 10, title, 0, 1, 'L')
        self.ln(4)

    def chapter_body(self, body):
        self.set_font('Arial', '', 12)
        self.multi_cell(0, 10, body)
        self.ln()

# Function to calculate the ingredients with a value of 1 for each region
def calculate_ingredients_with_value_1(df):
    ingredients_with_1 = {}

    for index, row in df.iterrows():
        region = row['region']
        counts = {col: 1 for col, value in row.items() if value == 1 and col not in ['region', 'country']}
        
        # Sort ingredients alphabetically
        sorted_ingredients = {k: v for k, v in sorted(counts.items())}

        if region not in ingredients_with_1:
            ingredients_with_1[region] = sorted_ingredients
        else:
            ingredients_with_1[region].update(sorted_ingredients)

    return ingredients_with_1

# Function to calculate the percentage prevalence of an ingredient in a region
def percentage_prevalence(ingredient_column, Nc):
    Pc_i = ingredient_column.sum() / Nc
    return Pc_i

# Function to calculate prevalences for each region
def calculate_prevalences_by_region(df):
    regions = df['region'].unique()
    region_prevalences = {}

    for region in regions:
        region_df = df[df['region'] == region]
        
        # Calculate Nc for the current region
        Nc_region = len(region_df)
        
        # Calculate the percentage prevalences for each ingredient in the current region
        prevalences = {}
        for ingredient in region_df.columns[2:]:
            prevalences[ingredient] = percentage_prevalence(region_df[ingredient], Nc_region)
        
        # Sort ingredients by percentage prevalence and take the top 10
        sorted_prevalences = sorted(prevalences.items(), key=lambda x: x[1], reverse=True)[:10]
        sorted_prevalences = {item[0]: item[1] for item in sorted_prevalences}
        
        # Add the top 10 ingredient prevalences to the current region
        region_prevalences[region] = sorted_prevalences
    
    return region_prevalences

# Calculate the ingredients with a value of 1 for each region
ingredients_with_1 = calculate_ingredients_with_value_1(ingredients_df_with_names)

# Calculate the prevalences for each region
region_prevalences = calculate_prevalences_by_region(ingredients_df_with_names)


for cuisine, pairs in top_pairs.items():
    # Create an instance of PDF for each cuisine with cuisine_name as argument
    pdf = PDF(cuisine)

    # Add a page
    pdf.add_page()

    pdf.chapter_title(f"Top 5 pairs ingredients for {cuisine} cuisine:")
    for pair in pairs.keys():
        pdf.chapter_body(f"{pair}")
    pdf.ln()

    pdf.chapter_title(f"Top 5 triplets ingredients for {cuisine} cuisine:")
    for triplet in top_triplets[cuisine].keys():
        pdf.chapter_body(f"{triplet}")
    pdf.ln()

    pdf.chapter_title(f"Top 5 compounds in pairs for {cuisine} cuisine:")
    for compound in top_compounds_per_pair[cuisine].keys():
        pdf.chapter_body(f"{compound}")
    pdf.ln()

    pdf.chapter_title(f"Top 5 compounds in triplets for {cuisine} cuisine:")
    for compound in top_compounds_per_triplet[cuisine].keys():
        pdf.chapter_body(f"{compound}")
    pdf.ln()

    # Add ingredients with value 1
    pdf.chapter_title(f"Ingredients for {cuisine} cuisine:")
    ingredients_list = ', '.join(ingredients_with_1[cuisine].keys())
    pdf.chapter_body(ingredients_list)
    pdf.ln()

    # Add prevalences information
    pdf.chapter_title(f"Top 10 Ingredient Prevalences for {cuisine} cuisine:")
    for ingredient, prevalence in region_prevalences[cuisine].items():
        pdf.chapter_body(f"{ingredient}: {prevalence:.2%}")
    pdf.ln()

    # Add additional information from ingr_small_copy
    pdf.chapter_title(f"Compounds for each ingredient for {cuisine}:")
    for ingredient in ingredients_with_1[cuisine].keys():
        if ingredient in ingr_small_copy['ingredient_name'].values:
            row = ingr_small_copy[ingr_small_copy['ingredient_name'] == ingredient].iloc[0]
            compounds_list = ', '.join(map(str, row['compounds_present']))
            pdf.chapter_body(f"Ingredient: {ingredient}")
            pdf.chapter_body(f"Compounds Present: {compounds_list}")
    pdf.ln()

    # Save the PDF with cuisine-specific name
    pdf_file_name = f'{cuisine}_analysis.pdf'
    pdf.output(pdf_file_name)
