<h3><strong>Data Modelling</strong></h3>


In [1]:
import os

# Get the current working directory
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
print(f"Parent directory: {parent_dir}")

file_path = os.path.join(parent_dir, '01_DataPreprocessing', 'current_dataset.xlsx')
print(f"Full file path: {file_path}")
print(f"File exists: {os.path.exists(file_path)}")

Parent directory: c:\Users\Helena\Desktop\APU\Semester 5\INV\IR - PROPOSAL\scraping-code\baby-food-recom-data-ai
Full file path: c:\Users\Helena\Desktop\APU\Semester 5\INV\IR - PROPOSAL\scraping-code\baby-food-recom-data-ai\01_DataPreprocessing\current_dataset.xlsx
File exists: True


**Feature Engineering**

In [2]:
# If file exists, open it
import pandas as pd

if os.path.exists(file_path):
    import openpyxl
    workbook = openpyxl.load_workbook(file_path)
    worksheet = workbook["Sheet1"]
    print("Successfully opened the Excel file!")

df = pd.read_excel(file_path)
df[['choking_hazard', 'choking_hazards']].head()
#drop
df.drop(columns=['choking_hazard'], inplace=True)

Successfully opened the Excel file!


In [3]:
#drop data if imporant columns are empty
important_columns = ['food_name', 'ingredient', 'instructions',  'ner_ingredient','recipe_link']
for col in df.columns:
    if col in important_columns:
        null_count = df[col].isnull().sum()        
        if null_count >0:
            print(f"Column '{col}' has {null_count} null values.")  
            # df = df.dropna(subset=[col])
        else:
            print(f"Column '{col}' is complete.")


Column 'food_name' is complete.
Column 'ingredient' is complete.
Column 'ner_ingredient' is complete.
Column 'instructions' is complete.
Column 'recipe_link' is complete.


In [4]:
print("DataFrame info:")
print(df.info())

DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 520 entries, 0 to 519
Data columns (total 25 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   food_name        520 non-null    object 
 1   ingredient       520 non-null    object 
 2   ner_ingredient   520 non-null    object 
 3   instructions     520 non-null    object 
 4   min_age_group    520 non-null    int64  
 5   max_age_group    520 non-null    int64  
 6   texture          520 non-null    object 
 7   prep_time        388 non-null    float64
 8   cook_time        363 non-null    float64
 9   serving          439 non-null    object 
 10  origin           466 non-null    object 
 11  recipe_link      520 non-null    object 
 12  credibility      246 non-null    object 
 13  image_link       42 non-null     object 
 14  region           466 non-null    object 
 15  flag_code        466 non-null    object 
 16  difficulty       520 non-null    object 
 17  

In [5]:
# Get unique values for each column
print("Unique values in each column:")
print("-" * 50)

for column in df.columns:
    unique_values = df[column].unique()
    
    # Get the count of unique values
    unique_count = len(unique_values)
    
    # For columns with many unique values, just show the count
    if unique_count > 10:
        print(f"{column}: {unique_count} unique values")
    else:
        # For columns with few unique values, show all values
        print(f"{column}: {unique_values}")
    
    # Show value counts for categorical columns with few unique values
    if unique_count <= 10 and unique_count > 1:
        print(f"Value counts for {column}:")
        print(df[column].value_counts())
    
    print("-" * 50)

Unique values in each column:
--------------------------------------------------
food_name: 518 unique values
--------------------------------------------------
ingredient: 514 unique values
--------------------------------------------------
ner_ingredient: 508 unique values
--------------------------------------------------
instructions: 518 unique values
--------------------------------------------------
min_age_group: [ 7 12  6  9 10  8]
Value counts for min_age_group:
min_age_group
6     235
12    106
9     104
7      40
10     23
8      12
Name: count, dtype: int64
--------------------------------------------------
max_age_group: [ 8 12 24  9  6 36  7 11 23]
Value counts for max_age_group:
max_age_group
12    103
24    101
9      80
8      70
6      70
11     61
36     25
7       6
23      4
Name: count, dtype: int64
--------------------------------------------------
texture: ['puree' 'NONE' 'lumpy texture' 'family food' 'soft finger food']
Value counts for texture:
texture
puree 

*Cosine*

Using TF-IDF --> use ner ingredients, ingredients, instructions, 

In [6]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MultiLabelBinarizer, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

In [38]:
import ast
# Convert stringified list to actual list
df['allergen_list'] = df['allergen'].apply(ast.literal_eval)

# Handle empty lists by checking if the list is empty
df['allergen_str'] = df['allergen_list'].apply(lambda x: ' '.join(x) if x else '')

print("allergen_list:")
print(df['allergen_list'].head()) 
print("allergen_str:")
print(df['allergen_str'].head())


allergen_list:
0                  [soy]
1                     []
2    [milk, soy, gluten]
3           [milk, fish]
4                  [soy]
Name: allergen_list, dtype: object
allergen_str:
0                soy
1                   
2    milk soy gluten
3          milk fish
4                soy
Name: allergen_str, dtype: object


In [39]:
df['dietary_list'] = df['dietary_tags'].apply(ast.literal_eval)

# Handle empty lists by checking if the list is empty
df['dietary_str'] = df['dietary_list'].apply(lambda x: ' '.join(x) if x else '')

print("dietary_list:")
print(df['dietary_list'].head()) 
print("dietary_str:")
print(df['dietary_str'].head())

dietary_list:
0    [vegan, vegetarian, pescetarian, dairy_free, e...
1    [pescetarian, dairy_free, egg_free, soy_free, ...
2    [vegetarian, pescetarian, egg_free, nut_free, ...
3    [pescetarian, egg_free, soy_free, nut_free, gl...
4    [vegan, vegetarian, pescetarian, dairy_free, e...
Name: dietary_list, dtype: object
dietary_str:
0    vegan vegetarian pescetarian dairy_free egg_fr...
1    pescetarian dairy_free egg_free soy_free nut_f...
2    vegetarian pescetarian egg_free nut_free halal...
3    pescetarian egg_free soy_free nut_free gluten_...
4    vegan vegetarian pescetarian dairy_free egg_fr...
Name: dietary_str, dtype: object


In [55]:
# Multi-hot encoding for multi-label columns
mlb_tags = MultiLabelBinarizer()
tags_df = pd.DataFrame(mlb_tags.fit_transform(df['dietary_list']), columns=mlb_tags.classes_)

# Transform dietary tag column names: replace underscores with spaces for better readability
tags_df.columns = [col.replace('_', ' ') for col in tags_df.columns]
print("Updated dietary tag columns:", list(tags_df.columns))

mlb_allergen = MultiLabelBinarizer()
allergen_df = pd.DataFrame(mlb_allergen.fit_transform(df['allergen_list']), columns=mlb_allergen.classes_)

# Print the shapes to verify
print(f"Dietary tags shape: {tags_df.shape}")
print(f"Dietary tag categories: {list(tags_df.columns)}")
print(f"Allergen shape: {allergen_df.shape}")
print(f"Allergen categories: {list(allergen_df.columns)}")

# Show some sample data
print("\nSample dietary tags data:")
print(tags_df.head())
print("\nSample allergen data:")
print(allergen_df.head())

Updated dietary tag columns: ['dairy free', 'egg free', 'gluten free', 'halal', 'non halal', 'non veg', 'nut free', 'pescetarian', 'soy free', 'vegan', 'vegetarian']
Dietary tags shape: (520, 11)
Dietary tag categories: ['dairy free', 'egg free', 'gluten free', 'halal', 'non halal', 'non veg', 'nut free', 'pescetarian', 'soy free', 'vegan', 'vegetarian']
Allergen shape: (520, 8)
Allergen categories: ['egg', 'fish', 'gluten', 'milk', 'nuts', 'peanuts', 'shellfish', 'soy']

Sample dietary tags data:
   dairy free  egg free  gluten free  halal  non halal  non veg  nut free  \
0           1         1            1      1          0        1         1   
1           1         1            1      1          0        1         1   
2           0         1            0      1          0        1         1   
3           0         1            1      1          0        1         1   
4           1         1            1      1          0        1         1   

   pescetarian  soy free  vegan  v

In [33]:
df[['choking_hazards', 'hypoallergenic']].head()

Unnamed: 0,choking_hazards,hypoallergenic
0,0.0,0.0
1,0.0,0.0
2,0.0,0.0
3,0.0,0.0
4,0.0,0.0


In [37]:
# Let's reload the original data to check the raw values
import pandas as pd
import os

# Reload the original Excel file
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
file_path = os.path.join(parent_dir, '01_DataPreprocessing', 'current_dataset.xlsx')

# Read fresh data
original_df = pd.read_excel(file_path)

print("Original data - Unique values in choking_hazards:")
if 'choking_hazards' in original_df.columns:
    print(original_df['choking_hazards'].unique())
    print("\nChoking hazards value counts:")
    print(original_df['choking_hazards'].value_counts(dropna=False))
else:
    print("choking_hazards column not found")

print("\nOriginal data - Unique values in hypoallergenic:")
if 'hypoallergenic' in original_df.columns:
    print(original_df['hypoallergenic'].unique())
    print("\nHypoallergenic value counts:")
    print(original_df['hypoallergenic'].value_counts(dropna=False))
else:
    print("hypoallergenic column not found")

# Check if there are other similar columns
print("\nAll columns containing 'chok' or 'hazard':")
chok_cols = [col for col in original_df.columns if 'chok' in col.lower() or 'hazard' in col.lower()]
print(chok_cols)

print("\nAll columns containing 'hypo' or 'allerg':")
hypo_cols = [col for col in original_df.columns if 'hypo' in col.lower() or 'allerg' in col.lower()]
print(hypo_cols)

# Show sample of the original data
print("\nSample of original data:")
if chok_cols or hypo_cols:
    relevant_cols = chok_cols + hypo_cols
    print(original_df[relevant_cols].head(10))
else:
    print("No relevant columns found")

# First reload the original data to ensure we have the correct values
# Reload the original Excel file to get fresh data
df = pd.read_excel(file_path)

# Drop the duplicate choking_hazard column
if 'choking_hazard' in df.columns:
    df.drop(columns=['choking_hazard'], inplace=True)

print("Original values before mapping:")
print("Unique values in choking_hazards:", df['choking_hazards'].unique())
print("Unique values in hypoallergenic:", df['hypoallergenic'].unique())

# Correct column mapping with proper case (Yes/No instead of yes/no)
df['choking_hazards'] = df['choking_hazards'].map({'Yes': 1, 'No': 0}).fillna(0)
df['choking_hazards'] = df['choking_hazards'].astype(float)
df['hypoallergenic'] = df['hypoallergenic'].map({'Yes': 1, 'No': 0}).fillna(0)
df['hypoallergenic'] = df['hypoallergenic'].astype(float)

print("\nAfter mapping:")
print("Unique values in choking_hazards:", df['choking_hazards'].unique())
print("Unique values in hypoallergenic:", df['hypoallergenic'].unique())

print("\nValue counts after mapping:")
print("Choking hazards:")
print(df['choking_hazards'].value_counts())
print("\nHypoallergenic:")
print(df['hypoallergenic'].value_counts())

print("\nSample of mapped data:")
print(df[['hypoallergenic', 'choking_hazards']].head(10))


Original data - Unique values in choking_hazards:
['No' 'Yes']

Choking hazards value counts:
choking_hazards
No     517
Yes      3
Name: count, dtype: int64

Original data - Unique values in hypoallergenic:
['No' 'Yes']

Hypoallergenic value counts:
hypoallergenic
No     336
Yes    184
Name: count, dtype: int64

All columns containing 'chok' or 'hazard':
['choking_hazard', 'choking_hazards']

All columns containing 'hypo' or 'allerg':
['allergen', 'hypoallergenic']

Sample of original data:
  choking_hazard choking_hazards                   allergen hypoallergenic
0             No              No                    ['soy']             No
1             No              No                         []            Yes
2             No              No  ['milk', 'soy', 'gluten']             No
3            Yes             Yes           ['milk', 'fish']             No
4             No              No                    ['soy']             No
5             No              No                     

In [46]:
import ast

# Convert stringified list to actual list
df['ner_ingredient_list'] = df['ner_ingredient'].apply(ast.literal_eval)
df['ner_ingredient_str'] = df['ner_ingredient_list'].apply(lambda x: ' '.join(x))

print("ner_ingredient_list:")
print(df['ner_ingredient_list'].head()) 
print("ner_ingredient_str:")
print(df['ner_ingredient_str'].head())


ner_ingredient_list:
0           [sweet potato, edamame, cornstarch, water]
1    [onion, chicken breast, udon thin, dashi, clea...
2                              [yogurt, soybean flour]
3    [japanese rice, natto, long onion, egg, butter...
4                            [dashi, tofu, miso paste]
Name: ner_ingredient_list, dtype: object
ner_ingredient_str:
0                sweet potato edamame cornstarch water
1    onion chicken breast udon thin dashi clear bro...
2                                 yogurt soybean flour
3    japanese rice natto long onion egg butter fill...
4                                dashi tofu miso paste
Name: ner_ingredient_str, dtype: object


In [47]:
# Convert ENGLISH_STOP_WORDS from frozenset to list
stop_words_list = list(ENGLISH_STOP_WORDS)

# Use the list in TfidfVectorizer
df['combined_text'] = (
    df['ner_ingredient_str'] + ' ' + df['texture'] )

tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words_list)  # or simply use 'english'
tfidf_matrix = tfidf_vectorizer.fit_transform(df['combined_text'])

In [48]:
from scipy.sparse import hstack, csr_matrix

binary_features = csr_matrix(df[['choking_hazards', 'hypoallergenic']].values)

final_features = hstack([
    tfidf_matrix, 
    tags_df, 
    allergen_df,
    binary_features
])

In [49]:
# Print summary statistics about the sparse matrix
print(f"Matrix shape: {final_features.shape}")
print(f"Data type: {final_features.dtype}")
print(f"Memory usage (bytes): {final_features.data.nbytes + final_features.indices.nbytes + final_features.indptr.nbytes if hasattr(final_features, 'indptr') else final_features.data.nbytes + final_features.row.nbytes + final_features.col.nbytes}")
print(f"Percentage of non-zero elements: {100 * final_features.nnz / (final_features.shape[0] * final_features.shape[1]):.2f}%")

Matrix shape: (520, 421)
Data type: float64
Memory usage (bytes): 152624
Percentage of non-zero elements: 4.36%


In [50]:
# Convert to CSR format first, which supports efficient slicing
final_features_csr = final_features.tocsr()

# Now you can access the parts you want
print("TF-IDF part for first recipe:", final_features_csr[0, :tfidf_matrix.shape[1]].toarray()[0])


tags_shape = tags_df.shape[1]
print("Dietary tags part for first recipe:", 
      final_features_csr[0, tfidf_matrix.shape[1]:tfidf_matrix.shape[1]+tags_shape].toarray()[0])
print("Allergen part for first recipe:", 
      final_features_csr[0, tfidf_matrix.shape[1]+tags_shape:tfidf_matrix.shape[1]+tags_shape+allergen_df.shape[1]].toarray()[0])
print("Binary features for first recipe:", 
      final_features_csr[0, -2:].toarray()[0])  # Assuming binary_features has 2 columns

TF-IDF part for first recipe: [0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0. 

In [51]:
# Convert to CSR format first, which supports efficient slicing
import numpy as np

final_features_csr = final_features.tocsr()

# Get the TF-IDF feature names
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
print(f"\nTF-IDF feature names (first 10 of {len(tfidf_feature_names)}):")
print(tfidf_feature_names[:10])

# Get the dietary tags feature names
dietary_feature_names = list(tags_df.columns)
print(f"\nDietary tag features ({len(dietary_feature_names)}):")
print(dietary_feature_names)

# Get the allergen feature names
allergen_feature_names = list(allergen_df.columns)
print(f"\nAllergen features ({len(allergen_feature_names)}):")
print(allergen_feature_names)

# Binary feature names
binary_feature_names = ['choking_hazards', 'hypoallergenic']
print(f"\nBinary features:")
print(binary_feature_names)

# Update the shape calculations
tags_shape = tags_df.shape[1]
allergen_shape = allergen_df.shape[1]
binary_shape = 2

# Print sample values for first recipe
print("\n----- SAMPLE VALUES FOR FIRST RECIPE -----")
print("TF-IDF part (showing non-zero values only):")
tfidf_values = final_features_csr[0, :tfidf_matrix.shape[1]].toarray()[0]
non_zero_indices = np.nonzero(tfidf_values)[0]
for idx in non_zero_indices:
    print(f"  {tfidf_feature_names[idx]}: {tfidf_values[idx]:.4f}")

print("\nDietary tags:")
tags_start = tfidf_matrix.shape[1]
tags_end = tags_start + tags_shape
tags_values = final_features_csr[0, tags_start:tags_end].toarray()[0]
for i, val in enumerate(tags_values):
    if val > 0:
        print(f"  {dietary_feature_names[i]}: {val}")

print("\nAllergen part:")
allergen_start = tags_end
allergen_end = allergen_start + allergen_shape
allergen_values = final_features_csr[0, allergen_start:allergen_end].toarray()[0]
for i, val in enumerate(allergen_values):
    if val > 0:
        print(f"  {allergen_feature_names[i]}: {val}")

print("\nBinary features:")
binary_values = final_features_csr[0, -binary_shape:].toarray()[0]
for i, name in enumerate(binary_feature_names):
    print(f"  {name}: {binary_values[i]}")

# Check samples with different binary values
print("\n----- VERIFICATION: CHECKING RECIPES WITH DIFFERENT BINARY VALUES -----")

# Find recipes with choking hazards = 1
choking_hazard_indices = df[df['choking_hazards'] == 1].index
print(f"\nRecipes with choking hazards (found {len(choking_hazard_indices)}):")
if len(choking_hazard_indices) > 0:
    idx = choking_hazard_indices[0]
    binary_vals = final_features_csr[idx, -binary_shape:].toarray()[0]
    print(f"Recipe {idx}: choking_hazards={binary_vals[0]}, hypoallergenic={binary_vals[1]}")

# Find recipes with hypoallergenic = 1
hypoallergenic_indices = df[df['hypoallergenic'] == 1].index
print(f"\nRecipes with hypoallergenic (found {len(hypoallergenic_indices)}):")
if len(hypoallergenic_indices) > 0:
    idx = hypoallergenic_indices[0]
    binary_vals = final_features_csr[idx, -binary_shape:].toarray()[0]
    print(f"Recipe {idx}: choking_hazards={binary_vals[0]}, hypoallergenic={binary_vals[1]}")

print(f"\nTotal feature dimensions: TF-IDF={tfidf_matrix.shape[1]}, Dietary={tags_shape}, Allergen={allergen_shape}, Binary={binary_shape}")
print(f"Total features: {tfidf_matrix.shape[1] + tags_shape + allergen_shape + binary_shape}")
print(f"Final matrix shape: {final_features_csr.shape}")

# Summary of binary feature distribution
print("\n----- BINARY FEATURE DISTRIBUTION -----")
print(f"Choking hazards: {int(df['choking_hazards'].sum())} recipes with hazards out of {len(df)}")
print(f"Hypoallergenic: {int(df['hypoallergenic'].sum())} recipes that are hypoallergenic out of {len(df)}")


TF-IDF feature names (first 10 of 400):
['abon' 'agar' 'almond' 'ambon' 'anchovy' 'apple' 'applesauce' 'apricot'
 'aron' 'artichoke']

Dietary tag features (11):
['dairy_free', 'egg_free', 'gluten_free', 'halal', 'non_halal', 'non_veg', 'nut_free', 'pescetarian', 'soy_free', 'vegan', 'vegetarian']

Allergen features (8):
['egg', 'fish', 'gluten', 'milk', 'nuts', 'peanuts', 'shellfish', 'soy']

Binary features:
['choking_hazards', 'hypoallergenic']

----- SAMPLE VALUES FOR FIRST RECIPE -----
TF-IDF part (showing non-zero values only):
  cornstarch: 0.4330
  edamame: 0.6897
  potato: 0.2818
  puree: 0.1831
  sweet: 0.4016
  water: 0.2502

Dietary tags:
  dairy_free: 1.0
  egg_free: 1.0
  gluten_free: 1.0
  halal: 1.0
  non_veg: 1.0
  nut_free: 1.0
  pescetarian: 1.0
  vegan: 1.0
  vegetarian: 1.0

Allergen part:
  soy: 1.0

Binary features:
  choking_hazards: 0.0
  hypoallergenic: 0.0

----- VERIFICATION: CHECKING RECIPES WITH DIFFERENT BINARY VALUES -----

Recipes with choking hazards 

In [52]:
# Create a list of all feature names
all_feature_names = (
    list(tfidf_vectorizer.get_feature_names_out()) + 
    list(tags_df.columns) + 
    list(allergen_df.columns) + 
    ['choking_hazards', 'hypoallergenic']
)

print(f"Total number of features: {len(all_feature_names)}")
print(f"Feature names: {all_feature_names[:10]}...")  # Show first 10 feature names

# Convert sparse matrix to DataFrame (caution: may be large!)
final_features_df = pd.DataFrame(
    final_features.toarray(),  # Convert to dense array
    columns=all_feature_names  # Use the combined feature names list
)

# Print the first few rows
print("\nFeature matrix shape:", final_features_df.shape)
print("\nFirst few rows of feature matrix:")
print(final_features_df.head())

Total number of features: 421
Feature names: ['abon', 'agar', 'almond', 'ambon', 'anchovy', 'apple', 'applesauce', 'apricot', 'aron', 'artichoke']...

Feature matrix shape: (520, 421)

First few rows of feature matrix:
   abon  agar  almond  ambon  anchovy  apple  applesauce  apricot  aron  \
0   0.0   0.0     0.0    0.0      0.0    0.0         0.0      0.0   0.0   
1   0.0   0.0     0.0    0.0      0.0    0.0         0.0      0.0   0.0   
2   0.0   0.0     0.0    0.0      0.0    0.0         0.0      0.0   0.0   
3   0.0   0.0     0.0    0.0      0.0    0.0         0.0      0.0   0.0   
4   0.0   0.0     0.0    0.0      0.0    0.0         0.0      0.0   0.0   

   artichoke  ...  egg  fish  gluten  milk  nuts  peanuts  shellfish  soy  \
0        0.0  ...  0.0   0.0     0.0   0.0   0.0      0.0        0.0  1.0   
1        0.0  ...  0.0   0.0     0.0   0.0   0.0      0.0        0.0  0.0   
2        0.0  ...  0.0   0.0     1.0   1.0   0.0      0.0        0.0  1.0   
3        0.0  ...  0.0

In [53]:
# Create a list of all feature names
all_feature_names = (
    list(tfidf_vectorizer.get_feature_names_out()) + 
    list(tags_df.columns) + 
    list(allergen_df.columns) + 
    ['choking_hazards', 'hypoallergenic']
)

print(f"Total number of features: {len(all_feature_names)}")
print(f"Feature names: {all_feature_names[:10]}...")  # Show first 10 feature names

final_features_df = pd.DataFrame(
    final_features.toarray(),  # Convert to dense array
    columns=all_feature_names  # Use the combined feature names list
)

# Print the first few rows
print(final_features_df.head())

# Calculate cosine similarity using the complete feature matrix
from sklearn.metrics.pairwise import cosine_similarity

# Calculate similarity matrix using all features (not just TF-IDF)
cosine_sim_complete = cosine_similarity(final_features, final_features)
print(f"Complete cosine similarity shape: {cosine_sim_complete.shape}")

# Also calculate similarity using just TF-IDF for comparison
cosine_sim_tfidf = cosine_similarity(tfidf_matrix, tfidf_matrix)
print(f"TF-IDF only cosine similarity shape: {cosine_sim_tfidf.shape}")

# Show sample similarity scores for first recipe
print(f"\nComplete feature similarity for first recipe (first 10): {cosine_sim_complete[0][:10]}")
print(f"TF-IDF only similarity for first recipe (first 10): {cosine_sim_tfidf[0][:10]}")

# Find most similar recipes to the first recipe using complete features
first_recipe_similarities = cosine_sim_complete[0]
sorted_indices = np.argsort(first_recipe_similarities)[::-1]  # Sort in descending order

print("\nTop 5 most similar recipes to the first recipe (using all features):")
for i in range(5):
    idx = sorted_indices[i]
    similarity = first_recipe_similarities[idx]
    print(f"Recipe {idx}: Similarity = {similarity:.4f}")
    if 'food_name' in df.columns:
        print(f"  Food name: {df.iloc[idx]['food_name']}")
    print()

Total number of features: 421
Feature names: ['abon', 'agar', 'almond', 'ambon', 'anchovy', 'apple', 'applesauce', 'apricot', 'aron', 'artichoke']...
   abon  agar  almond  ambon  anchovy  apple  applesauce  apricot  aron  \
0   0.0   0.0     0.0    0.0      0.0    0.0         0.0      0.0   0.0   
1   0.0   0.0     0.0    0.0      0.0    0.0         0.0      0.0   0.0   
2   0.0   0.0     0.0    0.0      0.0    0.0         0.0      0.0   0.0   
3   0.0   0.0     0.0    0.0      0.0    0.0         0.0      0.0   0.0   
4   0.0   0.0     0.0    0.0      0.0    0.0         0.0      0.0   0.0   

   artichoke  ...  egg  fish  gluten  milk  nuts  peanuts  shellfish  soy  \
0        0.0  ...  0.0   0.0     0.0   0.0   0.0      0.0        0.0  1.0   
1        0.0  ...  0.0   0.0     0.0   0.0   0.0      0.0        0.0  0.0   
2        0.0  ...  0.0   0.0     1.0   1.0   0.0      0.0        0.0  1.0   
3        0.0  ...  0.0   1.0     0.0   1.0   0.0      0.0        0.0  0.0   
4        0.0  

In [54]:
# Calculate cosine similarity using TF-IDF matrix only
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim_tfidf = cosine_similarity(tfidf_matrix, tfidf_matrix)
print(f"TF-IDF cosine similarity shape: {cosine_sim_tfidf.shape}")
print(f"TF-IDF cosine similarity for first recipe (first 5 values): {cosine_sim_tfidf[0][:5]}")

# Note: This is similarity based on TF-IDF features only
# For complete similarity including dietary tags, allergens, and binary features,
# see the next cell which uses the complete feature matrix

TF-IDF cosine similarity shape: (520, 520)
TF-IDF cosine similarity for first recipe (first 5 values): [1.         0.1338791  0.03657197 0.         0.        ]


<h4><strong>Jaccard<strong></h4>


<p><strong>Evaluation Metrics</strong><p>

<p><strong>Confidence Level of Certain Recipe</strong></p>