Ce Notebook contient 2 fonctions "search_matching_food" & "search_top_n_matching_food" qui permettent de trouver l'ingrédient ou les ingrédients de la BDD fournies qui sont les plus proches d'un String

Exemple : search_matching_food("apple", ingredients_db, index), renvoie l'ingrédient de la base de donnée qui semble correspondre à "Apple"

# Imports et outils

In [11]:
import pandas as pd
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [2]:
# Load Ingredients DataBase
ingredients_db = pd.read_csv("data/ingredients_db.csv", sep=';')


# Fonctions

In [3]:
def init_model(data_frame: pd.DataFrame):
    """
    Creates a FAISS index from the ingredient names in the DataFrame.

    Args:
        data_frame (pd.DataFrame): DataFrame containing ingredient information with columns: "EnglishFoodName" (Name), "Groupe" (Group), "Sous-groupe" (Subgroup), "Id_CIQUAL".

    Returns:
        tuple: A tuple containing the FAISS index and the embeddings, or None if the DataFrame is empty.
    """
    ingredients_names = data_frame['EnglishFoodName'].apply(preprocess_text).tolist()
    
    if len(ingredients_names) == 0:
        return None
    
    embeddings = embeddings_model.encode(ingredients_names, convert_to_tensor=True).cpu().numpy()
    vector_dimension = embeddings.shape[1]
    
    index = faiss.IndexFlatL2(vector_dimension)
    faiss.normalize_L2(embeddings)
    index.add(embeddings)
    
    return (index, embeddings)

def preprocess_text(text):
    """
    Preprocesses the input text for improved search performance.

    This function performs the following steps:
    - Converts text to lowercase.
    - Removes punctuation.
    - Tokenizes the text into words.
    - Removes English stop words.
    - Lemmatizes the tokens.

    Args:
        text (str): The text to preprocess.

    Returns:
        str: The preprocessed text as a single string.
    """
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)


embeddings_model = SentenceTransformer("all-mpnet-base-v2", tokenizer_kwargs={"clean_up_tokenization_spaces": True})


In [9]:
def search_matching_food(
    aliment: str,
    data_frame: pd.DataFrame,
    index: faiss.IndexFlatL2
) -> dict | None:
    """
    Searches for a food item in the FAISS index using a hybrid approach (semantic and TF-IDF).

    This function encodes the input food item, searches for the most similar items in the FAISS index,
    combines the semantic similarity with TF-IDF similarity, and returns the best match with additional information.

    Args:
        aliment (str): The food item to search for.
        data_frame (pd.DataFrame): DataFrame containing ingredient information with columns:
            'FoodID', 'FoodName', 'EnglishFoodName', 'FoodGroupName', 'FoodSubSubGroup'.
        index (faiss.IndexFlatL2): Precomputed FAISS index containing embeddings of the ingredient names.

    Returns:
        (dict | None): A dictionary containing:
            'FoodID', 'FoodName', 'EnglishFoodName', 'FoodGroupName', 'FoodSubSubGroup', 'Score'
        or None if no suitable match is found.
    """
    aliment_processed = preprocess_text(aliment)

    if data_frame.empty:
        return None

    aliment_vector = embeddings_model.encode(aliment_processed, convert_to_tensor=True).cpu().numpy().reshape(1, -1)
    faiss.normalize_L2(aliment_vector)

    k = min(5, len(data_frame))
    distances, indices = index.search(aliment_vector, k)

    tfidf = TfidfVectorizer()
    tfidf_matrix = tfidf.fit_transform(data_frame['EnglishFoodName'].apply(preprocess_text))
    aliment_tfidf = tfidf.transform([aliment_processed])
    tfidf_similarities = cosine_similarity(aliment_tfidf, tfidf_matrix).flatten()

    indices_flat = indices.flatten()
    combined_scores = (1 - distances.flatten() / 2) * 0.7 + tfidf_similarities[indices_flat] * 0.3
    best_index = combined_scores.argmax()
    best_match_index = indices_flat[best_index]
    best_score = combined_scores[best_index]

    if best_score < 0.5:
        return None

    result = {
        'FoodID': data_frame['FoodID'].iloc[best_match_index],
        'FoodName': data_frame['FoodName'].iloc[best_match_index],
        'EnglishFoodName': data_frame['EnglishFoodName'].iloc[best_match_index],
        'FoodGroupName': data_frame['FoodGroupName'].iloc[best_match_index],
        'FoodSubGroup': data_frame['FoodSubGroup'].iloc[best_match_index],
        'Score': best_score
    }

    return result


def search_top_n_matching_food(
    aliment: str,
    data_frame: pd.DataFrame,
    index: faiss.IndexFlatL2,
    topn: int = 1
) -> list[dict] | None:
    """
    Searches for the top N food items in the FAISS index using a hybrid approach (semantic and TF-IDF).

    This function encodes the input food item, searches for the most similar items in the FAISS index,
    combines the semantic similarity with TF-IDF similarity, and returns the top N matches with additional information.

    Args:
        aliment (str): The food item to search for.
        data_frame (pd.DataFrame): DataFrame containing ingredient information with columns:
            'FoodID', 'FoodName', 'EnglishFoodName', 'FoodGroupName', 'FoodSubGroup'.
        index (faiss.IndexFlatL2): Precomputed FAISS index containing embeddings of the ingredient names.
        topn (int, optional): The number of top matches to return. Defaults to 1.

    Returns:
        (list[dict] | None): A list of dictionaries, each containing:
            'FoodID', 'Name', 'English Name', 'Group', 'Sub Group', and 'Score' of a match,
        or None if no suitable matches are found.
    """
    aliment_processed = preprocess_text(aliment)

    if data_frame.empty:
        return None

    aliment_vector = embeddings_model.encode(aliment_processed, convert_to_tensor=True).cpu().numpy().reshape(1, -1)
    faiss.normalize_L2(aliment_vector)

    k = min(max(topn, 15), len(data_frame))
    distances, indices = index.search(aliment_vector, k)

    tfidf = TfidfVectorizer()
    tfidf_matrix = tfidf.fit_transform(data_frame['EnglishFoodName'].apply(preprocess_text))
    aliment_tfidf = tfidf.transform([aliment_processed])
    tfidf_similarities = cosine_similarity(aliment_tfidf, tfidf_matrix).flatten()

    indices_flat = indices.flatten()
    combined_scores = (1 - distances.flatten() / 2) * 0.7 + tfidf_similarities[indices_flat] * 0.3

    sorted_indices = np.argsort(combined_scores)[::-1]
    results = []
    for i in sorted_indices[:topn]:
        match_index = indices_flat[i]
        score = combined_scores[i]

        if score < 0.3:
            continue

        result = {
            'FoodID': data_frame['FoodID'].iloc[match_index],
            'FoodName': data_frame['FoodName'].iloc[match_index],
            'EnglishFoodName': data_frame['EnglishFoodName'].iloc[match_index],
            'FoodGroupName': data_frame['FoodGroupName'].iloc[match_index],
            'FoodSubGroup': data_frame['FoodSubGroup'].iloc[match_index],
            'Score': score
        }
        results.append(result)

    return results if results else None


# Exemples

Initialiser le modèle

In [5]:
index, embeddings = init_model(ingredients_db)

Chercher un ingrédient

In [None]:
search_matching_food("apple", ingredients_db, index)

{'FoodID': 13038,
 'FoodName': 'Compote de pomme',
 'EnglishFoodName': 'Apple compote',
 'FoodGroupName': 'Fruits et légumes',
 'FoodSubGroup': 'fruits',
 'Score': 0.5686411427708987}

Chercher les top n ingrédients correspondants

In [13]:
search_top_n_matching_food("apple", ingredients_db, index, 5)

[{'FoodID': 13111,
  'FoodName': 'Pomme, sèche',
  'EnglishFoodName': 'Apple, dried',
  'FoodGroupName': 'Fruits et légumes',
  'FoodSubGroup': 'fruits',
  'Score': 0.5777404961097088},
 {'FoodID': 13038,
  'FoodName': 'Compote de pomme',
  'EnglishFoodName': 'Apple compote',
  'FoodGroupName': 'Fruits et légumes',
  'FoodSubGroup': 'fruits',
  'Score': 0.5686411427708987},
 {'FoodID': 23490,
  'FoodName': 'Tarte ou tartelette aux pommes',
  'EnglishFoodName': 'Apple tart',
  'FoodGroupName': 'Produits sucrés et desserts',
  'FoodSubGroup': 'gâteaux et pâtisseries',
  'Score': 0.5490563004222749},
 {'FoodID': 23493,
  'FoodName': 'Crumble aux pommes',
  'EnglishFoodName': 'Apple crumble',
  'FoodGroupName': 'Produits sucrés et desserts',
  'FoodSubGroup': 'gâteaux et pâtisseries',
  'Score': 0.5462155410135974},
 {'FoodID': 23480,
  'FoodName': 'Chausson aux pommes',
  'EnglishFoodName': 'Apple turnover',
  'FoodGroupName': 'Produits sucrés et desserts',
  'FoodSubGroup': 'viennoiserie