In [None]:
import pandas as pd # pour importer les dataset
import numpy as np # pour manipuler les tableaux
from sklearn import preprocessing # pour transformer les donnees
from sklearn.feature_extraction.text import TfidfVectorizer # pour créer un vecteur if idf
from sklearn.metrics.pairwise import linear_kernel # pour faire le produit matriciel
from sklearn.metrics import average_precision_score, ndcg_score
import re
import string

# pour le filtrage colaboratif
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors


In [None]:
food = pd.read_csv('data.csv')
print(food.head())

   Food_ID                  Name        C_Type  Veg_Non  \
0        1   summer squash salad  Healthy Food      veg   
1        2  chicken minced salad  Healthy Food  non-veg   
2        3  sweet chilli almonds         Snack      veg   
3        4       tricolour salad  Healthy Food      veg   
4        5        christmas cake       Dessert      veg   

                                            Describe  
0  white balsamic vinegar, lemon juice, lemon rin...  
1  olive oil, chicken mince, garlic (minced), oni...  
2  almonds whole, egg white, curry leaves, salt, ...  
3  vinegar, honey/sugar, soy sauce, salt, garlic ...  
4  christmas dry fruits (pre-soaked), orange zest...  


# Système de recommandation content based
Ce système a pour objectif de suggérer à un client des plats similaires, en se basant sur deux critères principaux : les ingrédients et la catégorie des plats.

* On commence par la préparation des données : nous avons d'abord renommé la
colonne 'Describe' en 'Ingredient', puis nettoyé le texte en éliminant toute ponctuation. Ensuite, nous avons créé une colonne 'features' combinant  la catégorie du plat (C_Type) et ses ingrédients, permettant ainsi au système de considérer simultanément ces deux caractéristiques.


* On a utilisé par la suite TfidfVectorizer pour convertir les caractéristiques textuelles en une matrice numérique (tfidf_matrix). Cette matrice représente l'importance des mots dans chaque plat par rapport à l'ensemble des plats, pourquoi on a utilisé TF-IDF, car TF(term frequency)
mesure la fréquence d’un mot dans un plat, et IDF((Inverse Document Frequency) : réduit l’importance des mots trop courants) ce qui va donné l'importance d'un mot dans un plat par rapport à tous les autres plats.


* Nous avons ensuite calculé la similarité cosinus entre tous les plats à l'aide de linear_kernel, qui mesure la similarité entre les vecteurs TF-IDF. Cette méthode  effectue  un produit matriciel entre la matrice TF-IDF et sa transposée. Pourquoi ce choix ? Parce que le produit matriciel nous permet de calculer efficacement les similarités cosinus entre tous les paires de plats en une seule opération rapide et efficace.

* Nous avons fait ensuite la fonction get_recommendations() qui prend un nom de plat en entrée, trouve les plats les plus similaires (basés sur la similarité cosinus) et retourne les 2 meilleures recommandations.


Renommer la colone description

In [None]:
food.rename(columns={'Describe': 'Ingredient'}, inplace=True)

Enlever la ponctuation

In [None]:
def text_cleaning(text):
    text  = "".join([char for char in text if char not in string.punctuation])
    return text
food['Ingredient'] = food['Ingredient'].apply(text_cleaning)
print(food.head())

   Food_ID                  Name        C_Type  Veg_Non  \
0        1   summer squash salad  Healthy Food      veg   
1        2  chicken minced salad  Healthy Food  non-veg   
2        3  sweet chilli almonds         Snack      veg   
3        4       tricolour salad  Healthy Food      veg   
4        5        christmas cake       Dessert      veg   

                                          Ingredient  
0  white balsamic vinegar lemon juice lemon rind ...  
1  olive oil chicken mince garlic minced onion sa...  
2  almonds whole egg white curry leaves salt suga...  
3  vinegar honeysugar soy sauce salt garlic clove...  
4  christmas dry fruits presoaked orange zest lem...  


In [None]:
food.duplicated().sum() # verifier si il ya des duplication

np.int64(0)

In [None]:
food.isnull().sum()   # verifier les valeurs null

Unnamed: 0,0
Food_ID,0
Name,0
C_Type,0
Veg_Non,0
Ingredient,0


Creation d'une nouvelle colonne qui regroupe tout les features

In [None]:
def creat_features(x):
    return x['C_Type'] +" " + x['Ingredient']

food['features'] = food.apply(creat_features, axis=1)
print(food.head())


   Food_ID                  Name        C_Type  Veg_Non  \
0        1   summer squash salad  Healthy Food      veg   
1        2  chicken minced salad  Healthy Food  non-veg   
2        3  sweet chilli almonds         Snack      veg   
3        4       tricolour salad  Healthy Food      veg   
4        5        christmas cake       Dessert      veg   

                                          Ingredient  \
0  white balsamic vinegar lemon juice lemon rind ...   
1  olive oil chicken mince garlic minced onion sa...   
2  almonds whole egg white curry leaves salt suga...   
3  vinegar honeysugar soy sauce salt garlic clove...   
4  christmas dry fruits presoaked orange zest lem...   

                                            features  
0  Healthy Food white balsamic vinegar lemon juic...  
1  Healthy Food olive oil chicken mince garlic mi...  
2  Snack almonds whole egg white curry leaves sal...  
3  Healthy Food vinegar honeysugar soy sauce salt...  
4  Dessert christmas dry fruits p

Creation du vecteur tfidf

In [None]:
tfidf = TfidfVectorizer(stop_words='english') # supprimer tous les informations inutiles (and,or....)
tfidf_matrix = tfidf.fit_transform(food['features'])
print(tfidf_matrix.shape)

(400, 1268)


In [None]:
p

Calculer la similarité entre les plats en utilisant linear_kernel

In [None]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
print(cosine_sim)

[[1.         0.19909508 0.12084918 ... 0.12254895 0.04046384 0.15698963]
 [0.19909508 1.         0.06334794 ... 0.1420757  0.05394112 0.2188066 ]
 [0.12084918 0.06334794 1.         ... 0.03056255 0.10875053 0.01667124]
 ...
 [0.12254895 0.1420757  0.03056255 ... 1.         0.         0.09397769]
 [0.04046384 0.05394112 0.10875053 ... 0.         1.         0.        ]
 [0.15698963 0.2188066  0.01667124 ... 0.09397769 0.         1.        ]]


Afficher la matrice résultat

In [None]:
pd.DataFrame(cosine_sim)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,390,391,392,393,394,395,396,397,398,399
0,1.000000,0.199095,0.120849,0.178993,0.042875,0.039454,0.000000,0.102396,0.069888,0.110289,...,0.142678,0.043105,0.00000,0.000000,0.143308,0.030749,0.061974,0.122549,0.040464,0.156990
1,0.199095,1.000000,0.063348,0.301922,0.017618,0.055993,0.021054,0.173781,0.135859,0.123202,...,0.057127,0.068383,0.00000,0.019949,0.057420,0.066721,0.031635,0.142076,0.053941,0.218807
2,0.120849,0.063348,1.000000,0.076815,0.000000,0.076369,0.163670,0.009176,0.061120,0.074159,...,0.018276,0.021877,0.00000,0.046399,0.017161,0.047024,0.030035,0.030563,0.108751,0.016671
3,0.178993,0.301922,0.076815,1.000000,0.000000,0.030653,0.000000,0.092781,0.201652,0.165776,...,0.036894,0.044164,0.00000,0.000000,0.029170,0.031504,0.013959,0.068125,0.000000,0.168570
4,0.042875,0.017618,0.000000,0.000000,1.000000,0.010158,0.101445,0.000000,0.132351,0.000000,...,0.032433,0.000000,0.00000,0.199743,0.000000,0.000000,0.000000,0.000000,0.093665,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,0.030749,0.066721,0.047024,0.031504,0.000000,0.022498,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.210428,0.11363,0.075019,0.080053,1.000000,0.000000,0.000000,0.000000,0.056627
396,0.061974,0.031635,0.030035,0.013959,0.000000,0.069863,0.000000,0.010350,0.000000,0.000000,...,0.043435,0.280731,0.00000,0.000000,0.019356,0.000000,1.000000,0.037983,0.045405,0.018804
397,0.122549,0.142076,0.030563,0.068125,0.000000,0.020914,0.000000,0.051725,0.061198,0.021990,...,0.091485,0.032885,0.00000,0.000000,0.116235,0.000000,0.037983,1.000000,0.000000,0.093978
398,0.040464,0.053941,0.108751,0.000000,0.093665,0.016831,0.318039,0.019767,0.095713,0.044503,...,0.061219,0.000000,0.00000,0.123293,0.000000,0.000000,0.045405,0.000000,1.000000,0.000000


Avoir les indices des plats

In [None]:
indices = pd.Series(food.index, index=food['Name']).drop_duplicates()
print(indices)

Name
summer squash salad                                          0
chicken minced salad                                         1
sweet chilli almonds                                         2
tricolour salad                                              3
christmas cake                                               4
                                                          ... 
Kimchi Toast                                               395
Tacos de Gobernador (Shrimp, Poblano, and Cheese Tacos)    396
Melted Broccoli Pasta With Capers and Anchovies            397
Lemon-Ginger Cake with Pistachios                          398
Rosemary Roasted Vegetables                                399
Length: 400, dtype: int64


Créer la fonction de recommendations

In [None]:
def get_recommendations(title):

    idx = indices[title] # avoir l'index du plat
    sim_scores = list(enumerate(cosine_sim[idx])) # avoir les scores de similarité du plat par rapport aux autres plats
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True) # trier les scores
    sim_scores = sim_scores[1:3] # avoir le top 2

    food_indices = [i[0] for i in sim_scores]
    return food['Name'].iloc[food_indices]

In [None]:
get_recommendations('tricolour salad')

Unnamed: 0,Name
1,chicken minced salad
103,chilli chicken


Calculer la précision

In [None]:
def cross_validation(food, cosine_sim, indices):
    scores = []

    for i in range(len(food)):
        title = food['Name'].iloc[i]
        true_idx = indices[title]

        # Exclure l'élément de test des données
        temp_cosine_sim = cosine_sim.copy()
        temp_cosine_sim[true_idx, :] = 0
        temp_cosine_sim[:, true_idx] = 0

        # Obtenir les recommandations
        recommendations = get_recommendations(title)

        # Calculer la précision
        y_true = np.zeros(len(food))
        y_true[true_idx] = 1
        y_scores = cosine_sim[true_idx]

        # Utiliser average_precision_score comme métrique
        score = average_precision_score(y_true, y_scores)
        scores.append(score)

    return np.mean(scores)

In [None]:
mean_precision = cross_validation(food, cosine_sim, indices)
print(f'Mean Average Precision: {mean_precision}')

Mean Average Precision: 0.9925


# Système de recommandation avec un filtrage collaborative
* On a chargé la dataset on l'a examinais, ce qui nous a permi de supprimer les valeurs manquantes, on s'est donc concentré sur 511 evaluations valides.

* On a ensuite transformé cette dataset en une matrice Utilisateur-plats ou chaque case représente les notes d'un utilisateur pour un plat donnné, les lignes représentes les user tandis que les colonnes représentent les plats, on a ensuite utilisé csr-matrix qui nous a permi de rendre cette matrice creuse (une matrice creuse est une matrice qui permet de stocker que les valeurs non nulles)

* On a ensuite utilisé un model de ML qui est nearestNeighbort (le plus proche voisin) avec la similiarité cosinus pour trouver les utilisateurs ayant des préférences similaires, on a aussi entrainé notre model avec la methode fit.

* on construit ensuite notre fonction de recommendation qui prend un parametre le user, alors premeire etape on trouve notre user pour identifier ses notes dans la matrice. Ensuite on cherche tout les utilisateurs similaires en utilisant NearestNeigbors pour trouver les 16 proches voisinis, puis on garde les 2 les plus proches, on sélectionne ensuite les plats les plus pertinent non encore noté par l'utilisateur cibles donc parmi les plats bien notées par ces voisinis, on filtre ceux que l'utilisateurs n'a pas encore notés.

On retourne enfin les plats recommendés


In [None]:
ratting = pd.read_csv('ratings.csv')
print(ratting.head())
print (ratting.shape)

   User_ID  Food_ID  Rating
0      1.0     88.0     4.0
1      1.0     46.0     3.0
2      1.0     24.0     5.0
3      1.0     25.0     4.0
4      2.0     49.0     1.0
(512, 3)


In [None]:
ratting.isnull().sum()

Unnamed: 0,0
User_ID,1
Food_ID,1
Rating,1


Supprimer les valeurs null

In [None]:
ratting.tail()
ratting = ratting[:511]
ratting.tail()

Unnamed: 0,User_ID,Food_ID,Rating
506,99.0,65.0,7.0
507,99.0,22.0,1.0
508,100.0,24.0,10.0
509,100.0,233.0,10.0
510,100.0,29.0,7.0


Vérifier si il n' ya pas des valeurs null

In [None]:
ratting.isnull().sum()

Unnamed: 0,0
User_ID,0
Food_ID,0
Rating,0


Creation de la matrice user - food

In [None]:
rating_matrix = ratting.pivot_table(index='User_ID',columns='Food_ID',values='Rating').fillna(0)
rating_matrix.head()

Food_ID,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,...,300.0,301.0,302.0,303.0,304.0,305.0,306.0,307.0,308.0,309.0
User_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
rating_matrix.shape

(100, 309)

Covertion en matrice creuse

In [None]:
# pour enlever les 0 unitile
csr_rating_matrix =  csr_matrix(rating_matrix.values)
print(csr_rating_matrix)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 508 stored elements and shape (100, 309)>
  Coords	Values
  (0, 23)	5.0
  (0, 24)	4.0
  (0, 45)	3.0
  (0, 87)	4.0
  (1, 32)	8.0
  (1, 48)	1.0
  (1, 70)	8.0
  (1, 105)	9.0
  (2, 45)	2.0
  (2, 64)	3.0
  (2, 72)	9.0
  (2, 109)	10.0
  (2, 167)	1.0
  (2, 200)	8.0
  (2, 208)	6.0
  (2, 291)	8.0
  (2, 298)	1.0
  (3, 13)	5.0
  (3, 20)	1.0
  (3, 127)	6.0
  (3, 140)	5.0
  (3, 169)	1.0
  (3, 211)	10.0
  (4, 7)	6.0
  (4, 26)	6.0
  :	:
  (95, 270)	7.0
  (95, 292)	5.0
  (96, 19)	1.0
  (96, 30)	3.0
  (96, 45)	7.0
  (96, 63)	4.0
  (96, 143)	6.0
  (96, 214)	9.0
  (96, 307)	3.0
  (97, 4)	7.0
  (97, 34)	4.0
  (97, 39)	10.0
  (97, 46)	2.0
  (97, 81)	2.0
  (97, 193)	2.0
  (97, 216)	3.0
  (98, 21)	1.0
  (98, 48)	3.0
  (98, 64)	7.0
  (98, 68)	8.0
  (98, 92)	7.0
  (98, 151)	2.0
  (99, 23)	10.0
  (99, 28)	7.0
  (99, 232)	10.0


In [None]:
recommender = NearestNeighbors(metric='cosine')
recommender.fit(csr_rating_matrix)

In [None]:
def Get_User_Recommendations(user_id):

    user_index = np.where(rating_matrix.index == user_id)[0][0]
    user_ratings = rating_matrix.iloc[user_index]


    reshaped = user_ratings.values.reshape(1, -1)
    distances, indices = recommender.kneighbors(reshaped, n_neighbors=16)


    similar_users_indices = rating_matrix.iloc[indices[0]].index[1:]
    similar_users_distances = distances[0][1:]


    sorted_indices = np.argsort(similar_users_distances)[:2]
    closest_users_indices = similar_users_indices[sorted_indices]


    similar_users_ratings = rating_matrix.loc[closest_users_indices]


    user_rated_food = user_ratings[user_ratings > 0].index
    top_rated_food_ids = []

    for user in closest_users_indices:
        user_ratings = rating_matrix.loc[user]

        potential_recs = user_ratings[(user_ratings > 6) & ~user_ratings.index.isin(user_rated_food)]
        if not potential_recs.empty:

            top_rated_food_ids.append(potential_recs.idxmax())


    if top_rated_food_ids:
        recommended_food = food[food['Food_ID'].isin(top_rated_food_ids)]
        return recommended_food
    else:
        return pd.DataFrame()




In [None]:
Get_User_Recommendations(17)

Unnamed: 0,Food_ID,Name,C_Type,Veg_Non,Ingredient,features
24,25,cashew nut cookies,Dessert,veg,cashew paste ghee khaand a sweetening agent an...,Dessert cashew paste ghee khaand a sweetening ...
86,87,roasted spring chicken with root veggies,Healthy Food,non-veg,whole chicken thyme garlic lemon orange salt b...,Healthy Food whole chicken thyme garlic lemon ...
