In [None]:
import os
import sys

# Ajouter le répertoire parent au sys.path
sys.path.append(os.path.abspath('..'))

import re
from collections import Counter

import en_core_web_sm
import pandas as pd
import plotly.express as px
from sklearn.cluster import KMeans
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.impute import SimpleImputer
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from spacy.lang.en.stop_words import STOP_WORDS

from training.classes.language_tools import TextProcessing

pd.set_option('display.max_columns', None)

## Introduction et contexte

Ce notebook a pour but de faire une première exploration des données mises à disposition par openfoodfacts pour répondre à notre problème qui est:

quelle alternative à un produit alimentaire pourrait nous être proposé ?

Dans ces alternatives, quels sont les produits qui ne contiennent pas de produit susceptible de provoquer une allergie alimentaire ?

Le volume de données est important, nous allons nous concentrer sur les produits où la liste des ingrédients est renseignés et ou les produits sont vendus en france.

## Chargement des Données

In [2]:
rawdata = None
def extract_raw_data():
    file_path = '../data/en.openfoodfacts.org.products.csv.gz'
    chunk_size = 10000

    filtered_chunks_list = []

    for chunk in pd.read_csv(file_path, chunksize=chunk_size, compression='gzip', sep='\t', engine='python', quoting=3):
        filtered_chunks = chunk[(chunk['countries_tags'] == 'en:france') & (chunk['ingredients_tags'].notna())]
        filtered_chunks_list.append(filtered_chunks)

    if filtered_chunks_list:
        rawdata = pd.concat(filtered_chunks_list, axis=0)
    return rawdata

rawdata = extract_raw_data()
# rawdata = pd.read_csv('filtered_dataset_openfoodfacts_raw_en.csv')
display(rawdata)

Unnamed: 0,code,url,creator,created_t,created_datetime,last_modified_t,last_modified_datetime,last_modified_by,last_updated_t,last_updated_datetime,product_name,abbreviated_product_name,generic_name,quantity,packaging,packaging_tags,packaging_en,packaging_text,brands,brands_tags,categories,categories_tags,categories_en,origins,origins_tags,origins_en,manufacturing_places,manufacturing_places_tags,labels,labels_tags,labels_en,emb_codes,emb_codes_tags,first_packaging_code_geo,cities,cities_tags,purchase_places,stores,countries,countries_tags,countries_en,ingredients_text,ingredients_tags,ingredients_analysis_tags,allergens,allergens_en,traces,traces_tags,traces_en,serving_size,serving_quantity,no_nutrition_data,additives_n,additives,additives_tags,additives_en,nutriscore_score,nutriscore_grade,nova_group,pnns_groups_1,pnns_groups_2,food_groups,food_groups_tags,food_groups_en,states,states_tags,states_en,brand_owner,ecoscore_score,ecoscore_grade,nutrient_levels_tags,product_quantity,owner,data_quality_errors_tags,unique_scans_n,popularity_tags,completeness,last_image_t,last_image_datetime,main_category,main_category_en,image_url,image_small_url,image_ingredients_url,image_ingredients_small_url,image_nutrition_url,image_nutrition_small_url,energy-kj_100g,energy-kcal_100g,energy_100g,energy-from-fat_100g,fat_100g,saturated-fat_100g,butyric-acid_100g,caproic-acid_100g,caprylic-acid_100g,capric-acid_100g,lauric-acid_100g,myristic-acid_100g,palmitic-acid_100g,stearic-acid_100g,arachidic-acid_100g,behenic-acid_100g,lignoceric-acid_100g,cerotic-acid_100g,montanic-acid_100g,melissic-acid_100g,unsaturated-fat_100g,monounsaturated-fat_100g,omega-9-fat_100g,polyunsaturated-fat_100g,omega-3-fat_100g,omega-6-fat_100g,alpha-linolenic-acid_100g,eicosapentaenoic-acid_100g,docosahexaenoic-acid_100g,linoleic-acid_100g,arachidonic-acid_100g,gamma-linolenic-acid_100g,dihomo-gamma-linolenic-acid_100g,oleic-acid_100g,elaidic-acid_100g,gondoic-acid_100g,mead-acid_100g,erucic-acid_100g,nervonic-acid_100g,trans-fat_100g,cholesterol_100g,carbohydrates_100g,sugars_100g,added-sugars_100g,sucrose_100g,glucose_100g,fructose_100g,lactose_100g,maltose_100g,maltodextrins_100g,starch_100g,polyols_100g,erythritol_100g,fiber_100g,soluble-fiber_100g,insoluble-fiber_100g,proteins_100g,casein_100g,serum-proteins_100g,nucleotides_100g,salt_100g,added-salt_100g,sodium_100g,alcohol_100g,vitamin-a_100g,beta-carotene_100g,vitamin-d_100g,vitamin-e_100g,vitamin-k_100g,vitamin-c_100g,vitamin-b1_100g,vitamin-b2_100g,vitamin-pp_100g,vitamin-b6_100g,vitamin-b9_100g,folates_100g,vitamin-b12_100g,biotin_100g,pantothenic-acid_100g,silica_100g,bicarbonate_100g,potassium_100g,chloride_100g,calcium_100g,phosphorus_100g,iron_100g,magnesium_100g,zinc_100g,copper_100g,manganese_100g,fluoride_100g,selenium_100g,chromium_100g,molybdenum_100g,iodine_100g,caffeine_100g,taurine_100g,ph_100g,fruits-vegetables-nuts_100g,fruits-vegetables-nuts-dried_100g,fruits-vegetables-nuts-estimate_100g,fruits-vegetables-nuts-estimate-from-ingredients_100g,collagen-meat-protein-ratio_100g,cocoa_100g,chlorophyl_100g,carbon-footprint_100g,carbon-footprint-from-meat-or-fish_100g,nutrition-score-fr_100g,nutrition-score-uk_100g,glycemic-index_100g,water-hardness_100g,choline_100g,phylloquinone_100g,beta-glucan_100g,inositol_100g,carnitine_100g,sulphate_100g,nitrate_100g,acidity_100g
4,5,http://world-en.openfoodfacts.org/product/0000...,touchette,1605337720,2020-11-14T07:08:40Z,1729432954,2024-10-20T14:02:34Z,roboto-app,1.729433e+09,2024-10-20T14:02:34Z,Bio inulin,,,550g,Glass,en:glass,Glass,,EWL,ewl,Gemüse,"en:plant-based-foods-and-beverages,en:plant-ba...","Plant-based foods and beverages,Plant-based fo...",,,,bénivay-ollon,benivay-ollon,en:no-lactose,en:no-lactose,No lactose,13089c,13089c,,,,France,,France,en:france,France,"Jus et purée d'abricots (50%), eau, sucre.","en:apricot-juice-and-puree,en:fruit,en:prunus-...","en:palm-oil-free,en:vegan,en:vegetarian",,,,,,,,,0.0,,,,-2.0,a,3.0,Fruits and vegetables,Vegetables,en:vegetables,"en:fruits-and-vegetables,en:vegetables","Fruits and vegetables,Vegetables","en:to-be-completed, en:nutrition-facts-complet...","en:to-be-completed,en:nutrition-facts-complete...","To be completed,Nutrition facts completed,Ingr...",,,unknown,"en:fat-in-low-quantity,en:saturated-fat-in-low...",550.0,,,,,0.8875,1.729432e+09,2024-10-20T13:51:45Z,en:vegetables,Vegetables,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,840.0,208.0,840.0,,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,8.0,8.0,,,,,,,,,,,88.0,,,0.0,,,,0.23,,0.092,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,50.000000,,,,,,-2.0,,,,,,,,,,,
9,10,http://world-en.openfoodfacts.org/product/0000...,jeanbono,1476947941,2016-10-20T07:19:01Z,1728034727,2024-10-04T09:38:47Z,fix-code-bot,1.729852e+09,2024-10-25T10:25:56Z,Madeleines nature,,,880 g,"Plastic,Cardboard,fr:Boîte en carton,fr:Film e...","en:plastic,en:cardboard,fr:boite-en-carton,fr:...","Plastic,Cardboard,fr:boite-en-carton,fr:film-e...",,Bijou,bijou,"Snacks, Desserts, Snacks sucrés, Biscuits et g...","en:snacks,en:desserts,en:sweet-snacks,en:biscu...","Snacks,Desserts,Sweet snacks,Biscuits and cake...",,,,France,france,"Point Vert, Sans colorants, Sans huile de palme","en:green-dot,en:no-colorings,en:no-palm-oil","Green Dot,No colorings,No palm oil",,,,,,"Lyon,France,Limoges","M2I,Bijou",France,en:france,France,"Farine de blé 33%, sucre, huile de colza, œufs...","en:wheat-flour,en:cereal,en:flour,en:wheat,en:...","en:palm-oil-free,en:non-vegan,en:vegetarian-st...","en:eggs,en:gluten,en:milk",,"en:nuts,en:soybeans","en:nuts,en:soybeans","Nuts,Soybeans","17,6g",17.6,,3.0,,"en:e331,en:e422,en:e503","E331 - Sodium citrates,E422 - Glycerol,E503 - ...",13.0,d,4.0,Sugary snacks,Biscuits and cakes,en:biscuits-and-cakes,"en:sugary-snacks,en:biscuits-and-cakes","Sugary snacks,Biscuits and cakes","en:to-be-completed, en:nutrition-facts-complet...","en:to-be-completed,en:nutrition-facts-complete...","To be completed,Nutrition facts completed,Ingr...",,53.0,c,"en:fat-in-high-quantity,en:saturated-fat-in-mo...",880.0,,,17.0,"top-50000-scans-2019,top-100000-scans-2019,at-...",0.8875,1.636213e+09,2021-11-06T15:34:48Z,en:plain-madeleines,Plain madeleines,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,1852.0,442.0,1852.0,,22.0,2.6,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,54.0,25.0,,,,,,,,,,,1.4,,,6.4,,,,0.53,,0.212,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,22.666667,,,,,,13.0,,,,,,,,,,,
14,15,http://world-en.openfoodfacts.org/product/0000...,openfoodfacts-contributors,1523810594,2018-04-15T16:43:14Z,1728034730,2024-10-04T09:38:50Z,fix-code-bot,1.729852e+09,2024-10-25T10:25:56Z,Madeleines ChocoLait,,,1080 g,"Plastique, Carton","en:plastic,en:cardboard","Plastic,Cardboard",,Bijou,bijou,"Snacks, Snacks sucrés, Biscuits et gâteaux, Gâ...","en:snacks,en:sweet-snacks,en:biscuits-and-cake...","Snacks,Sweet snacks,Biscuits and cakes,Cakes,C...",,,,,,,,,,,,,,,,France,en:france,France,"Farine de blé 27%, chocolat au lait 18% (sucre...","en:wheat-flour,en:cereal,en:flour,en:wheat,en:...","en:palm-oil-free,en:non-vegan,en:maybe-vegetarian",,,en:nuts,en:nuts,Nuts,"21,6g",21.6,,5.0,,"en:e322,en:e322i,en:e331,en:e422,en:e500,en:e503","E322 - Lecithins,E322i - Lecithin,E331 - Sodiu...",17.0,d,4.0,Sugary snacks,Biscuits and cakes,en:biscuits-and-cakes,"en:sugary-snacks,en:biscuits-and-cakes","Sugary snacks,Biscuits and cakes","en:to-be-completed, en:nutrition-facts-complet...","en:to-be-completed,en:nutrition-facts-complete...","To be completed,Nutrition facts completed,Ingr...",,32.0,d,"en:fat-in-high-quantity,en:saturated-fat-in-hi...",1080.0,,,14.0,"top-50000-scans-2019,top-100000-scans-2019,at-...",0.7875,1.636214e+09,2021-11-06T15:46:13Z,en:chocolate-madeleines,Chocolate madeleines,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,1926.0,460.0,1926.0,,24.0,6.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,54.0,31.0,,,,,,,,,,,1.4,,,6.4,,,,0.48,,0.192,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,16.250000,,,,,,17.0,,,,,,,,,,,
18,20,http://world-en.openfoodfacts.org/product/0000...,openfoodfacts-contributors,1536930846,2018-09-14T13:14:06Z,1728034733,2024-10-04T09:38:53Z,fix-code-bot,1.729852e+09,2024-10-25T10:25:56Z,Madeleines Choco Noir,,,1080 g / 50 madeleines,1 boîte en carton à recycler 50 sachets indivi...,fr:1-boite-en-carton-a-recycler-50-sachets-ind...,fr:1-boite-en-carton-a-recycler-50-sachets-ind...,,BIJOU,bijou,"Snacks, Snacks sucrés, Biscuits et gâteaux, Gâ...","en:snacks,en:sweet-snacks,en:biscuits-and-cake...","Snacks,Sweet snacks,Biscuits and cakes,Cakes,C...","fr:Blé origine France,fr:Œufs origine France","fr:ble-origine-france,fr:oeufs-origine-france","fr:ble-origine-france,fr:oeufs-origine-france","Saint-Yrieix,France","saint-yrieix,france","Plein air, Sans conservateurs, Œufs de poules ...","en:free-range,en:no-preservatives,en:free-rang...","Free range,No preservatives,Free range eggs,Gr...",,,,,,France,"magasin d'usine,magasin Bijou bordeaux,magasin...",France,en:france,France,"Farine de _blé_ 27%, chocolat noir 18% (pâte d...","en:wheat-flour,en:cereal,en:flour,en:wheat,en:...","en:palm-oil-free,en:non-vegan,en:vegetarian-st...","en:eggs,en:gluten,en:milk,en:soybeans",,en:nuts,en:nuts,Nuts,21.6g,21.6,,5.0,,"en:e322,en:e331,en:e422,en:e500,en:e503","E322 - Lecithins,E331 - Sodium citrates,E422 -...",16.0,d,4.0,Sugary snacks,Biscuits and cakes,en:biscuits-and-cakes,"en:sugary-snacks,en:biscuits-and-cakes","Sugary snacks,Biscuits and cakes","en:to-be-checked, en:complete, en:nutrition-fa...","en:to-be-checked,en:complete,en:nutrition-fact...","To be checked,Complete,Nutrition facts complet...",,39.0,d,"en:fat-in-high-quantity,en:saturated-fat-in-hi...",1080.0,,,21.0,"top-50000-scans-2019,top-100000-scans-2019,at-...",0.9000,1.687002e+09,2023-06-17T11:42:30Z,en:chocolate-madeleines,Chocolate madeleines,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,1953.0,467.0,1953.0,,25.0,6.1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,53.0,29.0,,,,,,,,,,,2.5,,,6.3,,,,0.45,,0.180,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,16.250000,,,,,,16.0,,,,,,,,,,,
20,22,http://world-en.openfoodfacts.org/product/0000...,openfoodfacts-contributors,1614525537,2021-02-28T15:18:57Z,1728034734,2024-10-04T09:38:54Z,fix-code-bot,1.729852e+09,2024-10-25T10:25:55Z,Farandole de madeleine,,,590 g,"Boîte en carton, Film en plastique","fr:boite-en-carton,fr:film-en-plastique","fr:boite-en-carton,fr:film-en-plastique",,Bijou,bijou,"Snacks, Snacks sucrés, Biscuits et gâteaux, Gâ...","en:snacks,en:sweet-snacks,en:biscuits-and-cake...","Snacks,Sweet snacks,Biscuits and cakes,Cakes,C...",,,,,,,,,,,,,,,,en:France,en:france,France,Madeleines ChocoNoir - Madeleines nappées de c...,"fr:madeleines-choconoir,fr:madeleines-nappees-...","en:palm-oil-content-unknown,en:non-vegan,en:ve...",,,,en:nuts,Nuts,,,off,5.0,,"en:e322,en:e322i,en:e331,en:e422,en:e500,en:e503","E322 - Lecithins,E322i - Lecithin,E331 - Sodiu...",,unknown,4.0,Sugary snacks,Biscuits and cakes,en:biscuits-and-cakes,"en:sugary-snacks,en:biscuits-and-cakes","Sugary snacks,Biscuits and cakes","en:to-be-completed, en:nutrition-facts-complet...","en:to-be-completed,en:nutrition-facts-complete...","To be completed,Nutrition facts completed,Ingr...",,41.0,d,,590.0,,,2.0,"top-75-percent-scans-2020,top-80-percent-scans...",0.8000,1.614526e+09,2021-02-28T15:20:00Z,en:long-madeleines,Long madeleines,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.750000,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3498741,9999864004549,http://world-en.openfoodfacts.org/product/9999...,openfoodfacts-contributors,1564657971,2019-08-01T11:12:51Z,1564658722,2019-08-01T11:25:22Z,roboto-app,1.729988e+09,2024-10-27T00:08:44Z,Boudin à l’ancienne,,,240g,,,,,Biocoop Bordeaux lac,biocoop-bordeaux-lac,"Viandes et dérivés, Viandes, Charcuteries, Poi...","en:meats-and-their-products,en:meats,en:prepar...","Meats and their products,Meats,Prepared meats,...",,,,,,"Bio, Bio européen, FR-BIO-01","en:organic,en:eu-organic,en:fr-bio-01","Organic,EU Organic,FR-BIO-01",,,,,,,Biocoop,en:France,en:france,France,BIOCOOP BORDEAUX LAC distribue par les eleveur...,fr:biocoop-bordeaux-lac-distribue-par-les-elev...,"en:may-contain-palm-oil,en:non-vegan,en:non-ve...",,,,,,,,on,0.0,,,,,unknown,3.0,Fish Meat Eggs,Offals,en:offals,"en:fish-meat-eggs,en:offals","Fish‚ Meat‚ Eggs,Offals","en:to-be-completed, en:nutrition-facts-complet...","en:to-be-completed,en:nutrition-facts-complete...","To be completed,Nutrition facts completed,Ingr...",,86.0,a,,240.0,,,1.0,"bottom-25-percent-scans-2019,bottom-20-percent...",0.7750,1.564658e+09,2019-08-01T11:12:52Z,en:sauteed-black-pudding,Sautéed black pudding,,,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,5.357143,,,,,,,,,,,,,,,,,
3498753,9999900401301,http://world-en.openfoodfacts.org/product/9999...,kiliweb,1515171187,2018-01-05T16:53:07Z,1515171193,2018-01-05T16:53:13Z,,1.707599e+09,2024-02-10T21:06:07Z,Kabanos,,,,,,,,Tarczynski,tarczynski,,,,,,,,,,,,,,,,,,,France,en:france,France,"KABANOSÈRÇ Sktadn' wiep -zEkrobia, Guszcz 'o.e...","fr:kabanoserc-sktadn-wiep,fr:zekrobia,fr:guszc...","en:palm-oil-content-unknown,en:vegan-status-un...",,,,,,,,,0.0,,,,,unknown,,unknown,unknown,,,,"en:to-be-completed, en:nutrition-facts-complet...","en:to-be-completed,en:nutrition-facts-complete...","To be completed,Nutrition facts completed,Ingr...",,,unknown,,,,,,,0.4750,1.515171e+09,2018-01-05T16:53:12Z,,,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,,,,549.0,2297.0,,48.0,18.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,5.3,1.9,,,,,,,,,,,,,,24.0,,,,3.10,,1.240,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.000000,,,,,,,,,,,,,,,,,
3498810,9999991042704,http://world-en.openfoodfacts.org/product/9999...,kiliweb,1525862784,2018-05-09T10:46:24Z,1718572213,2024-06-16T21:10:13Z,geodata,1.729873e+09,2024-10-25T16:12:01Z,Yaourt vanille,,,120 g,,,,,Patapain,patapain,Yaourts vanille,"en:dairies,en:fermented-foods,en:fermented-mil...","Dairies,Fermented foods,Fermented milk product...",,,,,,,,,FR 13.005.028 EC,fr-13-005-028-ec,"43.283333,5.566667",,aubagne-bouches-du-rhone-france,,,France,en:france,France,"Lait entier 77%, crème, sucre 7,5%, ferments l...","en:whole-milk,en:dairy,en:milk,en:cream,en:sug...","en:palm-oil-free,en:non-vegan,en:vegetarian-st...",en:milk,,,"en:eggs,en:gluten,en:nuts","Eggs,Gluten,Nuts",,,,0.0,,,,7.0,c,3.0,Milk and dairy products,Milk and yogurt,en:milk-and-yogurt,"en:milk-and-dairy-products,en:milk-and-yogurt","Milk and dairy products,Milk and yogurt","en:to-be-completed, en:nutrition-facts-complet...","en:to-be-completed,en:nutrition-facts-complete...","To be completed,Nutrition facts completed,Ingr...",,67.0,b,"en:fat-in-moderate-quantity,en:saturated-fat-i...",120.0,,,1.0,"bottom-25-percent-scans-2019,bottom-20-percent...",0.7750,1.718572e+09,2024-06-16T21:08:23Z,en:vanilla-yogurt,Vanilla yogurt,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,,,,128.0,536.0,,7.6,5.1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,12.0,11.0,,,,,,,,,,,,,,2.7,,,,0.09,,0.036,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.000000,,,,,,7.0,,,,,,,,,,,
3498834,9999999004360,http://world-en.openfoodfacts.org/product/9999...,kiliweb,1548086277,2019-01-21T15:57:57Z,1682697806,2023-04-28T16:03:26Z,roboto-app,1.707611e+09,2024-02-11T00:22:00Z,Minis beignets,,"Minis beignets Beignet au sucre, surgelé, prêt...",9 x 15 g,,,,,,,"Snacks, Snacks sucrés, Beignets sucrés","en:snacks,en:sweet-snacks,en:sweet-fritters","Snacks,Sweet snacks,Sweet Fritters",,,,,,,,,,,,,,,,France,en:france,France,"MATIÈRE GRASSE DU LAIT BABEURRE (34%), FARINE ...","fr:matiere-grasse-du-lait-babeurre,fr:farine-d...","en:palm-oil,en:non-vegan,en:vegetarian-status-...",,,,en:sesame-seeds,Sesame seeds,,,,5.0,,"en:e322,en:e322i,en:e412,en:e450,en:e471,en:e5...","E322 - Lecithins,E322i - Lecithin,E412 - Guar ...",15.0,d,4.0,unknown,unknown,,,,"en:to-be-completed, en:nutrition-facts-complet...","en:to-be-completed,en:nutrition-facts-complete...","To be completed,Nutrition facts completed,Ingr...",,,unknown,"en:fat-in-high-quantity,en:saturated-fat-in-hi...",135.0,,,,,0.5750,1.548086e+09,2019-01-21T15:58:00Z,en:sweet-fritters,Sweet Fritters,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,,,,333.0,1393.0,,20.2,6.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,30.9,10.4,,,,,,,,,,,,,,7.4,,,,1.10,,0.440,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.000000,,,,,,15.0,,,,,,,,,,,


## Inspection des Données Brutes

In [3]:
# Basic stats
print("Taille du dataset:")
print("Number of rows : {}".format(rawdata.shape[0]))
print("Number of columns : {}".format(rawdata.shape[1]))
print()
print("---------------------------")
print()

print("Basics infos:")
print()
display(rawdata.info())
print()
print("---------------------------")
print()

print("Basics statistics: ")
print()
data_desc = rawdata.describe(include='all')
display(data_desc)
print()
print("---------------------------")
print()

print("Unique elements by feature: ")
print()
display(rawdata.nunique().sort_values())
print()
print("---------------------------")
print()

print("Percentage of missing values: ")
print()
null_percentage = 100 * rawdata.isnull().mean()
null_percentage_df = null_percentage.to_frame(name='Null Percentage').T
display(null_percentage_df) 

Taille du dataset:
Number of rows : 262446
Number of columns : 206

---------------------------

Basics infos:

<class 'pandas.core.frame.DataFrame'>
Index: 262446 entries, 4 to 3498839
Columns: 206 entries, code to acidity_100g
dtypes: float64(131), int64(2), object(73)
memory usage: 414.5+ MB


None


---------------------------

Basics statistics: 



Unnamed: 0,code,url,creator,created_t,created_datetime,last_modified_t,last_modified_datetime,last_modified_by,last_updated_t,last_updated_datetime,product_name,abbreviated_product_name,generic_name,quantity,packaging,packaging_tags,packaging_en,packaging_text,brands,brands_tags,categories,categories_tags,categories_en,origins,origins_tags,origins_en,manufacturing_places,manufacturing_places_tags,labels,labels_tags,labels_en,emb_codes,emb_codes_tags,first_packaging_code_geo,cities,cities_tags,purchase_places,stores,countries,countries_tags,countries_en,ingredients_text,ingredients_tags,ingredients_analysis_tags,allergens,allergens_en,traces,traces_tags,traces_en,serving_size,serving_quantity,no_nutrition_data,additives_n,additives,additives_tags,additives_en,nutriscore_score,nutriscore_grade,nova_group,pnns_groups_1,pnns_groups_2,food_groups,food_groups_tags,food_groups_en,states,states_tags,states_en,brand_owner,ecoscore_score,ecoscore_grade,nutrient_levels_tags,product_quantity,owner,data_quality_errors_tags,unique_scans_n,popularity_tags,completeness,last_image_t,last_image_datetime,main_category,main_category_en,image_url,image_small_url,image_ingredients_url,image_ingredients_small_url,image_nutrition_url,image_nutrition_small_url,energy-kj_100g,energy-kcal_100g,energy_100g,energy-from-fat_100g,fat_100g,saturated-fat_100g,butyric-acid_100g,caproic-acid_100g,caprylic-acid_100g,capric-acid_100g,lauric-acid_100g,myristic-acid_100g,palmitic-acid_100g,stearic-acid_100g,arachidic-acid_100g,behenic-acid_100g,lignoceric-acid_100g,cerotic-acid_100g,montanic-acid_100g,melissic-acid_100g,unsaturated-fat_100g,monounsaturated-fat_100g,omega-9-fat_100g,polyunsaturated-fat_100g,omega-3-fat_100g,omega-6-fat_100g,alpha-linolenic-acid_100g,eicosapentaenoic-acid_100g,docosahexaenoic-acid_100g,linoleic-acid_100g,arachidonic-acid_100g,gamma-linolenic-acid_100g,dihomo-gamma-linolenic-acid_100g,oleic-acid_100g,elaidic-acid_100g,gondoic-acid_100g,mead-acid_100g,erucic-acid_100g,nervonic-acid_100g,trans-fat_100g,cholesterol_100g,carbohydrates_100g,sugars_100g,added-sugars_100g,sucrose_100g,glucose_100g,fructose_100g,lactose_100g,maltose_100g,maltodextrins_100g,starch_100g,polyols_100g,erythritol_100g,fiber_100g,soluble-fiber_100g,insoluble-fiber_100g,proteins_100g,casein_100g,serum-proteins_100g,nucleotides_100g,salt_100g,added-salt_100g,sodium_100g,alcohol_100g,vitamin-a_100g,beta-carotene_100g,vitamin-d_100g,vitamin-e_100g,vitamin-k_100g,vitamin-c_100g,vitamin-b1_100g,vitamin-b2_100g,vitamin-pp_100g,vitamin-b6_100g,vitamin-b9_100g,folates_100g,vitamin-b12_100g,biotin_100g,pantothenic-acid_100g,silica_100g,bicarbonate_100g,potassium_100g,chloride_100g,calcium_100g,phosphorus_100g,iron_100g,magnesium_100g,zinc_100g,copper_100g,manganese_100g,fluoride_100g,selenium_100g,chromium_100g,molybdenum_100g,iodine_100g,caffeine_100g,taurine_100g,ph_100g,fruits-vegetables-nuts_100g,fruits-vegetables-nuts-dried_100g,fruits-vegetables-nuts-estimate_100g,fruits-vegetables-nuts-estimate-from-ingredients_100g,collagen-meat-protein-ratio_100g,cocoa_100g,chlorophyl_100g,carbon-footprint_100g,carbon-footprint-from-meat-or-fish_100g,nutrition-score-fr_100g,nutrition-score-uk_100g,glycemic-index_100g,water-hardness_100g,choline_100g,phylloquinone_100g,beta-glucan_100g,inositol_100g,carnitine_100g,sulphate_100g,nitrate_100g,acidity_100g
count,262446.0,262446,262445,262446.0,262446,262446.0,262446,255918,262384.0,262384,259542,14365.0,72756,193203,131260,131260,131260,12133,247022,247013,224425,224425,224425,50696,50660,50659,62115,62091,162683,162683,162683,62206,62195,48498,0.0,51890,77227,113823,262446,262446,262446,262445,262446,262444,101898,0.0,60167,78640,77682,65262,63302.0,13393,262446.0,2,134324,134324,193651.0,262428,226888.0,262445,262446,202214,202214,202214,262446,262446,262446,3884,161513.0,262438,195942,191720.0,50955,8612,191012.0,187639,262446.0,256179.0,256179,224425,224425,243009,243009,228118,228118,185463,185463,85997.0,217907.0,235753.0,28.0,235497.0,233782.0,4.0,4.0,3.0,3.0,4.0,2.0,3.0,2.0,22.0,8.0,1.0,1.0,3.0,7.0,23.0,1767.0,38.0,1777.0,1297.0,272.0,559.0,57.0,129.0,423.0,52.0,1.0,1.0,27.0,0.0,1.0,2.0,0.0,2.0,561.0,644.0,235376.0,234382.0,134.0,19.0,16.0,32.0,180.0,11.0,83.0,170.0,933.0,9.0,131150.0,4.0,2.0,235344.0,48.0,39.0,19.0,232179.0,0.0,232179.0,7645.0,998.0,35.0,1443.0,1558.0,276.0,2156.0,1389.0,1031.0,1021.0,1282.0,1013.0,18.0,981.0,413.0,743.0,51.0,207.0,1170.0,387.0,4829.0,1120.0,2098.0,1724.0,642.0,364.0,376.0,225.0,310.0,45.0,43.0,379.0,75.0,55.0,193.0,8369.0,796.0,8127.0,262444.0,279.0,3121.0,2.0,245.0,11471.0,193653.0,2.0,4.0,0.0,48.0,6.0,10.0,39.0,32.0,17.0,27.0,3.0
unique,261662.0,261668,4907,,257802,,230022,6587,,154956,200193,13357.0,52181,17499,22398,18531,18516,5722,57908,42433,63700,49862,49855,8397,6562,6502,12943,11536,46308,39319,39319,17632,15280,2868,,5780,5382,8035,60,1,1,237469,200778,37,3280,,5692,6096,6095,8421,,4,,2,33903,33903,,7,,11,40,45,45,45,2408,2408,2408,74,,9,225,,219,173,,86978,,,254506,17144,17142,242255,242255,227393,227393,184881,184881,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
top,3017800180426.0,http://world-en.openfoodfacts.org/product/3017...,kiliweb,,2021-04-20T10:35:38Z,,2023-10-26T17:32:26Z,packbot,,2024-10-26T19:22:56Z,Comté,150.0,Pâtes alimentaires de qualité supérieure,500 g,"Plastique, Sachet","en:plastic,en:bag","Plastic,Bag",1 sachet plastique à jeter,Carrefour,carrefour,Boissons,en:beverages,Beverages,France,en:france,France,France,france,Point Vert,en:green-dot,Green Dot,EMB 56251E,emb-56251e,"47.833333,-0.333333",,sable-sur-sarthe-sarthe-france,France,Carrefour,France,en:france,France,Poulet,"en:superior-quality-durum-wheat-semolina,en:ce...","en:palm-oil-free,en:vegan,en:vegetarian",en:milk,,en:nuts,en:nuts,Nuts,100g,,on,,[ farine-de-ble -> fr:farine-de-ble ] [ far...,"en:e322,en:e322i","E322 - Lecithins,E322i - Lecithin",,unknown,,unknown,unknown,en:biscuits-and-cakes,"en:sugary-snacks,en:biscuits-and-cakes","Sugary snacks,Biscuits and cakes","en:to-be-completed, en:nutrition-facts-complet...","en:to-be-completed,en:nutrition-facts-complete...","To be completed,Nutrition facts completed,Ingr...",Casino,,unknown,"en:fat-in-low-quantity,en:saturated-fat-in-low...",,org-carrefour,en:energy-value-in-kcal-does-not-match-value-c...,,"bottom-25-percent-scans-2019,bottom-20-percent...",,,2021-07-06T13:01:18Z,en:groceries,Groceries,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
freq,2.0,2,164342,,13,,23,48909,,34,204,18.0,275,8667,6996,7464,7464,521,5706,5909,1668,1739,1739,20889,21334,21336,22922,23766,10079,10872,10872,346,350,756,,536,38289,8446,229398,262446,262446,518,785,45476,23942,,5372,7498,7498,5560,,10679,,1,6771,6771,,57958,,61029,61029,17610,17610,17610,20867,20867,20867,2313,,96770,22330,,8370,3323,,6297,,,4,8801,8801,2,2,2,2,2,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
mean,,,,1527818000.0,,1668274000.0,,,1722693000.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,59402290.0,,1.534716,,,,8.424036,,3.332873,,,,,,,,,,55.292809,,,454.639577,,,4.643771,,0.754019,1589367000.0,,,,,,,,,,1061.092244,65165410000.0,252382800000.0,763.9,13.539812,5.254001,6.84075,0.231506,3.314367,2.621,36.7925,9.51,3.886,32.47,0.292605,2.597562,0.003,0.0,20.760333,0.040721,12.865217,24.670276,84.320439,12.470353,4.237112,15.981691,1.348583,1.839791,0.841817,2.19903,0.055554,0.6,5.35,38.494037,,0.0023,0.0,,7.0071,0.191689,0.128027,27.396272,13.686511,15.921403,12.939023,14.118931,33.346875,35.192159,7.758274,4.470129,30.338441,60.326955,11.407778,2.957124,2.25,2.95,8.364608,3.643506,5.602054,5.954821,1.151902,,0.460968,4.523002,2.224969,1.135413,0.0264909,0.10945,0.013057,0.24251,0.043429,0.07843,0.504083,0.161866,2.596676,5e-05,0.2667345,0.429925,0.033273,1.779756,0.118012,1.284519,4.148063,0.552229,0.434461,0.074108,0.672424,0.072437,0.013241,0.144039,0.008291,1.258499,0.04168811,0.843189,0.450522,0.497122,1.528764,6.727772,32.711966,10.630528,47.395401,18.953396,14.916487,50.560818,1.879,236.850844,617.556837,8.424088,9.5,37.175,,0.104835,1.966691,4.09,0.039503,0.010034,0.137547,0.079228,4.463333
std,,,,72937050.0,,47905280.0,,,10191810.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,14945520000.0,,2.334788,,,,9.006878,,1.024788,,,,,,,,,,28.200915,,,2617.209263,,,12.18971,,0.176202,81047550.0,,,,,,,,,,753.672078,30419550000000.0,122543000000000.0,943.452202,16.981526,7.936301,13.439585,0.445831,5.729307,3.31831,23.818779,13.279465,3.584855,41.761726,0.842357,7.033452,,,34.850428,0.052201,30.47787,26.016964,278.682433,16.999863,39.550699,29.057123,5.833911,7.177955,2.815056,6.189702,0.068529,,,28.291801,,,0.0,,9.889454,1.031835,1.438917,27.364661,22.842566,20.051772,27.13252,18.557303,29.308173,232.626984,14.854779,17.07452,25.017773,54.082527,23.133451,13.100406,1.93477,2.899138,9.225805,3.117017,15.342642,11.161286,7.316836,,2.928417,8.363204,34.440498,6.415153,0.3994638,1.676563,0.198817,3.040437,0.560016,1.736091,12.553398,2.376406,78.608404,3.6e-05,6.456718,5.434498,0.335736,11.194348,0.459,15.547818,79.293585,7.439983,3.913648,0.701343,12.827831,0.711181,0.062906,2.340551,0.049028,17.245658,0.2623974,3.770973,5.474969,1.148727,5.279059,1.143104,37.50216,27.715552,30.211995,48.108579,3.238563,22.90616,0.429921,339.512034,6184.440843,9.006858,12.020815,10.889865,,0.141583,4.817318,1.890003,0.02777,0.00494,0.346106,0.384357,3.83328
min,,,,1328021000.0,,1362683000.0,,,1706029000.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,0.0,,,,-20.0,,1.0,,,,,,,,,,-30.0,,,0.0,,,1.0,,0.1,1328987000.0,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.063,0.0,0.0042,0.583,1.07,0.12,0.108,2.94,0.0,0.001,0.003,0.0,0.281,0.00034,0.0,0.0,0.0055,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6,5.35,0.027,,0.0023,0.0,,0.0142,0.0,0.0,0.0,-0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.8,0.9,0.0,0.0014,0.0003,0.0025,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.2e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4e-07,0.0,0.0,0.0,0.0,0.87,0.0,0.0,0.0,-65.887256,3.7,0.0,1.575,0.0,0.049,-20.0,1.0,26.0,,0.0,3e-07,0.5,0.0033,0.0008,2e-05,0.0,0.39
25%,,,,1494173000.0,,1644552000.0,,,1707650000.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,30.0,,0.0,,,,1.0,,3.0,,,,,,,,,,35.0,,,160.0,,,1.0,,0.5875,1519127000.0,,,,,,,,,,426.0,111.7584,464.0,94.25,1.1,0.3,0.10575,1.7e-05,0.00655,0.7065,36.3425,4.815,2.209,17.705,6.6e-05,0.003325,0.003,0.0,0.6405,0.006455,0.0,3.0,27.025,2.0,0.41,1.45,0.052,0.1,0.059,0.354,0.01,0.6,5.35,10.375,,0.0023,0.0,,3.51065,0.0,0.0,3.4,0.8,0.0,0.15,0.325,2.375,0.01,0.005006,0.099,8.625,27.0,0.0,0.2,0.95,1.925,1.7,1.055,0.3,0.022,0.051,,0.020474,0.0,6.8e-05,2.9e-05,7.5e-07,0.0018,5e-06,0.012,0.00021,0.00023,0.0028,0.00025,3e-05,1.6e-05,4e-07,5e-06,0.0009,0.0015,0.01731,0.073,0.002,0.115,0.138,0.002,0.03,0.0008,0.0001,4e-05,1.5e-05,5e-06,1e-05,4e-06,1.5e-05,0.02,0.035,6.0,0.0,0.0,17.0,0.0,12.0,32.0,1.727,0.0,106.465,1.0,5.25,29.0,,0.06,3.15e-05,3.1,0.025,0.006875,0.0014,0.0001,2.695
50%,,,,1517339000.0,,1682667000.0,,,1729883000.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,100.0,,1.0,,,,8.0,,4.0,,,,,,,,,,56.0,,,290.0,,,2.0,,0.775,1600427000.0,,,,,,,,,,952.0,252.0,1042.0,471.5,7.2,2.0,0.15,0.013011,0.0089,0.83,48.55,9.51,4.31,32.47,0.00603,0.0245,0.003,0.0,1.0,0.025,0.0,15.0,42.0,6.0,1.8,8.0,0.087,0.2,0.12,0.486,0.0455,0.6,5.35,33.0,,0.0023,0.0,,7.0071,0.0,0.0,15.0,3.6,6.03,0.3,5.1,28.5,5.95,0.231,0.12,29.1,66.0,0.0,1.6,1.6,2.95,6.1,2.9,1.9,0.024,0.5,,0.2,0.0,0.00014,0.0001,1.3e-06,0.006,1.3e-05,0.02,0.0005,0.00078,0.0064,0.000623,9.3e-05,3.9e-05,1.3e-06,1e-05,0.0023,0.00327,0.034,0.2305,0.0322,0.12,0.225,0.004,0.08734,0.002,0.000371,0.00032,5e-05,1.7e-05,1.3e-05,1.5e-05,4.6e-05,0.032,0.039,7.0,13.69,0.0,50.0,0.03,15.0,51.0,1.879,126.0,325.0,8.0,9.5,37.0,,0.07,3.4e-05,4.0,0.029,0.00905,0.005,0.00043,5.0
75%,,,,1556192000.0,,1701680000.0,,,1729944000.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,150.0,,2.0,,,,15.0,,4.0,,,,,,,,,,76.0,,,500.0,,,4.0,,0.8875,1652449000.0,,,,,,,,,,1569.0,393.0,1637.0,1147.5,21.0,7.3,6.885,0.2445,4.96945,3.64,49.0,14.205,5.775,47.235,0.265,0.35475,0.003,0.0,31.0,0.0484,1.9,39.0,62.55,13.3,3.21,21.125,0.397,0.7,0.5,0.6965,0.082,0.6,5.35,67.0,,0.0023,0.0,,10.50355,0.1,0.016875,52.0,19.1,31.275,4.92,21.55,55.35,41.925,3.9,0.905,49.75,94.0,6.67,3.5,2.9,3.975,12.0,5.4,5.3,7.25,1.2,,0.48,6.0,0.00045,0.0725,5e-06,0.013,3.6e-05,0.033,0.00091,0.0012,0.0133,0.0012,0.000166,8.8e-05,2.1e-06,1.8e-05,0.005,0.03385,0.1115,0.57925,0.1615,0.27,0.35,0.0078,0.15925,0.0042,0.00068,0.0015,0.00017,2.8e-05,4e-05,5e-05,9.5e-05,0.1,0.4,7.5,60.0,0.0,65.0,22.219062,15.0,70.0,2.031,335.0,615.0,15.0,13.75,45.175,,0.1565,4.25e-05,5.275,0.051,0.015,0.0174,0.001,6.5



---------------------------

Unique elements by feature: 



allergens_en                0
water-hardness_100g         0
cities                      0
added-salt_100g             0
elaidic-acid_100g           0
                        ...  
last_image_t           254506
created_datetime       257802
created_t              257802
code                   261662
url                    261668
Length: 206, dtype: int64


---------------------------

Percentage of missing values: 



Unnamed: 0,code,url,creator,created_t,created_datetime,last_modified_t,last_modified_datetime,last_modified_by,last_updated_t,last_updated_datetime,product_name,abbreviated_product_name,generic_name,quantity,packaging,packaging_tags,packaging_en,packaging_text,brands,brands_tags,categories,categories_tags,categories_en,origins,origins_tags,origins_en,manufacturing_places,manufacturing_places_tags,labels,labels_tags,labels_en,emb_codes,emb_codes_tags,first_packaging_code_geo,cities,cities_tags,purchase_places,stores,countries,countries_tags,countries_en,ingredients_text,ingredients_tags,ingredients_analysis_tags,allergens,allergens_en,traces,traces_tags,traces_en,serving_size,serving_quantity,no_nutrition_data,additives_n,additives,additives_tags,additives_en,nutriscore_score,nutriscore_grade,nova_group,pnns_groups_1,pnns_groups_2,food_groups,food_groups_tags,food_groups_en,states,states_tags,states_en,brand_owner,ecoscore_score,ecoscore_grade,nutrient_levels_tags,product_quantity,owner,data_quality_errors_tags,unique_scans_n,popularity_tags,completeness,last_image_t,last_image_datetime,main_category,main_category_en,image_url,image_small_url,image_ingredients_url,image_ingredients_small_url,image_nutrition_url,image_nutrition_small_url,energy-kj_100g,energy-kcal_100g,energy_100g,energy-from-fat_100g,fat_100g,saturated-fat_100g,butyric-acid_100g,caproic-acid_100g,caprylic-acid_100g,capric-acid_100g,lauric-acid_100g,myristic-acid_100g,palmitic-acid_100g,stearic-acid_100g,arachidic-acid_100g,behenic-acid_100g,lignoceric-acid_100g,cerotic-acid_100g,montanic-acid_100g,melissic-acid_100g,unsaturated-fat_100g,monounsaturated-fat_100g,omega-9-fat_100g,polyunsaturated-fat_100g,omega-3-fat_100g,omega-6-fat_100g,alpha-linolenic-acid_100g,eicosapentaenoic-acid_100g,docosahexaenoic-acid_100g,linoleic-acid_100g,arachidonic-acid_100g,gamma-linolenic-acid_100g,dihomo-gamma-linolenic-acid_100g,oleic-acid_100g,elaidic-acid_100g,gondoic-acid_100g,mead-acid_100g,erucic-acid_100g,nervonic-acid_100g,trans-fat_100g,cholesterol_100g,carbohydrates_100g,sugars_100g,added-sugars_100g,sucrose_100g,glucose_100g,fructose_100g,lactose_100g,maltose_100g,maltodextrins_100g,starch_100g,polyols_100g,erythritol_100g,fiber_100g,soluble-fiber_100g,insoluble-fiber_100g,proteins_100g,casein_100g,serum-proteins_100g,nucleotides_100g,salt_100g,added-salt_100g,sodium_100g,alcohol_100g,vitamin-a_100g,beta-carotene_100g,vitamin-d_100g,vitamin-e_100g,vitamin-k_100g,vitamin-c_100g,vitamin-b1_100g,vitamin-b2_100g,vitamin-pp_100g,vitamin-b6_100g,vitamin-b9_100g,folates_100g,vitamin-b12_100g,biotin_100g,pantothenic-acid_100g,silica_100g,bicarbonate_100g,potassium_100g,chloride_100g,calcium_100g,phosphorus_100g,iron_100g,magnesium_100g,zinc_100g,copper_100g,manganese_100g,fluoride_100g,selenium_100g,chromium_100g,molybdenum_100g,iodine_100g,caffeine_100g,taurine_100g,ph_100g,fruits-vegetables-nuts_100g,fruits-vegetables-nuts-dried_100g,fruits-vegetables-nuts-estimate_100g,fruits-vegetables-nuts-estimate-from-ingredients_100g,collagen-meat-protein-ratio_100g,cocoa_100g,chlorophyl_100g,carbon-footprint_100g,carbon-footprint-from-meat-or-fish_100g,nutrition-score-fr_100g,nutrition-score-uk_100g,glycemic-index_100g,water-hardness_100g,choline_100g,phylloquinone_100g,beta-glucan_100g,inositol_100g,carnitine_100g,sulphate_100g,nitrate_100g,acidity_100g
Null Percentage,0.0,0.0,0.000381,0.0,0.0,0.0,0.0,2.487369,0.023624,0.023624,1.106513,94.526493,72.277726,26.383713,49.985902,49.985902,49.985902,95.376954,5.877019,5.880448,14.487171,14.487171,14.487171,80.683264,80.696981,80.697363,76.332274,76.341419,38.012772,38.012772,38.012772,76.2976,76.301792,81.52077,100.0,80.228314,70.574137,56.629935,0.0,0.0,0.0,0.000381,0.0,0.000762,61.173727,100.0,77.074522,70.035741,70.400768,75.13317,75.879991,94.896855,0.0,99.999238,48.818424,48.818424,26.213011,0.006859,13.548692,0.000381,0.0,22.950245,22.950245,22.950245,0.0,0.0,0.0,98.520077,38.458578,0.003048,25.34007,26.948782,80.584577,96.718563,27.218552,28.503768,0.0,2.38792,2.38792,14.487171,14.487171,7.406095,7.406095,13.080024,13.080024,29.332891,29.332891,67.232497,16.970729,10.170854,99.989331,10.268398,10.921866,99.998476,99.998476,99.998857,99.998857,99.998476,99.999238,99.998857,99.999238,99.991617,99.996952,99.999619,99.999619,99.998857,99.997333,99.991236,99.326719,99.985521,99.322908,99.505803,99.89636,99.787004,99.978281,99.950847,99.838824,99.980186,99.999619,99.999619,99.989712,100.0,99.999619,99.999238,100.0,99.999238,99.786242,99.754616,10.314503,10.693247,99.948942,99.99276,99.993904,99.987807,99.931414,99.995809,99.968374,99.935225,99.644498,99.996571,50.027815,99.998476,99.999238,10.326696,99.981711,99.98514,99.99276,11.532658,100.0,11.532658,97.08702,99.619731,99.986664,99.450173,99.406354,99.894836,99.178498,99.470748,99.607157,99.610968,99.511519,99.614016,99.993141,99.626209,99.842634,99.716894,99.980567,99.921127,99.554194,99.852541,98.160002,99.573246,99.200597,99.343103,99.755378,99.861305,99.856732,99.914268,99.88188,99.982854,99.983616,99.855589,99.971423,99.979043,99.926461,96.811154,99.6967,96.903363,0.000762,99.893692,98.810803,99.999238,99.906647,95.629196,26.212249,99.999238,99.998476,100.0,99.981711,99.997714,99.99619,99.98514,99.987807,99.993522,99.989712,99.998857


**Notes :**

Il y a beaucoup de données manquantes.

Nous allons dans un premier temps nous concentrer sur les données des produits vendus en France.

## Analyse Exploratoire des Données

In [4]:
# Find different states
unique_values = set()
rawdata['states_tags'].str.split(',').apply(unique_values.update)

states_df = pd.DataFrame(data=unique_values, columns=['states'])
display(states_df)

Unnamed: 0,states
0,en:photos-uploaded
1,en:characteristics-completed
2,en:packaging-photo-to-be-selected
3,en:ingredients-photo-to-be-selected
4,en:characteristics-to-be-completed
5,en:categories-to-be-completed
6,en:ingredients-completed
7,en:front-photo-to-be-selected
8,en:expiration-date-completed
9,en:categories-completed


In [5]:
# Vérifier que la colonne 'state' existe
if 'state' in rawdata.columns:
    # Compter les occurrences de chaque état
    states_count = rawdata['state'].value_counts().reset_index()
    states_count.columns = ['state', 'count']
    
    # Afficher le tableau
    print(states_count)
else:
    print("La colonne 'state' n'existe pas dans le dataset.")

La colonne 'state' n'existe pas dans le dataset.


**Notes :**

Il y a beaucoup de colonnes, nous allons sélectionner celles qui potentiellement pourront aider nos futurs modèles à apprendre et à répondre à notre problématique.

In [6]:
columns_to_keep = [
    'code',
    'url',
    'created_datetime',
    'last_modified_datetime',
    'product_name',
    'packaging_tags',
    'categories_tags',
    'ingredients_tags',
    'ingredients_analysis_tags',
    'allergens',
    'traces_tags',
    'additives_n',
    'additives_tags',
    'nutriscore_grade',
    'food_groups_tags',
    'states_tags',
    'ecoscore_score',
    'ecoscore_grade',
    'nutrient_levels_tags',
    'popularity_tags',
    'main_category',
    'image_url',
    'image_small_url',
    'image_ingredients_url',
    'image_ingredients_small_url',
    'image_nutrition_url',
    'image_nutrition_small_url',
    'energy-kcal_100g',
    'energy_100g',
    'fat_100g',
    'saturated-fat_100g',
    'cholesterol_100g',
    'sugars_100g',
    'proteins_100g',
    'salt_100g',
    'fruits-vegetables-nuts-estimate-from-ingredients_100g'
]

data = rawdata[columns_to_keep]
display(data)

Unnamed: 0,code,url,created_datetime,last_modified_datetime,product_name,packaging_tags,categories_tags,ingredients_tags,ingredients_analysis_tags,allergens,traces_tags,additives_n,additives_tags,nutriscore_grade,food_groups_tags,states_tags,ecoscore_score,ecoscore_grade,nutrient_levels_tags,popularity_tags,main_category,image_url,image_small_url,image_ingredients_url,image_ingredients_small_url,image_nutrition_url,image_nutrition_small_url,energy-kcal_100g,energy_100g,fat_100g,saturated-fat_100g,cholesterol_100g,sugars_100g,proteins_100g,salt_100g,fruits-vegetables-nuts-estimate-from-ingredients_100g
4,5,http://world-en.openfoodfacts.org/product/0000...,2020-11-14T07:08:40Z,2024-10-20T14:02:34Z,Bio inulin,en:glass,"en:plant-based-foods-and-beverages,en:plant-ba...","en:apricot-juice-and-puree,en:fruit,en:prunus-...","en:palm-oil-free,en:vegan,en:vegetarian",,,0.0,,a,"en:fruits-and-vegetables,en:vegetables","en:to-be-completed,en:nutrition-facts-complete...",,unknown,"en:fat-in-low-quantity,en:saturated-fat-in-low...",,en:vegetables,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,208.0,840.0,0.0,0.0,,8.0,0.0,0.23,50.000000
9,10,http://world-en.openfoodfacts.org/product/0000...,2016-10-20T07:19:01Z,2024-10-04T09:38:47Z,Madeleines nature,"en:plastic,en:cardboard,fr:boite-en-carton,fr:...","en:snacks,en:desserts,en:sweet-snacks,en:biscu...","en:wheat-flour,en:cereal,en:flour,en:wheat,en:...","en:palm-oil-free,en:non-vegan,en:vegetarian-st...","en:eggs,en:gluten,en:milk","en:nuts,en:soybeans",3.0,"en:e331,en:e422,en:e503",d,"en:sugary-snacks,en:biscuits-and-cakes","en:to-be-completed,en:nutrition-facts-complete...",53.0,c,"en:fat-in-high-quantity,en:saturated-fat-in-mo...","top-50000-scans-2019,top-100000-scans-2019,at-...",en:plain-madeleines,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,442.0,1852.0,22.0,2.6,,25.0,6.4,0.53,22.666667
14,15,http://world-en.openfoodfacts.org/product/0000...,2018-04-15T16:43:14Z,2024-10-04T09:38:50Z,Madeleines ChocoLait,"en:plastic,en:cardboard","en:snacks,en:sweet-snacks,en:biscuits-and-cake...","en:wheat-flour,en:cereal,en:flour,en:wheat,en:...","en:palm-oil-free,en:non-vegan,en:maybe-vegetarian",,en:nuts,5.0,"en:e322,en:e322i,en:e331,en:e422,en:e500,en:e503",d,"en:sugary-snacks,en:biscuits-and-cakes","en:to-be-completed,en:nutrition-facts-complete...",32.0,d,"en:fat-in-high-quantity,en:saturated-fat-in-hi...","top-50000-scans-2019,top-100000-scans-2019,at-...",en:chocolate-madeleines,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,460.0,1926.0,24.0,6.0,,31.0,6.4,0.48,16.250000
18,20,http://world-en.openfoodfacts.org/product/0000...,2018-09-14T13:14:06Z,2024-10-04T09:38:53Z,Madeleines Choco Noir,fr:1-boite-en-carton-a-recycler-50-sachets-ind...,"en:snacks,en:sweet-snacks,en:biscuits-and-cake...","en:wheat-flour,en:cereal,en:flour,en:wheat,en:...","en:palm-oil-free,en:non-vegan,en:vegetarian-st...","en:eggs,en:gluten,en:milk,en:soybeans",en:nuts,5.0,"en:e322,en:e331,en:e422,en:e500,en:e503",d,"en:sugary-snacks,en:biscuits-and-cakes","en:to-be-checked,en:complete,en:nutrition-fact...",39.0,d,"en:fat-in-high-quantity,en:saturated-fat-in-hi...","top-50000-scans-2019,top-100000-scans-2019,at-...",en:chocolate-madeleines,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,467.0,1953.0,25.0,6.1,,29.0,6.3,0.45,16.250000
20,22,http://world-en.openfoodfacts.org/product/0000...,2021-02-28T15:18:57Z,2024-10-04T09:38:54Z,Farandole de madeleine,"fr:boite-en-carton,fr:film-en-plastique","en:snacks,en:sweet-snacks,en:biscuits-and-cake...","fr:madeleines-choconoir,fr:madeleines-nappees-...","en:palm-oil-content-unknown,en:non-vegan,en:ve...",,en:nuts,5.0,"en:e322,en:e322i,en:e331,en:e422,en:e500,en:e503",unknown,"en:sugary-snacks,en:biscuits-and-cakes","en:to-be-completed,en:nutrition-facts-complete...",41.0,d,,"top-75-percent-scans-2020,top-80-percent-scans...",en:long-madeleines,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,,,,,,,,,1.750000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3498741,9999864004549,http://world-en.openfoodfacts.org/product/9999...,2019-08-01T11:12:51Z,2019-08-01T11:25:22Z,Boudin à l’ancienne,,"en:meats-and-their-products,en:meats,en:prepar...",fr:biocoop-bordeaux-lac-distribue-par-les-elev...,"en:may-contain-palm-oil,en:non-vegan,en:non-ve...",,,0.0,,unknown,"en:fish-meat-eggs,en:offals","en:to-be-completed,en:nutrition-facts-complete...",86.0,a,,"bottom-25-percent-scans-2019,bottom-20-percent...",en:sauteed-black-pudding,,,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,,,,,,,,,,,5.357143
3498753,9999900401301,http://world-en.openfoodfacts.org/product/9999...,2018-01-05T16:53:07Z,2018-01-05T16:53:13Z,Kabanos,,,"fr:kabanoserc-sktadn-wiep,fr:zekrobia,fr:guszc...","en:palm-oil-content-unknown,en:vegan-status-un...",,,0.0,,unknown,,"en:to-be-completed,en:nutrition-facts-complete...",,unknown,,,,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,,,549.0,2297.0,48.0,18.0,,1.9,24.0,3.10,0.000000
3498810,9999991042704,http://world-en.openfoodfacts.org/product/9999...,2018-05-09T10:46:24Z,2024-06-16T21:10:13Z,Yaourt vanille,,"en:dairies,en:fermented-foods,en:fermented-mil...","en:whole-milk,en:dairy,en:milk,en:cream,en:sug...","en:palm-oil-free,en:non-vegan,en:vegetarian-st...",en:milk,"en:eggs,en:gluten,en:nuts",0.0,,c,"en:milk-and-dairy-products,en:milk-and-yogurt","en:to-be-completed,en:nutrition-facts-complete...",67.0,b,"en:fat-in-moderate-quantity,en:saturated-fat-i...","bottom-25-percent-scans-2019,bottom-20-percent...",en:vanilla-yogurt,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,,,128.0,536.0,7.6,5.1,,11.0,2.7,0.09,0.000000
3498834,9999999004360,http://world-en.openfoodfacts.org/product/9999...,2019-01-21T15:57:57Z,2023-04-28T16:03:26Z,Minis beignets,,"en:snacks,en:sweet-snacks,en:sweet-fritters","fr:matiere-grasse-du-lait-babeurre,fr:farine-d...","en:palm-oil,en:non-vegan,en:vegetarian-status-...",,en:sesame-seeds,5.0,"en:e322,en:e322i,en:e412,en:e450,en:e471,en:e5...",d,,"en:to-be-completed,en:nutrition-facts-complete...",,unknown,"en:fat-in-high-quantity,en:saturated-fat-in-hi...",,en:sweet-fritters,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,,,333.0,1393.0,20.2,6.0,,10.4,7.4,1.10,0.000000


In [7]:
# Liste des états cibles
target_states = ["en:completed", "en:origins-completed", "en:ingredients-completed"]

# Construire une expression régulière pour les états
pattern = '|'.join(target_states)

# Compter les lignes contenant au moins un des états cibles
count = data['states_tags'].str.contains(pattern).sum()

print("Nombre de lignes avec les états spécifiés:", count)


Nombre de lignes avec les états spécifiés: 258361


In [8]:
data = data[data['states_tags'].str.contains(pattern)]

In [9]:
count_data_allergen = data['allergens'].notna().sum()
print("Nombre de lignes avec des allergens:", count_data_allergen)

Nombre de lignes avec des allergens: 100665


**Notes:**

Nous allons remplacer les valeurs de la colonne allergens non renseignées par la valeur `en:none`.

In [10]:
data.loc[:, 'allergens'] = data['allergens'].astype(str)
data['allergens'] = data['allergens'].str.lower()
data['allergens'] = data['allergens'].apply(lambda x: 'en:none' if x == 'nan' else x)
data['allergens'].value_counts()

allergens
en:none                                                                       158457
en:milk                                                                        23667
en:gluten                                                                      12388
en:gluten,en:milk                                                               5871
en:eggs,en:gluten,en:milk                                                       5612
                                                                               ...  
en:eggs,fr:omelette                                                                1
en:celery,en:eggs,en:gluten,en:milk,fr:curry                                       1
en:celery,en:crustaceans,en:eggs,en:gluten,en:milk,en:molluscs,en:soybeans         1
en:eggs,en:gluten,en:milk,fr:thon albacore 45 %,fr:colin d'alaska 30 %             1
en:eggs,en:gluten,en:milk,en:nuts,en:peanuts,fr:biscuit spéculoos                  1
Name: count, Length: 3144, dtype: int64

In [11]:
# Divise chaque entrée par ',' et compile tous les allergènes en une seule liste
allergens_list = data["allergens"].fillna('').str.split(',').sum()

# Compte les occurrences de chaque allergène
allergen_counts = Counter(allergens_list)

# Convertir le résultat en DataFrame
allergens_df = pd.DataFrame(allergen_counts.items(), columns=['allergen', 'count']).sort_values(by='count', ascending=False)
display(allergens_df)

Unnamed: 0,allergen,count
0,en:none,158511
3,en:milk,56302
2,en:gluten,44189
1,en:eggs,20899
4,en:soybeans,15370
...,...,...
725,fr:disufite de sodium,1
726,fr:lait entier crème,1
727,fr:cacahete,1
728,fr:emmental français,1


**Notes :**

On peu se fier seulement aux allergènes contenant plus de 100 produits dans la base de données.

Nous pourronts mettre les autres dans une autre catégories nommée `other`.

In [12]:
display(allergens_df[allergens_df['count'] > 100])

Unnamed: 0,allergen,count
0,en:none,158511
3,en:milk,56302
2,en:gluten,44189
1,en:eggs,20899
4,en:soybeans,15370
6,en:nuts,10530
8,en:fish,8302
10,en:sulphur-dioxide-and-sulphites,7400
7,en:mustard,5597
5,en:celery,4356


**Notes:**

Nous allons considérer que la valeur vide correspond à pas d'allergène : `en:none`.

Nous allons mettre les allergènes qui ne sont pas dans cette liste dans une catégorie `other`.

Nous pouvons fusionner les valeurs `fr:Non` et `en:none` et traduire `fr:avoine` en `en:oats`.

In [13]:
# Liste des allergènes à conserver (ceux dont le count est supérieur à 100)
allergens_to_keep = allergens_df[allergens_df['count'] > 100]['allergen'].tolist()
allergens_to_keep.append('en:none') # Ajoute l'entrée none

data['allergens'] = data['allergens'].apply(lambda x: 'other' if x not in allergens_to_keep  else x)
data['allergens'] = data['allergens'].apply(lambda x: 'en:oats' if x == 'fr:avoine' else x)
data['allergens'].value_counts()

display(data)

Unnamed: 0,code,url,created_datetime,last_modified_datetime,product_name,packaging_tags,categories_tags,ingredients_tags,ingredients_analysis_tags,allergens,traces_tags,additives_n,additives_tags,nutriscore_grade,food_groups_tags,states_tags,ecoscore_score,ecoscore_grade,nutrient_levels_tags,popularity_tags,main_category,image_url,image_small_url,image_ingredients_url,image_ingredients_small_url,image_nutrition_url,image_nutrition_small_url,energy-kcal_100g,energy_100g,fat_100g,saturated-fat_100g,cholesterol_100g,sugars_100g,proteins_100g,salt_100g,fruits-vegetables-nuts-estimate-from-ingredients_100g
4,5,http://world-en.openfoodfacts.org/product/0000...,2020-11-14T07:08:40Z,2024-10-20T14:02:34Z,Bio inulin,en:glass,"en:plant-based-foods-and-beverages,en:plant-ba...","en:apricot-juice-and-puree,en:fruit,en:prunus-...","en:palm-oil-free,en:vegan,en:vegetarian",en:none,,0.0,,a,"en:fruits-and-vegetables,en:vegetables","en:to-be-completed,en:nutrition-facts-complete...",,unknown,"en:fat-in-low-quantity,en:saturated-fat-in-low...",,en:vegetables,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,208.0,840.0,0.0,0.0,,8.0,0.0,0.23,50.000000
9,10,http://world-en.openfoodfacts.org/product/0000...,2016-10-20T07:19:01Z,2024-10-04T09:38:47Z,Madeleines nature,"en:plastic,en:cardboard,fr:boite-en-carton,fr:...","en:snacks,en:desserts,en:sweet-snacks,en:biscu...","en:wheat-flour,en:cereal,en:flour,en:wheat,en:...","en:palm-oil-free,en:non-vegan,en:vegetarian-st...",other,"en:nuts,en:soybeans",3.0,"en:e331,en:e422,en:e503",d,"en:sugary-snacks,en:biscuits-and-cakes","en:to-be-completed,en:nutrition-facts-complete...",53.0,c,"en:fat-in-high-quantity,en:saturated-fat-in-mo...","top-50000-scans-2019,top-100000-scans-2019,at-...",en:plain-madeleines,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,442.0,1852.0,22.0,2.6,,25.0,6.4,0.53,22.666667
14,15,http://world-en.openfoodfacts.org/product/0000...,2018-04-15T16:43:14Z,2024-10-04T09:38:50Z,Madeleines ChocoLait,"en:plastic,en:cardboard","en:snacks,en:sweet-snacks,en:biscuits-and-cake...","en:wheat-flour,en:cereal,en:flour,en:wheat,en:...","en:palm-oil-free,en:non-vegan,en:maybe-vegetarian",en:none,en:nuts,5.0,"en:e322,en:e322i,en:e331,en:e422,en:e500,en:e503",d,"en:sugary-snacks,en:biscuits-and-cakes","en:to-be-completed,en:nutrition-facts-complete...",32.0,d,"en:fat-in-high-quantity,en:saturated-fat-in-hi...","top-50000-scans-2019,top-100000-scans-2019,at-...",en:chocolate-madeleines,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,460.0,1926.0,24.0,6.0,,31.0,6.4,0.48,16.250000
18,20,http://world-en.openfoodfacts.org/product/0000...,2018-09-14T13:14:06Z,2024-10-04T09:38:53Z,Madeleines Choco Noir,fr:1-boite-en-carton-a-recycler-50-sachets-ind...,"en:snacks,en:sweet-snacks,en:biscuits-and-cake...","en:wheat-flour,en:cereal,en:flour,en:wheat,en:...","en:palm-oil-free,en:non-vegan,en:vegetarian-st...",other,en:nuts,5.0,"en:e322,en:e331,en:e422,en:e500,en:e503",d,"en:sugary-snacks,en:biscuits-and-cakes","en:to-be-checked,en:complete,en:nutrition-fact...",39.0,d,"en:fat-in-high-quantity,en:saturated-fat-in-hi...","top-50000-scans-2019,top-100000-scans-2019,at-...",en:chocolate-madeleines,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,467.0,1953.0,25.0,6.1,,29.0,6.3,0.45,16.250000
20,22,http://world-en.openfoodfacts.org/product/0000...,2021-02-28T15:18:57Z,2024-10-04T09:38:54Z,Farandole de madeleine,"fr:boite-en-carton,fr:film-en-plastique","en:snacks,en:sweet-snacks,en:biscuits-and-cake...","fr:madeleines-choconoir,fr:madeleines-nappees-...","en:palm-oil-content-unknown,en:non-vegan,en:ve...",en:none,en:nuts,5.0,"en:e322,en:e322i,en:e331,en:e422,en:e500,en:e503",unknown,"en:sugary-snacks,en:biscuits-and-cakes","en:to-be-completed,en:nutrition-facts-complete...",41.0,d,,"top-75-percent-scans-2020,top-80-percent-scans...",en:long-madeleines,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,,,,,,,,,1.750000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3498741,9999864004549,http://world-en.openfoodfacts.org/product/9999...,2019-08-01T11:12:51Z,2019-08-01T11:25:22Z,Boudin à l’ancienne,,"en:meats-and-their-products,en:meats,en:prepar...",fr:biocoop-bordeaux-lac-distribue-par-les-elev...,"en:may-contain-palm-oil,en:non-vegan,en:non-ve...",en:none,,0.0,,unknown,"en:fish-meat-eggs,en:offals","en:to-be-completed,en:nutrition-facts-complete...",86.0,a,,"bottom-25-percent-scans-2019,bottom-20-percent...",en:sauteed-black-pudding,,,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,,,,,,,,,,,5.357143
3498753,9999900401301,http://world-en.openfoodfacts.org/product/9999...,2018-01-05T16:53:07Z,2018-01-05T16:53:13Z,Kabanos,,,"fr:kabanoserc-sktadn-wiep,fr:zekrobia,fr:guszc...","en:palm-oil-content-unknown,en:vegan-status-un...",en:none,,0.0,,unknown,,"en:to-be-completed,en:nutrition-facts-complete...",,unknown,,,,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,,,549.0,2297.0,48.0,18.0,,1.9,24.0,3.10,0.000000
3498810,9999991042704,http://world-en.openfoodfacts.org/product/9999...,2018-05-09T10:46:24Z,2024-06-16T21:10:13Z,Yaourt vanille,,"en:dairies,en:fermented-foods,en:fermented-mil...","en:whole-milk,en:dairy,en:milk,en:cream,en:sug...","en:palm-oil-free,en:non-vegan,en:vegetarian-st...",en:milk,"en:eggs,en:gluten,en:nuts",0.0,,c,"en:milk-and-dairy-products,en:milk-and-yogurt","en:to-be-completed,en:nutrition-facts-complete...",67.0,b,"en:fat-in-moderate-quantity,en:saturated-fat-i...","bottom-25-percent-scans-2019,bottom-20-percent...",en:vanilla-yogurt,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,,,128.0,536.0,7.6,5.1,,11.0,2.7,0.09,0.000000
3498834,9999999004360,http://world-en.openfoodfacts.org/product/9999...,2019-01-21T15:57:57Z,2023-04-28T16:03:26Z,Minis beignets,,"en:snacks,en:sweet-snacks,en:sweet-fritters","fr:matiere-grasse-du-lait-babeurre,fr:farine-d...","en:palm-oil,en:non-vegan,en:vegetarian-status-...",en:none,en:sesame-seeds,5.0,"en:e322,en:e322i,en:e412,en:e450,en:e471,en:e5...",d,,"en:to-be-completed,en:nutrition-facts-complete...",,unknown,"en:fat-in-high-quantity,en:saturated-fat-in-hi...",,en:sweet-fritters,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,,,333.0,1393.0,20.2,6.0,,10.4,7.4,1.10,0.000000


In [14]:
# Séparer les allergènes par ",", et convertir en une liste
allergens_series = data['allergens'].str.split(',').explode().str.strip()
# Compter les occurrences de chaque allergène
allergens_counts = allergens_series.value_counts().reset_index()
allergens_counts.columns = ['allergen', 'count']

# Créer le graphique
fig = px.bar(allergens_counts, x='allergen', y='count', 
             title="Répartition des allergènes dans le dataset",
             labels={'allergen': 'Allergènes', 'count': 'Nombre d\'occurrences'},
             height=500)

fig.show()

Maintenant nous allons nous focaliser sur les traces d'allergènes.

In [15]:
data.loc[:, 'traces_tags'] = data['traces_tags'].astype(str).str.lower()

# Divise chaque entrée par ',' et compile tous les traces d'allergènes en une seule liste
traces_allergens_list = data["traces_tags"].fillna('').str.split(',').sum()

# Compte les occurrences de chaque traces d'allergène
traces_allergen_counts = Counter(traces_allergens_list)

# Convertir le résultat en DataFrame
traces_allergens_df = pd.DataFrame(traces_allergen_counts.items(), columns=['allergen', 'count']).sort_values(by='count', ascending=False)
display(allergens_df)

Unnamed: 0,allergen,count
0,en:none,158511
3,en:milk,56302
2,en:gluten,44189
1,en:eggs,20899
4,en:soybeans,15370
...,...,...
725,fr:disufite de sodium,1
726,fr:lait entier crème,1
727,fr:cacahete,1
728,fr:emmental français,1


In [16]:
display(traces_allergens_df[traces_allergens_df['count'] > 20])

Unnamed: 0,allergen,count
0,,180752
1,en:nuts,41442
2,en:soybeans,28009
11,en:milk,25267
4,en:gluten,21943
6,en:eggs,21711
5,en:sesame-seeds,17072
20,en:mustard,13961
7,en:celery,13398
17,en:peanuts,10851


In [17]:
# Séparer les allergènes par ",", et convertir en une liste
data.loc[:, 'traces_tags'] = data['traces_tags'].astype(str)

traces_allergens_to_keep = traces_allergens_df[traces_allergens_df['count'] > 20]['allergen'].tolist()

data['traces_tags'] = data['traces_tags'].apply(lambda x: 'en:none' if (x == 'nan')  else x)
data['traces_tags'] = data['traces_tags'].apply(lambda x: 'other' if x not in traces_allergens_to_keep  else x)

traces_allergens_series = data['traces_tags'].str.split(',').explode().str.strip()
traces_allergens_counts = traces_allergens_series.value_counts().reset_index()
traces_allergens_counts.columns = ['traces_tags', 'count']

# Créer le graphique
fig = px.bar(traces_allergens_counts, x='traces_tags', y='count', 
             title="Répartition des traces d'allergènes dans le dataset",
             labels={'traces_tags': 'Traces allergènes', 'count': 'Nombre d\'occurrences'},
             height=500)

fig.show()

### Sélections et nettoyage des données

In [18]:
data.describe(include='all')

Unnamed: 0,code,url,created_datetime,last_modified_datetime,product_name,packaging_tags,categories_tags,ingredients_tags,ingredients_analysis_tags,allergens,traces_tags,additives_n,additives_tags,nutriscore_grade,food_groups_tags,states_tags,ecoscore_score,ecoscore_grade,nutrient_levels_tags,popularity_tags,main_category,image_url,image_small_url,image_ingredients_url,image_ingredients_small_url,image_nutrition_url,image_nutrition_small_url,energy-kcal_100g,energy_100g,fat_100g,saturated-fat_100g,cholesterol_100g,sugars_100g,proteins_100g,salt_100g,fruits-vegetables-nuts-estimate-from-ingredients_100g
count,258361.0,258361,258361,258361,255537,130790,222035,258361,258359,258361,258361,258361.0,131893,258345,200085,258361,159886.0,258353,193750,185127,222035,239625,239625,224313,224313,183187,183187,213971.0,231800.0,231549.0,229837.0,637.0,230437.0,231393.0,228250.0,258359.0
unique,257585.0,257591,253809,226334,197147,18477,49675,196813,37,19,33,,33170,7,45,2159,,9,225,86223,17077,238878,238878,223596,223596,182612,182612,,,,,,,,,
top,3250390663201.0,http://world-en.openfoodfacts.org/product/3254...,2021-04-20T10:35:38Z,2023-10-26T17:32:26Z,Comté,"en:plastic,en:bag",en:beverages,"en:superior-quality-durum-wheat-semolina,en:ce...","en:palm-oil-free,en:vegan,en:vegetarian",en:none,en:none,,"en:e322,en:e322i",unknown,"en:sugary-snacks,en:biscuits-and-cakes","en:to-be-completed,en:nutrition-facts-complete...",,unknown,"en:fat-in-low-quantity,en:saturated-fat-in-low...","bottom-25-percent-scans-2019,bottom-20-percent...",en:groceries,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,,,,,,,,,
freq,2.0,2,13,23,204,7450,1681,784,45406,158457,181708,,6665,56101,17355,20867,,94324,22137,6150,8684,2,2,2,2,2,2,,,,,,,,,
mean,,,,,,,,,,,,1.529871,,,,,55.388873,,,,,,,,,,,66364130000.0,256686800000.0,13.531768,5.250109,0.128933,13.672234,8.337464,1.148384,19.084162
std,,,,,,,,,,,,2.332687,,,,,28.206574,,,,,,,,,,,30698060000000.0,123583400000000.0,17.007197,7.945012,1.44677,22.894757,9.162811,7.355613,48.318683
min,,,,,,,,,,,,0.0,,,,,-30.0,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,-0.1,0.0,0.0,-65.887256
25%,,,,,,,,,,,,0.0,,,,,35.0,,,,,,,,,,,111.0,461.0,1.1,0.26,0.0,0.8,1.7,0.05,0.0
50%,,,,,,,,,,,,1.0,,,,,56.0,,,,,,,,,,,252.0,1040.0,7.2,2.0,0.0,3.6,6.0,0.5,0.044759
75%,,,,,,,,,,,,2.0,,,,,76.0,,,,,,,,,,,393.0,1636.0,21.0,7.2,0.0167,19.0,12.0,1.2,22.52524


In [19]:
data[data['additives_n'] == 1.0].head()

Unnamed: 0,code,url,created_datetime,last_modified_datetime,product_name,packaging_tags,categories_tags,ingredients_tags,ingredients_analysis_tags,allergens,traces_tags,additives_n,additives_tags,nutriscore_grade,food_groups_tags,states_tags,ecoscore_score,ecoscore_grade,nutrient_levels_tags,popularity_tags,main_category,image_url,image_small_url,image_ingredients_url,image_ingredients_small_url,image_nutrition_url,image_nutrition_small_url,energy-kcal_100g,energy_100g,fat_100g,saturated-fat_100g,cholesterol_100g,sugars_100g,proteins_100g,salt_100g,fruits-vegetables-nuts-estimate-from-ingredients_100g
49,51,http://world-en.openfoodfacts.org/product/0000...,2016-12-01T19:59:24Z,2024-10-11T08:26:38Z,Fondants Citron,"fr:boite-carton,fr:sachet-plastique","en:plant-based-foods-and-beverages,en:plant-ba...","en:blackberry,en:fruit,en:berries,en:cane-suga...","en:palm-oil-free,en:vegan-status-unknown,en:ve...",other,other,1.0,en:e440,c,"en:sugary-snacks,en:sweets","en:to-be-completed,en:nutrition-facts-complete...",64.0,b,"en:fat-in-low-quantity,en:saturated-fat-in-low...","top-50000-scans-2019,top-100000-scans-2019,at-...",en:jams,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,249.0,1056.0,0.3,0.05,,59.0,0.5,0.05,50.0
120,131,http://world-en.openfoodfacts.org/product/0000...,2020-06-30T12:29:48Z,2024-10-29T14:06:01Z,Confiture de fraise mara des bois,,"en:plant-based-foods-and-beverages,en:plant-ba...","en:e968,en:monk-fruit-extract,en:sweetener","en:palm-oil-free,en:vegan-status-unknown,en:ve...",en:none,en:none,1.0,en:e968,c,"en:sugary-snacks,en:sweets","en:to-be-completed,en:nutrition-facts-complete...",60.0,b,"en:fat-in-low-quantity,en:saturated-fat-in-low...",,en:strawberry-jams,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,,,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,0.0,0.0,0.0,0.0,,725.0,0.0,0.0,0.0
170,184,http://world-en.openfoodfacts.org/product/0000...,2023-02-25T11:51:38Z,2024-10-04T09:45:42Z,Whey Protein Vanilla,,en:dietary-supplements,"fr:ingredients,fr:tunique,fr:extrait-de-fruits...","en:palm-oil-content-unknown,en:vegan-status-un...",other,en:none,1.0,en:e464,b,,"en:to-be-completed,en:nutrition-facts-complete...",,unknown,"en:fat-in-moderate-quantity,en:saturated-fat-i...",,en:dietary-supplements,,,,,,,377.0,1580.0,4.33,1.33,,3.33,80.0,0.233,0.0
260,289,http://world-en.openfoodfacts.org/product/0000...,2019-09-14T10:42:53Z,2024-10-04T09:40:01Z,Passiflore BIO,,en:dietary-supplements,fr:complement-alimentair-ingredients-pour-3-co...,"en:palm-oil-content-unknown,en:vegan-status-un...",en:none,en:none,1.0,"en:e553,en:e553b",unknown,,"en:to-be-completed,en:nutrition-facts-complete...",,unknown,,"bottom-25-percent-scans-2019,bottom-20-percent...",en:dietary-supplements,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,,,,,,,,,,,0.0
402,476,http://world-en.openfoodfacts.org/product/0000...,2019-08-19T21:04:21Z,2024-10-11T14:12:41Z,Encore +,flacon,fr:lubrifiant-feminin,"fr:cyclopentasiloxane,en:e900,fr:paraben-free,...","en:palm-oil-content-unknown,en:vegan-status-un...",en:none,en:none,1.0,en:e900,,,"en:to-be-completed,en:nutrition-facts-to-be-co...",,,,,fr:lubrifiant-feminin,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,,,,,,,,,,,0.0


- url --> ok
- created_datetime --> transform to 1 column for year
- last_modified_datetime --> transform to 1 column for year
- product_name --> delete empty rows
- packaging_tags --> set 'en:unknown' for empty rows
- categories_tags --> set 'en:unknown' for empty rows
- ingredients_tags --> delete empty rows
- ingredients_analysis_tags --> set 'unknown' for empty rows
- allergens --> set 'en:none' for empty rows
- traces_tags --> set 'en:none' for empty rows
- additives_n --> ok
- additives_tags --> set en:none when empty
- nutriscore_grade --> set unknown when empty
- food_groups_tags --> set en:none when empty
- states_tags --> ok
- ecoscore_grade --> set unknown when empty
- nutrient_levels_tags --> set en:unknown when empty
- popularity_tags --> set unknown when empty
- main_category --> set en:none when empty
- image_url --> set 'en:none' for empty rows
- energy-kcal_100g --> set -1 for empty rows
- energy_100g --> set -1 for empty rows
- fat_100g --> set -1 for empty rows
- cholesterol_100g --> set -1 for empty rows
- sugars_100g --> set -1 for empty rows
- proteins_100g --> set -1 for empty rows
- salt_100g --> set -1 for empty rows


In [20]:
columns_to_keep = [
    'code',
    'url',
    'last_modified_datetime',
    'product_name',
    'packaging_tags',
    'categories_tags',
    'ingredients_tags',
    'ingredients_analysis_tags',
    'allergens',
    'traces_tags',
    'additives_tags',
    'nutriscore_grade',
    'food_groups_tags',
    'states_tags',
    'ecoscore_grade',
    'nutrient_levels_tags',
    'popularity_tags',
    'main_category',
    'image_url',
    'energy_100g',
    'fat_100g',
    'saturated-fat_100g',
    'cholesterol_100g',
    'sugars_100g',
    'proteins_100g',
    'salt_100g',
    'fruits-vegetables-nuts-estimate-from-ingredients_100g'
]

clean_data = data[columns_to_keep]
display(clean_data)

Unnamed: 0,code,url,last_modified_datetime,product_name,packaging_tags,categories_tags,ingredients_tags,ingredients_analysis_tags,allergens,traces_tags,additives_tags,nutriscore_grade,food_groups_tags,states_tags,ecoscore_grade,nutrient_levels_tags,popularity_tags,main_category,image_url,energy_100g,fat_100g,saturated-fat_100g,cholesterol_100g,sugars_100g,proteins_100g,salt_100g,fruits-vegetables-nuts-estimate-from-ingredients_100g
4,5,http://world-en.openfoodfacts.org/product/0000...,2024-10-20T14:02:34Z,Bio inulin,en:glass,"en:plant-based-foods-and-beverages,en:plant-ba...","en:apricot-juice-and-puree,en:fruit,en:prunus-...","en:palm-oil-free,en:vegan,en:vegetarian",en:none,en:none,,a,"en:fruits-and-vegetables,en:vegetables","en:to-be-completed,en:nutrition-facts-complete...",unknown,"en:fat-in-low-quantity,en:saturated-fat-in-low...",,en:vegetables,https://images.openfoodfacts.org/images/produc...,840.0,0.0,0.0,,8.0,0.0,0.23,50.000000
9,10,http://world-en.openfoodfacts.org/product/0000...,2024-10-04T09:38:47Z,Madeleines nature,"en:plastic,en:cardboard,fr:boite-en-carton,fr:...","en:snacks,en:desserts,en:sweet-snacks,en:biscu...","en:wheat-flour,en:cereal,en:flour,en:wheat,en:...","en:palm-oil-free,en:non-vegan,en:vegetarian-st...",other,other,"en:e331,en:e422,en:e503",d,"en:sugary-snacks,en:biscuits-and-cakes","en:to-be-completed,en:nutrition-facts-complete...",c,"en:fat-in-high-quantity,en:saturated-fat-in-mo...","top-50000-scans-2019,top-100000-scans-2019,at-...",en:plain-madeleines,https://images.openfoodfacts.org/images/produc...,1852.0,22.0,2.6,,25.0,6.4,0.53,22.666667
14,15,http://world-en.openfoodfacts.org/product/0000...,2024-10-04T09:38:50Z,Madeleines ChocoLait,"en:plastic,en:cardboard","en:snacks,en:sweet-snacks,en:biscuits-and-cake...","en:wheat-flour,en:cereal,en:flour,en:wheat,en:...","en:palm-oil-free,en:non-vegan,en:maybe-vegetarian",en:none,en:nuts,"en:e322,en:e322i,en:e331,en:e422,en:e500,en:e503",d,"en:sugary-snacks,en:biscuits-and-cakes","en:to-be-completed,en:nutrition-facts-complete...",d,"en:fat-in-high-quantity,en:saturated-fat-in-hi...","top-50000-scans-2019,top-100000-scans-2019,at-...",en:chocolate-madeleines,https://images.openfoodfacts.org/images/produc...,1926.0,24.0,6.0,,31.0,6.4,0.48,16.250000
18,20,http://world-en.openfoodfacts.org/product/0000...,2024-10-04T09:38:53Z,Madeleines Choco Noir,fr:1-boite-en-carton-a-recycler-50-sachets-ind...,"en:snacks,en:sweet-snacks,en:biscuits-and-cake...","en:wheat-flour,en:cereal,en:flour,en:wheat,en:...","en:palm-oil-free,en:non-vegan,en:vegetarian-st...",other,en:nuts,"en:e322,en:e331,en:e422,en:e500,en:e503",d,"en:sugary-snacks,en:biscuits-and-cakes","en:to-be-checked,en:complete,en:nutrition-fact...",d,"en:fat-in-high-quantity,en:saturated-fat-in-hi...","top-50000-scans-2019,top-100000-scans-2019,at-...",en:chocolate-madeleines,https://images.openfoodfacts.org/images/produc...,1953.0,25.0,6.1,,29.0,6.3,0.45,16.250000
20,22,http://world-en.openfoodfacts.org/product/0000...,2024-10-04T09:38:54Z,Farandole de madeleine,"fr:boite-en-carton,fr:film-en-plastique","en:snacks,en:sweet-snacks,en:biscuits-and-cake...","fr:madeleines-choconoir,fr:madeleines-nappees-...","en:palm-oil-content-unknown,en:non-vegan,en:ve...",en:none,en:nuts,"en:e322,en:e322i,en:e331,en:e422,en:e500,en:e503",unknown,"en:sugary-snacks,en:biscuits-and-cakes","en:to-be-completed,en:nutrition-facts-complete...",d,,"top-75-percent-scans-2020,top-80-percent-scans...",en:long-madeleines,https://images.openfoodfacts.org/images/produc...,,,,,,,,1.750000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3498741,9999864004549,http://world-en.openfoodfacts.org/product/9999...,2019-08-01T11:25:22Z,Boudin à l’ancienne,,"en:meats-and-their-products,en:meats,en:prepar...",fr:biocoop-bordeaux-lac-distribue-par-les-elev...,"en:may-contain-palm-oil,en:non-vegan,en:non-ve...",en:none,en:none,,unknown,"en:fish-meat-eggs,en:offals","en:to-be-completed,en:nutrition-facts-complete...",a,,"bottom-25-percent-scans-2019,bottom-20-percent...",en:sauteed-black-pudding,,,,,,,,,5.357143
3498753,9999900401301,http://world-en.openfoodfacts.org/product/9999...,2018-01-05T16:53:13Z,Kabanos,,,"fr:kabanoserc-sktadn-wiep,fr:zekrobia,fr:guszc...","en:palm-oil-content-unknown,en:vegan-status-un...",en:none,en:none,,unknown,,"en:to-be-completed,en:nutrition-facts-complete...",unknown,,,,https://images.openfoodfacts.org/images/produc...,2297.0,48.0,18.0,,1.9,24.0,3.10,0.000000
3498810,9999991042704,http://world-en.openfoodfacts.org/product/9999...,2024-06-16T21:10:13Z,Yaourt vanille,,"en:dairies,en:fermented-foods,en:fermented-mil...","en:whole-milk,en:dairy,en:milk,en:cream,en:sug...","en:palm-oil-free,en:non-vegan,en:vegetarian-st...",en:milk,other,,c,"en:milk-and-dairy-products,en:milk-and-yogurt","en:to-be-completed,en:nutrition-facts-complete...",b,"en:fat-in-moderate-quantity,en:saturated-fat-i...","bottom-25-percent-scans-2019,bottom-20-percent...",en:vanilla-yogurt,https://images.openfoodfacts.org/images/produc...,536.0,7.6,5.1,,11.0,2.7,0.09,0.000000
3498834,9999999004360,http://world-en.openfoodfacts.org/product/9999...,2023-04-28T16:03:26Z,Minis beignets,,"en:snacks,en:sweet-snacks,en:sweet-fritters","fr:matiere-grasse-du-lait-babeurre,fr:farine-d...","en:palm-oil,en:non-vegan,en:vegetarian-status-...",en:none,en:sesame-seeds,"en:e322,en:e322i,en:e412,en:e450,en:e471,en:e5...",d,,"en:to-be-completed,en:nutrition-facts-complete...",unknown,"en:fat-in-high-quantity,en:saturated-fat-in-hi...",,en:sweet-fritters,https://images.openfoodfacts.org/images/produc...,1393.0,20.2,6.0,,10.4,7.4,1.10,0.000000


In [None]:
import pandas as pd


def clean_dataset(df):
    # Transformation des dates en année
    df['last_modified_year'] = pd.to_datetime(df['last_modified_datetime']).dt.year
    
    # Supprimer les lignes vides pour certaines colonnes
    df = df.dropna(subset=['product_name', 'ingredients_tags'])
    
    # Remplacer les valeurs vides dans d'autres colonnes
    fill_values = {
        'packaging_tags': 'en:unknown',
        'categories_tags': 'en:unknown',
        'ingredients_analysis_tags': 'unknown',
        'allergens': 'en:none',
        'traces_tags': 'en:none',
        'additives_tags': 'en:none',
        'nutriscore_grade': 'unknown',
        'food_groups_tags': 'en:none',
        'states_tags': 'en:unknown',
        'ecoscore_grade': 'unknown',
        'nutrient_levels_tags': 'en:unknown',
        'popularity_tags': 'unknown',
        'main_category': 'en:none',
        'image_url': 'en:none',
        'energy_100g': -1,
        'fat_100g': -1,
        'saturated-fat_100g': -1,
        'cholesterol_100g': -1,
        'sugars_100g': -1,
        'proteins_100g': -1,
        'salt_100g': -1,
        'fruits-vegetables-nuts-estimate-from-ingredients_100g': -1
    }

    df.fillna(value=fill_values, inplace=True)
    
    # Supprimer les colonnes non nécessaires
    df = df.drop(columns=['last_modified_datetime'])
    
    return df

clean_data = clean_dataset(clean_data)
clean_data.head()




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,code,url,product_name,packaging_tags,categories_tags,ingredients_tags,ingredients_analysis_tags,allergens,traces_tags,additives_tags,nutriscore_grade,food_groups_tags,states_tags,ecoscore_grade,nutrient_levels_tags,popularity_tags,main_category,image_url,energy_100g,fat_100g,saturated-fat_100g,cholesterol_100g,sugars_100g,proteins_100g,salt_100g,fruits-vegetables-nuts-estimate-from-ingredients_100g,last_modified_year
4,5,http://world-en.openfoodfacts.org/product/0000...,Bio inulin,en:glass,"en:plant-based-foods-and-beverages,en:plant-ba...","en:apricot-juice-and-puree,en:fruit,en:prunus-...","en:palm-oil-free,en:vegan,en:vegetarian",en:none,en:none,en:none,a,"en:fruits-and-vegetables,en:vegetables","en:to-be-completed,en:nutrition-facts-complete...",unknown,"en:fat-in-low-quantity,en:saturated-fat-in-low...",unknown,en:vegetables,https://images.openfoodfacts.org/images/produc...,840.0,0.0,0.0,-1.0,8.0,0.0,0.23,50.0,2024
9,10,http://world-en.openfoodfacts.org/product/0000...,Madeleines nature,"en:plastic,en:cardboard,fr:boite-en-carton,fr:...","en:snacks,en:desserts,en:sweet-snacks,en:biscu...","en:wheat-flour,en:cereal,en:flour,en:wheat,en:...","en:palm-oil-free,en:non-vegan,en:vegetarian-st...",other,other,"en:e331,en:e422,en:e503",d,"en:sugary-snacks,en:biscuits-and-cakes","en:to-be-completed,en:nutrition-facts-complete...",c,"en:fat-in-high-quantity,en:saturated-fat-in-mo...","top-50000-scans-2019,top-100000-scans-2019,at-...",en:plain-madeleines,https://images.openfoodfacts.org/images/produc...,1852.0,22.0,2.6,-1.0,25.0,6.4,0.53,22.666667,2024
14,15,http://world-en.openfoodfacts.org/product/0000...,Madeleines ChocoLait,"en:plastic,en:cardboard","en:snacks,en:sweet-snacks,en:biscuits-and-cake...","en:wheat-flour,en:cereal,en:flour,en:wheat,en:...","en:palm-oil-free,en:non-vegan,en:maybe-vegetarian",en:none,en:nuts,"en:e322,en:e322i,en:e331,en:e422,en:e500,en:e503",d,"en:sugary-snacks,en:biscuits-and-cakes","en:to-be-completed,en:nutrition-facts-complete...",d,"en:fat-in-high-quantity,en:saturated-fat-in-hi...","top-50000-scans-2019,top-100000-scans-2019,at-...",en:chocolate-madeleines,https://images.openfoodfacts.org/images/produc...,1926.0,24.0,6.0,-1.0,31.0,6.4,0.48,16.25,2024
18,20,http://world-en.openfoodfacts.org/product/0000...,Madeleines Choco Noir,fr:1-boite-en-carton-a-recycler-50-sachets-ind...,"en:snacks,en:sweet-snacks,en:biscuits-and-cake...","en:wheat-flour,en:cereal,en:flour,en:wheat,en:...","en:palm-oil-free,en:non-vegan,en:vegetarian-st...",other,en:nuts,"en:e322,en:e331,en:e422,en:e500,en:e503",d,"en:sugary-snacks,en:biscuits-and-cakes","en:to-be-checked,en:complete,en:nutrition-fact...",d,"en:fat-in-high-quantity,en:saturated-fat-in-hi...","top-50000-scans-2019,top-100000-scans-2019,at-...",en:chocolate-madeleines,https://images.openfoodfacts.org/images/produc...,1953.0,25.0,6.1,-1.0,29.0,6.3,0.45,16.25,2024
20,22,http://world-en.openfoodfacts.org/product/0000...,Farandole de madeleine,"fr:boite-en-carton,fr:film-en-plastique","en:snacks,en:sweet-snacks,en:biscuits-and-cake...","fr:madeleines-choconoir,fr:madeleines-nappees-...","en:palm-oil-content-unknown,en:non-vegan,en:ve...",en:none,en:nuts,"en:e322,en:e322i,en:e331,en:e422,en:e500,en:e503",unknown,"en:sugary-snacks,en:biscuits-and-cakes","en:to-be-completed,en:nutrition-facts-complete...",d,en:unknown,"top-75-percent-scans-2020,top-80-percent-scans...",en:long-madeleines,https://images.openfoodfacts.org/images/produc...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.75,2024


In [22]:
dftest= clean_data['main_category'].value_counts().reset_index()
dftest[dftest['count'] > 100]

Unnamed: 0,main_category,count
0,en:none,34985
1,en:groceries,8649
2,en:sweetened-beverages,4210
3,en:beverages,1725
4,en:candies,1474
...,...,...
427,en:smoked-sausages,102
428,en:canned-flageolet-beans,102
429,en:sardines-in-tomato-sauce,101
430,en:blackberry-jams,101


In [23]:
clean_data.describe(include='all')

Unnamed: 0,code,url,product_name,packaging_tags,categories_tags,ingredients_tags,ingredients_analysis_tags,allergens,traces_tags,additives_tags,nutriscore_grade,food_groups_tags,states_tags,ecoscore_grade,nutrient_levels_tags,popularity_tags,main_category,image_url,energy_100g,fat_100g,saturated-fat_100g,cholesterol_100g,sugars_100g,proteins_100g,salt_100g,fruits-vegetables-nuts-estimate-from-ingredients_100g,last_modified_year
count,255537.0,255537,255537,255537,255537,255537,255537,255537,255537,255537,255537,255537,255537,255537,255537,255537,255537,255537,255537.0,255537.0,255537.0,255537.0,255537.0,255537.0,255537.0,255537.0,255537.0
unique,254765.0,254771,197147,18379,49352,194509,38,19,33,32825,7,46,1653,9,224,85393,16911,237020,,,,,,,,,
top,3254560000000.0,http://world-en.openfoodfacts.org/product/3433...,Comté,en:unknown,en:unknown,"en:superior-quality-durum-wheat-semolina,en:ce...","en:palm-oil-free,en:vegan,en:vegetarian",en:none,en:none,en:none,unknown,en:none,"en:to-be-completed,en:nutrition-facts-complete...",unknown,en:unknown,unknown,en:none,en:none,,,,,,,,,
freq,2.0,2,204,125404,34985,784,45120,156416,179620,124985,54476,56622,20867,92392,62937,72025,34985,17773,,,,,,,,,
mean,,,,,,,,,,,,,,,,,,,232843000000.0,12.047864,4.571277,-0.997241,12.075284,7.374202,0.896229,19.172789,2022.430744
std,,,,,,,,,,,,,,,,,,,117703700000000.0,16.715739,7.752517,0.091233,19.215145,9.121442,6.788525,48.506079,1.454487
min,,,,,,,,,,,,,,,,,,,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-65.887256,2013.0
25%,,,,,,,,,,,,,,,,,,,293.0,0.5,0.1,-1.0,0.5,0.6,0.01,0.0,2022.0
50%,,,,,,,,,,,,,,,,,,,908.0,5.1,1.3,-1.0,2.7,5.2,0.275,0.056676,2023.0
75%,,,,,,,,,,,,,,,,,,,1570.0,19.0,6.0,-1.0,15.0,10.77,1.1,22.833333,2023.0


In [24]:
clean_data.to_csv("clean_dataset.csv")

### Text processing

In [25]:
clean_data = pd.read_csv("clean_dataset.csv")


Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.



In [46]:
nlp = en_core_web_sm.load()

text_processing = TextProcessing()

def clean_text_column(text):
    # Suppression des préfixes de langue
    text = re.sub(r'\b\w{2}:\b', ' ', text)  # \b délimite le mot et \w{2} correspond à deux lettres
    # Remplacement des tirets par des espaces
    text = text.replace('-', ' ')
    # Suppression des espaces multiples
    text = re.sub(r'\s+', ' ', text).strip()
    return text

fill_values = {
        'packaging_tags': 'en:unknown',
        'categories_tags': 'en:unknown',
        'ingredients_analysis_tags': 'unknown',
        'allergens': 'en:none',
        'traces_tags': 'en:none',
        'additives_tags': 'en:none',
        'nutriscore_grade': 'unknown',
        'food_groups_tags': 'en:none',
        'states_tags': 'en:unknown',
        'ecoscore_grade': 'unknown',
        'nutrient_levels_tags': 'en:unknown',
        'popularity_tags': 'unknown',
        'main_category': 'en:none',
        'image_url': 'en:none',
        'energy_100g': -1,
        'fat_100g': -1,
        'saturated-fat_100g': -1,
        'cholesterol_100g': -1,
        'sugars_100g': -1,
        'proteins_100g': -1,
        'salt_100g': -1,
        'fruits-vegetables-nuts-estimate-from-ingredients_100g': -1
    }

# Liste des colonnes à traiter
columns_cat = ['product_name', 'packaging_tags', 'categories_tags', 'ingredients_tags', 'ingredients_analysis_tags', 
           'nutrient_levels_tags', 'nutriscore_grade', 'main_category', 'ecoscore_grade', 'popularity_tags']

# Liste pour stocker les noms des nouvelles colonnes créées
new_columns_cat_names = []

# Boucle pour appliquer les transformations et stocker les noms des nouvelles colonnes
for col in columns_cat:
    # Nom des nouvelles colonnes pour chaque transformation
    preprocessed_col = f'preprocessed_{col}'
    # lemmatized_col = f'preprocessed_{col}_lemmatized'
    len_col = f'len_{preprocessed_col}'
    
    # Appliquer le nettoyage
    clean_data[preprocessed_col] = clean_data[col].fillna('').apply(clean_text_column)
    
    # Appliquer la standardisation
    clean_data[preprocessed_col] = clean_data[preprocessed_col].apply(text_processing.standardize)

    # Calculer la longueur des mots dans la colonne lemmatisée
    clean_data[len_col] = clean_data[preprocessed_col].apply(lambda x: len(x))
    
    # Appliquer la lemmatisation
    # clean_data[lemmatized_col] = clean_data[preprocessed_col].apply(lambda x: text_processing.lemmatize(x, nlp))
    
    # Calculer la longueur des mots dans la colonne lemmatisée
    # clean_data[len_col] = clean_data[lemmatized_col].apply(lambda x: len(x))
    
    # Ajouter les nouveaux noms de colonnes dans la liste
    # new_columns_cat_names.extend([preprocessed_col, lemmatized_col, len_col])
    new_columns_cat_names.extend([preprocessed_col, len_col])

# Affichage des noms des nouvelles colonnes créées
print("Noms des nouvelles colonnes créées :")
print(new_columns_cat_names)
# Affichage des premières lignes des nouvelles colonnes
clean_data[new_columns_cat_names].head()

Noms des nouvelles colonnes créées :
['preprocessed_product_name', 'len_preprocessed_product_name', 'preprocessed_packaging_tags', 'len_preprocessed_packaging_tags', 'preprocessed_categories_tags', 'len_preprocessed_categories_tags', 'preprocessed_ingredients_tags', 'len_preprocessed_ingredients_tags', 'preprocessed_ingredients_analysis_tags', 'len_preprocessed_ingredients_analysis_tags', 'preprocessed_nutrient_levels_tags', 'len_preprocessed_nutrient_levels_tags', 'preprocessed_nutriscore_grade', 'len_preprocessed_nutriscore_grade', 'preprocessed_main_category', 'len_preprocessed_main_category', 'preprocessed_ecoscore_grade', 'len_preprocessed_ecoscore_grade', 'preprocessed_popularity_tags', 'len_preprocessed_popularity_tags']


Unnamed: 0,preprocessed_product_name,len_preprocessed_product_name,preprocessed_packaging_tags,len_preprocessed_packaging_tags,preprocessed_categories_tags,len_preprocessed_categories_tags,preprocessed_ingredients_tags,len_preprocessed_ingredients_tags,preprocessed_ingredients_analysis_tags,len_preprocessed_ingredients_analysis_tags,preprocessed_nutrient_levels_tags,len_preprocessed_nutrient_levels_tags,preprocessed_nutriscore_grade,len_preprocessed_nutriscore_grade,preprocessed_main_category,len_preprocessed_main_category,preprocessed_ecoscore_grade,len_preprocessed_ecoscore_grade,preprocessed_popularity_tags,len_preprocessed_popularity_tags
0,bio inulin,10,glass,5,plant based foods and beverages plant based fo...,117,apricot juice and puree fruit prunus species f...,123,palm oil free vegan vegetarian,30,fat in low quantity saturated fat in low quant...,98,a,1,vegetables,10,unknown,7,unknown,7
1,madeleines nature,17,plastic cardboard boite en carton film en plas...,51,snacks desserts sweet snacks biscuits and cake...,81,wheat flour cereal flour wheat cereal flour su...,435,palm oil free non vegan vegetarian status unknown,49,fat in high quantity saturated fat in moderate...,105,d,1,plain madeleines,16,c,1,top 50000 scans 2019top 100000 scans 2019at le...,1572
2,madeleines chocolait,20,plastic cardboard,17,snacks sweet snacks biscuits and cakes cakes c...,92,wheat flour cereal flour wheat cereal flour mi...,564,palm oil free non vegan maybe vegetarian,40,fat in high quantity saturated fat in high qua...,101,d,1,chocolate madeleines,20,d,1,top 50000 scans 2019top 100000 scans 2019at le...,1641
3,madeleines choco noir,21,1 boite en carton a recycler 50 sachets indivi...,62,snacks sweet snacks biscuits and cakes cakes c...,92,wheat flour cereal flour wheat cereal flour da...,557,palm oil free non vegan vegetarian status unknown,49,fat in high quantity saturated fat in high qua...,101,d,1,chocolate madeleines,20,d,1,top 50000 scans 2019top 100000 scans 2019at le...,1571
4,farandole de madeleine,22,boite en carton film en plastique,33,snacks sweet snacks biscuits and cakes cakes c...,108,madeleines choconoir madeleines nappees de cho...,951,palm oil content unknown non vegan vegetarian ...,60,unknown,7,unknown,7,long madeleines,15,d,1,top 75 percent scans 2020top 80 percent scans ...,603


## Training

Essai avec un entrainement d'un modèle simple Kmeans.

Rappel :

**1. CountVectorizer**

- **Principe :** Compte le nombre d’occurrences de chaque mot dans un texte ou une liste de tags.
- **Utilisation :** Représente chaque mot par une colonne dans une matrice, avec une valeur représentant sa fréquence.
- **Avantage :** Simple et rapide à calculer, utile pour des listes ou textes courts où la présence des mots est suffisante.
- **Limite :** Ne capture pas les relations sémantiques (ex. "sucre" ≠ "glucose").

**2. TF-IDF Vectorizer**

- **Principe :** Calcule la fréquence des mots ajustée par leur rareté dans l’ensemble des documents.
- **Utilisation :** Les mots fréquents dans peu de documents reçoivent une valeur plus élevée, permettant de distinguer des termes spécifiques.
- **Avantage :** Diminue l’importance des mots très fréquents (ex. "de", "le") et met en avant les mots distinctifs.
- **Limite :** Ne capture pas la similarité sémantique entre les mots.

**3. Word Embeddings (Word2Vec, GloVe)**

- **Principe :** Apprend à représenter chaque mot dans un espace vectoriel de manière à ce que des mots contextuellement proches soient également proches dans l’espace vectoriel.
- **Utilisation :** Représente chaque mot par un vecteur de plusieurs dimensions, capturant des relations sémantiques (ex. "roi" proche de "reine").
- **Avantage :** Capture les similarités et relations entre mots, utile pour les tâches nécessitant une compréhension sémantique.
- **Limite :** Ne prend pas en compte le contexte spécifique d'une phrase.

**4. FastText**

- **Principe :** Apprend également des embeddings mais utilise des sous-mots (n-grammes de caractères), ce qui permet de mieux comprendre les mots avec des similitudes orthographiques.
- **Exemple :** FastText repère des similarités entre "glucose" et "sucre" en utilisant des sous-parties communes dans les mots. Si des variantes de mots sont dans le dataset (ex. "sucreries", "sucre"), FastText peut capturer ces relations plus finement.
- **Avantage :** Idéal pour des données contenant des variantes ou des erreurs typographiques, tout en offrant des similarités proches de Word2Vec et BERT pour les termes proches.
- **Limite :** FastText est limité par son incapacité à capter le contexte global, la syntaxe et la structure grammaticale, ce qui le rend moins performant pour comprendre le sens des mots dans des phrases complexes ou ambiguës

**5. BERT (Contextual Embeddings)**

- **Principe :** Génère des représentations contextuelles pour chaque mot dans une phrase, tenant compte du contexte global.
- **Utilisation :** Représente chaque mot différemment selon le contexte de la phrase (ex. "batterie" dans "batterie de cuisine" vs. "batterie de voiture").
- **Avantage :** Idéal pour les phrases et textes complexes nécessitant une compréhension fine du contexte.
- **Limite :** Modèle très coûteux en calcul, moins utile pour des listes simples de mots isolés.

--- 

**En résumé :**

CountVectorizer et TF-IDF sont légers et efficaces pour des listes de tags ou des mots courts.

Word Embeddings ajoutent la dimension sémantique.

BERT excelle dans des phrases contextuelles, mais est plus lourd et parfois excessif pour des mots isolés.

In [None]:
# Colonnes numériques et catégorielles
numeric_columns = [
    'energy_100g', 'fat_100g', 'saturated-fat_100g', 'cholesterol_100g', 
    'sugars_100g', 'proteins_100g', 'salt_100g', 'fruits-vegetables-nuts-estimate-from-ingredients_100g'
]
categorial_columns = [
    'preprocessed_product_name', 'preprocessed_packaging_tags', 'preprocessed_categories_tags', 
    'preprocessed_ingredients_tags', 'preprocessed_ingredients_analysis_tags', 
    'preprocessed_nutriscore_grade', 'preprocessed_main_category'
]

# Fonction d'imputation pour les valeurs numériques
def impute_numeric_data(df, columns):
    imputer = SimpleImputer(strategy='mean')
    df[columns] = imputer.fit_transform(df[columns])
    return df

# Fonction de normalisation des colonnes numériques
def scale_numeric_data(df, columns):
    scaler = StandardScaler()
    df[columns] = scaler.fit_transform(df[columns])
    return df

# Fonction de transformation de texte pour les colonnes catégorielles
def encode_categorical_data(df, columns, max_features=500, min_df=4, n_components=50):
    vectorizer = CountVectorizer(max_features=max_features, min_df=min_df)
    encoded_columns = []
    
    for col in columns:
        print('column', col)
        # Encodage du texte
        encoded = vectorizer.fit_transform(df[col].fillna(''))
        print('encoded.shape', encoded.shape[1])
        
        # Ajustement de n_components pour TruncatedSVD si nécessaire
        adjusted_n_components = min(n_components, encoded.shape[1])
        print('adjusted_n_components:', adjusted_n_components)
        # Réduction de dimensions avec TruncatedSVD plus optimisé avec les sparses matrices que PCA
        svd = TruncatedSVD(n_components=adjusted_n_components, random_state=42)
        
        # Réduction de dimensions
        reduced = svd.fit_transform(encoded)
        encoded_columns.append(pd.DataFrame(reduced))
    
    vectorizer_nb_features = vectorizer.get_feature_names_out()
    print("[INFO][CountVectorizer] : Nombre de features dans le vocabulaire : ", len(vectorizer_nb_features))
    return pd.concat(encoded_columns, axis=1)

# Fonction de clustering et calcul des scores
def cluster_data(features, cluster_range, random_state=42):
    silhouette_scores = []
    inertia = []
    
    for k in cluster_range:
        kmeans = KMeans(n_clusters=k, random_state=random_state)
        labels = kmeans.fit_predict(features)
        inertia.append(kmeans.inertia_)
        silhouette_scores.append(silhouette_score(features, labels))
    
    return labels, silhouette_scores, inertia

# Fonction principale de préparation des données et de clustering
def prepare_and_cluster(df, cluster_range, random_state=42):
    # Imputation des valeurs numériques
    df = impute_numeric_data(df, numeric_columns)
    
    # Encodage des données catégorielles
    categorical_features = encode_categorical_data(df, categorial_columns)
    
    # Normalisation des données numériques
    df = scale_numeric_data(df, numeric_columns)
    
    # Assemblage des features
    features = pd.concat([categorical_features, df[numeric_columns].reset_index(drop=True)], axis=1)
    features.columns = features.columns.astype(str)
    
    # Clustering et calcul des scores
    labels, silhouette_scores, inertia = cluster_data(features, cluster_range=cluster_range, random_state=random_state)
    
    # Ajout des labels de clusters au DataFrame
    df['cluster'] = labels
    return df, silhouette_scores, inertia

def train_kmeans_with_manual_clusters(df, n_clusters, random_state=42):
    """
    Applique un KMeans avec un nombre de clusters spécifié manuellement.
    
    Parameters:
        df (DataFrame): Le DataFrame contenant les données sur lesquelles appliquer le KMeans.
        n_clusters (int): Le nombre de clusters à utiliser.
        random_state (int): L'état aléatoire pour assurer la reproductibilité.
    
    Returns:
        DataFrame: Le DataFrame avec les labels de clusters ajoutés.
    """
    # Imputation des valeurs numériques
    df = impute_numeric_data(df, numeric_columns)
    
    # Encodage des données catégorielles
    categorical_features = encode_categorical_data(df, categorial_columns)
    
    # Normalisation des données numériques
    df = scale_numeric_data(df, numeric_columns)
    
    # Assemblage des features
    features = pd.concat([categorical_features, df[numeric_columns].reset_index(drop=True)], axis=1)
    features.columns = features.columns.astype(str)
    
    # Apprentissage du modèle KMeans avec le nombre de clusters spécifié
    kmeans = KMeans(n_clusters=n_clusters, random_state=random_state)
    df['cluster'] = kmeans.fit_predict(features)
    
    return df


def inertia_elbow_plotly(cluster_range, inertia, silhouette_scores):
    data = pd.DataFrame({
        'Number of clusters': cluster_range,
        'Inertia': inertia,
        'Silhouette Score': silhouette_scores
    })
    
    # Graphique Elbow (Inertia)
    fig_inertia = px.line(
        data,
        x='Number of clusters',
        y='Inertia',
        markers=True,
        title="Elbow Method"
    )
    fig_inertia.update_layout(
        xaxis_title="Number of clusters",
        yaxis_title="Inertia"
    )
    
    # Graphique Silhouette
    fig_silhouette = px.line(
        data,
        x='Number of clusters',
        y='Silhouette Score',
        markers=True,
        title="Silhouette Score"
    )
    fig_silhouette.update_layout(
        xaxis_title="Number of clusters",
        yaxis_title="Silhouette Score"
    )
    
    fig_inertia.show()
    fig_silhouette.show()


cluster_range=range(50, 1000, 25)

df_kmeans, silhouette_scores, inertia = prepare_and_cluster(df=clean_data.sample(2000), cluster_range=cluster_range)

# Affichage des résultats
df_kmeans.head()
print("Silhouette Scores:", silhouette_scores)
print("Inertia:", inertia)

inertia_elbow_plotly(cluster_range=cluster_range, inertia=inertia, silhouette_scores=silhouette_scores)


column preprocessed_product_name
encoded.shape 210
adjusted_n_components: 50
column preprocessed_packaging_tags
encoded.shape 74
adjusted_n_components: 50
column preprocessed_categories_tags
encoded.shape 210
adjusted_n_components: 50
column preprocessed_ingredients_tags
encoded.shape 210
adjusted_n_components: 50
column preprocessed_ingredients_analysis_tags
encoded.shape 12
adjusted_n_components: 12
column preprocessed_nutriscore_grade
encoded.shape 3
adjusted_n_components: 3
column preprocessed_main_category
encoded.shape 210
adjusted_n_components: 50
[INFO][CountVectorizer] : Nombre de features dans le vocabulaire :  210
Silhouette Scores: [0.1005999983899091, 0.10768130300298916, 0.10812062844501151, 0.09925294562462046, 0.10564067098305037, 0.10282925076192623, 0.10037636686015056, 0.10149909564400263, 0.10002394028950201, 0.10038431164257546, 0.10279484775593742, 0.10494296374715295, 0.10605293312265436, 0.10945479785378254, 0.10927545292749946, 0.10933242903762484, 0.1108142818

In [32]:
 # Appliquer KMeans avec le meilleur nombre de clusters
df_result= train_kmeans_with_manual_clusters(df=clean_data, n_clusters=625)

column preprocessed_product_name
encoded.shape 210
adjusted_n_components: 50
column preprocessed_packaging_tags
encoded.shape 210
adjusted_n_components: 50
column preprocessed_categories_tags
encoded.shape 210
adjusted_n_components: 50
column preprocessed_ingredients_tags
encoded.shape 210
adjusted_n_components: 50
column preprocessed_ingredients_analysis_tags
encoded.shape 12
adjusted_n_components: 12
column preprocessed_nutriscore_grade
encoded.shape 3
adjusted_n_components: 3
column preprocessed_main_category
encoded.shape 210
adjusted_n_components: 50
[INFO][CountVectorizer] : Nombre de features dans le vocabulaire :  210


In [33]:
df_result.shape

(255537, 49)

In [34]:
sample_df = df_result.sample(1000)
sample_df.to_csv("sample.csv")

In [35]:
cluster_count_df =df_result["cluster"].value_counts().reset_index()
cluster_count_df

Unnamed: 0,cluster,count
0,44,4132
1,548,2820
2,132,1719
3,32,1684
4,310,1584
...,...,...
620,19,1
621,21,1
622,82,1
623,97,1


In [36]:
cluster_count_df[cluster_count_df["count"] > 10]

Unnamed: 0,cluster,count
0,44,4132
1,548,2820
2,132,1719
3,32,1684
4,310,1584
...,...,...
612,476,55
613,263,47
614,127,45
615,497,41


In [37]:
df_result["cluster"].value_counts()


df_result[["product_name"]][df_result["cluster"] == 100].head(20)

Unnamed: 0,product_name
4682,Pate croute richelieu canard
4709,Roti de veau à l'indienne
5722,10 Saucisses de Strasbourg
5727,Mousse de foie forestière
6985,Metka Petite
13962,Prinskorv - Saucisses de porc et de bœuf
14213,Cervelas de Strasbourg
14406,Knack filière
16261,Saucisse reunionnaise fumee
16362,Saucisse fumée réunionnaise


In [38]:
df_result[["product_name"]][df_result["cluster"] == 20].head(20)

Unnamed: 0,product_name
6803,Nespresso Origin Brazil
46590,Café du Pérou bio et équitable capsules x10
46591,Classico lungo
46595,Capsules de café expresso
61355,Forza café Espresso
70254,Espresso Forte
77350,Café expresso classico
77351,Café espresso fortissimo
77352,Café espresso decaffeinato
77355,Café espresso leggero


In [39]:
df_result[["product_name"]][df_result["cluster"] == 169].head(20)

Unnamed: 0,product_name
166,Ice cream sticks Vanilla
862,macadamia but brittle
2856,New York Super Fudge Chunk Ice Cream
2858,Ben & Jerry's Dessert Glacé MENTHE CHOCOLAT 45...
9395,Adelis glace
11333,Favorite selection mini cups
15503,Glace NOUGAT de Montélimar Chabert & Guillot
15504,Glace RÉGLISSE Intense
15505,"Glace RHUM RAISINS, raisins gloden macérés"
15507,Glace à la VERVEINE DU VELAY


In [40]:
df_result[["product_name"]][df_result["cluster"] == 165].head(20)

Unnamed: 0,product_name
10662,Sauce bourguignonne
11199,Sauce Pommes Frites
14977,Dips Sauce Creamy Curry
14996,Sauce Creamy Deluxe
15099,Sauce Tartare
15514,Sauce Curry
19506,Sauce bourguignonne
19508,Sauce Tartare
20190,Sauce Pommes Frites
20911,Sauce crudités caesar
