In [1]:
import pandas as pd
import polars as pl

import plotly.express as px

from collections import Counter


pd.set_option('display.max_columns', None)

## Introduction et contexte

Ce notebook a pour but de faire une première exploration des données mises à disposition par openfoodfacts pour répondre à notre problème qui est:

quelle alternative à un produit alimentaire pourrait nous être proposé ?

Dans ces alternatives, quels sont les produits qui ne contiennent pas de produit susceptible de provoquer une allergie alimentaire ?

Le volume de données est important, nous allons nous concentrer sur les produits où la liste des ingrédients est renseignés et ou les produits sont vendus en france.

## Chargement des Données

In [None]:
rawdata = None
def extract_raw_data():
    file_path = '../data/en.openfoodfacts.org.products.csv.gz'
    chunk_size = 10000

    filtered_chunks_list = []

    for chunk in pd.read_csv(file_path, chunksize=chunk_size, compression='gzip', sep='\t', engine='python', quoting=3):
        filtered_chunks = chunk[(chunk['countries_tags'] == 'en:france') & (chunk['ingredients_tags'].notna())]
        filtered_chunks_list.append(filtered_chunks)

    if filtered_chunks_list:
        rawdata = pd.concat(filtered_chunks_list, axis=0)
    return rawdata

# rawdata = extract_raw_data()
rawdata = pd.read_csv('filtered_dataset_openfoodfacts_raw_en.csv')
display(rawdata)

  rawdata = pd.read_csv('filtered_dataset_openfoodfacts_raw_en.csv')


Unnamed: 0.1,Unnamed: 0,code,url,creator,created_t,created_datetime,last_modified_t,last_modified_datetime,last_modified_by,last_updated_t,last_updated_datetime,product_name,abbreviated_product_name,generic_name,quantity,packaging,packaging_tags,packaging_en,packaging_text,brands,brands_tags,categories,categories_tags,categories_en,origins,origins_tags,origins_en,manufacturing_places,manufacturing_places_tags,labels,labels_tags,labels_en,emb_codes,emb_codes_tags,first_packaging_code_geo,cities,cities_tags,purchase_places,stores,countries,countries_tags,countries_en,ingredients_text,ingredients_tags,ingredients_analysis_tags,allergens,allergens_en,traces,traces_tags,traces_en,serving_size,serving_quantity,no_nutrition_data,additives_n,additives,additives_tags,additives_en,nutriscore_score,nutriscore_grade,nova_group,pnns_groups_1,pnns_groups_2,food_groups,food_groups_tags,food_groups_en,states,states_tags,states_en,brand_owner,ecoscore_score,ecoscore_grade,nutrient_levels_tags,product_quantity,owner,data_quality_errors_tags,unique_scans_n,popularity_tags,completeness,last_image_t,last_image_datetime,main_category,main_category_en,image_url,image_small_url,image_ingredients_url,image_ingredients_small_url,image_nutrition_url,image_nutrition_small_url,energy-kj_100g,energy-kcal_100g,energy_100g,energy-from-fat_100g,fat_100g,saturated-fat_100g,butyric-acid_100g,caproic-acid_100g,caprylic-acid_100g,capric-acid_100g,lauric-acid_100g,myristic-acid_100g,palmitic-acid_100g,stearic-acid_100g,arachidic-acid_100g,behenic-acid_100g,lignoceric-acid_100g,cerotic-acid_100g,montanic-acid_100g,melissic-acid_100g,unsaturated-fat_100g,monounsaturated-fat_100g,omega-9-fat_100g,polyunsaturated-fat_100g,omega-3-fat_100g,omega-6-fat_100g,alpha-linolenic-acid_100g,eicosapentaenoic-acid_100g,docosahexaenoic-acid_100g,linoleic-acid_100g,arachidonic-acid_100g,gamma-linolenic-acid_100g,dihomo-gamma-linolenic-acid_100g,oleic-acid_100g,elaidic-acid_100g,gondoic-acid_100g,mead-acid_100g,erucic-acid_100g,nervonic-acid_100g,trans-fat_100g,cholesterol_100g,carbohydrates_100g,sugars_100g,added-sugars_100g,sucrose_100g,glucose_100g,fructose_100g,lactose_100g,maltose_100g,maltodextrins_100g,starch_100g,polyols_100g,erythritol_100g,fiber_100g,soluble-fiber_100g,insoluble-fiber_100g,proteins_100g,casein_100g,serum-proteins_100g,nucleotides_100g,salt_100g,added-salt_100g,sodium_100g,alcohol_100g,vitamin-a_100g,beta-carotene_100g,vitamin-d_100g,vitamin-e_100g,vitamin-k_100g,vitamin-c_100g,vitamin-b1_100g,vitamin-b2_100g,vitamin-pp_100g,vitamin-b6_100g,vitamin-b9_100g,folates_100g,vitamin-b12_100g,biotin_100g,pantothenic-acid_100g,silica_100g,bicarbonate_100g,potassium_100g,chloride_100g,calcium_100g,phosphorus_100g,iron_100g,magnesium_100g,zinc_100g,copper_100g,manganese_100g,fluoride_100g,selenium_100g,chromium_100g,molybdenum_100g,iodine_100g,caffeine_100g,taurine_100g,ph_100g,fruits-vegetables-nuts_100g,fruits-vegetables-nuts-dried_100g,fruits-vegetables-nuts-estimate_100g,fruits-vegetables-nuts-estimate-from-ingredients_100g,collagen-meat-protein-ratio_100g,cocoa_100g,chlorophyl_100g,carbon-footprint_100g,carbon-footprint-from-meat-or-fish_100g,nutrition-score-fr_100g,nutrition-score-uk_100g,glycemic-index_100g,water-hardness_100g,choline_100g,phylloquinone_100g,beta-glucan_100g,inositol_100g,carnitine_100g,sulphate_100g,nitrate_100g,acidity_100g
0,0,1,http://world-en.openfoodfacts.org/product/0000...,inf,1634745456,2021-10-20T15:57:36Z,1730771746,2024-11-05T01:55:46Z,smoothie-app,1.730772e+09,2024-11-05T01:55:46Z,Purée Mix Tropical Harmony + Aloe,,,1 kg,,,,,Punuts,punuts,syrups,en:syrups,Syrups,Spanien,en:spain,Spain,,,"No gluten, Vegetarian, No artificial flavors, ...","en:no-gluten,en:vegetarian,en:no-artificial-fl...","No gluten,Vegetarian,No artificial flavors,Veg...",,,,,,,,"Vereinigte Staaten von Amerika, Germany","en:germany,en:united-states","Germany,United States","fruit juice blend, guava and papaya purees, pa...","en:fruit-juice-blend,en:guava,en:fruit,en:papa...","en:may-contain-palm-oil,en:vegan,en:vegetarian",,,,,,,,,4.0,,"en:e160a,en:e160ai,en:e330,en:e415,en:e440","E160a - Carotene,E160ai - Beta-carotene,E330 -...",-5.0,a,4.0,unknown,unknown,,,,"en:to-be-completed, en:nutrition-facts-complet...","en:to-be-completed,en:nutrition-facts-complete...","To be completed,Nutrition facts completed,Ingr...",,,unknown,"en:fat-in-moderate-quantity,en:saturated-fat-i...",1000.0,,,,,0.9000,1.730772e+09,2024-11-05T01:55:26Z,en:syrups,Syrups,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,1530.0,360.000000,1530.0,,10.000000,1.000000,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.000,9.000000,8.000000,,,,,,,,,,,13.000000,,,49.000000,,,,0.020000,,0.008000,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,32.987467,,,,,,-5.0,,,,,,,,,,,
1,1,2,http://world-en.openfoodfacts.org/product/0000...,kiliweb,1722606455,2024-08-02T13:47:35Z,1728914612,2024-10-14T14:03:32Z,ulate,1.728915e+09,2024-10-14T14:03:32Z,Poudre de grillon,,,100 g,,,,,naak,naak,Bebidas instantáneas,"en:beverages-and-beverages-preparations,en:bev...","Beverages and beverages preparations,Beverage ...",Canada,en:canada,Canada,,,"Organic, EU Organic, FR-BIO-01, en:nutriscore","en:organic,en:eu-organic,en:fr-bio-01,en:nutri...","Organic,EU Organic,FR-BIO-01,Nutriscore",EMB 35068G,emb-35068g,"48.116667,-1.4",,chateaubourg-ille-et-vilaine-france,,Amazon.ca,Canada,en:canada,Canada,,,,,,,,,45.0g,45.0,,,,,,,unknown,,unknown,unknown,,,,"en:to-be-completed, en:nutrition-facts-complet...","en:to-be-completed,en:nutrition-facts-complete...","To be completed,Nutrition facts completed,Ingr...",,,unknown,"en:fat-in-moderate-quantity,en:saturated-fat-i...",100.0,,,,,0.7500,1.727995e+09,2024-10-03T22:37:06Z,en:instant-beverages,Instant beverages,,,,,,,,400.000000,1674.0,,16.444445,4.666666,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,44.666668,4.888889,,,,,,,,,,,17.111111,,,10.000000,,,,0.013123,,0.005249,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,2,3,http://world-en.openfoodfacts.org/product/0000...,prepperapp,1716818343,2024-05-27T13:59:03Z,1729862624,2024-10-25T13:23:44Z,insectproductadd,1.729863e+09,2024-10-25T13:23:44Z,Feuchtes Toilettentuch - Kamille,,,70pcs,,,,,3 alfa,3-alfa,,,,,,,,,en:no-alcohol,en:no-alcohol,No alcohol,,,,,,,,GR,en:greece,Greece,Kein Rock gleicht den anderen Wir haben nachge...,en:kein-rock-gleicht-den-anderen-wir-haben-nac...,"en:palm-oil-content-unknown,en:vegan-status-un...",,,,,,120.0g,120.0,,0.0,,,,,unknown,,unknown,unknown,,,,"en:to-be-completed, en:nutrition-facts-complet...","en:to-be-completed,en:nutrition-facts-complete...","To be completed,Nutrition facts completed,Ingr...",,,unknown,,0.0,,,,,0.5875,1.729863e+09,2024-10-25T13:23:39Z,,,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,,160.000000,669.0,,3.200000,0.500000,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.002,19.800000,1.400000,,,,,,,,,,,8.500000,,,8.700000,,,,0.807087,,0.322835,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.000000,,,,,,,,,,,,,,,,,
3,3,4,http://world-en.openfoodfacts.org/product/0000...,elcoco,1560176426,2019-06-10T14:20:26Z,1730316925,2024-10-30T19:35:25Z,foodvisor,1.730317e+09,2024-10-30T19:35:25Z,Marinara,,,100g,,,,,Newman's own,newman-s-own,Dark chocolate bar,"en:snacks,en:sweet-snacks,en:cocoa-and-its-pro...","Snacks,Sweet snacks,Cocoa and its products,Cho...","Peru,Netherlands","en:netherlands,en:peru","Netherlands,Peru",,,"FSC Mix,CH-BIO-006,Certified B Corporation,Soi...","en:fair-trade,en:organic,en:eu-organic,en:cert...","Fair trade,Organic,EU Organic,Certified B Corp...",,,,,,United Kingdom,Amazon,CZ,en:czech-republic,Czech Republic,"Organic cocoa mass 60%, organic cane sugar, or...","en:cocoa-paste,en:plant,en:cocoa,en:cane-sugar...","en:palm-oil-free,en:vegan,en:vegetarian",,,,,,124.0g,124.0,,0.0,,,,23.0,e,3.0,Sugary snacks,Chocolate products,en:chocolate-products,"en:sugary-snacks,en:chocolate-products","Sugary snacks,Chocolate products","en:to-be-completed, en:nutrition-facts-complet...","en:to-be-completed,en:nutrition-facts-complete...","To be completed,Nutrition facts completed,Ingr...",,41.0,d,"en:fat-in-low-quantity,en:saturated-fat-in-hig...",100.0,,en:energy-value-in-kcal-does-not-match-value-i...,,,0.7750,1.730317e+09,2024-10-30T19:35:25Z,en:dark-chocolate-bar,Dark chocolate bar,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,,,,,2401.0,56.451613,2401.0,,2.016129,21.500000,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,8.064516,35.000000,,,,,,,,,,,1.612903,,,1.612903,,,,0.000000,,0.000000,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,12.000000,,,,,,23.0,,,,,,,,,,,
4,4,5,http://world-en.openfoodfacts.org/product/0000...,touchette,1605337720,2020-11-14T07:08:40Z,1729432954,2024-10-20T14:02:34Z,roboto-app,1.729433e+09,2024-10-20T14:02:34Z,Bio inulin,,,550g,Glass,en:glass,Glass,,EWL,ewl,Gemüse,"en:plant-based-foods-and-beverages,en:plant-ba...","Plant-based foods and beverages,Plant-based fo...",,,,bénivay-ollon,benivay-ollon,en:no-lactose,en:no-lactose,No lactose,13089c,13089c,,,,France,,France,en:france,France,"Jus et purée d'abricots (50%), eau, sucre.","en:apricot-juice-and-puree,en:fruit,en:prunus-...","en:palm-oil-free,en:vegan,en:vegetarian",,,,,,,,,0.0,,,,-2.0,a,3.0,Fruits and vegetables,Vegetables,en:vegetables,"en:fruits-and-vegetables,en:vegetables","Fruits and vegetables,Vegetables","en:to-be-completed, en:nutrition-facts-complet...","en:to-be-completed,en:nutrition-facts-complete...","To be completed,Nutrition facts completed,Ingr...",,,unknown,"en:fat-in-low-quantity,en:saturated-fat-in-low...",550.0,,,,,0.8875,1.729432e+09,2024-10-20T13:51:45Z,en:vegetables,Vegetables,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,840.0,208.000000,840.0,,0.000000,0.000000,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,8.000000,8.000000,,,,,,,,,,,88.000000,,,0.000000,,,,0.230000,,0.092000,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,50.000000,,,,,,-2.0,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
567862,567862,2663923,http://world-en.openfoodfacts.org/product/0266...,openfoodfacts-contributors,1575795217,2019-12-08T08:53:37Z,1727971835,2024-10-03T16:10:35Z,fix-code-bot,1.727972e+09,2024-10-03T16:10:35Z,,,,,,,,,Coop,coop,,,,,,,,,,,,,,,,,,,en:France,en:france,France,,,,,,,,,,,,,,,,,unknown,,unknown,unknown,,,,"en:to-be-completed, en:nutrition-facts-to-be-c...","en:to-be-completed,en:nutrition-facts-to-be-co...","To be completed,Nutrition facts to be complete...",,,unknown,,,,,,,0.1500,1.575795e+09,2019-12-08T08:53:38Z,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
567863,567863,266392301259,http://world-en.openfoodfacts.org/product/0266...,kiliweb,1659914688,2022-08-07T23:24:48Z,1659915340,2022-08-07T23:35:40Z,roboto-app,1.707866e+09,2024-02-13T23:17:08Z,Chocolatine,,,4 x 68 g,,,,,,,,,,,,,,,,,,,,,,,,,en:ca,en:canada,Canada,,,,,,,,,,,,,,,,,unknown,,unknown,unknown,,,,"en:to-be-completed, en:nutrition-facts-complet...","en:to-be-completed,en:nutrition-facts-complete...","To be completed,Nutrition facts completed,Ingr...",,,unknown,,272.0,,,,,0.3750,1.659915e+09,2022-08-07T23:24:51Z,,,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,,,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,,340.909091,1426.0,,19.318182,11.363636,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,36.363636,11.363636,,,,,,,,,,,2.272727,,,6.818182,,,,0.795455,,0.318182,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
567864,567864,266392804996,http://world-en.openfoodfacts.org/product/0266...,kiliweb,1655594246,2022-06-18T23:17:26Z,1655594910,2022-06-18T23:28:30Z,roboto-app,1.707863e+09,2024-02-13T22:28:42Z,Chocolatine,,,,,,,,,,,,,,,,,,,,,,,,,,,,en:ca,en:canada,Canada,,,,,,,,,,,,,,,,,unknown,,unknown,unknown,,,,"en:to-be-completed, en:nutrition-facts-complet...","en:to-be-completed,en:nutrition-facts-complete...","To be completed,Nutrition facts completed,Ingr...",,,unknown,,,,,,,0.3750,1.655594e+09,2022-06-18T23:17:28Z,,,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,,,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,,340.909091,1426.0,,19.318182,11.363636,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,36.363636,11.363636,,,,,,,,,,,2.272727,,,6.818182,,,,0.795455,,0.318182,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
567865,567865,266405008991,http://world-en.openfoodfacts.org/product/0266...,kiliweb,1636494345,2021-11-09T21:45:45Z,1636494346,2021-11-09T21:45:46Z,kiliweb,1.707848e+09,2024-02-13T18:13:54Z,Jinga,,,,,,,,,,,,,,,,,,,,,,,,,,,,en:us,en:united-states,United States,,,,,,,,,,,,,,,,,unknown,,unknown,unknown,,,,"en:to-be-completed, en:nutrition-facts-complet...","en:to-be-completed,en:nutrition-facts-complete...","To be completed,Nutrition facts completed,Ingr...",,,unknown,,,,,,,0.2750,1.636494e+09,2021-11-09T21:45:46Z,,,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,,,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,,157.142857,657.0,,7.142857,1.428571,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,15.714286,1.428571,,,,,,,,,,,2.857143,,,7.142857,,,,1.285714,,0.514286,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


## Inspection des Données Brutes

In [3]:
# Basic stats
print("Taille du dataset:")
print("Number of rows : {}".format(rawdata.shape[0]))
print("Number of columns : {}".format(rawdata.shape[1]))
print()
print("---------------------------")
print()

print("Basics infos:")
print()
display(rawdata.info())
print()
print("---------------------------")
print()

print("Basics statistics: ")
print()
data_desc = rawdata.describe(include='all')
display(data_desc)
print()
print("---------------------------")
print()

print("Unique elements by feature: ")
print()
display(rawdata.nunique().sort_values())
print()
print("---------------------------")
print()

print("Percentage of missing values: ")
print()
null_percentage = 100 * rawdata.isnull().mean()
null_percentage_df = null_percentage.to_frame(name='Null Percentage').T
display(null_percentage_df) 

Taille du dataset:
Number of rows : 567867
Number of columns : 207

---------------------------

Basics infos:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 567867 entries, 0 to 567866
Columns: 207 entries, Unnamed: 0 to acidity_100g
dtypes: float64(131), int64(4), object(72)
memory usage: 896.8+ MB


None


---------------------------

Basics statistics: 



Unnamed: 0.1,Unnamed: 0,code,url,creator,created_t,created_datetime,last_modified_t,last_modified_datetime,last_modified_by,last_updated_t,last_updated_datetime,product_name,abbreviated_product_name,generic_name,quantity,packaging,packaging_tags,packaging_en,packaging_text,brands,brands_tags,categories,categories_tags,categories_en,origins,origins_tags,origins_en,manufacturing_places,manufacturing_places_tags,labels,labels_tags,labels_en,emb_codes,emb_codes_tags,first_packaging_code_geo,cities,cities_tags,purchase_places,stores,countries,countries_tags,countries_en,ingredients_text,ingredients_tags,ingredients_analysis_tags,allergens,allergens_en,traces,traces_tags,traces_en,serving_size,serving_quantity,no_nutrition_data,additives_n,additives,additives_tags,additives_en,nutriscore_score,nutriscore_grade,nova_group,pnns_groups_1,pnns_groups_2,food_groups,food_groups_tags,food_groups_en,states,states_tags,states_en,brand_owner,ecoscore_score,ecoscore_grade,nutrient_levels_tags,product_quantity,owner,data_quality_errors_tags,unique_scans_n,popularity_tags,completeness,last_image_t,last_image_datetime,main_category,main_category_en,image_url,image_small_url,image_ingredients_url,image_ingredients_small_url,image_nutrition_url,image_nutrition_small_url,energy-kj_100g,energy-kcal_100g,energy_100g,energy-from-fat_100g,fat_100g,saturated-fat_100g,butyric-acid_100g,caproic-acid_100g,caprylic-acid_100g,capric-acid_100g,lauric-acid_100g,myristic-acid_100g,palmitic-acid_100g,stearic-acid_100g,arachidic-acid_100g,behenic-acid_100g,lignoceric-acid_100g,cerotic-acid_100g,montanic-acid_100g,melissic-acid_100g,unsaturated-fat_100g,monounsaturated-fat_100g,omega-9-fat_100g,polyunsaturated-fat_100g,omega-3-fat_100g,omega-6-fat_100g,alpha-linolenic-acid_100g,eicosapentaenoic-acid_100g,docosahexaenoic-acid_100g,linoleic-acid_100g,arachidonic-acid_100g,gamma-linolenic-acid_100g,dihomo-gamma-linolenic-acid_100g,oleic-acid_100g,elaidic-acid_100g,gondoic-acid_100g,mead-acid_100g,erucic-acid_100g,nervonic-acid_100g,trans-fat_100g,cholesterol_100g,carbohydrates_100g,sugars_100g,added-sugars_100g,sucrose_100g,glucose_100g,fructose_100g,lactose_100g,maltose_100g,maltodextrins_100g,starch_100g,polyols_100g,erythritol_100g,fiber_100g,soluble-fiber_100g,insoluble-fiber_100g,proteins_100g,casein_100g,serum-proteins_100g,nucleotides_100g,salt_100g,added-salt_100g,sodium_100g,alcohol_100g,vitamin-a_100g,beta-carotene_100g,vitamin-d_100g,vitamin-e_100g,vitamin-k_100g,vitamin-c_100g,vitamin-b1_100g,vitamin-b2_100g,vitamin-pp_100g,vitamin-b6_100g,vitamin-b9_100g,folates_100g,vitamin-b12_100g,biotin_100g,pantothenic-acid_100g,silica_100g,bicarbonate_100g,potassium_100g,chloride_100g,calcium_100g,phosphorus_100g,iron_100g,magnesium_100g,zinc_100g,copper_100g,manganese_100g,fluoride_100g,selenium_100g,chromium_100g,molybdenum_100g,iodine_100g,caffeine_100g,taurine_100g,ph_100g,fruits-vegetables-nuts_100g,fruits-vegetables-nuts-dried_100g,fruits-vegetables-nuts-estimate_100g,fruits-vegetables-nuts-estimate-from-ingredients_100g,collagen-meat-protein-ratio_100g,cocoa_100g,chlorophyl_100g,carbon-footprint_100g,carbon-footprint-from-meat-or-fish_100g,nutrition-score-fr_100g,nutrition-score-uk_100g,glycemic-index_100g,water-hardness_100g,choline_100g,phylloquinone_100g,beta-glucan_100g,inositol_100g,carnitine_100g,sulphate_100g,nitrate_100g,acidity_100g
count,567867.0,567867.0,567867,567867,567867.0,567867,567867.0,567867,556437,567865.0,567865,550590,30,7029,86369,19712,19714,19712,926,301849,301824,281911,281911,281911,6828,6824,6822,7176,7173,104024,104030,104027,7238,7240,4476,0.0,5040,12399,35860,565147,565147,565147,258004,257917,262799,61530,0.0,5886,8544,8494,283372,281016.0,3515,258005.0,2,170433,170433,244767.0,566007,243393.0,566034,566034,219415,219415,219415,567867,567867,567867,196037,103508.0,566006,256349,83727.0,2657,24446,66221.0,65859,567867.0,336459.0,336459,281911,281911,318197,318197,76896,76896,225019,225019,15902.0,471685.0,472810.0,732.0,469329.0,432183.0,6.0,0.0,1.0,2.0,3.0,0.0,3.0,3.0,99.0,39.0,1.0,1.0,1.0,0.0,0.0,35454.0,7.0,35427.0,416.0,176.0,133.0,7.0,7.0,9.0,86.0,1.0,0.0,2.0,0.0,28.0,7.0,0.0,2.0,197789.0,202432.0,469564.0,447667.0,16968.0,11.0,5.0,2.0,21.0,3.0,14.0,240.0,1447.0,12.0,322222.0,3065.0,2435.0,469971.0,1.0,13.0,3.0,421304.0,0.0,421304.0,1783.0,137903.0,5.0,18449.0,599.0,476.0,141021.0,16774.0,16198.0,16811.0,9268.0,7389.0,6530.0,6703.0,218.0,2571.0,16.0,26.0,87598.0,48.0,192881.0,8129.0,193010.0,7351.0,6249.0,2458.0,2255.0,5.0,1077.0,68.0,110.0,644.0,259.0,33.0,18.0,74.0,15.0,198.0,257915.0,5.0,98.0,1.0,7.0,230.0,244769.0,2.0,1.0,0.0,5.0,757.0,0.0,8.0,1.0,1.0,7.0,1.0
unique,,,567835,4362,,418300,,405983,5759,,236124,360330,30,5354,11241,4856,3767,3758,687,42363,34790,23902,16094,16094,1525,1220,1214,1847,1657,10399,7416,7415,2399,2305,691,,883,1307,2060,2532,999,999,193984,174508,37,846,,1181,1339,1338,37143,,3,,2,43509,43509,,7,,11,41,45,45,45,2864,2864,2864,9520,,9,233,,26,411,,14814,,,336032,9078,9078,318170,318170,76871,76871,225004,225004,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
top,,,http://world-en.openfoodfacts.org/product/0219...,kiliweb,,2020-04-23T18:23:21Z,,2022-08-01T12:48:51Z,org-database-usda,,2024-10-21T09:05:35Z,Filet de poulet,Purée de Noix de Cajou grillées DDM : 11/22 Bi...,Ice cream,100g,Plastic,en:plastic,Plastic,1 metal can to recycle,Kroger,kroger,Snacks,en:snacks,Snacks,France,en:united-states,United States,Canada,canada,"No GMOs, Non GMO project","en:no-gmos,en:non-gmo-project","No GMOs,Non GMO project",FR 79.195.003 EC,fr-79-195-003-ec,"47.833333,-0.333333",,nueil-les-aubiers-deux-sevres-france,"Calgary,Alberta,Canada",Sainsbury's,United States,en:united-states,United States,"Carbonated water, natural flavor.","en:carbonated-water,en:water,en:natural-flavou...","en:palm-oil-free,en:vegan-status-unknown,en:ve...",en:milk,,en:nuts,en:nuts,Nuts,1 ONZ (28 g),,on,,[ farine-de-ble -> fr:farine-de-ble ] [ far...,en:e330,E330 - Citric acid,,unknown,,unknown,unknown,en:sweets,"en:sugary-snacks,en:sweets","Sugary snacks,Sweets","en:to-be-completed, en:nutrition-facts-complet...","en:to-be-completed,en:nutrition-facts-complete...","To be completed,Nutrition facts completed,Ingr...",Target Stores,,unknown,"en:fat-in-low-quantity,en:saturated-fat-in-low...",,org-carrefour,en:energy-value-in-kcal-does-not-match-value-c...,,"top-75-percent-scans-2023,top-80-percent-scans...",,,2019-06-26T17:41:18Z,en:groceries,Groceries,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
freq,,,2,228056,,27,,91,142498,,53,666,1,146,5448,993,2191,2191,9,4007,4026,19348,19388,19388,974,1466,1467,839,899,13807,14330,14330,344,368,310,,390,2530,4898,207467,363117,363117,431,775,48274,15836,,574,855,855,12527,,1709,,1,11833,11833,,317047,,349623,349623,21448,21448,21448,82978,82978,82978,4722,,453870,19222,,1088,17612,,9850,,,6,21847,21847,2,2,2,2,2,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
mean,283933.0,74650780000.0,,,1602006000.0,,1652305000.0,,,1715033000.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1598363.0,,2.68577,,,,8.681587,,3.471809,,,,,,,,,,48.227499,,,713.8722,,,1.68584,,0.400535,1641893000.0,,,,,,,,,,1021.929885,492.7096,2058.613,332.839331,3635.589,1416.383,3.3463,,7.7,27.5,0.143723,,3.408433,0.033333,0.987631,8.252564,0.025,1.0,2.4,,,7.790952,2.425857,5.234665,2.238755,2.854832,4.971594,0.542571,0.357143,2.589678,2.085701,0.0,,1.500003,,0.223229,0.01893,,0.0365,505.6412,0.034913,215093200.0,14.007278,9.614848,6.441894,3.74036,10.0,0.770476,0.074667,0.194632,19.205486,29.390286,3.8805,2068970.0,2.402627,4.337557,141852700.0,0.1,6.648561,21.667567,189886600.0,,75954660.0,1.850088,0.166637,0.003539,0.01196,0.271707,0.33641,8.527151,0.13802,0.018595,0.012388,0.056688,0.008007,0.033214,0.04077809,0.465989,0.188655,25.044097,3.365377,0.530696,0.110063,0.136518,0.260023,0.005832,0.182884,0.026611,-0.000531,0.002684,2.2e-05,0.001036,0.006878,0.032823,0.155872,1.872369,0.336259,6.348111,35.421622,3.393333,47.846111,18.398473,15.2,55.762959,0.982,142.857143,504.379043,8.681655,17.0,54.0,,0.11546,0.000149,,9.381322,0.04,3.1e-05,0.001399,5.0
std,163929.226994,67832830000.0,,,75882570.0,,57097200.0,,,9682070.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,832306600.0,,3.389489,,,,8.93907,,0.961691,,,,,,,,,,21.718283,,,40804.6,,,2.710167,,0.146048,51385320.0,,,,,,,,,,720.886707,145610.3,608508.3,471.498732,2340064.0,912804.3,8.15865,,,31.819805,0.182937,,5.70957,0.057735,4.245946,49.595797,,,,,,13.48313,4.143628,9.651096,18.12597,5.779338,33.480759,0.507383,0.265386,5.165142,9.452997,,,2.121316,,0.556768,0.022722,,0.051336,224853.1,0.895133,145940000000.0,28.33945,16.264093,13.965524,6.888732,14.142136,1.837004,0.062011,0.522178,19.369214,33.556407,4.635183,1174441000.0,3.680523,5.689009,97246330000.0,,8.310745,37.526988,123251500000.0,,49300600000.0,6.612294,17.356487,0.006994,0.912495,5.129693,5.328487,3195.504,3.116757,0.151757,0.146104,2.643482,0.402934,2.117865,1.846539,4.018227,6.537711,93.441318,12.163935,13.998331,0.243866,4.523296,2.440791,0.347062,4.351519,1.557208,0.150129,0.033874,2.6e-05,0.018434,0.049022,0.264775,2.949043,13.759019,0.556253,3.93335,36.435491,9.879594,34.98685,30.829171,3.271085,24.678748,,161.834719,631.773532,8.939065,1.414214,,,0.112471,0.000759,,26.513951,,,0.002924,
min,0.0,1.0,,,1332445000.0,,1415653000.0,,,1707490000.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,0.0,,,,-15.0,,1.0,,,,,,,,,,-30.0,,,0.0,,,1.0,,0.05,1332445000.0,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,7.7,5.0,0.00117,,0.0003,0.0,0.0,0.0,0.025,1.0,2.4,,,0.0,0.0,0.0,0.0,0.0,0.0,0.147,0.016,0.0001,0.0,0.0,,6e-06,,0.0,0.0,,0.0002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004,2e-05,0.0,0.0,0.423,0.0,0.0,0.0,0.0,0.1,0.00129,0.0002,0.0,,0.0,0.0,-0.00034,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-6.896552,0.0,0.0,-2e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-1.878101,10.0,3.0,0.982,0.0,8.88,-15.0,16.0,54.0,,0.01,0.0,,0.00135,0.04,3.1e-05,0.0,5.0
25%,141966.5,30800000000.0,,,1545333000.0,,1587662000.0,,,1707733000.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,28.0,,0.0,,,,1.0,,3.0,,,,,,,,,,35.0,,,150.0,,,1.0,,0.2875,1619749000.0,,,,,,,,,,448.0,98.0,410.0,42.0,0.0,0.0,0.000725,,7.7,16.25,0.040585,,0.11265,0.0,0.0,0.049,0.025,1.0,2.4,,,0.0,0.0,0.0,0.074,0.4,0.0,0.2,0.1735,0.426,0.0,0.0,,0.750004,,0.0,5e-06,,0.01835,0.0,0.0,5.0,0.6,0.0,0.000417,0.0018,5.0,0.0,0.052,0.017,2.1,3.0,0.8,0.0,0.0,0.0,1.0,0.1,0.25,0.00135,0.121,,0.0485,0.0,0.0,4.7e-05,0.0,0.001075,1e-05,0.0,0.0,0.000212,0.003158,0.00017,4e-05,4e-05,3.8e-07,1e-06,0.000417,0.00263,0.029125,0.083,0.0022,0.0,0.088,0.0,0.017,0.000778,0.000208,0.0,4e-06,7e-06,6e-06,7e-06,1.2e-05,0.00945,0.0053,4.375,0.0,0.0,17.175,0.0,15.0,32.25,0.982,100.0,111.425,1.0,16.5,54.0,,0.0613,8e-06,,0.002083,0.04,3.1e-05,0.000127,5.0
50%,283933.0,57413170000.0,,,1599227000.0,,1661643000.0,,,1707865000.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,50.0,,1.0,,,,10.0,,4.0,,,,,,,,,,46.0,,,330.0,,,1.0,,0.4,1650479000.0,,,,,,,,,,896.0,250.0,1046.0,167.0,5.8,1.72,0.0069,,7.7,27.5,0.08,,0.225,0.0,0.003,0.107,0.025,1.0,2.4,,,2.94,1e-06,1.79,0.159,1.09,0.0635,0.4,0.337,0.75,0.00182,0.0,,1.500003,,0.0005,0.02,,0.0365,0.0,0.0,19.76744,4.1,0.847,0.15,1.1,10.0,0.0,0.1,0.03865,12.5,11.5,2.665,1.25,2.0,2.0,5.6,0.1,3.0,0.0025,0.7083333,,0.2833333,0.0,0.0,0.000147,0.0,0.004,4e-05,0.0,0.000693,0.00034,0.004923,0.000533,7e-05,0.000112,1.71e-06,2e-05,0.00119,0.0816,0.089,0.17,0.0296,0.035,0.164,0.00092,0.057,0.00273,0.000481,0.001,7e-06,2.4e-05,1.4e-05,1.6e-05,1.6e-05,0.0285,0.112,7.0,27.0,0.0,40.0,0.284091,15.0,55.0,0.982,100.0,360.3,10.0,17.0,54.0,,0.07,2.3e-05,,0.00606,0.04,3.1e-05,0.00026,5.0
75%,425899.5,80432400000.0,,,1659396000.0,,1701695000.0,,,1728034000.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,113.0,,4.0,,,,15.0,,4.0,,,,,,,,,,67.0,,,500.0,,,2.0,,0.5,1673040000.0,,,,,,,,,,1520.0,392.8571,1644.0,418.0,19.35,6.72,0.050725,,7.7,38.75,0.215,,5.1125,0.05,0.05095,0.205,0.025,1.0,2.4,,,8.33,4.1655,6.25,0.7155,2.3275,0.707,0.6255,0.5,2.18,0.024215,0.0,,2.250001,,0.003885,0.025,,0.05465,0.0,0.028,55.0,19.0,13.025,6.0,1.6,15.0,0.1,0.11,0.09705,30.05,55.0,3.33,3.53,3.0,7.0,12.0,0.1,8.0,32.50125,1.5,,0.6,0.0,0.000101,0.0015,1e-06,0.0127,0.00012,0.0033,0.001,0.000755,0.008,0.001282,0.00014,0.000214,4.84e-06,7.3e-05,0.002381,0.4,0.2455,0.297,0.12025,0.103,0.312,0.00235,0.122,0.00577,0.001071,0.002,5e-05,6.2e-05,6.7e-05,3.9e-05,0.000115,0.055,0.4,7.55,50.75,0.0,93.0,21.875,18.0,72.0,0.982,100.0,643.8,15.0,17.5,54.0,,0.136,0.0001,,0.01325,0.04,3.1e-05,0.00064,5.0



---------------------------

Unique elements by feature: 



erucic-acid_100g                         0
cities                                   0
elaidic-acid_100g                        0
dihomo-gamma-linolenic-acid_100g         0
unsaturated-fat_100g                     0
                                     ...  
created_t                           418300
created_datetime                    418300
code                                567834
url                                 567835
Unnamed: 0                          567867
Length: 207, dtype: int64


---------------------------

Percentage of missing values: 



Unnamed: 0.1,Unnamed: 0,code,url,creator,created_t,created_datetime,last_modified_t,last_modified_datetime,last_modified_by,last_updated_t,last_updated_datetime,product_name,abbreviated_product_name,generic_name,quantity,packaging,packaging_tags,packaging_en,packaging_text,brands,brands_tags,categories,categories_tags,categories_en,origins,origins_tags,origins_en,manufacturing_places,manufacturing_places_tags,labels,labels_tags,labels_en,emb_codes,emb_codes_tags,first_packaging_code_geo,cities,cities_tags,purchase_places,stores,countries,countries_tags,countries_en,ingredients_text,ingredients_tags,ingredients_analysis_tags,allergens,allergens_en,traces,traces_tags,traces_en,serving_size,serving_quantity,no_nutrition_data,additives_n,additives,additives_tags,additives_en,nutriscore_score,nutriscore_grade,nova_group,pnns_groups_1,pnns_groups_2,food_groups,food_groups_tags,food_groups_en,states,states_tags,states_en,brand_owner,ecoscore_score,ecoscore_grade,nutrient_levels_tags,product_quantity,owner,data_quality_errors_tags,unique_scans_n,popularity_tags,completeness,last_image_t,last_image_datetime,main_category,main_category_en,image_url,image_small_url,image_ingredients_url,image_ingredients_small_url,image_nutrition_url,image_nutrition_small_url,energy-kj_100g,energy-kcal_100g,energy_100g,energy-from-fat_100g,fat_100g,saturated-fat_100g,butyric-acid_100g,caproic-acid_100g,caprylic-acid_100g,capric-acid_100g,lauric-acid_100g,myristic-acid_100g,palmitic-acid_100g,stearic-acid_100g,arachidic-acid_100g,behenic-acid_100g,lignoceric-acid_100g,cerotic-acid_100g,montanic-acid_100g,melissic-acid_100g,unsaturated-fat_100g,monounsaturated-fat_100g,omega-9-fat_100g,polyunsaturated-fat_100g,omega-3-fat_100g,omega-6-fat_100g,alpha-linolenic-acid_100g,eicosapentaenoic-acid_100g,docosahexaenoic-acid_100g,linoleic-acid_100g,arachidonic-acid_100g,gamma-linolenic-acid_100g,dihomo-gamma-linolenic-acid_100g,oleic-acid_100g,elaidic-acid_100g,gondoic-acid_100g,mead-acid_100g,erucic-acid_100g,nervonic-acid_100g,trans-fat_100g,cholesterol_100g,carbohydrates_100g,sugars_100g,added-sugars_100g,sucrose_100g,glucose_100g,fructose_100g,lactose_100g,maltose_100g,maltodextrins_100g,starch_100g,polyols_100g,erythritol_100g,fiber_100g,soluble-fiber_100g,insoluble-fiber_100g,proteins_100g,casein_100g,serum-proteins_100g,nucleotides_100g,salt_100g,added-salt_100g,sodium_100g,alcohol_100g,vitamin-a_100g,beta-carotene_100g,vitamin-d_100g,vitamin-e_100g,vitamin-k_100g,vitamin-c_100g,vitamin-b1_100g,vitamin-b2_100g,vitamin-pp_100g,vitamin-b6_100g,vitamin-b9_100g,folates_100g,vitamin-b12_100g,biotin_100g,pantothenic-acid_100g,silica_100g,bicarbonate_100g,potassium_100g,chloride_100g,calcium_100g,phosphorus_100g,iron_100g,magnesium_100g,zinc_100g,copper_100g,manganese_100g,fluoride_100g,selenium_100g,chromium_100g,molybdenum_100g,iodine_100g,caffeine_100g,taurine_100g,ph_100g,fruits-vegetables-nuts_100g,fruits-vegetables-nuts-dried_100g,fruits-vegetables-nuts-estimate_100g,fruits-vegetables-nuts-estimate-from-ingredients_100g,collagen-meat-protein-ratio_100g,cocoa_100g,chlorophyl_100g,carbon-footprint_100g,carbon-footprint-from-meat-or-fish_100g,nutrition-score-fr_100g,nutrition-score-uk_100g,glycemic-index_100g,water-hardness_100g,choline_100g,phylloquinone_100g,beta-glucan_100g,inositol_100g,carnitine_100g,sulphate_100g,nitrate_100g,acidity_100g
Null Percentage,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.012795,0.000352,0.000352,3.042438,99.994717,98.76221,84.790629,96.528765,96.528412,96.528765,99.836934,46.845124,46.849526,50.356157,50.356157,50.356157,98.797606,98.79831,98.798662,98.736324,98.736852,81.681626,81.68057,81.681098,98.725406,98.725054,99.211787,100.0,99.112468,97.816566,93.685141,0.478985,0.478985,0.478985,54.566122,54.581442,53.721734,89.164716,100.0,98.96349,98.495422,98.504227,50.098879,50.513765,99.381017,54.565946,99.999648,69.987162,69.987162,56.897126,0.327541,57.139084,0.322787,0.322787,61.361551,61.361551,61.361551,0.0,0.0,0.0,65.47836,81.772493,0.327718,54.857563,85.255879,99.532109,95.695119,88.338643,88.40239,0.0,40.750387,40.750387,50.356157,50.356157,43.966281,43.966281,86.458801,86.458801,60.3747,60.3747,97.199696,16.937417,16.739307,99.871097,17.352303,23.893623,99.998943,100.0,99.999824,99.999648,99.999472,100.0,99.999472,99.999472,99.982566,99.993132,99.999824,99.999824,99.999824,100.0,100.0,93.756637,99.998767,93.761391,99.926743,99.969007,99.976579,99.998767,99.998767,99.998415,99.984856,99.999824,100.0,99.999648,100.0,99.995069,99.998767,100.0,99.999648,65.169837,64.352216,17.31092,21.166928,97.011976,99.998063,99.99912,99.999648,99.996302,99.999472,99.997535,99.957737,99.745187,99.997887,43.257488,99.460261,99.571202,17.239248,99.999824,99.997711,99.999472,25.809388,100.0,25.809388,99.686018,75.715617,99.99912,96.751176,99.894518,99.916178,75.166544,97.046139,97.147572,97.039624,98.367928,98.698815,98.850083,98.819618,99.961611,99.547253,99.997182,99.995421,84.574205,99.991547,66.034124,98.568503,66.011408,98.705507,98.899566,99.567152,99.6029,99.99912,99.810343,99.988025,99.980629,99.886593,99.954391,99.994189,99.99683,99.986969,99.997359,99.965133,54.581795,99.99912,99.982742,99.999824,99.998767,99.959498,56.896773,99.999648,99.999824,100.0,99.99912,99.866694,100.0,99.998591,99.999824,99.999824,99.998767,99.999824


**Notes :**

Il y a beaucoup de données manquantes.

Nous allons dans un premier temps nous concentrer sur les données des produits vendus en France.

## Analyse Exploratoire des Données

In [4]:
# Find different states
unique_values = set()
rawdata['states_tags'].str.split(',').apply(unique_values.update)

states_df = pd.DataFrame(data=unique_values, columns=['states'])
display(states_df)

Unnamed: 0,states
0,en:photos-uploaded
1,en:nutrition-facts-completed
2,en:front-photo-to-be-selected
3,en:to-be-checked
4,en:packaging-code-completed
5,en:packaging-code-to-be-completed
6,en:origins-to-be-completed
7,en:nutrition-facts-to-be-completed
8,en:product-name-completed
9,en:categories-to-be-completed


In [5]:
# Vérifier que la colonne 'state' existe
if 'state' in rawdata.columns:
    # Compter les occurrences de chaque état
    states_count = rawdata['state'].value_counts().reset_index()
    states_count.columns = ['state', 'count']
    
    # Afficher le tableau
    print(states_count)
else:
    print("La colonne 'state' n'existe pas dans le dataset.")

La colonne 'state' n'existe pas dans le dataset.


**Notes :**

Il y a beaucoup de colonnes, nous allons sélectionner celles qui potentiellement pourront aider nos futurs modèles à apprendre et à répondre à notre problématique.

In [None]:
columns_to_keep = [
    'code',
    'url',
    'created_datetime',
    'last_modified_datetime',
    'product_name',
    'packaging',
    'categories_tags',
    'ingredients_tags',
    'ingredients_analysis_tags',
    'allergens',
    'traces_tags',
    'additives_n',
    'additives_tags',
    'nutriscore_grade',
    'food_groups_tags',
    'states_tags',
    'ecoscore_score',
    'ecoscore_grade',
    'nutrient_levels_tags',
    'popularity_tags',
    'main_category',
    'image_url',
    'image_small_url',
    'image_ingredients_url',
    'image_ingredients_small_url',
    'image_nutrition_url',
    'image_nutrition_small_url',
    'energy-kcal_100g',
    'energy_100g',
    'fat_100g',
    'cholesterol_100g',
    'sugars_100g',
    'proteins_100g',
    'salt_100g',
    'alcohol_100g'
]

data = rawdata[columns_to_keep]
display(data)

Unnamed: 0,code,url,created_datetime,last_modified_datetime,product_name,packaging,categories_tags,ingredients_text,ingredients_tags,ingredients_analysis_tags,allergens,traces_tags,additives_n,additives_tags,nutriscore_score,nutriscore_grade,food_groups_tags,states_tags,ecoscore_score,ecoscore_grade,nutrient_levels_tags,popularity_tags,main_category,image_url,image_small_url,image_ingredients_url,image_ingredients_small_url,image_nutrition_url,image_nutrition_small_url,energy-kcal_100g,energy_100g,fat_100g,cholesterol_100g,sugars_100g,proteins_100g,salt_100g,alcohol_100g
0,1,http://world-en.openfoodfacts.org/product/0000...,2021-10-20T15:57:36Z,2024-11-05T01:55:46Z,Purée Mix Tropical Harmony + Aloe,,en:syrups,"fruit juice blend, guava and papaya purees, pa...","en:fruit-juice-blend,en:guava,en:fruit,en:papa...","en:may-contain-palm-oil,en:vegan,en:vegetarian",,,4.0,"en:e160a,en:e160ai,en:e330,en:e415,en:e440",-5.0,a,,"en:to-be-completed,en:nutrition-facts-complete...",,unknown,"en:fat-in-moderate-quantity,en:saturated-fat-i...",,en:syrups,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,360.000000,1530.0,10.000000,0.000,8.000000,49.000000,0.020000,
1,2,http://world-en.openfoodfacts.org/product/0000...,2024-08-02T13:47:35Z,2024-10-14T14:03:32Z,Poudre de grillon,,"en:beverages-and-beverages-preparations,en:bev...",,,,,,,,,unknown,,"en:to-be-completed,en:nutrition-facts-complete...",,unknown,"en:fat-in-moderate-quantity,en:saturated-fat-i...",,en:instant-beverages,,,,,,,400.000000,1674.0,16.444445,,4.888889,10.000000,0.013123,
2,3,http://world-en.openfoodfacts.org/product/0000...,2024-05-27T13:59:03Z,2024-10-25T13:23:44Z,Feuchtes Toilettentuch - Kamille,,,Kein Rock gleicht den anderen Wir haben nachge...,en:kein-rock-gleicht-den-anderen-wir-haben-nac...,"en:palm-oil-content-unknown,en:vegan-status-un...",,,0.0,,,unknown,,"en:to-be-completed,en:nutrition-facts-complete...",,unknown,,,,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,160.000000,669.0,3.200000,0.002,1.400000,8.700000,0.807087,
3,4,http://world-en.openfoodfacts.org/product/0000...,2019-06-10T14:20:26Z,2024-10-30T19:35:25Z,Marinara,,"en:snacks,en:sweet-snacks,en:cocoa-and-its-pro...","Organic cocoa mass 60%, organic cane sugar, or...","en:cocoa-paste,en:plant,en:cocoa,en:cane-sugar...","en:palm-oil-free,en:vegan,en:vegetarian",,,0.0,,23.0,e,"en:sugary-snacks,en:chocolate-products","en:to-be-completed,en:nutrition-facts-complete...",41.0,d,"en:fat-in-low-quantity,en:saturated-fat-in-hig...",,en:dark-chocolate-bar,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,,,,,56.451613,2401.0,2.016129,,35.000000,1.612903,0.000000,
4,5,http://world-en.openfoodfacts.org/product/0000...,2020-11-14T07:08:40Z,2024-10-20T14:02:34Z,Bio inulin,Glass,"en:plant-based-foods-and-beverages,en:plant-ba...","Jus et purée d'abricots (50%), eau, sucre.","en:apricot-juice-and-puree,en:fruit,en:prunus-...","en:palm-oil-free,en:vegan,en:vegetarian",,,0.0,,-2.0,a,"en:fruits-and-vegetables,en:vegetables","en:to-be-completed,en:nutrition-facts-complete...",,unknown,"en:fat-in-low-quantity,en:saturated-fat-in-low...",,en:vegetables,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,208.000000,840.0,0.000000,,8.000000,0.000000,0.230000,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
567862,2663923,http://world-en.openfoodfacts.org/product/0266...,2019-12-08T08:53:37Z,2024-10-03T16:10:35Z,,,,,,,,,,,,unknown,,"en:to-be-completed,en:nutrition-facts-to-be-co...",,unknown,,,,,,,,,,,,,,,,,
567863,266392301259,http://world-en.openfoodfacts.org/product/0266...,2022-08-07T23:24:48Z,2022-08-07T23:35:40Z,Chocolatine,,,,,,,,,,,unknown,,"en:to-be-completed,en:nutrition-facts-complete...",,unknown,,,,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,,,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,340.909091,1426.0,19.318182,,11.363636,6.818182,0.795455,
567864,266392804996,http://world-en.openfoodfacts.org/product/0266...,2022-06-18T23:17:26Z,2022-06-18T23:28:30Z,Chocolatine,,,,,,,,,,,unknown,,"en:to-be-completed,en:nutrition-facts-complete...",,unknown,,,,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,,,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,340.909091,1426.0,19.318182,,11.363636,6.818182,0.795455,
567865,266405008991,http://world-en.openfoodfacts.org/product/0266...,2021-11-09T21:45:45Z,2021-11-09T21:45:46Z,Jinga,,,,,,,,,,,unknown,,"en:to-be-completed,en:nutrition-facts-complete...",,unknown,,,,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,,,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,157.142857,657.0,7.142857,,1.428571,7.142857,1.285714,


In [7]:
# Liste des états cibles
target_states = ["en:completed", "en:origins-completed", "en:ingredients-completed"]

# Construire une expression régulière pour les états
pattern = '|'.join(target_states)

# Compter les lignes contenant au moins un des états cibles
count = data['states_tags'].str.contains(pattern).sum()

print("Nombre de lignes avec les états spécifiés:", count)


Nombre de lignes avec les états spécifiés: 258229


In [8]:
data = data[data['states_tags'].str.contains(pattern)]

In [9]:
count_data_allergen = data['allergens'].notna().sum()
print("Nombre de lignes avec des allergens:", count_data_allergen)

Nombre de lignes avec des allergens: 61186


**Notes:**

Nous allons remplacer les valeurs de la colonne allergens non renseignées par la valeur `en:none`.

In [10]:
data.loc[:, 'allergens'] = data['allergens'].astype(str)
data['allergens'] = data['allergens'].str.lower()
data['allergens'] = data['allergens'].apply(lambda x: 'en:none' if x == 'nan' else x)
data['allergens'].value_counts()

allergens
en:none                                                                       197056
en:milk                                                                        15769
en:gluten                                                                       9401
en:gluten,en:milk                                                               4692
en:nuts                                                                         2931
                                                                               ...  
en:milk,en:palm oil                                                                1
en:crustaceans,en:gluten,en:milk,en:molluscs,en:soybeans                           1
en:celery,en:eggs,en:gluten,en:milk,en:sesame-seeds                                1
en:celery,en:crustaceans,en:gluten,en:milk,en:sesame-seeds                         1
en:eggs,en:gluten,en:milk,en:sesame-seeds,en:sulphur-dioxide-and-sulphites         1
Name: count, Length: 816, dtype: int64

In [11]:
# Divise chaque entrée par ',' et compile tous les allergènes en une seule liste
allergens_list = data["allergens"].fillna('').str.split(',').sum()

# Compte les occurrences de chaque allergène
allergen_counts = Counter(allergens_list)

# Convertir le résultat en DataFrame
allergens_df = pd.DataFrame(allergen_counts.items(), columns=['allergen', 'count']).sort_values(by='count', ascending=False)
display(allergens_df)

Unnamed: 0,allergen,count
0,en:none,197061
3,en:milk,31839
2,en:gluten,24703
4,en:soybeans,11187
1,en:eggs,6832
...,...,...
156,en:nonfat milk,1
157,en:egg. sulfate,1
158,en:thiamine,1
159,fr:wheat gluten,1


**Notes :**

On peu se fier seulement aux allergènes contenant plus de 100 produits dans la base de données.

Nous pourronts mettre les autres dans une autre catégories nommée `other`.

In [12]:
display(allergens_df[allergens_df['count'] > 100])

Unnamed: 0,allergen,count
0,en:none,197061
3,en:milk,31839
2,en:gluten,24703
4,en:soybeans,11187
1,en:eggs,6832
6,en:nuts,6562
9,en:peanuts,4225
5,en:celery,2849
10,en:sesame-seeds,1698
13,en:sulphur-dioxide-and-sulphites,1435


**Notes:**

Nous allons considérer que la valeur vide correspond à pas d'allergène : `en:none`.

Nous allons mettre les allergènes qui ne sont pas dans cette liste dans une catégorie `other`.

Nous pouvons fusionner les valeurs `fr:Non` et `en:none` et traduire `fr:avoine` en `en:oats`.

In [13]:
# Liste des allergènes à conserver (ceux dont le count est supérieur à 100)
allergens_to_keep = allergens_df[allergens_df['count'] > 100]['allergen'].tolist()
allergens_to_keep.append('en:none') # Ajoute l'entrée none

data['allergens'] = data['allergens'].apply(lambda x: 'other' if x not in allergens_to_keep  else x)
data['allergens'] = data['allergens'].apply(lambda x: 'en:oats' if x == 'fr:avoine' else x)
data['allergens'].value_counts()

display(data)

Unnamed: 0,code,url,created_datetime,last_modified_datetime,product_name,packaging,categories_tags,ingredients_text,ingredients_tags,ingredients_analysis_tags,allergens,traces_tags,additives_n,additives_tags,nutriscore_score,nutriscore_grade,food_groups_tags,states_tags,ecoscore_score,ecoscore_grade,nutrient_levels_tags,popularity_tags,main_category,image_url,image_small_url,image_ingredients_url,image_ingredients_small_url,image_nutrition_url,image_nutrition_small_url,energy-kcal_100g,energy_100g,fat_100g,cholesterol_100g,sugars_100g,proteins_100g,salt_100g,alcohol_100g
0,1,http://world-en.openfoodfacts.org/product/0000...,2021-10-20T15:57:36Z,2024-11-05T01:55:46Z,Purée Mix Tropical Harmony + Aloe,,en:syrups,"fruit juice blend, guava and papaya purees, pa...","en:fruit-juice-blend,en:guava,en:fruit,en:papa...","en:may-contain-palm-oil,en:vegan,en:vegetarian",en:none,,4.0,"en:e160a,en:e160ai,en:e330,en:e415,en:e440",-5.0,a,,"en:to-be-completed,en:nutrition-facts-complete...",,unknown,"en:fat-in-moderate-quantity,en:saturated-fat-i...",,en:syrups,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,360.000000,1530.0,10.000000,0.000,8.000000,49.000000,0.020000,
1,2,http://world-en.openfoodfacts.org/product/0000...,2024-08-02T13:47:35Z,2024-10-14T14:03:32Z,Poudre de grillon,,"en:beverages-and-beverages-preparations,en:bev...",,,,en:none,,,,,unknown,,"en:to-be-completed,en:nutrition-facts-complete...",,unknown,"en:fat-in-moderate-quantity,en:saturated-fat-i...",,en:instant-beverages,,,,,,,400.000000,1674.0,16.444445,,4.888889,10.000000,0.013123,
2,3,http://world-en.openfoodfacts.org/product/0000...,2024-05-27T13:59:03Z,2024-10-25T13:23:44Z,Feuchtes Toilettentuch - Kamille,,,Kein Rock gleicht den anderen Wir haben nachge...,en:kein-rock-gleicht-den-anderen-wir-haben-nac...,"en:palm-oil-content-unknown,en:vegan-status-un...",en:none,,0.0,,,unknown,,"en:to-be-completed,en:nutrition-facts-complete...",,unknown,,,,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,160.000000,669.0,3.200000,0.002,1.400000,8.700000,0.807087,
3,4,http://world-en.openfoodfacts.org/product/0000...,2019-06-10T14:20:26Z,2024-10-30T19:35:25Z,Marinara,,"en:snacks,en:sweet-snacks,en:cocoa-and-its-pro...","Organic cocoa mass 60%, organic cane sugar, or...","en:cocoa-paste,en:plant,en:cocoa,en:cane-sugar...","en:palm-oil-free,en:vegan,en:vegetarian",en:none,,0.0,,23.0,e,"en:sugary-snacks,en:chocolate-products","en:to-be-completed,en:nutrition-facts-complete...",41.0,d,"en:fat-in-low-quantity,en:saturated-fat-in-hig...",,en:dark-chocolate-bar,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,,,,,56.451613,2401.0,2.016129,,35.000000,1.612903,0.000000,
4,5,http://world-en.openfoodfacts.org/product/0000...,2020-11-14T07:08:40Z,2024-10-20T14:02:34Z,Bio inulin,Glass,"en:plant-based-foods-and-beverages,en:plant-ba...","Jus et purée d'abricots (50%), eau, sucre.","en:apricot-juice-and-puree,en:fruit,en:prunus-...","en:palm-oil-free,en:vegan,en:vegetarian",en:none,,0.0,,-2.0,a,"en:fruits-and-vegetables,en:vegetables","en:to-be-completed,en:nutrition-facts-complete...",,unknown,"en:fat-in-low-quantity,en:saturated-fat-in-low...",,en:vegetables,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,208.000000,840.0,0.000000,,8.000000,0.000000,0.230000,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
567796,2661427,http://world-en.openfoodfacts.org/product/0266...,2017-03-09T14:40:34Z,2022-06-21T16:28:13Z,Sunflower Seeds,,"en:plant-based-foods-and-beverages,en:plant-ba...","Sunflower seeds, salt.","en:sunflower-seed,en:plant,en:seed,en:sunflowe...","en:palm-oil-free,en:vegan,en:vegetarian",en:none,,0.0,,,unknown,,"en:to-be-completed,en:nutrition-facts-complete...",43.0,d,"en:fat-in-high-quantity,en:saturated-fat-in-mo...","bottom-25-percent-scans-2019,bottom-20-percent...",en:sunflower-seeds,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,,,,,615.000000,2573.0,53.850000,0.000,0.000000,30.770000,,
567806,266147009836,http://world-en.openfoodfacts.org/product/0266...,2024-07-06T12:25:38Z,2024-07-06T12:27:16Z,Roast beef,,,"rewards savings! $3,28 gredients: beef, water,...","en:rewards-savings-3-28-gredients,en:water,en:...","en:palm-oil-content-unknown,en:non-vegan,en:no...",en:none,,0.0,,,unknown,,"en:to-be-completed,en:nutrition-facts-to-be-co...",,unknown,,,,,,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,,,,,,,,,,
567815,266191085548,http://world-en.openfoodfacts.org/product/0266...,2014-06-16T08:31:09Z,2024-10-04T13:16:52Z,P'tit plaisir au Chocolat Noir,"Plastique, Carton","en:snacks,en:desserts,en:sweet-snacks,en:biscu...","Chocolat noir 48% (Sucre de Canne, pâte de Cac...","en:dark-chocolate,en:chocolate,en:wheat-flour,...","en:palm-oil-free,en:non-vegan,en:maybe-vegetarian",en:none,"en:eggs,en:nuts,en:sesame-seeds,en:soybeans",2.0,"en:e334,en:e500,en:e500i",22.0,e,"en:sugary-snacks,en:biscuits-and-cakes","en:to-be-completed,en:nutrition-facts-complete...",,unknown,"en:fat-in-high-quantity,en:saturated-fat-in-hi...",,en:cakes,,,,,,,508.000000,2125.0,25.300000,,34.000000,7.100000,0.551180,
567821,2662060,http://world-en.openfoodfacts.org/product/0266...,2021-10-08T23:05:21Z,2021-10-08T23:06:33Z,black and gold soy sauce,,,water sat saw sace bo mate arotein iasses sat ...,en:water-sat-saw-sace-bo-mate-arotein-iasses-s...,"en:palm-oil-content-unknown,en:vegan-status-un...",en:none,,0.0,,,unknown,,"en:to-be-completed,en:nutrition-facts-complete...",,unknown,,,,,,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,,,,,,,,,,


In [14]:
# Séparer les allergènes par ",", et convertir en une liste
allergens_series = data['allergens'].str.split(',').explode().str.strip()
# Compter les occurrences de chaque allergène
allergens_counts = allergens_series.value_counts().reset_index()
allergens_counts.columns = ['allergen', 'count']

# Créer le graphique
fig = px.bar(allergens_counts, x='allergen', y='count', 
             title="Répartition des allergènes dans le dataset",
             labels={'allergen': 'Allergènes', 'count': 'Nombre d\'occurrences'},
             height=500)

fig.show()

Maintenant nous allons nous focaliser sur les traces d'allergènes.

In [15]:
data.loc[:, 'traces_tags'] = data['traces_tags'].astype(str).str.lower()

# Divise chaque entrée par ',' et compile tous les traces d'allergènes en une seule liste
traces_allergens_list = data["traces_tags"].fillna('').str.split(',').sum()

# Compte les occurrences de chaque traces d'allergène
traces_allergen_counts = Counter(traces_allergens_list)

# Convertir le résultat en DataFrame
traces_allergens_df = pd.DataFrame(traces_allergen_counts.items(), columns=['allergen', 'count']).sort_values(by='count', ascending=False)
display(allergens_df)

Unnamed: 0,allergen,count
0,en:none,197061
3,en:milk,31839
2,en:gluten,24703
4,en:soybeans,11187
1,en:eggs,6832
...,...,...
156,en:nonfat milk,1
157,en:egg. sulfate,1
158,en:thiamine,1
159,fr:wheat gluten,1


In [16]:
display(traces_allergens_df[traces_allergens_df['count'] > 20])

Unnamed: 0,allergen,count
0,,249861
1,en:nuts,3669
2,en:soybeans,2993
3,en:milk,2664
6,en:gluten,2213
9,en:peanuts,1884
16,en:eggs,1785
10,en:sesame-seeds,1022
8,en:mustard,449
38,en:sulphur-dioxide-and-sulphites,367


In [17]:
# Séparer les allergènes par ",", et convertir en une liste
data.loc[:, 'traces_tags'] = data['traces_tags'].astype(str)

traces_allergens_to_keep = traces_allergens_df[traces_allergens_df['count'] > 20]['allergen'].tolist()

data['traces_tags'] = data['traces_tags'].apply(lambda x: 'en:none' if (x == 'nan')  else x)
data['traces_tags'] = data['traces_tags'].apply(lambda x: 'other' if x not in traces_allergens_to_keep  else x)

traces_allergens_series = data['traces_tags'].str.split(',').explode().str.strip()
traces_allergens_counts = traces_allergens_series.value_counts().reset_index()
traces_allergens_counts.columns = ['traces_tags', 'count']

# Créer le graphique
fig = px.bar(traces_allergens_counts, x='traces_tags', y='count', 
             title="Répartition des traces d'allergènes dans le dataset",
             labels={'traces_tags': 'Traces allergènes', 'count': 'Nombre d\'occurrences'},
             height=500)

fig.show()

### Sélections et nettoyage des données

In [19]:
data.describe(include='all')

Unnamed: 0,code,url,created_datetime,last_modified_datetime,product_name,packaging,categories_tags,ingredients_text,ingredients_tags,ingredients_analysis_tags,allergens,traces_tags,additives_n,additives_tags,nutriscore_score,nutriscore_grade,food_groups_tags,states_tags,ecoscore_score,ecoscore_grade,nutrient_levels_tags,popularity_tags,main_category,image_url,image_small_url,image_ingredients_url,image_ingredients_small_url,image_nutrition_url,image_nutrition_small_url,energy-kcal_100g,energy_100g,fat_100g,cholesterol_100g,sugars_100g,proteins_100g,salt_100g,alcohol_100g
count,258229.0,258229,258229,258229,255822,13965,225564,257208,257122,257147,258229,258229,257209.0,169892,209047.0,258218,173654,258229,66229.0,258216,215402,39090,225564,76981,76981,47127,47127,55942,55942,243743.0,244633.0,244964.0,183157.0,235393.0,244657.0,241743.0,1504.0
unique,,258207,116513,151277,177247,3860,12364,193195,173732,36,15,18,,43298,,7,44,1613,,9,222,11533,6695,76962,76962,47107,47107,55932,55932,,,,,,,,
top,,http://world-en.openfoodfacts.org/product/0202...,2020-04-23T18:23:21Z,2022-08-01T12:48:51Z,Ice cream,Plastic,en:snacks,"Carbonated water, natural flavor.","en:carbonated-water,en:water,en:natural-flavou...","en:palm-oil-free,en:vegan-status-unknown,en:ve...",en:none,en:none,,en:e330,,d,"en:sugary-snacks,en:sweets","en:to-be-completed,en:nutrition-facts-complete...",,unknown,"en:fat-in-low-quantity,en:saturated-fat-in-low...","top-75-percent-scans-2023,top-80-percent-scans...",en:groceries,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,,,,,,,,
freq,,2,27,90,495,826,18931,431,775,48253,197056,249911,,11811,,63867,18459,82978,,184349,17113,6006,20470,2,2,2,2,2,2,,,,,,,,
mean,52602230000.0,,,,,,,,,,,,2.686092,,8.627342,,,,49.762294,,,,,,,,,,,263.769653,1102.914212,12.315812,0.032443,15.574527,6.998107,330930000.0,1.195133
std,38879960000.0,,,,,,,,,,,,3.390585,,8.990085,,,,20.844618,,,,,,,,,,,210.901112,880.613925,206.133901,0.894173,21.243031,13.766187,162709500000.0,5.633088
min,1.0,,,,,,,,,,,,0.0,,-15.0,,,,-30.0,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,28000360000.0,,,,,,,,,,,,0.0,,1.0,,,,35.0,,,,,,,,,,,88.0,368.0,0.0,0.0,1.18,0.01,0.085,0.0
50%,42400220000.0,,,,,,,,,,,,1.0,,10.0,,,,47.0,,,,,,,,,,,261.0,1092.0,4.41,0.0,5.36,4.35,0.675,0.0
75%,73296280000.0,,,,,,,,,,,,4.0,,15.0,,,,68.0,,,,,,,,,,,393.0,1644.0,18.58,0.022,22.86,10.0,1.4825,0.0


In [23]:
data[data['additives_n'] == 1.0].head()

Unnamed: 0,code,url,created_datetime,last_modified_datetime,product_name,packaging,categories_tags,ingredients_text,ingredients_tags,ingredients_analysis_tags,allergens,traces_tags,additives_n,additives_tags,nutriscore_score,nutriscore_grade,food_groups_tags,states_tags,ecoscore_score,ecoscore_grade,nutrient_levels_tags,popularity_tags,main_category,image_url,image_small_url,image_ingredients_url,image_ingredients_small_url,image_nutrition_url,image_nutrition_small_url,energy-kcal_100g,energy_100g,fat_100g,cholesterol_100g,sugars_100g,proteins_100g,salt_100g,alcohol_100g
49,51,http://world-en.openfoodfacts.org/product/0000...,2016-12-01T19:59:24Z,2024-10-11T08:26:38Z,Fondants Citron,"Boîte carton, Sachet Plastique","en:plant-based-foods-and-beverages,en:plant-ba...","mûre bio 50% (fruit 27%, purée 23%) sucre de c...","en:blackberry,en:fruit,en:berries,en:cane-suga...","en:palm-oil-free,en:vegan-status-unknown,en:ve...",other,other,1.0,en:e440,10.0,c,"en:sugary-snacks,en:sweets","en:to-be-completed,en:nutrition-facts-complete...",64.0,b,"en:fat-in-low-quantity,en:saturated-fat-in-low...","top-50000-scans-2019,top-100000-scans-2019,at-...",en:jams,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,249.0,1056.0,0.3,,59.0,0.5,0.05,
50,52,http://world-en.openfoodfacts.org/product/0000...,2023-09-06T09:13:41Z,2024-10-11T22:17:20Z,Organic Erythritol Granulated,,fr:croquettes-chat,100% certified organic granulated erythritol,en:e968,"en:palm-oil-free,en:vegan,en:vegetarian",en:none,en:none,1.0,en:e968,0.0,b,,"en:to-be-completed,en:nutrition-facts-complete...",,unknown,"en:fat-in-low-quantity,en:saturated-fat-in-low...","top-75-percent-scans-2023,top-80-percent-scans...",fr:croquettes-chat,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,0.0,0.0,0.0,,0.0,0.0,0.0,
69,75,http://world-en.openfoodfacts.org/product/0000...,2020-04-08T13:01:34Z,2024-10-04T09:59:29Z,,,,"potato, high oleic sunflower oil (31%) flavour...","en:potato,en:vegetable,en:root-vegetable,en:tu...","en:palm-oil-free,en:vegan-status-unknown,en:ve...",en:none,en:none,1.0,en:e330,,unknown,,"en:to-be-completed,en:nutrition-facts-to-be-co...",,unknown,,,,,,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,,,,,,,,,,
77,83,http://world-en.openfoodfacts.org/product/0000...,2022-06-06T14:47:37Z,2024-10-04T09:41:25Z,Xanthan gum,,"en:cooking-helpers,en:xanthan-gum",xanthan gum,en:e415,"en:palm-oil-free,en:vegan,en:vegetarian",en:none,en:none,1.0,en:e415,-6.0,a,,"en:to-be-completed,en:nutrition-facts-complete...",,unknown,"en:fat-in-low-quantity,en:saturated-fat-in-low...",,en:xanthan-gum,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,,,,,183.0,766.0,0.0,,0.1,5.8,0.05,
102,112,http://world-en.openfoodfacts.org/product/0000...,2021-05-10T11:26:28Z,2024-10-04T09:43:04Z,Plus,,,wate NUTRITIONAL INFORMATION for 250 ml of pro...,it:wate-nutritional-information-for-250-ml-of-...,"en:may-contain-palm-oil,en:vegan,en:vegetarian",en:none,en:none,1.0,en:e101,,unknown,,"en:to-be-completed,en:nutrition-facts-complete...",,unknown,,,,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,,,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,215.13,900.0,6.79,,19.77,16.7,0.614,


- url --> ok
- created_datetime --> transform to 1 column for year
- last_modified_datetime --> transform to 1 column for year
- product_name --> delete empty rows
- packaging --> set 'unknown' for empty rows
- categories_tags --> set 'en:unknown' for empty rows
- ingredients_tags --> delete empty rows
- ingredients_analysis_tags --> set 'unknown' for empty rows
- allergens --> set 'en:none' for empty rows
- traces_tags --> set 'en:none' for empty rows
- additives_n --> ok
- additives_tags --> set en:none when empty
- nutriscore_grade --> set unknown when empty
- food_groups_tags --> set en:none when empty
- states_tags --> ok
- ecoscore_grade --> set unknown when empty
- nutrient_levels_tags --> set en:unknown when empty
- popularity_tags --> set unknown when empty
- main_category --> set en:none when empty
- image_url --> set 'en:none' for empty rows
- energy-kcal_100g --> set -1 for empty rows
- energy_100g --> set -1 for empty rows
- fat_100g --> set -1 for empty rows
- cholesterol_100g --> set -1 for empty rows
- sugars_100g --> set -1 for empty rows
- proteins_100g --> set -1 for empty rows
- salt_100g --> set -1 for empty rows


In [None]:
columns_to_keep = [
    'code',
    'url',
    'last_modified_datetime',
    'product_name',
    'packaging',
    'categories_tags',
    'ingredients_tags',
    'ingredients_analysis_tags',
    'allergens',
    'traces_tags',
    'additives_n',
    'additives_tags',
    'nutriscore_grade',
    'food_groups_tags',
    'states_tags',
    'ecoscore_grade',
    'nutrient_levels_tags',
    'popularity_tags',
    'main_category',
    'image_url',
    'energy_100g',
    'fat_100g',
    'cholesterol_100g',
    'sugars_100g',
    'proteins_100g',
    'salt_100g',
]

clean_data = data[columns_to_keep]
display(clean_data)

Unnamed: 0,code,url,created_datetime,last_modified_datetime,product_name,packaging,categories_tags,ingredients_tags,ingredients_analysis_tags,allergens,traces_tags,additives_n,additives_tags,nutriscore_grade,food_groups_tags,states_tags,ecoscore_grade,nutrient_levels_tags,popularity_tags,main_category,image_url,energy-kcal_100g,energy_100g,fat_100g,cholesterol_100g,sugars_100g,proteins_100g,salt_100g
0,1,http://world-en.openfoodfacts.org/product/0000...,2021-10-20T15:57:36Z,2024-11-05T01:55:46Z,Purée Mix Tropical Harmony + Aloe,,en:syrups,"en:fruit-juice-blend,en:guava,en:fruit,en:papa...","en:may-contain-palm-oil,en:vegan,en:vegetarian",en:none,en:none,4.0,"en:e160a,en:e160ai,en:e330,en:e415,en:e440",a,,"en:to-be-completed,en:nutrition-facts-complete...",unknown,"en:fat-in-moderate-quantity,en:saturated-fat-i...",,en:syrups,https://images.openfoodfacts.org/images/produc...,360.000000,1530.0,10.000000,0.000,8.000000,49.000000,0.020000
1,2,http://world-en.openfoodfacts.org/product/0000...,2024-08-02T13:47:35Z,2024-10-14T14:03:32Z,Poudre de grillon,,"en:beverages-and-beverages-preparations,en:bev...",,,en:none,en:none,,,unknown,,"en:to-be-completed,en:nutrition-facts-complete...",unknown,"en:fat-in-moderate-quantity,en:saturated-fat-i...",,en:instant-beverages,,400.000000,1674.0,16.444445,,4.888889,10.000000,0.013123
2,3,http://world-en.openfoodfacts.org/product/0000...,2024-05-27T13:59:03Z,2024-10-25T13:23:44Z,Feuchtes Toilettentuch - Kamille,,,en:kein-rock-gleicht-den-anderen-wir-haben-nac...,"en:palm-oil-content-unknown,en:vegan-status-un...",en:none,en:none,0.0,,unknown,,"en:to-be-completed,en:nutrition-facts-complete...",unknown,,,,https://images.openfoodfacts.org/images/produc...,160.000000,669.0,3.200000,0.002,1.400000,8.700000,0.807087
3,4,http://world-en.openfoodfacts.org/product/0000...,2019-06-10T14:20:26Z,2024-10-30T19:35:25Z,Marinara,,"en:snacks,en:sweet-snacks,en:cocoa-and-its-pro...","en:cocoa-paste,en:plant,en:cocoa,en:cane-sugar...","en:palm-oil-free,en:vegan,en:vegetarian",en:none,en:none,0.0,,e,"en:sugary-snacks,en:chocolate-products","en:to-be-completed,en:nutrition-facts-complete...",d,"en:fat-in-low-quantity,en:saturated-fat-in-hig...",,en:dark-chocolate-bar,https://images.openfoodfacts.org/images/produc...,56.451613,2401.0,2.016129,,35.000000,1.612903,0.000000
4,5,http://world-en.openfoodfacts.org/product/0000...,2020-11-14T07:08:40Z,2024-10-20T14:02:34Z,Bio inulin,Glass,"en:plant-based-foods-and-beverages,en:plant-ba...","en:apricot-juice-and-puree,en:fruit,en:prunus-...","en:palm-oil-free,en:vegan,en:vegetarian",en:none,en:none,0.0,,a,"en:fruits-and-vegetables,en:vegetables","en:to-be-completed,en:nutrition-facts-complete...",unknown,"en:fat-in-low-quantity,en:saturated-fat-in-low...",,en:vegetables,https://images.openfoodfacts.org/images/produc...,208.000000,840.0,0.000000,,8.000000,0.000000,0.230000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
567796,2661427,http://world-en.openfoodfacts.org/product/0266...,2017-03-09T14:40:34Z,2022-06-21T16:28:13Z,Sunflower Seeds,,"en:plant-based-foods-and-beverages,en:plant-ba...","en:sunflower-seed,en:plant,en:seed,en:sunflowe...","en:palm-oil-free,en:vegan,en:vegetarian",en:none,en:none,0.0,,unknown,,"en:to-be-completed,en:nutrition-facts-complete...",d,"en:fat-in-high-quantity,en:saturated-fat-in-mo...","bottom-25-percent-scans-2019,bottom-20-percent...",en:sunflower-seeds,https://images.openfoodfacts.org/images/produc...,615.000000,2573.0,53.850000,0.000,0.000000,30.770000,
567806,266147009836,http://world-en.openfoodfacts.org/product/0266...,2024-07-06T12:25:38Z,2024-07-06T12:27:16Z,Roast beef,,,"en:rewards-savings-3-28-gredients,en:water,en:...","en:palm-oil-content-unknown,en:non-vegan,en:no...",en:none,en:none,0.0,,unknown,,"en:to-be-completed,en:nutrition-facts-to-be-co...",unknown,,,,,,,,,,,
567815,266191085548,http://world-en.openfoodfacts.org/product/0266...,2014-06-16T08:31:09Z,2024-10-04T13:16:52Z,P'tit plaisir au Chocolat Noir,"Plastique, Carton","en:snacks,en:desserts,en:sweet-snacks,en:biscu...","en:dark-chocolate,en:chocolate,en:wheat-flour,...","en:palm-oil-free,en:non-vegan,en:maybe-vegetarian",en:none,other,2.0,"en:e334,en:e500,en:e500i",e,"en:sugary-snacks,en:biscuits-and-cakes","en:to-be-completed,en:nutrition-facts-complete...",unknown,"en:fat-in-high-quantity,en:saturated-fat-in-hi...",,en:cakes,,508.000000,2125.0,25.300000,,34.000000,7.100000,0.551180
567821,2662060,http://world-en.openfoodfacts.org/product/0266...,2021-10-08T23:05:21Z,2021-10-08T23:06:33Z,black and gold soy sauce,,,en:water-sat-saw-sace-bo-mate-arotein-iasses-s...,"en:palm-oil-content-unknown,en:vegan-status-un...",en:none,en:none,0.0,,unknown,,"en:to-be-completed,en:nutrition-facts-complete...",unknown,,,,,,,,,,,


In [None]:
import pandas as pd

def clean_dataset(df):
    # Convert 'created_datetime' et 'last_modified_datetime' en années
    df['created_year'] = pd.to_datetime(df['created_datetime'], errors='coerce').dt.year
    df['last_modified_year'] = pd.to_datetime(df['last_modified_datetime'], errors='coerce').dt.year
    df.drop(['last_modified_datetime'], axis=1, inplace=True)

    # Supprimer les lignes où 'product_name' ou 'ingredients_tags' sont vides
    df.dropna(subset=['product_name', 'ingredients_tags'], inplace=True)

    # Remplacer les valeurs manquantes par des valeurs par défaut
    df['packaging'].fillna('unknown', inplace=True)
    df['categories_tags'].fillna('en:unknown', inplace=True)
    df['ingredients_analysis_tags'].fillna('unknown', inplace=True)
    df['allergens'].fillna('en:none', inplace=True)
    df['traces_tags'].fillna('en:none', inplace=True)
    df['additives_tags'].fillna('en:none', inplace=True)
    df['nutriscore_grade'].fillna('unknown', inplace=True)
    df['food_groups_tags'].fillna('en:none', inplace=True)
    df['ecoscore_grade'].fillna('unknown', inplace=True)
    df['nutrient_levels_tags'].fillna('en:unknown', inplace=True)
    df['popularity_tags'].fillna('unknown', inplace=True)
    df['main_category'].fillna('en:none', inplace=True)
    df['image_url'].fillna('en:none', inplace=True)

    return df

clean_data = clean_dataset(clean_data)
clean_data.head()




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


A value is trying to be set on a copy of a DataFr

Unnamed: 0,code,url,product_name,packaging,categories_tags,ingredients_tags,ingredients_analysis_tags,allergens,traces_tags,additives_n,additives_tags,nutriscore_grade,food_groups_tags,states_tags,ecoscore_grade,nutrient_levels_tags,popularity_tags,main_category,image_url,energy-kcal_100g,energy_100g,fat_100g,cholesterol_100g,sugars_100g,proteins_100g,salt_100g,created_year,last_modified_year
0,1,http://world-en.openfoodfacts.org/product/0000...,Purée Mix Tropical Harmony + Aloe,unknown,en:syrups,"en:fruit-juice-blend,en:guava,en:fruit,en:papa...","en:may-contain-palm-oil,en:vegan,en:vegetarian",en:none,en:none,4.0,"en:e160a,en:e160ai,en:e330,en:e415,en:e440",a,en:none,"en:to-be-completed,en:nutrition-facts-complete...",unknown,"en:fat-in-moderate-quantity,en:saturated-fat-i...",unknown,en:syrups,https://images.openfoodfacts.org/images/produc...,360.0,1530.0,10.0,0.0,8.0,49.0,0.02,2021,2024
2,3,http://world-en.openfoodfacts.org/product/0000...,Feuchtes Toilettentuch - Kamille,unknown,en:unknown,en:kein-rock-gleicht-den-anderen-wir-haben-nac...,"en:palm-oil-content-unknown,en:vegan-status-un...",en:none,en:none,0.0,en:none,unknown,en:none,"en:to-be-completed,en:nutrition-facts-complete...",unknown,en:unknown,unknown,en:none,https://images.openfoodfacts.org/images/produc...,160.0,669.0,3.2,0.002,1.4,8.7,0.807087,2024,2024
3,4,http://world-en.openfoodfacts.org/product/0000...,Marinara,unknown,"en:snacks,en:sweet-snacks,en:cocoa-and-its-pro...","en:cocoa-paste,en:plant,en:cocoa,en:cane-sugar...","en:palm-oil-free,en:vegan,en:vegetarian",en:none,en:none,0.0,en:none,e,"en:sugary-snacks,en:chocolate-products","en:to-be-completed,en:nutrition-facts-complete...",d,"en:fat-in-low-quantity,en:saturated-fat-in-hig...",unknown,en:dark-chocolate-bar,https://images.openfoodfacts.org/images/produc...,56.451613,2401.0,2.016129,-1.0,35.0,1.612903,0.0,2019,2024
4,5,http://world-en.openfoodfacts.org/product/0000...,Bio inulin,Glass,"en:plant-based-foods-and-beverages,en:plant-ba...","en:apricot-juice-and-puree,en:fruit,en:prunus-...","en:palm-oil-free,en:vegan,en:vegetarian",en:none,en:none,0.0,en:none,a,"en:fruits-and-vegetables,en:vegetables","en:to-be-completed,en:nutrition-facts-complete...",unknown,"en:fat-in-low-quantity,en:saturated-fat-in-low...",unknown,en:vegetables,https://images.openfoodfacts.org/images/produc...,208.0,840.0,0.0,-1.0,8.0,0.0,0.23,2020,2024
5,7,http://world-en.openfoodfacts.org/product/0000...,Banquet mega bowls,unknown,en:butfalo-mac-and-cheese,"es:honig-stillende-frauen-nicht-geeignet,es:d-...","en:palm-oil-content-unknown,en:vegan-status-un...",en:none,en:none,0.0,en:none,unknown,en:none,"en:to-be-completed,en:nutrition-facts-to-be-co...",unknown,en:unknown,"top-75-percent-scans-2023,top-80-percent-scans...",en:butfalo-mac-and-cheese,en:none,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,2023,2024


In [27]:
clean_data['nutriscore_grade'].value_counts()

nutriscore_grade
d                 63579
unknown           44631
c                 40347
a                 37358
e                 37233
b                 29669
not-applicable     1960
Name: count, dtype: int64