In [1]:
import pandas as pd
from collections import Counter
import numpy as np
pd.options.display.max_columns=None
pd.options.display.max_colwidth=None
pd.options.display.max_rows=None
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

In [2]:
file_path = "../data/en.openfoodfacts.org.products.csv.gz"

In [3]:
chunk_size = 10000

filtered_chunks = []

for chunk in pd.read_csv(
    file_path,
    chunksize=chunk_size,
    compression="gzip",
    sep="\t",
    engine="python",
    quoting=3,
):
    filtered_chunks_en = chunk[
        (chunk["countries_tags"] == "en:france") & (chunk["ingredients_tags"].notnull() & chunk["product_name"].notnull())
    ]
    filtered_chunks.append(filtered_chunks_en)

if filtered_chunks:
    data_en = pd.concat(filtered_chunks, axis=0)

In [4]:
# Find the different state of each rows
states_unique_values = set()
data_en["states_tags"].str.split(",").apply(states_unique_values.update)

states_df = pd.DataFrame(data=states_unique_values, columns=["states"])
display(states_df)

Unnamed: 0,states
0,en:quantity-completed
1,en:nutrition-facts-to-be-completed
2,en:expiration-date-to-be-completed
3,en:photos-to-be-validated
4,en:photos-uploaded
5,en:packaging-code-to-be-completed
6,en:ingredients-completed
7,en:packaging-to-be-completed
8,en:packaging-photo-to-be-selected
9,en:origins-to-be-completed


In [5]:
# Target sta
target_states = [
    "en:completed",
    "en:origins-completed",
    "en:ingredients-completed",
    "en:characteristics-completed",
]

# Construire une expression régulière pour les états
pattern = "|".join(target_states)

# Compter les lignes contenant au moins un des états cibles
count = data_en["states_tags"].str.contains(pattern).sum()

print("Nombre de lignes avec les états spécifiés:", count)

data_en_filtered = data_en[data_en["states_tags"].str.contains(pattern)]

Nombre de lignes avec les états spécifiés: 255537


In [6]:
len(data_en) - 258361

1181

In [7]:
describe_sample = data_en_filtered.describe(include="all").drop("top").T

In [8]:
describe_sample.T

Unnamed: 0,code,url,creator,created_t,created_datetime,last_modified_t,last_modified_datetime,last_modified_by,last_updated_t,last_updated_datetime,product_name,abbreviated_product_name,generic_name,quantity,packaging,packaging_tags,packaging_en,packaging_text,brands,brands_tags,categories,categories_tags,categories_en,origins,origins_tags,origins_en,manufacturing_places,manufacturing_places_tags,labels,labels_tags,labels_en,emb_codes,emb_codes_tags,first_packaging_code_geo,cities,cities_tags,purchase_places,stores,countries,countries_tags,countries_en,ingredients_text,ingredients_tags,ingredients_analysis_tags,allergens,allergens_en,traces,traces_tags,traces_en,serving_size,serving_quantity,no_nutrition_data,additives_n,additives,additives_tags,additives_en,nutriscore_score,nutriscore_grade,nova_group,pnns_groups_1,pnns_groups_2,food_groups,food_groups_tags,food_groups_en,states,states_tags,states_en,brand_owner,ecoscore_score,ecoscore_grade,nutrient_levels_tags,product_quantity,owner,data_quality_errors_tags,unique_scans_n,popularity_tags,completeness,last_image_t,last_image_datetime,main_category,main_category_en,image_url,image_small_url,image_ingredients_url,image_ingredients_small_url,image_nutrition_url,image_nutrition_small_url,energy-kj_100g,energy-kcal_100g,energy_100g,energy-from-fat_100g,fat_100g,saturated-fat_100g,butyric-acid_100g,caproic-acid_100g,caprylic-acid_100g,capric-acid_100g,lauric-acid_100g,myristic-acid_100g,palmitic-acid_100g,stearic-acid_100g,arachidic-acid_100g,behenic-acid_100g,lignoceric-acid_100g,cerotic-acid_100g,montanic-acid_100g,melissic-acid_100g,unsaturated-fat_100g,monounsaturated-fat_100g,omega-9-fat_100g,polyunsaturated-fat_100g,omega-3-fat_100g,omega-6-fat_100g,alpha-linolenic-acid_100g,eicosapentaenoic-acid_100g,docosahexaenoic-acid_100g,linoleic-acid_100g,arachidonic-acid_100g,gamma-linolenic-acid_100g,dihomo-gamma-linolenic-acid_100g,oleic-acid_100g,elaidic-acid_100g,gondoic-acid_100g,mead-acid_100g,erucic-acid_100g,nervonic-acid_100g,trans-fat_100g,cholesterol_100g,carbohydrates_100g,sugars_100g,added-sugars_100g,sucrose_100g,glucose_100g,fructose_100g,lactose_100g,maltose_100g,maltodextrins_100g,starch_100g,polyols_100g,erythritol_100g,fiber_100g,soluble-fiber_100g,insoluble-fiber_100g,proteins_100g,casein_100g,serum-proteins_100g,nucleotides_100g,salt_100g,added-salt_100g,sodium_100g,alcohol_100g,vitamin-a_100g,beta-carotene_100g,vitamin-d_100g,vitamin-e_100g,vitamin-k_100g,vitamin-c_100g,vitamin-b1_100g,vitamin-b2_100g,vitamin-pp_100g,vitamin-b6_100g,vitamin-b9_100g,folates_100g,vitamin-b12_100g,biotin_100g,pantothenic-acid_100g,silica_100g,bicarbonate_100g,potassium_100g,chloride_100g,calcium_100g,phosphorus_100g,iron_100g,magnesium_100g,zinc_100g,copper_100g,manganese_100g,fluoride_100g,selenium_100g,chromium_100g,molybdenum_100g,iodine_100g,caffeine_100g,taurine_100g,ph_100g,fruits-vegetables-nuts_100g,fruits-vegetables-nuts-dried_100g,fruits-vegetables-nuts-estimate_100g,fruits-vegetables-nuts-estimate-from-ingredients_100g,collagen-meat-protein-ratio_100g,cocoa_100g,chlorophyl_100g,carbon-footprint_100g,carbon-footprint-from-meat-or-fish_100g,nutrition-score-fr_100g,nutrition-score-uk_100g,glycemic-index_100g,water-hardness_100g,choline_100g,phylloquinone_100g,beta-glucan_100g,inositol_100g,carnitine_100g,sulphate_100g,nitrate_100g,acidity_100g
count,255537.0,255537.0,255536.0,255537.0,255537.0,255537.0,255537.0,249517.0,255477.0,255477.0,255537.0,13865.0,72394.0,190388.0,130135.0,130135.0,130135.0,12100.0,241099.0,241092.0,220552.0,220552.0,220552.0,50434.0,50398.0,50397.0,61678.0,61655.0,159399.0,159399.0,159399.0,61551.0,61540.0,48041.0,0.0,51396.0,76756.0,112432.0,255537.0,255537.0,255537.0,255536.0,255537.0,255535.0,99877.0,0.0,59420.0,76867.0,75917.0,64571.0,62647.0,13222.0,255537.0,2.0,130552.0,130552.0,190433.0,255522.0,222182.0,255536.0,255537.0,198915.0,198915.0,198915.0,255537.0,255537.0,255537.0,3875.0,159029.0,255530.0,192600.0,188947.0,50071.0,8343.0,186866.0,183512.0,255537.0,249827.0,249827.0,220552.0,220552.0,237764.0,237764.0,222736.0,222736.0,181955.0,181955.0,84818.0,211997.0,229760.0,27.0,229518.0,227845.0,4.0,4.0,3.0,3.0,4.0,2.0,3.0,2.0,21.0,8.0,1.0,1.0,3.0,7.0,23.0,1750.0,38.0,1760.0,1292.0,271.0,555.0,57.0,129.0,422.0,52.0,1.0,1.0,27.0,0.0,1.0,2.0,0.0,2.0,544.0,623.0,229404.0,228426.0,125.0,19.0,16.0,32.0,177.0,11.0,82.0,167.0,916.0,9.0,128035.0,4.0,2.0,229377.0,48.0,39.0,19.0,226290.0,0.0,226290.0,7533.0,985.0,35.0,1429.0,1549.0,274.0,2137.0,1376.0,1021.0,1011.0,1274.0,1004.0,18.0,971.0,410.0,737.0,50.0,207.0,1153.0,384.0,4759.0,1110.0,2069.0,1708.0,635.0,361.0,373.0,223.0,308.0,44.0,42.0,375.0,73.0,55.0,193.0,8311.0,773.0,8095.0,255535.0,279.0,3097.0,2.0,245.0,11313.0,190435.0,2.0,4.0,0.0,48.0,6.0,10.0,39.0,32.0,17.0,27.0,3.0
unique,254765.0,254771.0,4760.0,,251188.0,,223847.0,6445.0,,150892.0,197147.0,12874.0,51932.0,17339.0,22193.0,18379.0,18364.0,5706.0,56810.0,41646.0,62753.0,49351.0,49344.0,8336.0,6518.0,6458.0,12826.0,11439.0,45709.0,38920.0,38920.0,17475.0,15144.0,2858.0,,5752.0,5347.0,7964.0,58.0,1.0,1.0,230914.0,194509.0,37.0,3229.0,,5628.0,5998.0,5997.0,8346.0,,4.0,,2.0,32824.0,32824.0,,7.0,,11.0,40.0,45.0,45.0,45.0,1653.0,1653.0,1653.0,73.0,,9.0,223.0,,218.0,169.0,,85392.0,,,248191.0,16910.0,16908.0,237019.0,237019.0,222021.0,222021.0,181381.0,181381.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
freq,2.0,2.0,159554.0,,13.0,,23.0,48625.0,,34.0,204.0,18.0,275.0,8577.0,6984.0,7430.0,7430.0,521.0,5597.0,5797.0,1608.0,1673.0,1673.0,20803.0,21245.0,21247.0,22825.0,23659.0,9927.0,10689.0,10689.0,345.0,349.0,743.0,,523.0,38005.0,8318.0,223914.0,255537.0,255537.0,516.0,784.0,45120.0,23475.0,,5301.0,7307.0,7307.0,5460.0,,10584.0,,1.0,6615.0,6615.0,,54461.0,,57407.0,57407.0,17229.0,17229.0,17229.0,20867.0,20867.0,20867.0,2312.0,,92385.0,22032.0,,8226.0,3182.0,,6056.0,,,4.0,8649.0,8649.0,2.0,2.0,2.0,2.0,2.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
mean,,,,1527304103.720948,,1668634890.138974,,,1722833773.189978,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,60023367.885525,,1.530373,,,,8.387764,,3.322924,,,,,,,,,,55.399967,,,454.511439,,,4.666119,,0.761083,1589385948.754882,,,,,,,,,,1060.292935,66982080208.446014,258965878560.86115,738.118519,13.527018,5.248403,6.84075,0.231506,3.314367,2.621,36.7925,9.51,3.886,32.47,0.116063,2.597562,0.003,0.0,20.760333,0.040721,12.865217,24.820513,84.320439,12.547839,4.24303,15.992693,1.343708,1.839791,0.841817,2.20308,0.055554,0.6,5.35,38.494037,,0.0023,0.0,,7.0071,0.196133,0.131685,27.325645,13.627139,15.293344,12.939023,14.118931,33.346875,35.750783,7.758274,4.522204,30.757096,60.271287,11.407778,2.952188,2.25,2.95,8.329264,3.643506,5.602054,5.954821,1.141308,,0.456737,4.512725,2.254331,1.135413,0.02675,0.110057,0.013152,0.24448,0.043807,0.07919,0.509009,0.162173,2.619952,5e-05,0.269481,0.43307,0.033527,1.810351,0.118012,1.29542,4.18016,0.556256,0.431859,0.075081,0.67744,0.073154,0.013351,0.145197,0.008365,1.266671,0.042636,0.863265,0.455328,0.507318,1.528764,6.727772,32.84932,10.684218,47.426233,19.172947,14.916487,50.525125,1.879,236.850844,617.129755,8.387817,9.5,37.175,,0.104835,1.966691,4.09,0.039503,0.010034,0.137547,0.079228,4.463333
std,,,,73025791.69258,,47423951.449729,,,10139414.283437,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,15023446243.26374,,2.332051,,,,9.005885,,1.029884,,,,,,,,,,28.217601,,,2634.558903,,,12.2181,,0.171523,81141568.625038,,,,,,,,,,754.144011,30840647448999.633,124130857192479.4,951.320011,17.017752,7.948285,13.439585,0.445831,5.729307,3.31831,23.818779,13.279465,3.584855,41.761726,0.158363,7.033452,,,34.850428,0.052201,30.47787,26.07531,278.682433,17.056458,39.626893,29.110315,5.849538,7.177955,2.815056,6.196488,0.068529,,,28.291801,,,0.0,,9.889454,1.047265,1.462842,27.341646,19.75713,19.249787,27.13252,18.557303,29.308173,234.5608,14.854779,17.172962,25.034378,54.461962,23.133451,13.230077,1.93477,2.899138,9.153117,3.117017,15.342642,11.161286,7.177428,,2.87286,8.27851,34.6663,6.415153,0.401409,1.681411,0.199541,3.053857,0.562643,1.744564,12.615292,2.383742,78.95991,3.6e-05,6.489857,5.454268,0.33709,11.305845,0.459,15.661216,79.602692,7.494377,3.928758,0.706195,12.887696,0.715062,0.063156,2.349934,0.049242,17.301441,0.265353,3.814357,5.50397,1.162688,5.279059,1.143104,37.522228,27.722586,30.199189,48.506236,3.238563,22.902226,0.429921,339.512034,6226.408484,9.005864,12.020815,10.889865,,0.141583,4.817318,1.890003,0.02777,0.00494,0.346106,0.384357,3.83328
min,,,,1328021038.0,,1362683313.0,,,1706029284.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,0.0,,,,-20.0,,1.0,,,,,,,,,,-30.0,,,0.0,,,1.0,,0.2,1328986727.0,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.063,0.0,0.0042,0.583,1.07,0.12,0.108,2.94,0.0,0.001,0.003,0.0,0.281,0.00034,0.0,0.0,0.0055,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6,5.35,0.027,,0.0023,0.0,,0.0142,0.0,0.0,0.0,-0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.8,0.9,0.0,0.0014,0.0003,0.0025,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.2e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2e-06,0.0,0.0,0.0,0.0,0.87,0.0,0.0,0.0,-65.887256,3.7,0.0,1.575,0.0,0.049,-20.0,1.0,26.0,,0.0,0.0,0.5,0.0033,0.0008,2e-05,0.0,0.39
25%,,,,1493456912.0,,1644552349.0,,,1707737449.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,30.0,,0.0,,,,1.0,,3.0,,,,,,,,,,35.0,,,160.0,,,1.0,,0.6,1519927009.0,,,,,,,,,,424.0,111.0,460.0,79.5,1.1,0.25,0.10575,1.7e-05,0.00655,0.7065,36.3425,4.815,2.209,17.705,5.5e-05,0.003325,0.003,0.0,0.6405,0.006455,0.0,3.1,27.025,2.0,0.41,1.4,0.052,0.1,0.059,0.354,0.01,0.6,5.35,10.375,,0.0023,0.0,,3.51065,0.0,0.0,3.4,0.8,0.0,0.15,0.325,2.375,0.01,0.005006,0.0985,8.65,27.0,0.0,0.24,0.95,1.925,1.7,1.055,0.3,0.022,0.05,,0.02,0.0,7e-05,2.9e-05,1e-06,0.0018,6e-06,0.012,0.00021,0.00023,0.0029,0.00025,3e-05,1.6e-05,0.0,5e-06,0.0009,0.0015,0.01731,0.071,0.001988,0.115,0.13865,0.002,0.03,0.0008,0.00014,4e-05,1.5e-05,5e-06,1e-05,5e-06,1.5e-05,0.02,0.035,6.0,0.0,0.0,17.0,0.0,12.0,32.0,1.727,0.0,106.56,1.0,5.25,29.0,,0.06,3.1e-05,3.1,0.025,0.006875,0.0014,0.0001,2.695
50%,,,,1517336484.0,,1682576444.0,,,1729884983.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,100.0,,1.0,,,,8.0,,4.0,,,,,,,,,,56.0,,,290.0,,,2.0,,0.7875,1600427974.0,,,,,,,,,,951.0,251.0,1038.0,420.0,7.1,1.98,0.15,0.013011,0.0089,0.83,48.55,9.51,4.31,32.47,0.004,0.0245,0.003,0.0,1.0,0.025,0.0,15.05,42.0,6.0,1.8,8.0,0.086,0.2,0.12,0.485,0.0455,0.6,5.35,33.0,,0.0023,0.0,,7.0071,0.0,0.0,14.8,3.6,6.0,0.3,5.1,28.5,6.0,0.231,0.12,31.0,66.0,0.0,1.6,1.6,2.95,6.0,2.9,1.9,0.024,0.5,,0.2,0.0,0.00014,0.0001,1e-06,0.006,1.3e-05,0.02,0.0005,0.00078,0.00649,0.000623,9.3e-05,3.9e-05,1e-06,1e-05,0.0023,0.00327,0.034,0.233,0.0322,0.12,0.225,0.004,0.087,0.002,0.000371,0.00032,5e-05,1.7e-05,1.3e-05,1.5e-05,4.7e-05,0.032,0.039,7.0,14.0,0.0,50.0,0.056676,15.0,51.0,1.879,126.0,325.6,8.0,9.5,37.0,,0.07,3.4e-05,4.0,0.029,0.00905,0.005,0.00043,5.0
75%,,,,1555696445.0,,1701685693.0,,,1729945528.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,150.0,,2.0,,,,15.0,,4.0,,,,,,,,,,76.0,,,500.0,,,4.0,,0.8875,1652448637.5,,,,,,,,,,1568.0,393.0,1636.0,971.5,21.0,7.2,6.885,0.2445,4.96945,3.64,49.0,14.205,5.775,47.235,0.259,0.35475,0.003,0.0,31.0,0.0484,1.9,40.0,62.55,13.625,3.2025,21.25,0.392,0.7,0.5,0.69825,0.082,0.6,5.35,67.0,,0.0023,0.0,,10.50355,0.1,0.01745,52.0,19.0,29.6,4.92,21.55,55.35,42.0,3.9,0.9275,50.0,94.12,6.67,3.5,2.9,3.975,11.9,5.4,5.3,7.25,1.2,,0.48,6.0,0.00045,0.0725,5e-06,0.013,3.6e-05,0.033,0.00091,0.0012,0.0133,0.0012,0.000166,8.8e-05,2e-06,1.8e-05,0.005,0.032525,0.1115,0.583,0.1655,0.269,0.35,0.0078,0.16,0.0042,0.00068,0.0015,0.00017,2.9e-05,4e-05,5e-05,9.5e-05,0.1,0.4,7.5,60.0,0.0,65.0,22.833333,15.0,70.0,2.031,335.0,615.0,15.0,13.75,45.175,,0.1565,4.3e-05,5.275,0.051,0.015,0.0174,0.001,6.5
max,,,,1730868771.0,,1730869030.0,,,1730869030.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3760275860057.0,,35.0,,,,40.0,,4.0,,,,,,,,,,125.0,,,1000000.0,,,842.0,,1.1,1730868895.0,,,,,,,,,,23200.0,1.42e+16,5.95e+16,3630.0,231.0,163.0,27.0,0.9,9.93,6.45,49.0,18.9,7.24,62.0,0.4,20.0,0.003,0.0,61.0,0.15,92.1,86.0,1750.0,77.0,1410.0,404.23,81.0,39.64,19.0,68.0,0.428,0.6,5.35,76.0,,0.0023,0.0,,14.0,19.0,24.0,620.0,725.0,76.9,92.8,55.0,101.4,3125.0,39.2,132.0,87.5,1370.0,68.0,4380.0,5.0,5.0,579.0,10.7,95.0,41.0,2500.0,,1000.0,100.0,800.0,38.0,9.7,50.0,3.3,87.0,15.0,55.0,400.0,75.0,2500.0,9.9e-05,200.0,100.0,7.5,80.0,6.3,412.0,1560.0,390.0,127.0,17.0,506.0,13.0,0.4,45.0,0.504,300.0,1.76,18.8,100.0,5.882,33.0,8.4,100.0,100.0,250.0,13461.0,25.0,100.0,2.183,2520.0,656298.6,40.0,18.0,48.7,,1.0,11.8,7.3,0.15,0.02,1.337,2.0,8.0


In [9]:
threshold = 0.85
fill_threshold_percent = describe_sample[describe_sample["count"] > len(data_en_filtered) * threshold].T.columns
fill_threshold_percent

Index(['code', 'url', 'creator', 'created_t', 'created_datetime',
       'last_modified_t', 'last_modified_datetime', 'last_modified_by',
       'last_updated_t', 'last_updated_datetime', 'product_name', 'brands',
       'brands_tags', 'categories', 'categories_tags', 'categories_en',
       'countries', 'countries_tags', 'countries_en', 'ingredients_text',
       'ingredients_tags', 'ingredients_analysis_tags', 'additives_n',
       'nutriscore_grade', 'nova_group', 'pnns_groups_1', 'pnns_groups_2',
       'states', 'states_tags', 'states_en', 'ecoscore_grade', 'completeness',
       'last_image_t', 'last_image_datetime', 'main_category',
       'main_category_en', 'image_url', 'image_small_url',
       'image_ingredients_url', 'image_ingredients_small_url', 'energy_100g',
       'fat_100g', 'saturated-fat_100g', 'carbohydrates_100g', 'sugars_100g',
       'proteins_100g', 'salt_100g', 'sodium_100g',
       'fruits-vegetables-nuts-estimate-from-ingredients_100g'],
      dtype='object'

In [10]:
columns_to_keep = [
    "code",
    "url",
    "last_modified_datetime",
    "product_name",
    "packaging_tags",
    "categories",
    "categories_tags",
    "ingredients_tags",
    "ingredients_analysis_tags",
    "food_groups_tags",
    "main_category",
    "allergens",
    "additives_tags",
    "traces_tags",
    "nutriscore_grade",
    "ecoscore_grade",
    "nutrient_levels_tags",
    "image_url",
    "energy_100g",
    "fat_100g",
    "saturated-fat_100g",
    "carbohydrates_100g",
    "sugars_100g",
    "proteins_100g",
    "salt_100g",
    "fruits-vegetables-nuts-estimate-from-ingredients_100g",
]
data_en_filtered = data_en_filtered[columns_to_keep]

In [11]:
print(len(data_en_filtered))
# Remove deplicates lines from the dataset
data_en_filtered = data_en_filtered.drop_duplicates()
print(len(data_en_filtered))

255537
255059


In [12]:
print(len(data_en_filtered))
# Remove the duplicates code in the dataset
data_en_filtered = data_en_filtered.drop_duplicates(subset="code")
print(len(data_en_filtered))

255059
254765


In [13]:
# Save the data_en_filtered
data_en_filtered.to_csv("../data/filtered_dataset_openfoodfacts_en.csv", index=False)

### Data Cleaning

In [13]:
data_en_filtered = pd.read_csv("../data/filtered_dataset_openfoodfacts_en.csv", dtype={'code': str})

In [14]:
data_en_filtered.head(2)

Unnamed: 0,code,url,last_modified_datetime,product_name,packaging_tags,categories,categories_tags,ingredients_tags,ingredients_analysis_tags,food_groups_tags,main_category,allergens,additives_tags,traces_tags,nutriscore_grade,ecoscore_grade,nutrient_levels_tags,image_url,energy_100g,fat_100g,saturated-fat_100g,carbohydrates_100g,sugars_100g,proteins_100g,salt_100g,fruits-vegetables-nuts-estimate-from-ingredients_100g
0,5,http://world-en.openfoodfacts.org/product/00000005/bio-inulin-ewl,2024-10-20T14:02:34Z,Bio inulin,en:glass,Gemüse,"en:plant-based-foods-and-beverages,en:plant-based-foods,en:fruits-and-vegetables-based-foods,en:vegetables-based-foods,en:vegetables","en:apricot-juice-and-puree,en:fruit,en:prunus-species-fruit,en:apricot,en:apricot-juice,en:apricot-puree,en:water,en:sugar,en:added-sugar,en:disaccharide","en:palm-oil-free,en:vegan,en:vegetarian","en:fruits-and-vegetables,en:vegetables",en:vegetables,,,,a,unknown,"en:fat-in-low-quantity,en:saturated-fat-in-low-quantity,en:sugars-in-moderate-quantity,en:salt-in-low-quantity",https://images.openfoodfacts.org/images/products/000/000/000/0005/front_de.34.400.jpg,840.0,0.0,0.0,8.0,8.0,0.0,0.23,50.0
1,10,http://world-en.openfoodfacts.org/product/00000010/madeleines-nature-bijou,2024-10-04T09:38:47Z,Madeleines nature,"en:plastic,en:cardboard,fr:boite-en-carton,fr:film-en-plastique","Snacks, Desserts, Snacks sucrés, Biscuits et gâteaux, Gâteaux, Madeleines, Madeleines natures","en:snacks,en:desserts,en:sweet-snacks,en:biscuits-and-cakes,en:cakes,en:madeleines,en:plain-madeleines","en:wheat-flour,en:cereal,en:flour,en:wheat,en:cereal-flour,en:sugar,en:added-sugar,en:disaccharide,en:colza-oil,en:oil-and-fat,en:vegetable-oil-and-fat,en:rapeseed-oil,en:free-range-chicken-eggs,en:egg,en:chicken-egg,en:free-range-eggs,en:glucose-fructose-syrup,en:monosaccharide,en:fructose,en:glucose,en:stabiliser,en:raising-agent,en:e331,en:salt,en:wheat-gluten,en:gluten,en:skimmed-milk-powder,en:dairy,en:milk-powder,en:vegetable-fiber,en:fiber,en:natural-flavouring,en:flavouring,en:e422,fr:carbonates-d-ammonium-carbonates-de-sodium","en:palm-oil-free,en:non-vegan,en:vegetarian-status-unknown","en:sugary-snacks,en:biscuits-and-cakes",en:plain-madeleines,"en:eggs,en:gluten,en:milk","en:e331,en:e422,en:e503","en:nuts,en:soybeans",d,c,"en:fat-in-high-quantity,en:saturated-fat-in-moderate-quantity,en:sugars-in-high-quantity,en:salt-in-moderate-quantity",https://images.openfoodfacts.org/images/products/000/000/000/0010/front_fr.37.400.jpg,1852.0,22.0,2.6,54.0,25.0,6.4,0.53,22.666667


In [6]:
def catetegorial_and_numerical_features(df):
    categorical_features = df.select_dtypes(include=['object', 'bool']).columns
    numerical_features = df.select_dtypes(include=['number']).columns
    return categorical_features, numerical_features

In [17]:
categorical_feature, numerical_feature = catetegorial_and_numerical_features(data_en_filtered)

In [18]:
categorical_feature

Index(['code', 'url', 'last_modified_datetime', 'product_name',
       'packaging_tags', 'categories', 'categories_tags', 'ingredients_tags',
       'ingredients_analysis_tags', 'food_groups_tags', 'main_category',
       'allergens', 'additives_tags', 'traces_tags', 'nutriscore_grade',
       'ecoscore_grade', 'nutrient_levels_tags', 'image_url'],
      dtype='object')

In [15]:
def clean_dataset(df):
    # Convertir les colonnes de date en années et gérer les erreurs
    df['last_modified_year'] = pd.to_datetime(df['last_modified_datetime'], errors='coerce').dt.year
    df.drop(['last_modified_datetime'], axis=1, inplace=True)

    # Liste des colonnes à remplir avec une valeur par défaut
    fill_unknown = [
        "packaging_tags",
        "categories_tags",
        "ingredients_tags",
        "ingredients_analysis_tags",
        "nutriscore_grade",
        "ecoscore_grade",
        "nutrient_levels_tags",
        "categories",
    ]
    fill_none = [
        "allergens",
        "traces_tags",
        "additives_tags",
        "food_groups_tags",
        "main_category",
        "image_url",
    ]

    # Remplacer les valeurs manquantes
    df[fill_unknown] = df[fill_unknown].fillna("unknown")
    df[fill_none] = df[fill_none].fillna("none")

    return df


# Appliquer la fonction
data_en_filtered_clean = clean_dataset(data_en_filtered)

In [20]:
to_split = [
    "packaging_tags",
    "categories_tags",
    "ingredients_tags",
    "ingredients_analysis_tags",
    "allergens",
    "additives_tags",
    "nutrient_levels_tags",
    "traces_tags",
    "main_category",
    "food_groups_tags",
    "categories",
]

In [21]:
for feature in to_split:
    data_en_filtered_clean[feature] = data_en_filtered_clean[feature].fillna("").str.replace("en:", "").str.split(",")

In [22]:
def select_frequent_tags(data, column_name, min_count=100, min_for_various_tags=100):
    all_tags = [item for items in data[column_name].dropna() for item in items]
    tag_counts = Counter(all_tags)
    frequent_tags = [tag for tag, count in tag_counts.items() if count >= min_count]
    if len(frequent_tags) > min_for_various_tags:
        frequent_tags = [tag for tag, _ in tag_counts.most_common(min_for_various_tags)]
    frequent_tags.append("other")
    return frequent_tags

In [23]:
# Applique cette fonction aux colonnes de to_split
frequent_tags = {
    col: select_frequent_tags(data_en_filtered_clean, col) for col in to_split
}

In [24]:
data_en_filtered_clean["food_groups_tags"].value_counts().reset_index().head()

Unnamed: 0,food_groups_tags,count
0,[none],56514
1,"[sugary-snacks, biscuits-and-cakes]",17164
2,"[sugary-snacks, sweets]",16469
3,"[composite-foods, one-dish-meals]",15040
4,"[fish-meat-eggs, processed-meat]",10615


In [25]:
len(frequent_tags["food_groups_tags"])

51

#### First dataset using boolean labels

In [26]:
#  Transformation en colonnes booléennes seulement pour les tags fréquents
new_columns = {}

for col, tags in frequent_tags.items():
    for tag in tags:
        new_columns[f"{col}_{tag}"] = data_en_filtered_clean[col].apply(
            lambda x: tag in x if isinstance(x, list) else False
        )

    # Créer une colonne booléenne pour les tags non fréquents
    new_columns[f"{col}_other"] = data_en_filtered[col].apply(
        lambda x: any(item not in tags for item in x) if isinstance(x, list) else False
    )

# Convert the dictionary to a DataFrame
new_columns_df = pd.DataFrame(new_columns)

In [27]:
print(data_en_filtered.shape)

print(new_columns_df.shape)

(254765, 26)
(254765, 723)


In [28]:
frequent_tags_list = list(frequent_tags.keys())

In [29]:
frequent_tags_list

['packaging_tags',
 'categories_tags',
 'ingredients_tags',
 'ingredients_analysis_tags',
 'allergens',
 'additives_tags',
 'nutrient_levels_tags',
 'traces_tags',
 'main_category',
 'food_groups_tags',
 'categories']

In [30]:
data_en_filtered_clean.head(2)

Unnamed: 0,code,url,product_name,packaging_tags,categories,categories_tags,ingredients_tags,ingredients_analysis_tags,food_groups_tags,main_category,allergens,additives_tags,traces_tags,nutriscore_grade,ecoscore_grade,nutrient_levels_tags,image_url,energy_100g,fat_100g,saturated-fat_100g,carbohydrates_100g,sugars_100g,proteins_100g,salt_100g,fruits-vegetables-nuts-estimate-from-ingredients_100g,last_modified_year
0,5,http://world-en.openfoodfacts.org/product/00000005/bio-inulin-ewl,Bio inulin,[glass],[Gemüse],"[plant-based-foods-and-beverages, plant-based-foods, fruits-and-vegetables-based-foods, vegetables-based-foods, vegetables]","[apricot-juice-and-puree, fruit, prunus-species-fruit, apricot, apricot-juice, apricot-puree, water, sugar, added-sugar, disaccharide]","[palm-oil-free, vegan, vegetarian]","[fruits-and-vegetables, vegetables]",[vegetables],[none],[none],[none],a,unknown,"[fat-in-low-quantity, saturated-fat-in-low-quantity, sugars-in-moderate-quantity, salt-in-low-quantity]",https://images.openfoodfacts.org/images/products/000/000/000/0005/front_de.34.400.jpg,840.0,0.0,0.0,8.0,8.0,0.0,0.23,50.0,2024
1,10,http://world-en.openfoodfacts.org/product/00000010/madeleines-nature-bijou,Madeleines nature,"[plastic, cardboard, fr:boite-en-carton, fr:film-en-plastique]","[Snacks, Desserts, Snacks sucrés, Biscuits et gâteaux, Gâteaux, Madeleines, Madeleines natures]","[snacks, desserts, sweet-snacks, biscuits-and-cakes, cakes, madeleines, plain-madeleines]","[wheat-flour, cereal, flour, wheat, cereal-flour, sugar, added-sugar, disaccharide, colza-oil, oil-and-fat, vegetable-oil-and-fat, rapeseed-oil, free-range-chicken-eggs, egg, chicken-egg, free-range-eggs, glucose-fructose-syrup, monosaccharide, fructose, glucose, stabiliser, raising-agent, e331, salt, wheat-gluten, gluten, skimmed-milk-powder, dairy, milk-powder, vegetable-fiber, fiber, natural-flavouring, flavouring, e422, fr:carbonates-d-ammonium-carbonates-de-sodium]","[palm-oil-free, non-vegan, vegetarian-status-unknown]","[sugary-snacks, biscuits-and-cakes]",[plain-madeleines],"[eggs, gluten, milk]","[e331, e422, e503]","[nuts, soybeans]",d,c,"[fat-in-high-quantity, saturated-fat-in-moderate-quantity, sugars-in-high-quantity, salt-in-moderate-quantity]",https://images.openfoodfacts.org/images/products/000/000/000/0010/front_fr.37.400.jpg,1852.0,22.0,2.6,54.0,25.0,6.4,0.53,22.666667,2024


In [31]:
categorical_feature, numerical_feature = catetegorial_and_numerical_features(data_en_filtered_clean)

In [32]:
categorical_feature

Index(['code', 'url', 'product_name', 'packaging_tags', 'categories',
       'categories_tags', 'ingredients_tags', 'ingredients_analysis_tags',
       'food_groups_tags', 'main_category', 'allergens', 'additives_tags',
       'traces_tags', 'nutriscore_grade', 'ecoscore_grade',
       'nutrient_levels_tags', 'image_url'],
      dtype='object')

In [33]:
# Drop columns listed in `frequent_tags_list` and display the first few rows
dataset_with_bool = data_en_filtered_clean.drop(columns=frequent_tags_list)

In [34]:
dataset_with_bool.head()

Unnamed: 0,code,url,product_name,nutriscore_grade,ecoscore_grade,image_url,energy_100g,fat_100g,saturated-fat_100g,carbohydrates_100g,sugars_100g,proteins_100g,salt_100g,fruits-vegetables-nuts-estimate-from-ingredients_100g,last_modified_year
0,5,http://world-en.openfoodfacts.org/product/00000005/bio-inulin-ewl,Bio inulin,a,unknown,https://images.openfoodfacts.org/images/products/000/000/000/0005/front_de.34.400.jpg,840.0,0.0,0.0,8.0,8.0,0.0,0.23,50.0,2024
1,10,http://world-en.openfoodfacts.org/product/00000010/madeleines-nature-bijou,Madeleines nature,d,c,https://images.openfoodfacts.org/images/products/000/000/000/0010/front_fr.37.400.jpg,1852.0,22.0,2.6,54.0,25.0,6.4,0.53,22.666667,2024
2,15,http://world-en.openfoodfacts.org/product/00000015/madeleines-chocolait-bijou,Madeleines ChocoLait,d,d,https://images.openfoodfacts.org/images/products/000/000/000/0015/front_fr.22.400.jpg,1926.0,24.0,6.0,54.0,31.0,6.4,0.48,16.25,2024
3,20,http://world-en.openfoodfacts.org/product/00000020/madeleines-choco-noir-bijou,Madeleines Choco Noir,d,d,https://images.openfoodfacts.org/images/products/000/000/000/0020/front_fr.39.400.jpg,1953.0,25.0,6.1,53.0,29.0,6.3,0.45,16.25,2024
4,22,http://world-en.openfoodfacts.org/product/00000022/farandole-de-madeleine-bijou,Farandole de madeleine,unknown,d,https://images.openfoodfacts.org/images/products/000/000/000/0022/front_fr.3.400.jpg,,,,,,,,1.75,2024


In [35]:
dataset_with_bool = pd.concat([dataset_with_bool, new_columns_df], axis=1)

In [36]:
# Save the dataset cleaned
dataset_with_bool.to_csv("../data/dataset_openfoodfacts_bool_cleaned.csv", index=False)

#### Second dataset using NLP TFIDF

In [37]:
data_en_filtered_clean.head(2)

Unnamed: 0,code,url,product_name,packaging_tags,categories,categories_tags,ingredients_tags,ingredients_analysis_tags,food_groups_tags,main_category,allergens,additives_tags,traces_tags,nutriscore_grade,ecoscore_grade,nutrient_levels_tags,image_url,energy_100g,fat_100g,saturated-fat_100g,carbohydrates_100g,sugars_100g,proteins_100g,salt_100g,fruits-vegetables-nuts-estimate-from-ingredients_100g,last_modified_year
0,5,http://world-en.openfoodfacts.org/product/00000005/bio-inulin-ewl,Bio inulin,[glass],[Gemüse],"[plant-based-foods-and-beverages, plant-based-foods, fruits-and-vegetables-based-foods, vegetables-based-foods, vegetables]","[apricot-juice-and-puree, fruit, prunus-species-fruit, apricot, apricot-juice, apricot-puree, water, sugar, added-sugar, disaccharide]","[palm-oil-free, vegan, vegetarian]","[fruits-and-vegetables, vegetables]",[vegetables],[none],[none],[none],a,unknown,"[fat-in-low-quantity, saturated-fat-in-low-quantity, sugars-in-moderate-quantity, salt-in-low-quantity]",https://images.openfoodfacts.org/images/products/000/000/000/0005/front_de.34.400.jpg,840.0,0.0,0.0,8.0,8.0,0.0,0.23,50.0,2024
1,10,http://world-en.openfoodfacts.org/product/00000010/madeleines-nature-bijou,Madeleines nature,"[plastic, cardboard, fr:boite-en-carton, fr:film-en-plastique]","[Snacks, Desserts, Snacks sucrés, Biscuits et gâteaux, Gâteaux, Madeleines, Madeleines natures]","[snacks, desserts, sweet-snacks, biscuits-and-cakes, cakes, madeleines, plain-madeleines]","[wheat-flour, cereal, flour, wheat, cereal-flour, sugar, added-sugar, disaccharide, colza-oil, oil-and-fat, vegetable-oil-and-fat, rapeseed-oil, free-range-chicken-eggs, egg, chicken-egg, free-range-eggs, glucose-fructose-syrup, monosaccharide, fructose, glucose, stabiliser, raising-agent, e331, salt, wheat-gluten, gluten, skimmed-milk-powder, dairy, milk-powder, vegetable-fiber, fiber, natural-flavouring, flavouring, e422, fr:carbonates-d-ammonium-carbonates-de-sodium]","[palm-oil-free, non-vegan, vegetarian-status-unknown]","[sugary-snacks, biscuits-and-cakes]",[plain-madeleines],"[eggs, gluten, milk]","[e331, e422, e503]","[nuts, soybeans]",d,c,"[fat-in-high-quantity, saturated-fat-in-moderate-quantity, sugars-in-high-quantity, salt-in-moderate-quantity]",https://images.openfoodfacts.org/images/products/000/000/000/0010/front_fr.37.400.jpg,1852.0,22.0,2.6,54.0,25.0,6.4,0.53,22.666667,2024


In [38]:
categorical_feature, numerical_feature = catetegorial_and_numerical_features(data_en_filtered_clean)

In [39]:
# Get the from the categorical features the feature which have list as object
text_features = [
    col for col in categorical_feature if isinstance(data_en_filtered_clean[col].iloc[0], list)
]
text_features

['packaging_tags',
 'categories',
 'categories_tags',
 'ingredients_tags',
 'ingredients_analysis_tags',
 'food_groups_tags',
 'main_category',
 'allergens',
 'additives_tags',
 'traces_tags',
 'nutrient_levels_tags']

In [40]:
data_en_filtered_clean.head(2)

Unnamed: 0,code,url,product_name,packaging_tags,categories,categories_tags,ingredients_tags,ingredients_analysis_tags,food_groups_tags,main_category,allergens,additives_tags,traces_tags,nutriscore_grade,ecoscore_grade,nutrient_levels_tags,image_url,energy_100g,fat_100g,saturated-fat_100g,carbohydrates_100g,sugars_100g,proteins_100g,salt_100g,fruits-vegetables-nuts-estimate-from-ingredients_100g,last_modified_year
0,5,http://world-en.openfoodfacts.org/product/00000005/bio-inulin-ewl,Bio inulin,[glass],[Gemüse],"[plant-based-foods-and-beverages, plant-based-foods, fruits-and-vegetables-based-foods, vegetables-based-foods, vegetables]","[apricot-juice-and-puree, fruit, prunus-species-fruit, apricot, apricot-juice, apricot-puree, water, sugar, added-sugar, disaccharide]","[palm-oil-free, vegan, vegetarian]","[fruits-and-vegetables, vegetables]",[vegetables],[none],[none],[none],a,unknown,"[fat-in-low-quantity, saturated-fat-in-low-quantity, sugars-in-moderate-quantity, salt-in-low-quantity]",https://images.openfoodfacts.org/images/products/000/000/000/0005/front_de.34.400.jpg,840.0,0.0,0.0,8.0,8.0,0.0,0.23,50.0,2024
1,10,http://world-en.openfoodfacts.org/product/00000010/madeleines-nature-bijou,Madeleines nature,"[plastic, cardboard, fr:boite-en-carton, fr:film-en-plastique]","[Snacks, Desserts, Snacks sucrés, Biscuits et gâteaux, Gâteaux, Madeleines, Madeleines natures]","[snacks, desserts, sweet-snacks, biscuits-and-cakes, cakes, madeleines, plain-madeleines]","[wheat-flour, cereal, flour, wheat, cereal-flour, sugar, added-sugar, disaccharide, colza-oil, oil-and-fat, vegetable-oil-and-fat, rapeseed-oil, free-range-chicken-eggs, egg, chicken-egg, free-range-eggs, glucose-fructose-syrup, monosaccharide, fructose, glucose, stabiliser, raising-agent, e331, salt, wheat-gluten, gluten, skimmed-milk-powder, dairy, milk-powder, vegetable-fiber, fiber, natural-flavouring, flavouring, e422, fr:carbonates-d-ammonium-carbonates-de-sodium]","[palm-oil-free, non-vegan, vegetarian-status-unknown]","[sugary-snacks, biscuits-and-cakes]",[plain-madeleines],"[eggs, gluten, milk]","[e331, e422, e503]","[nuts, soybeans]",d,c,"[fat-in-high-quantity, saturated-fat-in-moderate-quantity, sugars-in-high-quantity, salt-in-moderate-quantity]",https://images.openfoodfacts.org/images/products/000/000/000/0010/front_fr.37.400.jpg,1852.0,22.0,2.6,54.0,25.0,6.4,0.53,22.666667,2024


In [41]:
# Définir les paramètres pour la taille maximale du vocabulaire
max_vocab = 100
min_vocab = 50

# Initialiser un DataFrame vide pour stocker les résultats TF-IDF
tfidf_results = pd.DataFrame(index=data_en_filtered_clean.index)

# Boucle sur chaque colonne de `text_features` (qui contient des listes de tags)
for col in text_features:
    print(col)
    # Convertir les listes en chaînes de caractères séparées par des espaces pour compatibilité avec TF-IDF
    data_en_filtered_clean[col] = data_en_filtered_clean[col].apply(
        lambda x: " ".join(x) if isinstance(x, list) else ""
    )

    # Déterminer la taille du vocabulaire pour la colonne actuelle
    vectorizer = TfidfVectorizer()
    vectorizer.fit(data_en_filtered_clean[col])
    vocab_size = len(vectorizer.vocabulary_)

    # Définir `max_features` selon la taille du vocabulaire
    max_features = min(max_vocab, vocab_size) if vocab_size >= min_vocab else vocab_size

    # Reconfigurer TfidfVectorizer avec le nombre maximal de termes si nécessaire
    vectorizer = TfidfVectorizer(max_features=max_features)
    tfidf_matrix = vectorizer.fit_transform(data_en_filtered_clean[col])

    # Convertir la matrice TF-IDF en DataFrame avec noms de colonnes appropriés
    tfidf_df = pd.DataFrame(
        tfidf_matrix.toarray(),
        columns=[f"{col}_{term}" for term in vectorizer.get_feature_names_out()],
        index=data_en_filtered_clean.index,
    )

    # Ajouter le DataFrame TF-IDF aux résultats
    tfidf_results = pd.concat([tfidf_results, tfidf_df], axis=1)
    
    print('Done')

packaging_tags


Done
categories
Done
categories_tags
Done
ingredients_tags
Done
ingredients_analysis_tags
Done
food_groups_tags
Done
main_category
Done
allergens
Done
additives_tags
Done
traces_tags
Done
nutrient_levels_tags
Done


In [42]:
dataset_with_tfidf = data_en_filtered_clean.drop(columns=text_features)

In [43]:
dataset_with_tfidf.head()

Unnamed: 0,code,url,product_name,nutriscore_grade,ecoscore_grade,image_url,energy_100g,fat_100g,saturated-fat_100g,carbohydrates_100g,sugars_100g,proteins_100g,salt_100g,fruits-vegetables-nuts-estimate-from-ingredients_100g,last_modified_year
0,5,http://world-en.openfoodfacts.org/product/00000005/bio-inulin-ewl,Bio inulin,a,unknown,https://images.openfoodfacts.org/images/products/000/000/000/0005/front_de.34.400.jpg,840.0,0.0,0.0,8.0,8.0,0.0,0.23,50.0,2024
1,10,http://world-en.openfoodfacts.org/product/00000010/madeleines-nature-bijou,Madeleines nature,d,c,https://images.openfoodfacts.org/images/products/000/000/000/0010/front_fr.37.400.jpg,1852.0,22.0,2.6,54.0,25.0,6.4,0.53,22.666667,2024
2,15,http://world-en.openfoodfacts.org/product/00000015/madeleines-chocolait-bijou,Madeleines ChocoLait,d,d,https://images.openfoodfacts.org/images/products/000/000/000/0015/front_fr.22.400.jpg,1926.0,24.0,6.0,54.0,31.0,6.4,0.48,16.25,2024
3,20,http://world-en.openfoodfacts.org/product/00000020/madeleines-choco-noir-bijou,Madeleines Choco Noir,d,d,https://images.openfoodfacts.org/images/products/000/000/000/0020/front_fr.39.400.jpg,1953.0,25.0,6.1,53.0,29.0,6.3,0.45,16.25,2024
4,22,http://world-en.openfoodfacts.org/product/00000022/farandole-de-madeleine-bijou,Farandole de madeleine,unknown,d,https://images.openfoodfacts.org/images/products/000/000/000/0022/front_fr.3.400.jpg,,,,,,,,1.75,2024


In [44]:
dataset_with_tfidf = pd.concat([dataset_with_tfidf, tfidf_results], axis=1)

In [45]:
# Save the dataset
dataset_with_tfidf.to_csv("../data/dataset_openfoodfacts_tfidf_cleaned.csv", index=False)

In [46]:
dataset_with_tfidf.head()

Unnamed: 0,code,url,product_name,nutriscore_grade,ecoscore_grade,image_url,energy_100g,fat_100g,saturated-fat_100g,carbohydrates_100g,sugars_100g,proteins_100g,salt_100g,fruits-vegetables-nuts-estimate-from-ingredients_100g,last_modified_year,packaging_tags_aluminium,packaging_tags_au,packaging_tags_bac,packaging_tags_bag,packaging_tags_barquette,packaging_tags_blister,packaging_tags_bocal,packaging_tags_boite,packaging_tags_bottle,packaging_tags_bouchon,packaging_tags_bouteille,packaging_tags_box,packaging_tags_brick,packaging_tags_brique,packaging_tags_can,packaging_tags_canned,packaging_tags_cap,packaging_tags_capsule,packaging_tags_card,packaging_tags_cardboard,packaging_tags_carton,packaging_tags_container,packaging_tags_conteneur,packaging_tags_cork,packaging_tags_couvercle,packaging_tags_de,packaging_tags_dot,packaging_tags_drink,packaging_tags_dry,packaging_tags_emballage,packaging_tags_en,packaging_tags_envelope,packaging_tags_et,packaging_tags_etui,packaging_tags_facile,packaging_tags_feuille,packaging_tags_film,packaging_tags_flacon,packaging_tags_flask,packaging_tags_food,packaging_tags_fr,packaging_tags_fresh,packaging_tags_frozen,packaging_tags_gas,packaging_tags_glass,packaging_tags_green,packaging_tags_individual,packaging_tags_jar,packaging_tags_jeter,packaging_tags_lid,packaging_tags_metal,packaging_tags_metals,packaging_tags_mixed,packaging_tags_opercule,packaging_tags_or,packaging_tags_ouverture,packaging_tags_pack,packaging_tags_packed,packaging_tags_packet,packaging_tags_pak,packaging_tags_paper,packaging_tags_papier,packaging_tags_pensez,packaging_tags_pet,packaging_tags_plastic,packaging_tags_plastique,packaging_tags_point,packaging_tags_polyethylene,packaging_tags_polypropylene,packaging_tags_pot,packaging_tags_pots,packaging_tags_pouch,packaging_tags_pp,packaging_tags_protective,packaging_tags_recyclable,packaging_tags_recycle,packaging_tags_recycler,packaging_tags_refrigerated,packaging_tags_sac,packaging_tags_sachet,packaging_tags_sachets,packaging_tags_seal,packaging_tags_sheet,packaging_tags_sleeve,packaging_tags_sous,packaging_tags_steel,packaging_tags_terephthalate,packaging_tags_tetra,packaging_tags_tray,packaging_tags_tri,packaging_tags_triman,packaging_tags_tube,packaging_tags_unknown,packaging_tags_vacuum,packaging_tags_verre,packaging_tags_vert,packaging_tags_vial,packaging_tags_wine,packaging_tags_wood,packaging_tags_wrapper,categories_ajouté,categories_alimentaires,categories_aliments,categories_and,categories_au,categories_aux,categories_avec,categories_base,categories_based,categories_beverages,categories_biscuits,categories_bières,categories_blancs,categories_boissons,categories_bonbons,categories_bœuf,categories_cacao,categories_charcuteries,categories_chips,categories_chocolat,categories_chocolats,categories_compotes,categories_condiments,categories_confiseries,categories_confitures,categories_conserve,categories_conserves,categories_coques,categories_crèmes,categories_céréales,categories_de,categories_desserts,categories_déjeuners,categories_dérivés,categories_déshydratés,categories_en,categories_et,categories_fermentés,categories_foies,categories_foods,categories_frais,categories_fromages,categories_fruits,categories_graines,categories_gras,categories_grasses,categories_groceries,categories_gâteaux,categories_huiles,categories_jambons,categories_jus,categories_la,categories_lactés,categories_lait,categories_laitiers,categories_laits,categories_légumes,categories_légumineuses,categories_matières,categories_mer,categories_nectars,categories_origine,categories_pains,categories_petit,categories_pizzas,categories_plant,categories_plantes,categories_plats,categories_poissons,categories_pommes,categories_porc,categories_poulet,categories_pour,categories_products,categories_produits,categories_préparations,categories_préparés,categories_pâtes,categories_riz,categories_salés,categories_sans,categories_sauces,categories_secs,categories_snacks,categories_soupes,categories_sucre,categories_sucrés,categories_surgelés,categories_tartes,categories_tartiner,categories_terre,categories_unknown,categories_vache,categories_viande,categories_viandes,categories_volailles,categories_végétale,categories_végétales,categories_végétaux,categories_yaourts,categories_tags_alcoholic,categories_tags_and,categories_tags_appetizers,categories_tags_based,categories_tags_beef,categories_tags_beers,categories_tags_beverages,categories_tags_biscuits,categories_tags_breads,categories_tags_breakfasts,categories_tags_cakes,categories_tags_candies,categories_tags_canned,categories_tags_cereal,categories_tags_cereals,categories_tags_cheese,categories_tags_cheeses,categories_tags_chicken,categories_tags_chocolate,categories_tags_chocolates,categories_tags_cocoa,categories_tags_compotes,categories_tags_condiments,categories_tags_confectioneries,categories_tags_cow,categories_tags_cream,categories_tags_creams,categories_tags_crisps,categories_tags_dairies,categories_tags_dairy,categories_tags_de,categories_tags_desserts,categories_tags_dishes,categories_tags_dried,categories_tags_drinks,categories_tags_eggs,categories_tags_fats,categories_tags_fermented,categories_tags_fish,categories_tags_fishes,categories_tags_foods,categories_tags_fr,categories_tags_french,categories_tags_fresh,categories_tags_from,categories_tags_frozen,categories_tags_fruit,categories_tags_fruits,categories_tags_gras,categories_tags_groceries,categories_tags_hams,categories_tags_ice,categories_tags_in,categories_tags_its,categories_tags_jams,categories_tags_juices,categories_tags_legumes,categories_tags_meals,categories_tags_meat,categories_tags_meats,categories_tags_milk,categories_tags_milks,categories_tags_nectars,categories_tags_nuts,categories_tags_oils,categories_tags_olive,categories_tags_pasta,categories_tags_pastas,categories_tags_pastries,categories_tags_pies,categories_tags_pizzas,categories_tags_plant,categories_tags_pork,categories_tags_potatoes,categories_tags_poultries,categories_tags_preparations,categories_tags_prepared,categories_tags_products,categories_tags_rices,categories_tags_salads,categories_tags_salted,categories_tags_salty,categories_tags_sauces,categories_tags_sausages,categories_tags_seafood,categories_tags_seeds,categories_tags_snacks,categories_tags_soups,categories_tags_spreads,categories_tags_sweet,categories_tags_sweetened,categories_tags_teas,categories_tags_their,categories_tags_unknown,categories_tags_vegetable,categories_tags_vegetables,categories_tags_wheat,categories_tags_white,categories_tags_with,categories_tags_yogurts,ingredients_tags_acid,ingredients_tags_added,ingredients_tags_agent,ingredients_tags_alcohol,ingredients_tags_and,ingredients_tags_animal,ingredients_tags_antioxidant,ingredients_tags_apple,ingredients_tags_bean,ingredients_tags_butter,ingredients_tags_cane,ingredients_tags_carrot,ingredients_tags_cereal,ingredients_tags_cheese,ingredients_tags_chicken,ingredients_tags_chocolate,ingredients_tags_cocoa,ingredients_tags_colour,ingredients_tags_colza,ingredients_tags_condiment,ingredients_tags_corn,ingredients_tags_cream,ingredients_tags_dairy,ingredients_tags_de,ingredients_tags_dextrose,ingredients_tags_disaccharide,ingredients_tags_durum,ingredients_tags_e322,ingredients_tags_e322i,ingredients_tags_e330,ingredients_tags_egg,ingredients_tags_emulsifier,ingredients_tags_en,ingredients_tags_et,ingredients_tags_extract,ingredients_tags_family,ingredients_tags_fat,ingredients_tags_ferment,ingredients_tags_fiber,ingredients_tags_fish,ingredients_tags_flavouring,ingredients_tags_flour,ingredients_tags_fr,ingredients_tags_fructose,ingredients_tags_fruit,ingredients_tags_garlic,ingredients_tags_glucose,ingredients_tags_gluten,ingredients_tags_herb,ingredients_tags_juice,ingredients_tags_lecithin,ingredients_tags_legume,ingredients_tags_lemon,ingredients_tags_meat,ingredients_tags_milk,ingredients_tags_modified,ingredients_tags_monosaccharide,ingredients_tags_natural,ingredients_tags_nut,ingredients_tags_oil,ingredients_tags_olive,ingredients_tags_onion,ingredients_tags_palm,ingredients_tags_paste,ingredients_tags_pepper,ingredients_tags_plant,ingredients_tags_pork,ingredients_tags_potato,ingredients_tags_poultry,ingredients_tags_powder,ingredients_tags_preservative,ingredients_tags_protein,ingredients_tags_puree,ingredients_tags_rapeseed,ingredients_tags_rice,ingredients_tags_root,ingredients_tags_salt,ingredients_tags_seed,ingredients_tags_semolina,ingredients_tags_skimmed,ingredients_tags_soya,ingredients_tags_spice,ingredients_tags_stabiliser,ingredients_tags_starch,ingredients_tags_sugar,ingredients_tags_sunflower,ingredients_tags_syrup,ingredients_tags_taproot,ingredients_tags_thickener,ingredients_tags_tomato,ingredients_tags_tree,ingredients_tags_vanilla,ingredients_tags_vegetable,ingredients_tags_vinegar,ingredients_tags_water,ingredients_tags_wheat,ingredients_tags_whey,ingredients_tags_white,ingredients_tags_whole,ingredients_tags_yeast,ingredients_analysis_tags_contain,ingredients_analysis_tags_content,ingredients_analysis_tags_free,ingredients_analysis_tags_may,ingredients_analysis_tags_maybe,ingredients_analysis_tags_non,ingredients_analysis_tags_oil,ingredients_analysis_tags_palm,ingredients_analysis_tags_status,ingredients_analysis_tags_unknown,ingredients_analysis_tags_vegan,ingredients_analysis_tags_vegetarian,food_groups_tags_alcoholic,food_groups_tags_and,food_groups_tags_appetizers,food_groups_tags_artificially,food_groups_tags_based,food_groups_tags_beverages,food_groups_tags_biscuits,food_groups_tags_bread,food_groups_tags_breakfast,food_groups_tags_cakes,food_groups_tags_cereals,food_groups_tags_cheese,food_groups_tags_chocolate,food_groups_tags_coffees,food_groups_tags_composite,food_groups_tags_cream,food_groups_tags_dairy,food_groups_tags_desserts,food_groups_tags_dish,food_groups_tags_dressings,food_groups_tags_dried,food_groups_tags_eggs,food_groups_tags_fats,food_groups_tags_fatty,food_groups_tags_fish,food_groups_tags_flavored,food_groups_tags_foods,food_groups_tags_fruit,food_groups_tags_fruits,food_groups_tags_herbal,food_groups_tags_ice,food_groups_tags_juices,food_groups_tags_lean,food_groups_tags_legumes,food_groups_tags_meals,food_groups_tags_meat,food_groups_tags_milk,food_groups_tags_nectars,food_groups_tags_none,food_groups_tags_nuts,food_groups_tags_offals,food_groups_tags_one,food_groups_tags_other,food_groups_tags_pastries,food_groups_tags_pies,food_groups_tags_pizza,food_groups_tags_plant,food_groups_tags_potatoes,food_groups_tags_poultry,food_groups_tags_processed,food_groups_tags_products,food_groups_tags_quiches,food_groups_tags_salty,food_groups_tags_sandwiches,food_groups_tags_sauces,food_groups_tags_seafood,food_groups_tags_snacks,food_groups_tags_soups,food_groups_tags_substitutes,food_groups_tags_sugary,food_groups_tags_sweetened,food_groups_tags_sweets,food_groups_tags_teas,food_groups_tags_than,food_groups_tags_unsweetened,food_groups_tags_vegetables,food_groups_tags_waters,food_groups_tags_yogurt,main_category_and,main_category_apple,main_category_au,main_category_aux,main_category_bars,main_category_based,main_category_beans,main_category_beef,main_category_beers,main_category_beverages,main_category_biscuits,main_category_breads,main_category_cakes,main_category_candies,main_category_canned,main_category_cereals,main_category_cheese,main_category_cheeses,main_category_chicken,main_category_chocolate,main_category_chocolates,main_category_compotes,main_category_cooked,main_category_cookies,main_category_cream,main_category_crisps,main_category_dark,main_category_de,main_category_desserts,main_category_dried,main_category_dry,main_category_duck,main_category_eggs,main_category_filled,main_category_fish,main_category_foies,main_category_foods,main_category_fr,main_category_fresh,main_category_from,main_category_fromages,main_category_frozen,main_category_fruit,main_category_fruits,main_category_gras,main_category_green,main_category_groceries,main_category_ham,main_category_hams,main_category_hazelnuts,main_category_honeys,main_category_ice,main_category_in,main_category_jams,main_category_juices,main_category_la,main_category_made,main_category_meals,main_category_meat,main_category_milk,main_category_milks,main_category_mixed,main_category_none,main_category_of,main_category_oil,main_category_oils,main_category_olive,main_category_orange,main_category_pates,main_category_plain,main_category_pork,main_category_potato,main_category_preparations,main_category_raw,main_category_rice,main_category_rices,main_category_salads,main_category_salmons,main_category_salted,main_category_sausages,main_category_sliced,main_category_smoked,main_category_snacks,main_category_soups,main_category_spreads,main_category_squeezed,main_category_strawberry,main_category_sugar,main_category_sunflower,main_category_sweet,main_category_sweetened,main_category_teas,main_category_terrines,main_category_vegetable,main_category_vegetables,main_category_wheat,main_category_white,main_category_whole,main_category_with,main_category_yogurts,allergens_abricot,allergens_albacore,allergens_amandes,allergens_amidon,allergens_and,allergens_atlantique,allergens_au,allergens_avoine,allergens_base,allergens_beurre,allergens_blanc,allergens_blé,allergens_cacao,allergens_cajou,allergens_celery,allergens_ces,allergens_chapelure,allergens_chocolat,allergens_chèvre,allergens_citron,allergens_coco,allergens_crustaceans,allergens_crustacés,allergens_crème,allergens_céréales,allergens_de,allergens_dioxide,allergens_disulfite,allergens_du,allergens_eggs,allergens_en,allergens_entier,allergens_et,allergens_farine,allergens_ferments,allergens_fish,allergens_flocon,allergens_flocons,allergens_fr,allergens_france,allergens_fromage,allergens_fromagère,allergens_fruits,allergens_gluten,allergens_graines,allergens_grasse,allergens_gs1,allergens_iait,allergens_ingrédients,allergens_la,allergens_lait,allergens_levain,allergens_listao,allergens_lupin,allergens_maasdam,allergens_malt,allergens_matière,allergens_milk,allergens_molluscs,allergens_moutarde,allergens_mustard,allergens_noir,allergens_noisettes,allergens_noix,allergens_non,allergens_none,allergens_nuts,allergens_néant,allergens_orange,allergens_ou,allergens_peanuts,allergens_pecorino,allergens_potassium,allergens_poudre,allergens_produits,allergens_protéines,allergens_renseigne,allergens_rum,allergens_saint,allergens_sans,allergens_sarrasin,allergens_sauce,allergens_saumon,allergens_seeds,allergens_sesame,allergens_sodium,allergens_soja,allergens_soybeans,allergens_sulfureux,allergens_sulphites,allergens_sulphur,allergens_t4078,allergens_tamari,allergens_thon,allergens_transparence,allergens_vache,allergens_vin,allergens_écrémé,allergens_épeautre,allergens_œufs,additives_tags_e100,additives_tags_e101,additives_tags_e120,additives_tags_e133,additives_tags_e1400,additives_tags_e141,additives_tags_e14xx,additives_tags_e150,additives_tags_e150a,additives_tags_e150c,additives_tags_e150d,additives_tags_e153,additives_tags_e160,additives_tags_e160a,additives_tags_e160ai,additives_tags_e160b,additives_tags_e160c,additives_tags_e161b,additives_tags_e162,additives_tags_e163,additives_tags_e170,additives_tags_e171,additives_tags_e200,additives_tags_e202,additives_tags_e211,additives_tags_e220,additives_tags_e223,additives_tags_e224,additives_tags_e235,additives_tags_e250,additives_tags_e252,additives_tags_e260,additives_tags_e262,additives_tags_e262i,additives_tags_e270,additives_tags_e282,additives_tags_e290,additives_tags_e296,additives_tags_e300,additives_tags_e301,additives_tags_e306,additives_tags_e316,additives_tags_e322,additives_tags_e322i,additives_tags_e325,additives_tags_e326,additives_tags_e327,additives_tags_e330,additives_tags_e331,additives_tags_e331iii,additives_tags_e333,additives_tags_e334,additives_tags_e336,additives_tags_e339,additives_tags_e341,additives_tags_e392,additives_tags_e401,additives_tags_e406,additives_tags_e407,additives_tags_e410,additives_tags_e412,additives_tags_e414,additives_tags_e415,additives_tags_e420,additives_tags_e422,additives_tags_e428,additives_tags_e440,additives_tags_e445,additives_tags_e450,additives_tags_e450i,additives_tags_e451,additives_tags_e452,additives_tags_e460,additives_tags_e461,additives_tags_e466,additives_tags_e471,additives_tags_e472b,additives_tags_e472e,additives_tags_e476,additives_tags_e481,additives_tags_e500,additives_tags_e500i,additives_tags_e500ii,additives_tags_e501,additives_tags_e503,additives_tags_e503ii,additives_tags_e509,additives_tags_e551,additives_tags_e621,additives_tags_e627,additives_tags_e631,additives_tags_e901,additives_tags_e903,additives_tags_e904,additives_tags_e950,additives_tags_e951,additives_tags_e955,additives_tags_e960,additives_tags_e965,additives_tags_none,traces_tags_2020,traces_tags_alcool,traces_tags_allergenes,traces_tags_amandes,traces_tags_and,traces_tags_arachide,traces_tags_arachides,traces_tags_arete,traces_tags_aretes,traces_tags_atelier,traces_tags_autres,traces_tags_avoine,traces_tags_ble,traces_tags_cacao,traces_tags_celeri,traces_tags_celery,traces_tags_cereales,traces_tags_chevre,traces_tags_contenant,traces_tags_contenir,traces_tags_contient,traces_tags_coque,traces_tags_coques,traces_tags_crustaceans,traces_tags_crustaces,traces_tags_dans,traces_tags_ddm,traces_tags_de,traces_tags_des,traces_tags_dioxide,traces_tags_dioxyde,traces_tags_dont,traces_tags_du,traces_tags_eggs,traces_tags_en,traces_tags_est,traces_tags_et,traces_tags_eventuelles,traces_tags_fabrique,traces_tags_fish,traces_tags_fr,traces_tags_fruit,traces_tags_fruits,traces_tags_gluten,traces_tags_graines,traces_tags_gras,traces_tags_ingredients,traces_tags_la,traces_tags_lactose,traces_tags_lait,traces_tags_le,traces_tags_les,traces_tags_lettre,traces_tags_lot,traces_tags_lupin,traces_tags_milk,traces_tags_molluscs,traces_tags_mollusques,traces_tags_morceaux,traces_tags_moutarde,traces_tags_mustard,traces_tags_neant,traces_tags_noisettes,traces_tags_noix,traces_tags_non,traces_tags_none,traces_tags_noyau,traces_tags_noyaux,traces_tags_numero,traces_tags_nutritionnel,traces_tags_nuts,traces_tags_oeuf,traces_tags_oeufs,traces_tags_os,traces_tags_ou,traces_tags_pas,traces_tags_peanuts,traces_tags_peut,traces_tags_phenylalanine,traces_tags_poisson,traces_tags_poissons,traces_tags_possible,traces_tags_presence,traces_tags_produit,traces_tags_produits,traces_tags_proteines,traces_tags_sans,traces_tags_seeds,traces_tags_sel,traces_tags_sesame,traces_tags_si,traces_tags_soja,traces_tags_soufre,traces_tags_soybeans,traces_tags_sulfites,traces_tags_sulphites,traces_tags_sulphur,traces_tags_traces,traces_tags_un,traces_tags_valeur,nutrient_levels_tags_fat,nutrient_levels_tags_high,nutrient_levels_tags_in,nutrient_levels_tags_low,nutrient_levels_tags_moderate,nutrient_levels_tags_quantity,nutrient_levels_tags_salt,nutrient_levels_tags_saturated,nutrient_levels_tags_sugars,nutrient_levels_tags_unknown
0,5,http://world-en.openfoodfacts.org/product/00000005/bio-inulin-ewl,Bio inulin,a,unknown,https://images.openfoodfacts.org/images/products/000/000/000/0005/front_de.34.400.jpg,840.0,0.0,0.0,8.0,8.0,0.0,0.23,50.0,2024,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.184902,0.0,0.535275,0.0,0.0,0.123711,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.436984,0.0,0.0,0.0,0.0,0.0,0.0,0.195257,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.268933,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.602148,0.0,0.0,0.0,0.0,0.0,0.142265,0.0,0.0,0.165124,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.155458,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.349251,0.0,0.0,0.0,0.0,0.521446,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.652509,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.283455,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.168321,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.593487,0.0,0.0,0.0,0.402422,0.402422,0.0,0.0,0.402422,0.402422,0.0,0.216853,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.436572,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.873144,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.27885,0.0,0.556395,0.457502,0.185754,0.556395,0.141078,0.140453,0.140171,0.0
1,10,http://world-en.openfoodfacts.org/product/00000010/madeleines-nature-bijou,Madeleines nature,d,c,https://images.openfoodfacts.org/images/products/000/000/000/0010/front_fr.37.400.jpg,1852.0,22.0,2.6,54.0,25.0,6.4,0.53,22.666667,2024,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.383913,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.247469,0.321221,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.55483,0.0,0.0,0.0,0.0,0.0,0.314612,0.0,0.0,0.0,0.420927,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.191205,0.260059,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.322259,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.316347,0.0,0.0,0.0,0.0,0.150884,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.674292,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.503374,0.0,0.0,0.255446,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.141556,0.0,0.0,0.0,0.0,0.0,0.334617,0.0,0.0,0.668125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.323285,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.505589,0.0,0.0,0.247929,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.062291,0.105417,0.0,0.1446,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.164273,0.0,0.309451,0.0,0.0,0.0,0.117396,0.0,0.0,0.0,0.079307,0.07183,0.0,0.068068,0.0,0.0,0.0,0.0,0.216034,0.0,0.0,0.0,0.0,0.0,0.148681,0.0,0.317861,0.0,0.153137,0.264789,0.056,0.297616,0.0,0.0,0.189462,0.283268,0.0,0.0,0.0,0.0,0.0,0.0,0.17351,0.0,0.093452,0.094064,0.0,0.299633,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.211534,0.0,0.0,0.0,0.114474,0.0,0.0,0.061292,0.0,0.0,0.119244,0.0,0.0,0.128858,0.0,0.124112,0.0,0.106833,0.0,0.0,0.0,0.0,0.0,0.128856,0.0,0.0,0.258737,0.0,0.0,0.0,0.0,0.0,0.0,0.376483,0.0,0.0,0.431594,0.25528,0.25528,0.46436,0.442365,0.25528,0.25528,0.0,0.280228,0.0,0.0,0.0,0.0,0.549753,0.0,0.0,0.549753,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.377295,0.0,0.0,0.417927,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.68429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.538841,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.491323,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.559255,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.591416,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.580914,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.659539,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.75167,0.0,0.0,0.0,0.0,0.0,0.0,0.274052,0.369195,0.546821,0.0,0.365116,0.546821,0.13865,0.138036,0.137759,0.0
2,15,http://world-en.openfoodfacts.org/product/00000015/madeleines-chocolait-bijou,Madeleines ChocoLait,d,d,https://images.openfoodfacts.org/images/products/000/000/000/0015/front_fr.22.400.jpg,1926.0,24.0,6.0,54.0,31.0,6.4,0.48,16.25,2024,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.791318,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.611405,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.367553,0.0,0.0,0.0,0.0,0.0,0.210657,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.487347,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.098631,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.661163,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.329049,0.0,0.0,0.166982,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.100738,0.0,0.0,0.0,0.0,0.0,0.238129,0.0,0.0,0.713204,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.513712,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.359801,0.0,0.0,0.176438,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.048848,0.082667,0.0,0.113395,0.0,0.0,0.0,0.0,0.084261,0.0,0.0,0.128822,0.0,0.242669,0.223036,0.271678,0.0,0.092061,0.0,0.0,0.0,0.062192,0.0,0.0,0.053379,0.0,0.093187,0.096759,0.0,0.169412,0.084604,0.0,0.0,0.0,0.0,0.116595,0.0,0.249264,0.0,0.240178,0.207645,0.0,0.233388,0.0,0.0,0.148574,0.222137,0.0,0.0,0.096998,0.0,0.0,0.0,0.27213,0.0,0.073285,0.147528,0.0,0.23497,0.0,0.0,0.0,0.09875,0.0,0.074466,0.0,0.0,0.0,0.248825,0.0,0.0,0.0,0.08977,0.0,0.0,0.048065,0.0,0.0,0.09351,0.094524,0.0,0.10105,0.0,0.097328,0.0,0.083778,0.0,0.0,0.0,0.0,0.205759,0.101048,0.0,0.0,0.2029,0.0,0.0,0.093717,0.0,0.0,0.0,0.32708,0.0,0.745437,0.374959,0.221781,0.221781,0.0,0.0,0.221781,0.221781,0.0,0.280228,0.0,0.0,0.0,0.0,0.549753,0.0,0.0,0.549753,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.377295,0.0,0.0,0.417927,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.323267,0.336728,0.0,0.0,0.0,0.0,0.451045,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.476983,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.362841,0.0,0.0,0.0,0.468513,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.264887,0.535271,0.528533,0.0,0.176452,0.528533,0.134013,0.13342,0.133152,0.0
3,20,http://world-en.openfoodfacts.org/product/00000020/madeleines-choco-noir-bijou,Madeleines Choco Noir,d,d,https://images.openfoodfacts.org/images/products/000/000/000/0020/front_fr.39.400.jpg,1953.0,25.0,6.1,53.0,29.0,6.3,0.45,16.25,2024,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.335664,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.280851,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.24255,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.184013,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.722187,0.0,0.0,0.0,0.44069,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.367553,0.0,0.0,0.0,0.0,0.0,0.210657,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.487347,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.098631,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.661163,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.329049,0.0,0.0,0.166982,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.100738,0.0,0.0,0.0,0.0,0.0,0.238129,0.0,0.0,0.713204,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.513712,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.359801,0.0,0.0,0.176438,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.052112,0.08819,0.0,0.12097,0.0,0.0,0.0,0.0,0.08989,0.0,0.0,0.137428,0.0,0.258881,0.237936,0.289828,0.0,0.098211,0.0,0.0,0.0,0.066347,0.060092,0.0,0.056945,0.0,0.099413,0.0,0.0,0.18073,0.090256,0.0,0.0,0.0,0.0,0.124384,0.0,0.265917,0.0,0.256223,0.221517,0.046849,0.24898,0.0,0.0,0.1585,0.236977,0.0,0.0,0.0,0.0,0.0,0.0,0.145155,0.0,0.078181,0.157384,0.0,0.250668,0.0,0.0,0.0,0.105347,0.0,0.07944,0.0,0.0,0.0,0.176965,0.0,0.0,0.0,0.095767,0.0,0.0,0.051276,0.0,0.0,0.099757,0.0,0.0,0.107801,0.0,0.10383,0.0,0.089375,0.0,0.0,0.0,0.0,0.219505,0.107798,0.0,0.0,0.216455,0.0,0.0,0.0,0.0,0.0,0.0,0.376483,0.0,0.0,0.431594,0.25528,0.25528,0.46436,0.442365,0.25528,0.25528,0.0,0.280228,0.0,0.0,0.0,0.0,0.549753,0.0,0.0,0.549753,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.377295,0.0,0.0,0.417927,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.54876,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.432118,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.394012,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.597403,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.343316,0.0,0.0,0.0,0.0,0.0,0.479019,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.506566,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.385344,0.0,0.0,0.0,0.49757,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.264887,0.535271,0.528533,0.0,0.176452,0.528533,0.134013,0.13342,0.133152,0.0
4,22,http://world-en.openfoodfacts.org/product/00000022/farandole-de-madeleine-bijou,Farandole de madeleine,unknown,d,https://images.openfoodfacts.org/images/products/000/000/000/0022/front_fr.3.400.jpg,,,,,,,,1.75,2024,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.404187,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.338184,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.584128,0.0,0.0,0.0,0.0,0.0,0.331226,0.0,0.0,0.0,0.443154,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.273792,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.367553,0.0,0.0,0.0,0.0,0.0,0.210657,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.487347,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.098631,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.661163,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.329049,0.0,0.0,0.166982,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.100738,0.0,0.0,0.0,0.0,0.0,0.238129,0.0,0.0,0.713204,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.513712,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.359801,0.0,0.0,0.176438,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043954,0.074384,0.0,0.102033,0.0,0.0,0.0,0.0,0.075819,0.0,0.0,0.115915,0.0,0.218355,0.200689,0.244458,0.0,0.082837,0.0,0.0,0.0,0.055961,0.304109,0.0,0.04803,0.0,0.08385,0.087065,0.0,0.152438,0.076127,0.091593,0.263583,0.0,0.0,0.104912,0.0,0.224289,0.0,0.216113,0.18684,0.316119,0.210004,0.0,0.0,0.133688,0.19988,0.0,0.0,0.087279,0.0,0.0,0.0,0.122432,0.0,0.065942,0.132746,0.0,0.211427,0.0,0.0,0.0,0.088856,0.0,0.067005,0.0,0.0,0.0,0.149263,0.0,0.0,0.0,0.080775,0.0,0.0,0.043249,0.0,0.0,0.084141,0.085053,0.0,0.090925,0.0,0.087576,0.0,0.075384,0.0,0.0,0.0,0.0,0.185143,0.090923,0.0,0.0,0.182571,0.0,0.0,0.0,0.0,0.0,0.459268,0.0,0.0,0.0,0.318898,0.188622,0.188622,0.343108,0.653713,0.188622,0.188622,0.0,0.280228,0.0,0.0,0.0,0.0,0.549753,0.0,0.0,0.549753,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.377295,0.0,0.0,0.417927,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.323267,0.336728,0.0,0.0,0.0,0.0,0.451045,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.476983,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.362841,0.0,0.0,0.0,0.468513,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


#### Preprocess the dataset_openfoodfacts_bool_cleaned and apply kmeans

In [2]:
dataset_with_bool = pd.read_csv("../data/dataset_openfoodfacts_bool_cleaned.csv", dtype={'code': str})

In [3]:
dataset_with_bool.head(1)

Unnamed: 0,code,url,product_name,nutriscore_grade,ecoscore_grade,image_url,energy_100g,fat_100g,saturated-fat_100g,carbohydrates_100g,sugars_100g,proteins_100g,salt_100g,fruits-vegetables-nuts-estimate-from-ingredients_100g,last_modified_year,packaging_tags_unknown,packaging_tags_plastic,packaging_tags_cardboard,packaging_tags_bag,packaging_tags_fresh,packaging_tags_glass,packaging_tags_tray,packaging_tags_box,packaging_tags_bottle,packaging_tags_metal,packaging_tags_frozen,packaging_tags_pot,packaging_tags_jar,packaging_tags_canned,packaging_tags_fr:film-en-plastique,packaging_tags_protective-gas,packaging_tags_fr:sachet-plastique,packaging_tags_fr:etui-en-carton,packaging_tags_film,packaging_tags_paper,packaging_tags_brick,packaging_tags_sleeve,packaging_tags_vacuum-packed,packaging_tags_lid,packaging_tags_recyclable-metals,packaging_tags_aluminium,packaging_tags_fr:film-plastique,packaging_tags_seal,packaging_tags_fr:sachet-en-plastique,packaging_tags_bottle-cap,packaging_tags_wine-cork,packaging_tags_fr:barquette-plastique,packaging_tags_fr:bocal-en-verre,packaging_tags_vial,packaging_tags_can,packaging_tags_fr:pot-en-verre,packaging_tags_fr:0,packaging_tags_fr:point-vert,packaging_tags_fr:bouteille-en-verre,packaging_tags_pack,packaging_tags_dry,packaging_tags_fr:pensez-au-tri,packaging_tags_fr:boite-en-carton,packaging_tags_0,packaging_tags_fr:boite-carton,packaging_tags_fr:conteneur,packaging_tags_fr:pot-en-plastique,packaging_tags_pouch-flask,packaging_tags_fr:couvercle-en-metal,packaging_tags_fr:etui-carton-a-recycler,packaging_tags_drink-can,packaging_tags_fr:etui-carton,packaging_tags_fr:bocal-verre,packaging_tags_fr:pot-plastique,packaging_tags_fr:bouteille-plastique,packaging_tags_fr:bouteille-en-plastique,packaging_tags_fr:bouteille-verre,packaging_tags_capsule,packaging_tags_tetra-pak,packaging_tags_container,packaging_tags_envelope,packaging_tags_fr:couvercle-metal,packaging_tags_fr:opercule-en-plastique,packaging_tags_fr:opercule-en-metal,packaging_tags_fr:capsule-en-metal,packaging_tags_wood,packaging_tags_fr:barquette-en-plastique,packaging_tags_fr:triman,packaging_tags_fr:brique-en-carton,packaging_tags_fr:sachet-plastique-a-jeter,packaging_tags_fr:bouchon-en-plastique,packaging_tags_fr:film-plastique-a-jeter,packaging_tags_green-dot,packaging_tags_individual-bag,packaging_tags_recycle,packaging_tags_bottle-or-vial,packaging_tags_refrigerated,packaging_tags_wrapper,packaging_tags_fr:boite-plastique,packaging_tags_fr:boite-en-metal,packaging_tags_sheet,packaging_tags_blister,packaging_tags_fr:barquette-et-film-plastique-a-jeter,packaging_tags_food-can,packaging_tags_unfrozen,packaging_tags_discard,packaging_tags_fr:pot-verre,packaging_tags_fr:bouchon-plastique,packaging_tags_pet-polyethylene-terephthalate,packaging_tags_steel,packaging_tags_net,packaging_tags_fr:boite-metal,packaging_tags_fr:couvercle-plastique,packaging_tags_tube,packaging_tags_fr:sachet-papier,packaging_tags_plastique,packaging_tags_fr:couvercle-ouverture-facile,packaging_tags_card-box,packaging_tags_jug,packaging_tags_fr:flacon-verre,packaging_tags_other,categories_tags_plant-based-foods-and-beverages,categories_tags_plant-based-foods,categories_tags_snacks,categories_tags_unknown,categories_tags_sweet-snacks,categories_tags_meats-and-their-products,categories_tags_beverages,categories_tags_meals,categories_tags_dairies,categories_tags_fruits-and-vegetables-based-foods,categories_tags_meats,categories_tags_cereals-and-potatoes,categories_tags_desserts,categories_tags_fermented-foods,categories_tags_biscuits-and-cakes,categories_tags_fermented-milk-products,categories_tags_frozen-foods,categories_tags_spreads,categories_tags_prepared-meats,categories_tags_cereals-and-their-products,categories_tags_condiments,categories_tags_breakfasts,categories_tags_fruits-based-foods,categories_tags_cheeses,categories_tags_plant-based-beverages,categories_tags_groceries,categories_tags_seafood,categories_tags_confectioneries,categories_tags_vegetables-based-foods,categories_tags_cocoa-and-its-products,categories_tags_dairy-desserts,categories_tags_canned-foods,categories_tags_biscuits,categories_tags_sauces,categories_tags_sweet-spreads,categories_tags_fishes,categories_tags_fishes-and-their-products,categories_tags_plant-based-spreads,categories_tags_fermented-dairy-desserts,categories_tags_salty-snacks,categories_tags_fruit-based-beverages,categories_tags_meals-with-meat,categories_tags_chocolates,categories_tags_cakes,categories_tags_appetizers,categories_tags_poultries,categories_tags_pastas,categories_tags_juices-and-nectars,categories_tags_fatty-fishes,categories_tags_fresh-foods,categories_tags_breads,categories_tags_sweetened-beverages,categories_tags_alcoholic-beverages,categories_tags_dried-products,categories_tags_yogurts,categories_tags_fruit-and-vegetable-preserves,categories_tags_seeds,categories_tags_legumes-and-their-products,categories_tags_canned-plant-based-foods,categories_tags_vegetables,categories_tags_jams,categories_tags_fats,categories_tags_cow-cheeses,categories_tags_salted-spreads,categories_tags_frozen-desserts,categories_tags_fruit-juices,categories_tags_french-cheeses,categories_tags_nuts-and-their-products,categories_tags_legumes,categories_tags_hams,categories_tags_hot-beverages,categories_tags_chicken-and-its-products,categories_tags_candies,categories_tags_ice-creams-and-sorbets,categories_tags_pasta-dishes,categories_tags_chickens,categories_tags_chocolate-candies,categories_tags_sweeteners,categories_tags_pastries,categories_tags_dried-plant-based-foods,categories_tags_canned-vegetables,categories_tags_breakfast-cereals,categories_tags_unsweetened-beverages,categories_tags_fruits,categories_tags_soups,categories_tags_vegetable-fats,categories_tags_farming-products,categories_tags_dark-chocolates,categories_tags_compotes,categories_tags_chips-and-fries,categories_tags_cereal-grains,categories_tags_fish-and-meat-and-eggs,categories_tags_microwave-meals,categories_tags_chocolate-biscuits,categories_tags_viennoiseries,categories_tags_poultry-meals,categories_tags_ice-creams,categories_tags_sweet-pastries-and-pies,categories_tags_beers,categories_tags_beverages-and-beverages-preparations,categories_tags_other,ingredients_tags_salt,ingredients_tags_added-sugar,ingredients_tags_disaccharide,ingredients_tags_sugar,ingredients_tags_water,ingredients_tags_oil-and-fat,ingredients_tags_flavouring,ingredients_tags_vegetable,ingredients_tags_dairy,ingredients_tags_vegetable-oil-and-fat,ingredients_tags_cereal,ingredients_tags_wheat,ingredients_tags_flour,ingredients_tags_fruit,ingredients_tags_root-vegetable,ingredients_tags_monosaccharide,ingredients_tags_vegetable-oil,ingredients_tags_cereal-flour,ingredients_tags_glucose,ingredients_tags_natural-flavouring,ingredients_tags_starch,ingredients_tags_wheat-flour,ingredients_tags_plant,ingredients_tags_condiment,ingredients_tags_spice,ingredients_tags_onion-family-vegetable,ingredients_tags_milk,ingredients_tags_herb,ingredients_tags_emulsifier,ingredients_tags_egg,ingredients_tags_seed,ingredients_tags_e330,ingredients_tags_fruit-vegetable,ingredients_tags_sunflower-oil,ingredients_tags_preservative,ingredients_tags_colour,ingredients_tags_acid,ingredients_tags_onion,ingredients_tags_cocoa,ingredients_tags_antioxidant,ingredients_tags_rapeseed-oil,ingredients_tags_milk-powder,ingredients_tags_e322,ingredients_tags_animal,ingredients_tags_colza-oil,ingredients_tags_garlic,ingredients_tags_dextrose,ingredients_tags_pepper,ingredients_tags_juice,ingredients_tags_fruit-juice,ingredients_tags_glucose-syrup,ingredients_tags_e322i,ingredients_tags_yeast,ingredients_tags_meat,ingredients_tags_nut,ingredients_tags_thickener,ingredients_tags_taproot-vegetable,ingredients_tags_cream,ingredients_tags_tomato,ingredients_tags_ferment,ingredients_tags_cocoa-butter,ingredients_tags_modified-starch,ingredients_tags_stabiliser,ingredients_tags_pork,ingredients_tags_e300,ingredients_tags_tree-nut,ingredients_tags_protein,ingredients_tags_legume,ingredients_tags_cocoa-paste,ingredients_tags_carrot,ingredients_tags_butter,ingredients_tags_raising-agent,ingredients_tags_e500,ingredients_tags_skimmed-milk-powder,ingredients_tags_vinegar,ingredients_tags_pork-meat,ingredients_tags_corn-starch,ingredients_tags_berries,ingredients_tags_soya-lecithin,ingredients_tags_lemon-juice,ingredients_tags_fat,ingredients_tags_rice,ingredients_tags_pulse,ingredients_tags_acidity-regulator,ingredients_tags_microbial-culture,ingredients_tags_cane-sugar,ingredients_tags_palm-oil-and-fat,ingredients_tags_lactic-ferments,ingredients_tags_e250,ingredients_tags_cheese,ingredients_tags_e415,ingredients_tags_citrus-fruit,ingredients_tags_animal-protein,ingredients_tags_milk-proteins,ingredients_tags_leaf-vegetable,ingredients_tags_olive-oil,ingredients_tags_poultry,ingredients_tags_fish,ingredients_tags_chocolate,ingredients_tags_sea-salt,ingredients_tags_other,ingredients_analysis_tags_palm-oil-free,ingredients_analysis_tags_vegan,ingredients_analysis_tags_vegetarian,ingredients_analysis_tags_non-vegan,ingredients_analysis_tags_vegetarian-status-unknown,ingredients_analysis_tags_maybe-vegetarian,ingredients_analysis_tags_palm-oil-content-unknown,ingredients_analysis_tags_may-contain-palm-oil,ingredients_analysis_tags_vegan-status-unknown,ingredients_analysis_tags_maybe-vegan,ingredients_analysis_tags_palm-oil,ingredients_analysis_tags_non-vegetarian,ingredients_analysis_tags_other,allergens_none,allergens_eggs,allergens_gluten,allergens_milk,allergens_soybeans,allergens_celery,allergens_nuts,allergens_mustard,allergens_fish,allergens_sesame-seeds,allergens_sulphur-dioxide-and-sulphites,allergens_fr:avoine,allergens_peanuts,allergens_crustaceans,allergens_molluscs,allergens_lupin,allergens_orange,allergens_fr:Avoine,allergens_fr:Non,allergens_other,additives_tags_none,additives_tags_e330,additives_tags_e322,additives_tags_e322i,additives_tags_e14xx,additives_tags_e500,additives_tags_e300,additives_tags_e250,additives_tags_e415,additives_tags_e471,additives_tags_e440,additives_tags_e450,additives_tags_e412,additives_tags_e202,additives_tags_e407,additives_tags_e301,additives_tags_e331,additives_tags_e500ii,additives_tags_e160c,additives_tags_e503,additives_tags_e422,additives_tags_e316,additives_tags_e428,additives_tags_e160a,additives_tags_e420,additives_tags_e270,additives_tags_e120,additives_tags_e450i,additives_tags_e100,additives_tags_e451,additives_tags_e621,additives_tags_e262,additives_tags_e414,additives_tags_e150a,additives_tags_e252,additives_tags_e160,additives_tags_e955,additives_tags_e392,additives_tags_e296,additives_tags_e950,additives_tags_e160b,additives_tags_e160ai,additives_tags_e452,additives_tags_e401,additives_tags_e163,additives_tags_e224,additives_tags_e326,additives_tags_e903,additives_tags_e503ii,additives_tags_e282,additives_tags_e306,additives_tags_e223,additives_tags_e466,additives_tags_e211,additives_tags_e220,additives_tags_e341,additives_tags_e150d,additives_tags_e472e,additives_tags_e161b,additives_tags_e133,additives_tags_e200,additives_tags_e476,additives_tags_e410,additives_tags_e965,additives_tags_e162,additives_tags_e325,additives_tags_e262i,additives_tags_e260,additives_tags_e406,additives_tags_e500i,additives_tags_e171,additives_tags_e951,additives_tags_e290,additives_tags_e150c,additives_tags_e960,additives_tags_e141,additives_tags_e481,additives_tags_e1400,additives_tags_e509,additives_tags_e901,additives_tags_e461,additives_tags_e336,additives_tags_e460,additives_tags_e153,additives_tags_e334,additives_tags_e101,additives_tags_e472b,additives_tags_e150,additives_tags_e631,additives_tags_e627,additives_tags_e904,additives_tags_e339,additives_tags_e235,additives_tags_e501,additives_tags_e327,additives_tags_e331iii,additives_tags_e170,additives_tags_e551,additives_tags_e445,additives_tags_e333,additives_tags_other,nutrient_levels_tags_fat-in-low-quantity,nutrient_levels_tags_saturated-fat-in-low-quantity,nutrient_levels_tags_sugars-in-moderate-quantity,nutrient_levels_tags_salt-in-low-quantity,nutrient_levels_tags_fat-in-high-quantity,nutrient_levels_tags_saturated-fat-in-moderate-quantity,nutrient_levels_tags_sugars-in-high-quantity,nutrient_levels_tags_salt-in-moderate-quantity,nutrient_levels_tags_saturated-fat-in-high-quantity,nutrient_levels_tags_unknown,nutrient_levels_tags_fat-in-moderate-quantity,nutrient_levels_tags_sugars-in-low-quantity,nutrient_levels_tags_salt-in-high-quantity,nutrient_levels_tags_other,traces_tags_none,traces_tags_nuts,traces_tags_soybeans,traces_tags_gluten,traces_tags_sesame-seeds,traces_tags_eggs,traces_tags_celery,traces_tags_milk,traces_tags_crustaceans,traces_tags_fish,traces_tags_sulphur-dioxide-and-sulphites,traces_tags_mustard,traces_tags_peanuts,traces_tags_molluscs,traces_tags_lupin,traces_tags_fr:non,traces_tags_fr:noyaux,traces_tags_fr:phenylalanine,traces_tags_other,main_category_none,main_category_groceries,main_category_sweetened-beverages,main_category_beverages,main_category_candies,main_category_frozen-foods,main_category_biscuits,main_category_dark-chocolates,main_category_salads,main_category_white-hams,main_category_crackers,main_category_virgin-olive-oils,main_category_microwave-meals,main_category_milk-chocolates,main_category_potato-crisps-in-sunflower-oil,main_category_chocolate-biscuits,main_category_raw-cured-ham,main_category_chicken-breasts,main_category_bonbons,main_category_unsweetened-beverages,main_category_preparations-made-from-fish-meat,main_category_shortbread-cookies,main_category_strawberry-jams,main_category_cocoa-and-hazelnuts-spreads,main_category_dietary-supplements,main_category_cheeses,main_category_protein-powders,main_category_cakes,main_category_breads,main_category_smoked-salmons-from-farming,main_category_madeleines,main_category_gummi-candies,main_category_pasteurized-cheeses,main_category_squeezed-orange-juices,main_category_labeled-cheeses,main_category_herbal-teas,main_category_craft-beers,main_category_frozen-ready-made-meals,main_category_meals-with-chicken,main_category_gingerbreads,main_category_squeezed-apple-juices,main_category_whole-foies-gras-from-ducks,main_category_fats,main_category_chickens,main_category_assorted-chocolate-candies,main_category_ice-cream-tubs,main_category_dry-sausages,main_category_dry-pastas,main_category_apricot-jams,main_category_apple-compotes,main_category_fr:fromages-blancs-natures,main_category_ice-creams,main_category_artificially-sweetened-beverages,main_category_chocolate-cakes,main_category_chocolates,main_category_sliced-breads,main_category_raspberry-jams,main_category_canned-green-beans,main_category_squeezed-multifruit-juices,main_category_cooked-chicken-breast-slices,main_category_smoked-salmons,main_category_whole-milk-yogurts,main_category_sugar-free-chewing-gum,main_category_breakfast-cereals,main_category_mueslis-with-fruits,main_category_frozen-pizzas,main_category_filled-biscuits,main_category_meals,main_category_compotes,main_category_confectioneries,main_category_protein-bars,main_category_chicken-thighs,main_category_yogurts,main_category_beers,main_category_mueslis,main_category_coffees,main_category_industrial-cheese,main_category_vegetable-soups,main_category_soups,main_category_pastas,main_category_country-style-pates,main_category_sliced-cheeses,main_category_chestnut-spreads,main_category_cream-of-vegetable-soups,main_category_tartlet-biscuits-with-fruit-preparation,main_category_grated-emmentaler,main_category_instant-noodles,main_category_country-terrines,main_category_pork-roasts,main_category_prepared-meats,main_category_fr:produits-labellises,main_category_ice-cream-log,main_category_fish-soups,main_category_baguettes,main_category_dried-fruits,main_category_flavoured-yogurts,main_category_sheep-milk-yogurts,main_category_fr:cremes-fraiches,main_category_bilberries-jams,main_category_turkey-cutlets,main_category_other,food_groups_tags_fruits-and-vegetables,food_groups_tags_vegetables,food_groups_tags_sugary-snacks,food_groups_tags_biscuits-and-cakes,food_groups_tags_cereals-and-potatoes,food_groups_tags_cereals,food_groups_tags_none,food_groups_tags_sweets,food_groups_tags_fats-and-sauces,food_groups_tags_dressings-and-sauces,food_groups_tags_beverages,food_groups_tags_unsweetened-beverages,food_groups_tags_fats,food_groups_tags_fish-meat-eggs,food_groups_tags_processed-meat,food_groups_tags_soups,food_groups_tags_composite-foods,food_groups_tags_one-dish-meals,food_groups_tags_pizza-pies-and-quiches,food_groups_tags_fish-and-seafood,food_groups_tags_pastries,food_groups_tags_salty-snacks,food_groups_tags_salty-and-fatty-products,food_groups_tags_offals,food_groups_tags_chocolate-products,food_groups_tags_eggs,food_groups_tags_alcoholic-beverages,food_groups_tags_bread,food_groups_tags_sweetened-beverages,food_groups_tags_meat,food_groups_tags_poultry,food_groups_tags_dried-fruits,food_groups_tags_nuts,food_groups_tags_fruits,food_groups_tags_milk-and-dairy-products,food_groups_tags_ice-cream,food_groups_tags_sandwiches,food_groups_tags_legumes,food_groups_tags_breakfast-cereals,food_groups_tags_cheese,food_groups_tags_milk-and-yogurt,food_groups_tags_lean-fish,food_groups_tags_meat-other-than-poultry,food_groups_tags_appetizers,food_groups_tags_potatoes,food_groups_tags_dairy-desserts,food_groups_tags_plant-based-milk-substitutes,food_groups_tags_artificially-sweetened-beverages,food_groups_tags_fatty-fish,food_groups_tags_teas-and-herbal-teas-and-coffees,food_groups_tags_other,categories_Aliments et boissons à base de végétaux,categories_ Aliments d'origine végétale,categories_unknown,categories_Snacks,categories_ Snacks sucrés,categories_Viandes et dérivés,categories_ Aliments à base de fruits et de légumes,categories_Produits laitiers,categories_ Viandes,categories_ Céréales et pommes de terre,categories_ Produits fermentés,categories_ Biscuits et gâteaux,categories_ Produits laitiers fermentés,categories_ Desserts,categories_ Plats préparés,categories_Boissons,categories_ Produits à tartiner,categories_ Charcuteries,categories_ Surgelés,categories_ Céréales et dérivés,categories_ Boissons,categories_ Fruits et produits dérivés,categories_ Boissons à base de végétaux,categories_ Fromages,categories_Produits de la mer,categories_ groceries,categories_Condiments,categories_ Légumes et dérivés,categories_Plats préparés,categories_ Confiseries,categories_ Cacao et dérivés,categories_ Desserts lactés,categories_ Biscuits,categories_ Sauces,categories_ Produits à tartiner sucrés,categories_ Petit-déjeuners,categories_ Poissons,categories_ Conserves,categories_ Poissons et dérivés,categories_ Pâtes à tartiner végétales,categories_ Snacks salés,categories_ Desserts lactés fermentés,categories_ Boissons aux fruits,categories_ Plats préparés à la viande,categories_ Gâteaux,categories_ Chocolats,categories_Plant-based foods and beverages,categories_Desserts,categories_ Amuse-gueules,categories_ Frais,categories_ Jus et nectars,categories_ Poissons gras,categories_ Boissons alcoolisées,categories_ Boissons avec sucre ajouté,categories_ Confitures et marmelades,categories_ Volailles,categories_ Pâtes alimentaires,categories_ Confitures,categories_ Yaourts,categories_Surgelés,categories_ Pains,categories_ Aliments à base de plantes en conserve,categories_ Condiments,categories_ Graines,categories_ Fromages de vache,categories_ Snacks,categories_ Produits à tartiner salés,categories_ Légumineuses et dérivés,categories_ Produits déshydratés,categories_ Fromages de France,categories_ Jus de fruits,categories_ Jambons,categories_ Plant-based foods,categories_Petit-déjeuners,categories_ Légumineuses,categories_ Légumes,categories_ Matières grasses,categories_ Fruits à coques et dérivés,categories_ Boissons chaudes,categories_ Desserts glacés,categories_Aliments d'origine végétale,categories_ Bonbons,categories_ Poulet et dérivés,categories_ Plats à base de pâtes,categories_ Pâtisseries,categories_ Poulets,categories_ Boissons sans sucre ajouté,categories_ Légumes en conserve,categories_ Fruits,categories_ Confiseries chocolatées,categories_ Aliments à base de plantes séchées,categories_ Soupes,categories_ Compotes,categories_ Glaces et sorbets,categories_ Chocolats noirs,categories_ Viennoiseries,categories_ Plats préparés à réchauffer au micro-ondes,categories_ Plats à la volaille,categories_ Biscuits au chocolat,categories_ Céréales en grains,categories_other
0,5,http://world-en.openfoodfacts.org/product/00000005/bio-inulin-ewl,Bio inulin,a,unknown,https://images.openfoodfacts.org/images/products/000/000/000/0005/front_de.34.400.jpg,840.0,0.0,0.0,8.0,8.0,0.0,0.23,50.0,2024,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,True,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,True,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,True,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True


In [4]:
dataset_with_bool_ml = dataset_with_bool.drop(columns=["code", "url", "image_url", "product_name"])

In [7]:
# Separate feature types
from sklearn.impute import SimpleImputer


categorical_features, numeric_features = catetegorial_and_numerical_features(dataset_with_bool_ml)

# 1. Preprocessor for Numeric and Categorical Features
preprocessor = ColumnTransformer(
    transformers=[
        # Pipeline for numeric features: impute missing values and scale
        (
            "num",
            Pipeline(
                steps=[
                    ("imputer", SimpleImputer(strategy="mean")),
                    ("scaler", StandardScaler()),
                ]
            ),
            numeric_features,
        ),
        # Pipeline for categorical features: impute missing values and one-hot encode
        (
            "cat",
            Pipeline(
                steps=[
                    ("imputer", SimpleImputer(strategy="most_frequent")),
                    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
                ]
            ),
            categorical_features,
        ),
    ]
)
# 2. Full Pipeline including Preprocessing and KMeans
kmeans_pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("pca", PCA(n_components=50)),  # Optional: Reduce dimensionality before clustering
        ("kmeans", KMeans(n_clusters=5, random_state=42)),  # Set the number of clusters
    ]
)

# 3. Fit the pipeline on the data
kmeans_pipeline.fit(dataset_with_bool_ml)

# 4. Get cluster labels
cluster_labels = kmeans_pipeline.predict(dataset_with_bool_ml)

# Append cluster labels to the original DataFrame for analysis
dataset_with_bool_ml["cluster"] = cluster_labels

#### Preprocess the dataset_openfoodfacts_tfidf_cleaned and apply kmeans

In [8]:
dataset_with_tfidf = pd.read_csv("../data/dataset_openfoodfacts_tfidf_cleaned.csv", dtype={'code': str})

In [9]:
dataset_with_tfidf_ml = dataset_with_tfidf.drop(columns=["code", "url", "image_url", "product_name"])

In [10]:
# Separate feature types
categorical_features, numeric_features = catetegorial_and_numerical_features(
    dataset_with_tfidf_ml
)

# 1. Preprocessor for Numeric and Categorical Features
preprocessor = ColumnTransformer(
    transformers=[
        # Pipeline for numeric features: impute missing values and scale
        (
            "num",
            Pipeline(
                steps=[
                    ("imputer", SimpleImputer(strategy="mean")),
                    ("scaler", StandardScaler()),
                ]
            ),
            numeric_features,
        ),
        # Pipeline for categorical features: impute missing values and one-hot encode
        (
            "cat",
            Pipeline(
                steps=[
                    ("imputer", SimpleImputer(strategy="most_frequent")),
                    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
                ]
            ),
            categorical_features,
        ),
    ]
)

# 2. Full Pipeline including Preprocessing and KMeans
kmeans_pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("pca", PCA(n_components=50)),
        ("kmeans", KMeans(n_clusters=5, random_state=42)),  # Set the number of clusters
    ]
)

# 3. Fit the pipeline on the data
kmeans_pipeline.fit(dataset_with_tfidf_ml)

# 4. Get cluster labels
cluster_labels = kmeans_pipeline.predict(dataset_with_tfidf_ml)

# Append cluster labels to the original DataFrame for analysis
dataset_with_tfidf_ml["cluster"] = cluster_labels

In [11]:
dataset_with_tfidf_ml["cluster"].value_counts()

dataset_with_tfidf["cluster"] = dataset_with_tfidf_ml["cluster"]

dataset_with_tfidf[["product_name"]][dataset_with_tfidf["cluster"] == 1].head(20)

Unnamed: 0,product_name
0,Bio inulin
7,Organic pea protein powder
11,Fondants Citron
19,Confiture de fraise mara des bois
21,Pan
29,Tisane nerf - sommeil
31,huile végétale
41,pot au feu de légumes aux aromates
42,velouté de 10 légumes
43,Velouté de légumes façon poêlée


In [19]:
datatemp = data_en_filtered_clean["categories"].value_counts().reset_index()
datatemp.head(20)

Unnamed: 0,categories,count
0,unknown,34935
1,Boissons,1607
2,Surgelés,1304
3,"Snacks, Snacks sucrés, Confiseries, Bonbons",1055
4,"Snacks, Snacks sucrés, Cacao et dérivés, Chocolats, Chocolats noirs",1048
5,"Snacks, Snacks sucrés, Biscuits et gâteaux, Biscuits",987
6,"Viandes et dérivés, Viandes, Charcuteries, Jambons, Jambons blancs",941
7,"Boissons, Boissons avec sucre ajouté",873
8,"Snacks, Snacks salés, Amuse-gueules, Biscuits apéritifs",682
9,"Snacks, Snacks sucrés, Cacao et dérivés, Chocolats, Chocolats au lait",510
