In [11]:
import pandas as pd
import polars as pl

from collections import Counter


pd.set_option('display.max_columns', None)

In [12]:
rawdata = pd.read_csv(
    "../data/fr.openfoodfacts.org.products.csv.gz",
    sep='\t',
    engine='python',
    nrows=1000000,  # Limite le nombre de lignes
    on_bad_lines='skip',  # Ignore les lignes mal formées
    quoting=3  # Pour gérer les guillemets mal échappés, cela correspond à CSV.QUOTE_NONE
)
french_data = rawdata[rawdata['countries_tags'] == 'en:france']
print(french_data.shape)

(124005, 206)


In [13]:
french_data = rawdata[rawdata['countries_tags'] == 'en:france']
print(french_data.shape)

(124005, 206)


In [14]:
# Find different states
unique_values = set()
french_data['states_tags'].str.split(',').apply(unique_values.update)

states_df = pd.DataFrame(data=unique_values, columns=['states'])
display(states_df)

Unnamed: 0,states
0,en:origins-completed
1,en:ingredients-photo-selected
2,en:nutrition-facts-completed
3,en:nutrition-photo-selected
4,en:photos-uploaded
5,en:packaging-photo-selected
6,en:packaging-photo-to-be-selected
7,en:ingredients-photo-to-be-selected
8,en:complete
9,en:expiration-date-to-be-completed


In [15]:
# Liste des états cibles
target_states = ["en:completed", "en:origins-completed", "en:ingredients-completed"]

# Construire une expression régulière pour les états
pattern = '|'.join(target_states)

# Compter les lignes contenant au moins un des états cibles
count = french_data['states_tags'].str.contains(pattern).sum()

print("Nombre de lignes avec les états spécifiés:", count)


Nombre de lignes avec les états spécifiés: 17671


In [16]:
columns_to_keep = [
    'code',
    'url',
    'created_datetime',
    'last_modified_datetime',
    'product_name',
    'packaging',
    'packaging_fr',
    'categories_tags',
    'categories_fr',
    'ingredients_text',
    'ingredients_tags',
    'ingredients_analysis_tags',
    'allergens',
    'traces_tags',
    'additives_n',
    'additives_tags',
    'nutriscore_score',
    'nutriscore_grade',
    'food_groups_tags',
    'states_tags',
    'ecoscore_score',
    'ecoscore_grade',
    'nutrient_levels_tags',
    'popularity_tags',
    'main_category',
    'image_url',
    'image_small_url',
    'image_ingredients_url',
    'image_ingredients_small_url',
    'image_nutrition_url',
    'image_nutrition_small_url',
    'energy-kcal_100g',
    'energy_100g',
    'fat_100g',
    'cholesterol_100g',
    'sugars_100g',
    'proteins_100g',
    'salt_100g',
    'alcohol_100g'
]

data = french_data[columns_to_keep]
display(data)

Unnamed: 0,code,url,created_datetime,last_modified_datetime,product_name,packaging,packaging_fr,categories_tags,categories_fr,ingredients_text,ingredients_tags,ingredients_analysis_tags,allergens,traces_tags,additives_n,additives_tags,nutriscore_score,nutriscore_grade,food_groups_tags,states_tags,ecoscore_score,ecoscore_grade,nutrient_levels_tags,popularity_tags,main_category,image_url,image_small_url,image_ingredients_url,image_ingredients_small_url,image_nutrition_url,image_nutrition_small_url,energy-kcal_100g,energy_100g,fat_100g,cholesterol_100g,sugars_100g,proteins_100g,salt_100g,alcohol_100g
4,00000005,http://world-fr.openfoodfacts.org/produit/0000...,2020-11-14T07:08:40Z,2024-10-20T14:02:34Z,Bio inulin,Glass,Verre,"en:plant-based-foods-and-beverages,en:plant-ba...","Aliments et boissons à base de végétaux,Alimen...","Jus et purée d'abricots (50%), eau, sucre.","en:apricot-juice-and-puree,en:fruit,en:prunus-...","en:palm-oil-free,en:vegan,en:vegetarian",,,0.0,,-2.0,a,"en:fruits-and-vegetables,en:vegetables","en:to-be-completed,en:nutrition-facts-complete...",,unknown,"en:fat-in-low-quantity,en:saturated-fat-in-low...",,en:vegetables,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,208.0,840.0,0.0,,8.0,0.0,0.23,
9,00000010,http://world-fr.openfoodfacts.org/produit/0000...,2016-10-20T07:19:01Z,2024-10-04T09:38:47Z,Madeleines nature,"Plastic,Cardboard,fr:Boîte en carton,fr:Film e...","Plastique,Carton,Boite-en-carton,Film-en-plast...","en:snacks,en:desserts,en:sweet-snacks,en:biscu...","Snacks,Desserts,Snacks sucrés,Biscuits et gâte...","Farine de blé 33%, sucre, huile de colza, œufs...","en:wheat-flour,en:cereal,en:flour,en:wheat,en:...","en:palm-oil-free,en:non-vegan,en:vegetarian-st...","en:eggs,en:gluten,en:milk","en:nuts,en:soybeans",3.0,"en:e331,en:e422,en:e503",13.0,d,"en:sugary-snacks,en:biscuits-and-cakes","en:to-be-completed,en:nutrition-facts-complete...",53.0,c,"en:fat-in-high-quantity,en:saturated-fat-in-mo...","top-50000-scans-2019,top-100000-scans-2019,at-...",en:plain-madeleines,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,442.0,1852.0,22.0,,25.0,6.4,0.53,
14,00000015,http://world-fr.openfoodfacts.org/produit/0000...,2018-04-15T16:43:14Z,2024-10-04T09:38:50Z,Madeleines ChocoLait,"Plastique, Carton","Plastique,Carton","en:snacks,en:sweet-snacks,en:biscuits-and-cake...","Snacks,Snacks sucrés,Biscuits et gâteaux,Gâtea...","Farine de blé 27%, chocolat au lait 18% (sucre...","en:wheat-flour,en:cereal,en:flour,en:wheat,en:...","en:palm-oil-free,en:non-vegan,en:maybe-vegetarian",,en:nuts,5.0,"en:e322,en:e322i,en:e331,en:e422,en:e500,en:e503",17.0,d,"en:sugary-snacks,en:biscuits-and-cakes","en:to-be-completed,en:nutrition-facts-complete...",32.0,d,"en:fat-in-high-quantity,en:saturated-fat-in-hi...","top-50000-scans-2019,top-100000-scans-2019,at-...",en:chocolate-madeleines,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,460.0,1926.0,24.0,,31.0,6.4,0.48,0.0
18,00000020,http://world-fr.openfoodfacts.org/produit/0000...,2018-09-14T13:14:06Z,2024-10-04T09:38:53Z,Madeleines Choco Noir,1 boîte en carton à recycler 50 sachets indivi...,1-boite-en-carton-a-recycler-50-sachets-indivi...,"en:snacks,en:sweet-snacks,en:biscuits-and-cake...","Snacks,Snacks sucrés,Biscuits et gâteaux,Gâtea...","Farine de _blé_ 27%, chocolat noir 18% (pâte d...","en:wheat-flour,en:cereal,en:flour,en:wheat,en:...","en:palm-oil-free,en:non-vegan,en:vegetarian-st...","en:eggs,en:gluten,en:milk,en:soybeans",en:nuts,5.0,"en:e322,en:e331,en:e422,en:e500,en:e503",16.0,d,"en:sugary-snacks,en:biscuits-and-cakes","en:to-be-checked,en:complete,en:nutrition-fact...",39.0,d,"en:fat-in-high-quantity,en:saturated-fat-in-hi...","top-50000-scans-2019,top-100000-scans-2019,at-...",en:chocolate-madeleines,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,467.0,1953.0,25.0,,29.0,6.3,0.45,
19,00000021,http://world-fr.openfoodfacts.org/produit/0000...,2019-06-10T12:37:30Z,2024-10-04T09:38:53Z,,,,,,,,,,,,,,unknown,,"en:to-be-completed,en:nutrition-facts-to-be-co...",,unknown,,"bottom-25-percent-scans-2019,top-80-percent-sc...",,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999988,2000000131948,http://world-fr.openfoodfacts.org/produit/2000...,2022-04-30T07:10:40Z,2023-02-25T19:02:12Z,Gourmeline,,,,,,,,,,,,,unknown,,"en:to-be-completed,en:nutrition-facts-complete...",,unknown,,,,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,,,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,621.0,2598.0,25.0,,13.0,7.0,,
999991,2000000131988,http://world-fr.openfoodfacts.org/produit/2000...,2022-01-26T14:46:55Z,2022-01-26T14:47:18Z,forte pharma santarome gummies,,,,,,,,,,,,,unknown,,"en:to-be-completed,en:nutrition-facts-to-be-co...",,unknown,,,,,,,,,,,,,,,,,
999994,2000000132044,http://world-fr.openfoodfacts.org/produit/2000...,2024-04-25T17:05:39Z,2024-04-25T17:59:39Z,Salade de boulgour végétal,,,,,,,,,,,,,unknown,,"en:to-be-completed,en:nutrition-facts-complete...",,unknown,,,,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,,,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,198.0,828.0,7.9,,4.1,6.4,0.90,
999995,20000001321245,http://world-fr.openfoodfacts.org/produit/2000...,2024-03-18T19:30:01Z,2024-03-18T19:30:03Z,Agneau Curry,,,fr:plats-de-restaurant,Plats-de-restaurant,,,,,,,,,unknown,,"en:to-be-completed,en:nutrition-facts-to-be-co...",,unknown,,,fr:plats-de-restaurant,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,,,,,,,,,,,,


In [17]:
# Basic stats
print("Taille du dataset:")
print("Number of rows : {}".format(data.shape[0]))
print("Number of columns : {}".format(data.shape[1]))
print()
print("---------------------------")
print()

print("Basics infos:")
print()
display(data.info())
print()
print("---------------------------")
print()

print("Basics statistics: ")
print()
data_desc = data.describe(include='all')
display(data_desc)
print()
print("---------------------------")
print()

print("Unique elements by feature: ")
print()
display(data.nunique().sort_values())
print()
print("---------------------------")
print()

print("Percentage of missing values: ")
print()
null_percentage = 100*data.isnull().mean()
null_percentage_df = null_percentage.to_frame(name='Null Percentage').T
display(null_percentage_df) 

Taille du dataset:
Number of rows : 124005
Number of columns : 39

---------------------------

Basics infos:

<class 'pandas.core.frame.DataFrame'>
Index: 124005 entries, 4 to 999998
Data columns (total 39 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   code                         124005 non-null  object 
 1   url                          124005 non-null  object 
 2   created_datetime             124005 non-null  object 
 3   last_modified_datetime       124005 non-null  object 
 4   product_name                 115095 non-null  object 
 5   packaging                    10399 non-null   object 
 6   packaging_fr                 10399 non-null   object 
 7   categories_tags              47423 non-null   object 
 8   categories_fr                47423 non-null   object 
 9   ingredients_text             16553 non-null   object 
 10  ingredients_tags             16524 non-null   object 
 11  ingredients

None


---------------------------

Basics statistics: 



Unnamed: 0,code,url,created_datetime,last_modified_datetime,product_name,packaging,packaging_fr,categories_tags,categories_fr,ingredients_text,ingredients_tags,ingredients_analysis_tags,allergens,traces_tags,additives_n,additives_tags,nutriscore_score,nutriscore_grade,food_groups_tags,states_tags,ecoscore_score,ecoscore_grade,nutrient_levels_tags,popularity_tags,main_category,image_url,image_small_url,image_ingredients_url,image_ingredients_small_url,image_nutrition_url,image_nutrition_small_url,energy-kcal_100g,energy_100g,fat_100g,cholesterol_100g,sugars_100g,proteins_100g,salt_100g,alcohol_100g
count,124005.0,124005,124005,124005,115095,10399.0,10399.0,47423,47423,16553,16524,17790,4509,3067,16553.0,7338,29696.0,123096,41874,124005,35336.0,123096,30845,22177,47423,108283,108283,40284,40284,57596,57596,74428.0,75259.0,74739.0,406.0,74614.0,74773.0,70065.0,791.0
unique,123946.0,123946,121427,105196,76494,1977.0,1652.0,7743,7743,13748,12938,35,387,763,,3697,,7,45,2418,,9,191,5977,5136,108231,108231,40248,40248,57571,57571,,,,,,,,
top,202105021656.0,http://world-fr.openfoodfacts.org/produit/0202...,2021-09-21T15:30:45Z,2021-07-25T07:00:14Z,Filet de poulet,0.0,0.0,"en:meats-and-their-products,en:meats,en:chicke...","Viandes et dérivés,Viandes,Poulet et dérivés,V...",Bœuf,"en:chicken,en:poultry","en:palm-oil-content-unknown,en:vegan-status-un...",en:milk,en:nuts,,"en:e322,en:e322i",,unknown,"en:fish-meat-eggs,en:meat,en:poultry","en:to-be-completed,en:nutrition-facts-complete...",,unknown,"en:fat-in-low-quantity,en:saturated-fat-in-low...","bottom-25-percent-scans-2020,bottom-20-percent...",en:chicken-breasts,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,,,,,,,,
freq,2.0,2,7,51,627,1822.0,1086.0,2176,2176,295,240,4422,1023,357,,327,,92199,7221,16157,,87305,2867,1832,2186,2,2,2,2,2,2,,,,,,,,
mean,,,,,,,,,,,,,,,1.433154,,8.890288,,,,45.547204,,,,,,,,,,,190788400000.0,790603100000.0,13.776127,0.025996,10.958449,11.395939,1.46418,2.78025
std,,,,,,,,,,,,,,,2.388427,,8.816084,,,,25.222706,,,,,,,,,,,52049930000000.0,216889100000000.0,16.628091,0.053201,19.96893,14.697931,13.417748,6.694116
min,,,,,,,,,,,,,,,0.0,,-15.0,,,,-25.0,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,,,,,,,,,,,,,,,0.0,,1.0,,,,25.0,,,,,,,,,,,130.0,540.0,1.9,0.0,0.5,3.4,0.15,0.0
50%,,,,,,,,,,,,,,,0.0,,11.0,,,,46.0,,,,,,,,,,,263.0,1096.0,9.5,0.0,2.6,8.0,0.75,0.0
75%,,,,,,,,,,,,,,,2.0,,16.0,,,,64.0,,,,,,,,,,,386.0,1615.0,22.4,0.03,15.0,19.0,1.4,4.05



---------------------------

Unique elements by feature: 



nutriscore_grade                    7
ecoscore_grade                      9
additives_n                        22
ingredients_analysis_tags          35
food_groups_tags                   45
nutriscore_score                   53
alcohol_100g                       65
cholesterol_100g                  135
ecoscore_score                    144
nutrient_levels_tags              191
allergens                         387
traces_tags                       763
packaging_fr                     1652
packaging                        1977
states_tags                      2418
proteins_100g                    2698
energy_100g                      2754
fat_100g                         2952
sugars_100g                      3025
energy-kcal_100g                 3345
salt_100g                        3346
additives_tags                   3697
main_category                    5136
popularity_tags                  5977
categories_fr                    7743
categories_tags                  7743
ingredients_


---------------------------

Percentage of missing values: 



Unnamed: 0,code,url,created_datetime,last_modified_datetime,product_name,packaging,packaging_fr,categories_tags,categories_fr,ingredients_text,ingredients_tags,ingredients_analysis_tags,allergens,traces_tags,additives_n,additives_tags,nutriscore_score,nutriscore_grade,food_groups_tags,states_tags,ecoscore_score,ecoscore_grade,nutrient_levels_tags,popularity_tags,main_category,image_url,image_small_url,image_ingredients_url,image_ingredients_small_url,image_nutrition_url,image_nutrition_small_url,energy-kcal_100g,energy_100g,fat_100g,cholesterol_100g,sugars_100g,proteins_100g,salt_100g,alcohol_100g
Null Percentage,0.0,0.0,0.0,0.0,7.185194,91.614048,91.614048,61.757187,61.757187,86.651345,86.674731,85.653804,96.363856,97.526713,86.651345,94.082497,76.052579,0.733035,66.232007,0.0,71.504375,0.733035,75.126003,82.116044,61.757187,12.678521,12.678521,67.514213,67.514213,53.553486,53.553486,39.97984,39.309705,39.729043,99.672594,39.829846,39.701625,43.498246,99.362122


In [18]:
clean_data = data[data['ingredients_tags'].notnull()]
print(clean_data.shape)

(16524, 39)


In [19]:
file_path = '../data/fr.openfoodfacts.org.products.csv.gz'
chunk_size = 10000

filtered_chunks = []

for chunk in pd.read_csv(file_path, chunksize=chunk_size, compression='gzip', sep='\t', engine='python', on_bad_lines='skip', quoting=3):
    filtered_chunk = chunk[(chunk['countries_tags'] == 'en:france') & (chunk['ingredients_tags'].notnull())]
    filtered_chunks.append(filtered_chunk)

if filtered_chunks:
    data_fr = pd.concat(filtered_chunks, axis=0)

data_fr = data_fr[columns_to_keep]
display(data_fr)
data_fr.to_csv('filtered_dataset_openfoodfacts_fr.csv')

Unnamed: 0,code,url,created_datetime,last_modified_datetime,product_name,packaging,packaging_fr,categories_tags,categories_fr,ingredients_text,ingredients_tags,ingredients_analysis_tags,allergens,traces_tags,additives_n,additives_tags,nutriscore_score,nutriscore_grade,food_groups_tags,states_tags,ecoscore_score,ecoscore_grade,nutrient_levels_tags,popularity_tags,main_category,image_url,image_small_url,image_ingredients_url,image_ingredients_small_url,image_nutrition_url,image_nutrition_small_url,energy-kcal_100g,energy_100g,fat_100g,cholesterol_100g,sugars_100g,proteins_100g,salt_100g,alcohol_100g
4,5,http://world-fr.openfoodfacts.org/produit/0000...,2020-11-14T07:08:40Z,2024-10-20T14:02:34Z,Bio inulin,Glass,Verre,"en:plant-based-foods-and-beverages,en:plant-ba...","Aliments et boissons à base de végétaux,Alimen...","Jus et purée d'abricots (50%), eau, sucre.","en:apricot-juice-and-puree,en:fruit,en:prunus-...","en:palm-oil-free,en:vegan,en:vegetarian",,,0.0,,-2.0,a,"en:fruits-and-vegetables,en:vegetables","en:to-be-completed,en:nutrition-facts-complete...",,unknown,"en:fat-in-low-quantity,en:saturated-fat-in-low...",,en:vegetables,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,208.0,840.0,0.0,,8.0,0.0,0.23,
9,10,http://world-fr.openfoodfacts.org/produit/0000...,2016-10-20T07:19:01Z,2024-10-04T09:38:47Z,Madeleines nature,"Plastic,Cardboard,fr:Boîte en carton,fr:Film e...","Plastique,Carton,Boite-en-carton,Film-en-plast...","en:snacks,en:desserts,en:sweet-snacks,en:biscu...","Snacks,Desserts,Snacks sucrés,Biscuits et gâte...","Farine de blé 33%, sucre, huile de colza, œufs...","en:wheat-flour,en:cereal,en:flour,en:wheat,en:...","en:palm-oil-free,en:non-vegan,en:vegetarian-st...","en:eggs,en:gluten,en:milk","en:nuts,en:soybeans",3.0,"en:e331,en:e422,en:e503",13.0,d,"en:sugary-snacks,en:biscuits-and-cakes","en:to-be-completed,en:nutrition-facts-complete...",53.0,c,"en:fat-in-high-quantity,en:saturated-fat-in-mo...","top-50000-scans-2019,top-100000-scans-2019,at-...",en:plain-madeleines,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,442.0,1852.0,22.0,,25.0,6.4,0.53,
14,15,http://world-fr.openfoodfacts.org/produit/0000...,2018-04-15T16:43:14Z,2024-10-04T09:38:50Z,Madeleines ChocoLait,"Plastique, Carton","Plastique,Carton","en:snacks,en:sweet-snacks,en:biscuits-and-cake...","Snacks,Snacks sucrés,Biscuits et gâteaux,Gâtea...","Farine de blé 27%, chocolat au lait 18% (sucre...","en:wheat-flour,en:cereal,en:flour,en:wheat,en:...","en:palm-oil-free,en:non-vegan,en:maybe-vegetarian",,en:nuts,5.0,"en:e322,en:e322i,en:e331,en:e422,en:e500,en:e503",17.0,d,"en:sugary-snacks,en:biscuits-and-cakes","en:to-be-completed,en:nutrition-facts-complete...",32.0,d,"en:fat-in-high-quantity,en:saturated-fat-in-hi...","top-50000-scans-2019,top-100000-scans-2019,at-...",en:chocolate-madeleines,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,460.0,1926.0,24.0,,31.0,6.4,0.48,0.0
18,20,http://world-fr.openfoodfacts.org/produit/0000...,2018-09-14T13:14:06Z,2024-10-04T09:38:53Z,Madeleines Choco Noir,1 boîte en carton à recycler 50 sachets indivi...,1-boite-en-carton-a-recycler-50-sachets-indivi...,"en:snacks,en:sweet-snacks,en:biscuits-and-cake...","Snacks,Snacks sucrés,Biscuits et gâteaux,Gâtea...","Farine de _blé_ 27%, chocolat noir 18% (pâte d...","en:wheat-flour,en:cereal,en:flour,en:wheat,en:...","en:palm-oil-free,en:non-vegan,en:vegetarian-st...","en:eggs,en:gluten,en:milk,en:soybeans",en:nuts,5.0,"en:e322,en:e331,en:e422,en:e500,en:e503",16.0,d,"en:sugary-snacks,en:biscuits-and-cakes","en:to-be-checked,en:complete,en:nutrition-fact...",39.0,d,"en:fat-in-high-quantity,en:saturated-fat-in-hi...","top-50000-scans-2019,top-100000-scans-2019,at-...",en:chocolate-madeleines,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,467.0,1953.0,25.0,,29.0,6.3,0.45,
20,22,http://world-fr.openfoodfacts.org/produit/0000...,2021-02-28T15:18:57Z,2024-10-04T09:38:54Z,Farandole de madeleine,"Boîte en carton, Film en plastique","Boite-en-carton,Film-en-plastique","en:snacks,en:sweet-snacks,en:biscuits-and-cake...","Snacks,Snacks sucrés,Biscuits et gâteaux,Gâtea...",Madeleines ChocoNoir - Madeleines nappées de c...,"fr:madeleines-choconoir,fr:madeleines-nappees-...","en:palm-oil-content-unknown,en:non-vegan,en:ve...",,en:nuts,5.0,"en:e322,en:e322i,en:e331,en:e422,en:e500,en:e503",,unknown,"en:sugary-snacks,en:biscuits-and-cakes","en:to-be-completed,en:nutrition-facts-complete...",41.0,d,,"top-75-percent-scans-2020,top-80-percent-scans...",en:long-madeleines,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3498860,9999864004549,http://world-fr.openfoodfacts.org/produit/9999...,2019-08-01T11:12:51Z,2019-08-01T11:25:22Z,Boudin à l’ancienne,,,"en:meats-and-their-products,en:meats,en:prepar...","Viandes et dérivés,Viandes,Charcuteries,Poisso...",BIOCOOP BORDEAUX LAC distribue par les eleveur...,fr:biocoop-bordeaux-lac-distribue-par-les-elev...,"en:may-contain-palm-oil,en:non-vegan,en:non-ve...",,,0.0,,,unknown,"en:fish-meat-eggs,en:offals","en:to-be-completed,en:nutrition-facts-complete...",86.0,a,,"bottom-25-percent-scans-2019,bottom-20-percent...",en:sauteed-black-pudding,,,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,,,,,,,,,,
3498872,9999900401301,http://world-fr.openfoodfacts.org/produit/9999...,2018-01-05T16:53:07Z,2018-01-05T16:53:13Z,Kabanos,,,,,"KABANOSÈRÇ Sktadn' wiep -zEkrobia, Guszcz 'o.e...","fr:kabanoserc-sktadn-wiep,fr:zekrobia,fr:guszc...","en:palm-oil-content-unknown,en:vegan-status-un...",,,0.0,,,unknown,,"en:to-be-completed,en:nutrition-facts-complete...",,unknown,,,,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,,,549.0,2297.0,48.0,,1.9,24.0,3.10,
3498929,9999991042704,http://world-fr.openfoodfacts.org/produit/9999...,2018-05-09T10:46:24Z,2024-06-16T21:10:13Z,Yaourt vanille,,,"en:dairies,en:fermented-foods,en:fermented-mil...","Produits laitiers,Produits fermentés,Produits ...","Lait entier 77%, crème, sucre 7,5%, ferments l...","en:whole-milk,en:dairy,en:milk,en:cream,en:sug...","en:palm-oil-free,en:non-vegan,en:vegetarian-st...",en:milk,"en:eggs,en:gluten,en:nuts",0.0,,7.0,c,"en:milk-and-dairy-products,en:milk-and-yogurt","en:to-be-completed,en:nutrition-facts-complete...",67.0,b,"en:fat-in-moderate-quantity,en:saturated-fat-i...","bottom-25-percent-scans-2019,bottom-20-percent...",en:vanilla-yogurt,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,,,128.0,536.0,7.6,,11.0,2.7,0.09,
3498953,9999999004360,http://world-fr.openfoodfacts.org/produit/9999...,2019-01-21T15:57:57Z,2023-04-28T16:03:26Z,Minis beignets,,,"en:snacks,en:sweet-snacks,en:sweet-fritters","Snacks,Snacks sucrés,Beignets sucrés","MATIÈRE GRASSE DU LAIT BABEURRE (34%), FARINE ...","fr:matiere-grasse-du-lait-babeurre,fr:farine-d...","en:palm-oil,en:non-vegan,en:vegetarian-status-...",,en:sesame-seeds,5.0,"en:e322,en:e322i,en:e412,en:e450,en:e471,en:e5...",15.0,d,,"en:to-be-completed,en:nutrition-facts-complete...",,unknown,"en:fat-in-high-quantity,en:saturated-fat-in-hi...",,en:sweet-fritters,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,,,333.0,1393.0,20.2,,10.4,7.4,1.10,


In [20]:
filtered_chunk.to_csv('filtered_dataset_openfoodfacts.csv')

In [None]:
file_path = '../data/en.openfoodfacts.org.products.csv.gz'
chunk_size = 10000

filtered_chunks_en = []

for chunk in pd.read_csv(file_path, chunksize=chunk_size, compression='gzip', sep='\t', engine='python', quoting=3):
    filtered_chunks_en = chunk[(chunk['countries_tags'] == 'en:france') & (chunk['ingredients_tags'].notnull())]
    filtered_chunks.append(filtered_chunks_en)

if filtered_chunks:
    data_en = pd.concat(filtered_chunks, axis=0)

data_en = data_en[columns_to_keep]
display(data_en)
data_en.to_csv('filtered_dataset_openfoodfacts_en.csv')

Unnamed: 0,code,url,created_datetime,last_modified_datetime,product_name,packaging,packaging_fr,categories_tags,categories_fr,ingredients_text,ingredients_tags,ingredients_analysis_tags,allergens,traces_tags,additives_n,additives_tags,nutriscore_score,nutriscore_grade,food_groups_tags,states_tags,ecoscore_score,ecoscore_grade,nutrient_levels_tags,popularity_tags,main_category,image_url,image_small_url,image_ingredients_url,image_ingredients_small_url,image_nutrition_url,image_nutrition_small_url,energy-kcal_100g,energy_100g,fat_100g,cholesterol_100g,sugars_100g,proteins_100g,salt_100g,alcohol_100g
4,5,http://world-fr.openfoodfacts.org/produit/0000...,2020-11-14T07:08:40Z,2024-10-20T14:02:34Z,Bio inulin,Glass,Verre,"en:plant-based-foods-and-beverages,en:plant-ba...","Aliments et boissons à base de végétaux,Alimen...","Jus et purée d'abricots (50%), eau, sucre.","en:apricot-juice-and-puree,en:fruit,en:prunus-...","en:palm-oil-free,en:vegan,en:vegetarian",,,0.0,,-2.0,a,"en:fruits-and-vegetables,en:vegetables","en:to-be-completed,en:nutrition-facts-complete...",,unknown,"en:fat-in-low-quantity,en:saturated-fat-in-low...",,en:vegetables,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,208.0,840.0,0.0,,8.0,0.0,0.23,
9,10,http://world-fr.openfoodfacts.org/produit/0000...,2016-10-20T07:19:01Z,2024-10-04T09:38:47Z,Madeleines nature,"Plastic,Cardboard,fr:Boîte en carton,fr:Film e...","Plastique,Carton,Boite-en-carton,Film-en-plast...","en:snacks,en:desserts,en:sweet-snacks,en:biscu...","Snacks,Desserts,Snacks sucrés,Biscuits et gâte...","Farine de blé 33%, sucre, huile de colza, œufs...","en:wheat-flour,en:cereal,en:flour,en:wheat,en:...","en:palm-oil-free,en:non-vegan,en:vegetarian-st...","en:eggs,en:gluten,en:milk","en:nuts,en:soybeans",3.0,"en:e331,en:e422,en:e503",13.0,d,"en:sugary-snacks,en:biscuits-and-cakes","en:to-be-completed,en:nutrition-facts-complete...",53.0,c,"en:fat-in-high-quantity,en:saturated-fat-in-mo...","top-50000-scans-2019,top-100000-scans-2019,at-...",en:plain-madeleines,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,442.0,1852.0,22.0,,25.0,6.4,0.53,
14,15,http://world-fr.openfoodfacts.org/produit/0000...,2018-04-15T16:43:14Z,2024-10-04T09:38:50Z,Madeleines ChocoLait,"Plastique, Carton","Plastique,Carton","en:snacks,en:sweet-snacks,en:biscuits-and-cake...","Snacks,Snacks sucrés,Biscuits et gâteaux,Gâtea...","Farine de blé 27%, chocolat au lait 18% (sucre...","en:wheat-flour,en:cereal,en:flour,en:wheat,en:...","en:palm-oil-free,en:non-vegan,en:maybe-vegetarian",,en:nuts,5.0,"en:e322,en:e322i,en:e331,en:e422,en:e500,en:e503",17.0,d,"en:sugary-snacks,en:biscuits-and-cakes","en:to-be-completed,en:nutrition-facts-complete...",32.0,d,"en:fat-in-high-quantity,en:saturated-fat-in-hi...","top-50000-scans-2019,top-100000-scans-2019,at-...",en:chocolate-madeleines,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,460.0,1926.0,24.0,,31.0,6.4,0.48,0.0
18,20,http://world-fr.openfoodfacts.org/produit/0000...,2018-09-14T13:14:06Z,2024-10-04T09:38:53Z,Madeleines Choco Noir,1 boîte en carton à recycler 50 sachets indivi...,1-boite-en-carton-a-recycler-50-sachets-indivi...,"en:snacks,en:sweet-snacks,en:biscuits-and-cake...","Snacks,Snacks sucrés,Biscuits et gâteaux,Gâtea...","Farine de _blé_ 27%, chocolat noir 18% (pâte d...","en:wheat-flour,en:cereal,en:flour,en:wheat,en:...","en:palm-oil-free,en:non-vegan,en:vegetarian-st...","en:eggs,en:gluten,en:milk,en:soybeans",en:nuts,5.0,"en:e322,en:e331,en:e422,en:e500,en:e503",16.0,d,"en:sugary-snacks,en:biscuits-and-cakes","en:to-be-checked,en:complete,en:nutrition-fact...",39.0,d,"en:fat-in-high-quantity,en:saturated-fat-in-hi...","top-50000-scans-2019,top-100000-scans-2019,at-...",en:chocolate-madeleines,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,467.0,1953.0,25.0,,29.0,6.3,0.45,
20,22,http://world-fr.openfoodfacts.org/produit/0000...,2021-02-28T15:18:57Z,2024-10-04T09:38:54Z,Farandole de madeleine,"Boîte en carton, Film en plastique","Boite-en-carton,Film-en-plastique","en:snacks,en:sweet-snacks,en:biscuits-and-cake...","Snacks,Snacks sucrés,Biscuits et gâteaux,Gâtea...",Madeleines ChocoNoir - Madeleines nappées de c...,"fr:madeleines-choconoir,fr:madeleines-nappees-...","en:palm-oil-content-unknown,en:non-vegan,en:ve...",,en:nuts,5.0,"en:e322,en:e322i,en:e331,en:e422,en:e500,en:e503",,unknown,"en:sugary-snacks,en:biscuits-and-cakes","en:to-be-completed,en:nutrition-facts-complete...",41.0,d,,"top-75-percent-scans-2020,top-80-percent-scans...",en:long-madeleines,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3498860,9999864004549,http://world-fr.openfoodfacts.org/produit/9999...,2019-08-01T11:12:51Z,2019-08-01T11:25:22Z,Boudin à l’ancienne,,,"en:meats-and-their-products,en:meats,en:prepar...","Viandes et dérivés,Viandes,Charcuteries,Poisso...",BIOCOOP BORDEAUX LAC distribue par les eleveur...,fr:biocoop-bordeaux-lac-distribue-par-les-elev...,"en:may-contain-palm-oil,en:non-vegan,en:non-ve...",,,0.0,,,unknown,"en:fish-meat-eggs,en:offals","en:to-be-completed,en:nutrition-facts-complete...",86.0,a,,"bottom-25-percent-scans-2019,bottom-20-percent...",en:sauteed-black-pudding,,,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,,,,,,,,,,
3498872,9999900401301,http://world-fr.openfoodfacts.org/produit/9999...,2018-01-05T16:53:07Z,2018-01-05T16:53:13Z,Kabanos,,,,,"KABANOSÈRÇ Sktadn' wiep -zEkrobia, Guszcz 'o.e...","fr:kabanoserc-sktadn-wiep,fr:zekrobia,fr:guszc...","en:palm-oil-content-unknown,en:vegan-status-un...",,,0.0,,,unknown,,"en:to-be-completed,en:nutrition-facts-complete...",,unknown,,,,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,,,549.0,2297.0,48.0,,1.9,24.0,3.10,
3498929,9999991042704,http://world-fr.openfoodfacts.org/produit/9999...,2018-05-09T10:46:24Z,2024-06-16T21:10:13Z,Yaourt vanille,,,"en:dairies,en:fermented-foods,en:fermented-mil...","Produits laitiers,Produits fermentés,Produits ...","Lait entier 77%, crème, sucre 7,5%, ferments l...","en:whole-milk,en:dairy,en:milk,en:cream,en:sug...","en:palm-oil-free,en:non-vegan,en:vegetarian-st...",en:milk,"en:eggs,en:gluten,en:nuts",0.0,,7.0,c,"en:milk-and-dairy-products,en:milk-and-yogurt","en:to-be-completed,en:nutrition-facts-complete...",67.0,b,"en:fat-in-moderate-quantity,en:saturated-fat-i...","bottom-25-percent-scans-2019,bottom-20-percent...",en:vanilla-yogurt,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,,,128.0,536.0,7.6,,11.0,2.7,0.09,
3498953,9999999004360,http://world-fr.openfoodfacts.org/produit/9999...,2019-01-21T15:57:57Z,2023-04-28T16:03:26Z,Minis beignets,,,"en:snacks,en:sweet-snacks,en:sweet-fritters","Snacks,Snacks sucrés,Beignets sucrés","MATIÈRE GRASSE DU LAIT BABEURRE (34%), FARINE ...","fr:matiere-grasse-du-lait-babeurre,fr:farine-d...","en:palm-oil,en:non-vegan,en:vegetarian-status-...",,en:sesame-seeds,5.0,"en:e322,en:e322i,en:e412,en:e450,en:e471,en:e5...",15.0,d,,"en:to-be-completed,en:nutrition-facts-complete...",,unknown,"en:fat-in-high-quantity,en:saturated-fat-in-hi...",,en:sweet-fritters,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,,,333.0,1393.0,20.2,,10.4,7.4,1.10,


In [22]:
data_en_allergen = data_en[data_en['allergens'].notna()]
data_en_allergen.shape

(203796, 39)

In [23]:
display(data_en_allergen)

Unnamed: 0,code,url,created_datetime,last_modified_datetime,product_name,packaging,packaging_fr,categories_tags,categories_fr,ingredients_text,ingredients_tags,ingredients_analysis_tags,allergens,traces_tags,additives_n,additives_tags,nutriscore_score,nutriscore_grade,food_groups_tags,states_tags,ecoscore_score,ecoscore_grade,nutrient_levels_tags,popularity_tags,main_category,image_url,image_small_url,image_ingredients_url,image_ingredients_small_url,image_nutrition_url,image_nutrition_small_url,energy-kcal_100g,energy_100g,fat_100g,cholesterol_100g,sugars_100g,proteins_100g,salt_100g,alcohol_100g
9,10,http://world-fr.openfoodfacts.org/produit/0000...,2016-10-20T07:19:01Z,2024-10-04T09:38:47Z,Madeleines nature,"Plastic,Cardboard,fr:Boîte en carton,fr:Film e...","Plastique,Carton,Boite-en-carton,Film-en-plast...","en:snacks,en:desserts,en:sweet-snacks,en:biscu...","Snacks,Desserts,Snacks sucrés,Biscuits et gâte...","Farine de blé 33%, sucre, huile de colza, œufs...","en:wheat-flour,en:cereal,en:flour,en:wheat,en:...","en:palm-oil-free,en:non-vegan,en:vegetarian-st...","en:eggs,en:gluten,en:milk","en:nuts,en:soybeans",3.0,"en:e331,en:e422,en:e503",13.0,d,"en:sugary-snacks,en:biscuits-and-cakes","en:to-be-completed,en:nutrition-facts-complete...",53.0,c,"en:fat-in-high-quantity,en:saturated-fat-in-mo...","top-50000-scans-2019,top-100000-scans-2019,at-...",en:plain-madeleines,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,442.0,1852.0,22.00,,25.00,6.4,0.5300,
18,20,http://world-fr.openfoodfacts.org/produit/0000...,2018-09-14T13:14:06Z,2024-10-04T09:38:53Z,Madeleines Choco Noir,1 boîte en carton à recycler 50 sachets indivi...,1-boite-en-carton-a-recycler-50-sachets-indivi...,"en:snacks,en:sweet-snacks,en:biscuits-and-cake...","Snacks,Snacks sucrés,Biscuits et gâteaux,Gâtea...","Farine de _blé_ 27%, chocolat noir 18% (pâte d...","en:wheat-flour,en:cereal,en:flour,en:wheat,en:...","en:palm-oil-free,en:non-vegan,en:vegetarian-st...","en:eggs,en:gluten,en:milk,en:soybeans",en:nuts,5.0,"en:e322,en:e331,en:e422,en:e500,en:e503",16.0,d,"en:sugary-snacks,en:biscuits-and-cakes","en:to-be-checked,en:complete,en:nutrition-fact...",39.0,d,"en:fat-in-high-quantity,en:saturated-fat-in-hi...","top-50000-scans-2019,top-100000-scans-2019,at-...",en:chocolate-madeleines,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,467.0,1953.0,25.00,,29.00,6.3,0.4500,
28,30,http://world-fr.openfoodfacts.org/produit/0000...,2016-12-15T22:15:44Z,2024-10-14T05:06:56Z,Confiture d'oranges,"Plastic,Bag,Box,Cardboard","Plastique,Carton,Sachet,Boîte","en:plant-based-foods-and-beverages,en:plant-ba...","Aliments et boissons à base de végétaux,Alimen...","Farine de _blé_, _œufs_ frais, huile de colza,...","en:wheat-flour,en:cereal,en:flour,en:wheat,en:...","en:may-contain-palm-oil,en:non-vegan,en:maybe-...","en:eggs,en:gluten,en:milk","en:nuts,en:soybeans",5.0,"en:e422,en:e450,en:e471,en:e500,en:e503",12.0,d,"en:sugary-snacks,en:biscuits-and-cakes","en:to-be-completed,en:nutrition-facts-complete...",76.0,a,"en:fat-in-moderate-quantity,en:saturated-fat-i...","bottom-25-percent-scans-2019,top-80-percent-sc...",fr:cakes-aux-raisins,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,0.0,1768.0,19.00,,28.00,5.8,0.0000,
48,50,http://world-fr.openfoodfacts.org/produit/0000...,2021-05-08T17:23:51Z,2024-10-04T09:55:23Z,Thermo complete,Flacon plastique,Flacon-plastique,en:dietary-supplements,Compléments alimentaires,"tions melles microcristalline), extrait de thé...","fr:tions-melles-microcristalline,en:green-tea-...","en:palm-oil-content-unknown,en:vegan,en:vegeta...",en:celery,,4.0,"en:e1400,en:e466,en:e551,en:e570",,unknown,,"en:to-be-completed,en:nutrition-facts-complete...",,unknown,,"top-100000-scans-2022,top-75-percent-scans-202...",en:dietary-supplements,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,,,,,,,,0.0
49,51,http://world-fr.openfoodfacts.org/produit/0000...,2016-12-01T19:59:24Z,2024-10-11T08:26:38Z,Fondants Citron,"Boîte carton, Sachet Plastique","Boite-carton,Sachet-plastique","en:plant-based-foods-and-beverages,en:plant-ba...","Aliments et boissons à base de végétaux,Alimen...","mûre bio 50% (fruit 27%, purée 23%) sucre de c...","en:blackberry,en:fruit,en:berries,en:cane-suga...","en:palm-oil-free,en:vegan-status-unknown,en:ve...","en:eggs,en:gluten,en:milk,en:nuts","en:nuts,en:soybeans",1.0,en:e440,10.0,c,"en:sugary-snacks,en:sweets","en:to-be-completed,en:nutrition-facts-complete...",64.0,b,"en:fat-in-low-quantity,en:saturated-fat-in-low...","top-50000-scans-2019,top-100000-scans-2019,at-...",en:jams,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,249.0,1056.0,0.30,,59.00,0.5,0.0500,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3497724,9911111141111,http://world-fr.openfoodfacts.org/produit/9911...,2020-11-24T18:35:39Z,2023-09-10T04:17:13Z,Ketchup,,,"en:condiments,en:sauces,en:tomato-sauces,en:ke...","Condiments,Sauces,Sauces tomate,Ketchup",Tomate - oignon - ail-vinaigre de vin (SULFITE...,"en:tomato,en:vegetable,en:fruit-vegetable,en:o...","en:palm-oil-free,en:vegan-status-unknown,en:ve...",en:sulphur-dioxide-and-sulphites,,0.0,,,unknown,"en:fats-and-sauces,en:dressings-and-sauces","en:to-be-completed,en:nutrition-facts-complete...",74.0,b,,,en:ketchup,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,,,,,,,,,,
3498219,99515874,http://world-fr.openfoodfacts.org/produit/9951...,2017-12-17T13:58:05Z,2023-12-30T14:32:16Z,Mini stollen,,,"en:snacks,en:sweet-snacks,en:festive-foods,en:...","Snacks,Snacks sucrés,Aliments festifs,Biscuits...","45 % massepain (44% amandes, sucre, sirop de g...","en:marzipan,en:wheat-flour,en:cereal,en:flour,...","en:palm-oil,en:non-vegan,en:vegetarian-status-...",en:gluten,"en:nuts,en:peanuts,en:soybeans",0.0,,21.0,e,"en:sugary-snacks,en:biscuits-and-cakes","en:to-be-completed,en:nutrition-facts-complete...",,unknown,"en:fat-in-high-quantity,en:saturated-fat-in-hi...",,en:stollen,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,452.0,1891.0,23.30,,30.50,9.5,0.4800,
3498229,99534592,http://world-fr.openfoodfacts.org/produit/9953...,2018-01-12T12:56:14Z,2024-10-25T05:55:56Z,Saumon à l'oseille,,,"en:meals,en:meals-with-fish,en:meals-with-salm...","Plats préparés,Plats préparés au poisson,Plats...",'1/1/1/ Riz basmati accompagné d'une sauce à l...,"fr:1,fr:riz-basmati-accompagne-d-une-sauce-a-l...","en:palm-oil-content-unknown,en:non-vegan,en:no...","en:fish,en:milk","en:celery,en:crustaceans,en:eggs,en:gluten,en:...",4.0,"en:e14xx,en:e407,en:e412,en:e415",-1.0,a,"en:composite-foods,en:one-dish-meals","en:to-be-completed,en:nutrition-facts-complete...",,unknown,"en:fat-in-moderate-quantity,en:saturated-fat-i...",,en:microwave-meals,,,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,,,121.0,506.0,4.00,,0.10,8.2,0.6400,
3498547,99885434,http://world-fr.openfoodfacts.org/produit/9988...,2019-07-08T10:52:02Z,2023-06-06T12:57:53Z,raviolis pékinois surgelés,,,"en:frozen-foods,en:meals,en:pasta-dishes,en:st...","Surgelés,Plats préparés,Plats à base de pâtes,...",farine de blé eau porc 17 % chou barde de por...,fr:farine-de-ble-eau-porc-17-chou-barde-de-por...,"en:palm-oil-content-unknown,en:vegan-status-un...","en:fish,en:gluten,en:molluscs,en:soybeans","en:celery,en:eggs,en:milk",3.0,"en:e150a,en:e621,en:e635",,unknown,"en:composite-foods,en:one-dish-meals","en:to-be-completed,en:nutrition-facts-complete...",44.0,d,"en:fat-in-moderate-quantity,en:saturated-fat-i...","bottom-25-percent-scans-2019,bottom-20-percent...",en:frozen-ravioli,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,https://images.openfoodfacts.org/images/produc...,,928.0,9.73,,2.17,,1.0414,


In [24]:
unique_allergens = set()
data_en_allergen["allergens"].str.split(',').apply(unique_allergens.update)
allergens_list = sorted(unique_allergens)
print(len(allergens_list))
print(allergens_list)


1734
['ca:ordi', 'de:Dinkelweizenflocken', 'de:Dorschleber', 'de:ENTHÄLT EINE PHENYLALANINQUELLE', 'de:Eier und Eierzeugnisse.', 'de:KANN BEI ÜBERMÄSSIGEM VERZEHR ABFÜHREND WIRKEN', 'de:Kefir', 'de:Milcheiweißhydrolysat', 'de:Seehasen', 'de:Weizenstärkesirup', 'de:seigle', 'en:AVOINE', 'en:Almond flour', 'en:Amande', 'en:Amandes', 'en:Anhydride sulfureux et sulfites', 'en:Arachides', 'en:Arachnids', 'en:Avoine', 'en:BARLEY MALT', 'en:Barley malt', 'en:Beurre', 'en:Beurre salé', 'en:Blé', 'en:CREAMER', 'en:Cacahuètes', 'en:Cantal', 'en:Cereals Containing Gluten  (including Barley & Oats)', 'en:Crustacés', 'en:Crème', 'en:Céleri', 'en:E621', 'en:Emmental', 'en:Farine', 'en:Flétan', 'en:Fromage', 'en:Fruits à coque', 'en:Lacotse', 'en:Lacto', 'en:Lactosérum', 'en:Lait', "en:Malt d'orge", 'en:Malt de blé', 'en:Malted wheat', "en:Malts d'orge", 'en:Milch', 'en:Moutarde', 'en:Mozzarella', 'en:Noisette', 'en:Noisettes', 'en:Oat flakes', 'en:Oeufs', 'en:Orge', 'en:PHENYLALANINE', 'en:Phenylala

In [25]:
unique_allergens_tags = set()
data_en_allergen["traces_tags"].dropna().str.split(',').apply(unique_allergens_tags.update)
allergens_tags_list = sorted(unique_allergens_tags)
print(len(allergens_tags_list))
print(allergens_tags_list)


1251
['ca:api', 'de:andere-nusse', 'de:céleri', 'de:céréales-contenant-du-gluten-et-autres-fruits-à-coque-cacao-49-minimum-dans-le-chocolat-noir', 'de:céréales-contenant-du-gluten-et-fruits-à-coque', 'de:enthalt-möglicherwelse-gerste', 'de:erdnusse', 'de:fruits-à-coque', 'de:glucodies-3', 'de:gluten-et-autres-fruits-à-coque-cacao-30-minimum-dans-le-chocolat-au-lait', 'de:gluten-et-autres-fruits-à-coque-cacao-50-minimum-dans-le-chocolat-noir', 'de:gluten-et-autres-fruits-à-coques-cacao-50-minimum-dans-le-chocolat-noir', 'de:gluten-et-fruits-à-coque-cacao-39-minimum', 'de:gluten-toutes-nos-viandes-sont-issues-d-animaux-sacrifiés-selon-un-rite-islamique-et-contrôlées-par-un-organisme-indépendant-pourcentage-de-matières-grasses-inférieur-à-15-rapport-collagène-sur-protéines-de-viande-inférieur-à-25', 'de:graines-de-sésame', 'de:hasselnüsse', 'de:hühnereiweiß-milch-laktose', 'de:milch-und-milcherzeugnisse-einschließlich-lactose', 'de:nüsse-schalenteile', 'de:pekan', 'de:pinda-s', 'de:schale

In [26]:
# Compter les occurrences de chaque allergène
from collections import Counter

# Divise chaque entrée par ',' et compile tous les allergènes en une seule liste
allergens_list = data_en_allergen["allergens"].str.split(',').sum()

# Compte les occurrences de chaque allergène
allergen_counts = Counter(allergens_list)

# Convertir le résultat en DataFrame
allergens_df = pd.DataFrame(allergen_counts.items(), columns=['allergen', 'count']).sort_values(by='count', ascending=False)

print(allergens_df)

             allergen   count
2             en:milk  114032
1           en:gluten   89446
0             en:eggs   42228
3         en:soybeans   31256
5             en:nuts   21340
...               ...     ...
1725  en:WHEAT GLUTEN       2
1724       en:CREAMER       2
1723     fr:romatique       2
1722          fr:anit       2
1721          fr:E224       2

[1734 rows x 2 columns]
