In [4]:
import requests
import pandas as pd
import json
import time
import os
from datetime import datetime
from tqdm.auto import tqdm
import gzip
import urllib.request
from pathlib import Path

In [None]:
os.makedirs('OpenFoodFacts', exist_ok=True)
os.makedirs('OpenFoodFacts/raw', exist_ok=True)
os.makedirs('OpenFoodFacts/processed', exist_ok=True)
os.makedirs('OpenFoodFacts/exports', exist_ok=True)

print("Directory structure created!")

Directory structure created!


In [None]:
def download_complete_export(export_type='jsonl'):
    
    if export_type == 'jsonl':
        url = "https://static.openfoodfacts.org/data/openfoodfacts-products.jsonl.gz"
        output_path = Path("OpenFoodFacts/raw/openfoodfacts-products.jsonl.gz")
    else:
        url = "https://static.openfoodfacts.org/data/en.openfoodfacts.org.products.csv.gz"
        output_path = Path("OpenFoodFacts/raw/openfoodfacts-products.csv.gz")
    
    print(f"\nDownloading complete Open Food Facts database...")
    print(f"URL: {url}")
    print(f"Output: {output_path}")
    
    try:
        with requests.get(url, stream=True) as r:
            r.raise_for_status()
            total_size = int(r.headers.get('content-length', 0))
            
            with open(output_path, 'wb') as f, tqdm(
                desc=output_path.name,
                total=total_size,
                unit='iB',
                unit_scale=True,
                unit_divisor=1024,
            ) as bar:
                for chunk in r.iter_content(chunk_size=8192):
                    size = f.write(chunk)
                    bar.update(size)
                    
        print(f"\n\n Download completed!")
        print(f" File saved to: {output_path}")
        
        file_size_gb = os.path.getsize(output_path) / (1024**3)
        print(f" File size: {file_size_gb:.2f} GB")
        
        return str(output_path)
        
    except Exception as e:
        print(f"\n Error downloading file: {str(e)}")
        return None

In [7]:
export_file = download_complete_export(export_type='jsonl')


Downloading complete Open Food Facts database...
URL: https://static.openfoodfacts.org/data/openfoodfacts-products.jsonl.gz
Output: OpenFoodFacts/raw/openfoodfacts-products.jsonl.gz


openfoodfacts-products.jsonl.gz: 100%|██████████| 9.58G/9.58G [3:51:27<00:00, 741kiB/s]   




 Download completed!
 File saved to: OpenFoodFacts/raw/openfoodfacts-products.jsonl.gz
 File size: 9.58 GB


In [None]:
def process_jsonl_export(file_path, chunk_size=10000, max_products=None):

    print(f"\nProcessing JSONL export...")
    print(f"Reading from: {file_path}")
    
    products = []
    count = 0
    chunk_num = 1
    
    try:
        with gzip.open(file_path, 'rt', encoding='utf-8') as f:
            for line in tqdm(f, desc="Processing products"):
                if max_products and count >= max_products:
                    print(f"\nReached max_products limit of {max_products}.")
                    break
                try:
                    product = json.loads(line)
                    nutriments = product.get('nutriments', {})
                    
                    processed_product = {
                        'code': product.get('code'),
                        'product_name': product.get('product_name'),
                        'generic_name': product.get('generic_name'),
                        'brands': product.get('brands'),
                        'categories': product.get('categories'),
                        'categories_tags': ','.join(product.get('categories_tags', [])),
                        'ingredients_text': product.get('ingredients_text'),
                        'allergens': product.get('allergens'),
                        'traces': product.get('traces'),
                        'serving_size': product.get('serving_size'),
                        'nutrition_grade': product.get('nutrition_grades'),
                        'countries': product.get('countries'),
                        'countries_tags': ','.join(product.get('countries_tags', [])),
                        'image_url': product.get('image_url'),
                        'energy_100g': nutriments.get('energy-kcal_100g'),
                        'fat_100g': nutriments.get('fat_100g'),
                        'carbohydrates_100g': nutriments.get('carbohydrates_100g'),
                        'proteins_100g': nutriments.get('proteins_100g'),
                        'salt_100g': nutriments.get('salt_100g'),
                        'sugars_100g': nutriments.get('sugars_100g'),
                        'fiber_100g': nutriments.get('fiber_100g'),
                    }
                    products.append(processed_product)
                    count += 1
                    
                    if len(products) >= chunk_size:
                        df_chunk = pd.DataFrame(products)
                        chunk_file = f"OpenFoodFacts/processed/chunk_{chunk_num}.csv"
                        df_chunk.to_csv(
                            chunk_file,
                            index=False,
                            encoding='utf-8',
                            escapechar='\\'
                        )
                        products = []
                        chunk_num += 1
                        
                except (json.JSONDecodeError, KeyError) as e:
                    continue

        if products:
            df_chunk = pd.DataFrame(products)
            chunk_file = f"OpenFoodFacts/processed/chunk_{chunk_num}.csv"
            df_chunk.to_csv(chunk_file, index=False, encoding='utf-8')
            print(f"\nSaved final chunk: {chunk_file} ({len(products)} products)")
        
        print(f" Total products processed: {count:,}")
        print(f" Data saved in chunks in: OpenFoodFacts/processed/")
        
        return count
        
    except Exception as e:
        print(f"\n✗ An unexpected error occurred: {str(e)}")
        return 0

In [None]:
file_path = "OpenFoodFacts/raw/openfoodfacts-products.jsonl.gz"
total_processed = process_jsonl_export(
    file_path, 
    chunk_size=50000,
    max_products=None
)


Processing JSONL export...
Reading from: OpenFoodFacts/raw/openfoodfacts-products.jsonl.gz


Processing products: 4168452it [10:16, 6761.88it/s] 



Saved final chunk: OpenFoodFacts/processed/chunk_84.csv (18452 products)
 Total products processed: 4,168,452
 Data saved in chunks in: OpenFoodFacts/processed/


In [None]:
def fetch_all_categories():
    """
    Fetch list of all available categories from Open Food Facts.
    """
    url = "https://world.openfoodfacts.org/categories.json"
    print(f"Fetching all categories from {url}...")
    try:
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()
        
        categories = [
            {'id': tag.get('id'), 'name': tag.get('name'), 'products': tag.get('products', 0)}
            for tag in data.get('tags', []) if tag.get('id')
        ]
        
        df_categories = pd.DataFrame(categories).sort_values('products', ascending=False).reset_index(drop=True)
        print(f"✓ Found {len(df_categories):,} categories")
        print(f"✓ Total products across all categories: {df_categories['products'].sum():,}")
        
        return df_categories
        
    except Exception as e:
        print(f"✗ Error fetching categories: {str(e)}")
        return pd.DataFrame()

In [None]:
df_categories = fetch_all_categories()
if not df_categories.empty:
    print("\nTop 20 categories by product count:")
    print(df_categories.head(20))

In [None]:
def fetch_products_comprehensive(
    categories=None, 
    page_size=100, 
    max_pages=None,
    save_interval=1000
):
    
    base_url = "https://world.openfoodfacts.org/cgi/search.pl"
    all_products = []
    total_count = 0
    
    categories_to_process = categories if categories else [None] # Use [None] to fetch all if no category is specified
    
    for category in categories_to_process:
        print(f"\n{'='*60}")
        print(f"Processing category: {category or 'All Categories'}")
        print(f"{'='*60}")
        
        page = 1
        while True:
            if max_pages and page > max_pages:
                print(f"Reached max page limit of {max_pages} for this category.")
                break
            
            params = {
                "action": "process", "json": 1, "page": page, "page_size": page_size,
                "fields": "code,product_name,generic_name,brands,categories,ingredients_text,nutrition_grades,countries,image_url,nutriments"
            }
            if category: params.update({"tagtype_0": "categories", "tag_contains_0": "contains", "tag_0": category})
            
            try:
                response = requests.get(base_url, params=params, timeout=30)
                response.raise_for_status()
                data = response.json()
                products = data.get('products', [])
                
                if not products:
                    print(f"  No more products found for this category.")
                    break
                
                all_products.extend(products)
                total_count += len(products)
                print(f"  Page {page}: Fetched {len(products)} products (Total: {total_count:,})")
                
                if total_count // save_interval > (total_count - len(products)) // save_interval:
                    df_temp = pd.DataFrame(all_products)
                    temp_file = f"OpenFoodFacts/processed/api_fetch_backup.csv"
                    df_temp.to_csv(temp_file, index=False, encoding='utf-8')
                
                page += 1
                time.sleep(0.5)
                
            except requests.RequestException as e:
                print(f"  ✗ Network error on page {page}: {e}. Retrying after delay...")
                time.sleep(5)
    
    print(f"\n{'='*60}\nAPI Fetching Complete! Total products fetched: {total_count:,}\n{'='*60}")
    return pd.DataFrame(all_products)

In [None]:
if not df_categories.empty:
    top_categories = df_categories.head(5)['id'].tolist()
    df_api_top = fetch_products_comprehensive(
        categories=top_categories,
        max_pages=10,
        save_interval=500
    )
else:
    print("Skipping API fetch because category list is empty.")
    df_api_top = pd.DataFrame()

In [None]:
if not df_api_top.empty:
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    output_file = f"OpenFoodFacts/exports/openfoodfacts_api_{timestamp}.csv"
    df_api_top.to_csv(output_file, index=False, encoding='utf-8')
    print(f"✓ API data saved to: {output_file}")
    print(f"✓ Total products: {len(df_api_top):,}")

## Method 3: Merge All Chunks into a Single Dataset

In [None]:
def merge_all_chunks(chunk_directory='OpenFoodFacts/processed'):
    print(f"\nMerging all chunks from: {chunk_directory}")
    chunk_dir = Path(chunk_directory)
    chunk_files = sorted(chunk_dir.glob('chunk_*.csv'))
    
    if not chunk_files:
        print("No chunk files found to merge!")
        return None
    
    print(f"Found {len(chunk_files)} chunk files to merge.")
    
    df_list = [pd.read_csv(file) for file in tqdm(chunk_files, desc="Loading chunks")]
    
    if not df_list:
        return None

    df_merged = pd.concat(df_list, ignore_index=True)
    initial_count = len(df_merged)
    df_merged.drop_duplicates(subset=['code'], keep='first', inplace=True)
    duplicates_removed = initial_count - len(df_merged)
    
    print(f"Total unique products: {len(df_merged):,}")
    print(f"Duplicates removed: {duplicates_removed:,}")
    
    return df_merged

In [None]:
df_complete = merge_all_chunks()


Merging all chunks from: OpenFoodFacts/processed
Found 84 chunk files to merge.


  df_list = [pd.read_csv(file) for file in tqdm(chunk_files, desc="Loading chunks")]
  df_list = [pd.read_csv(file) for file in tqdm(chunk_files, desc="Loading chunks")]
  df_list = [pd.read_csv(file) for file in tqdm(chunk_files, desc="Loading chunks")]
  df_list = [pd.read_csv(file) for file in tqdm(chunk_files, desc="Loading chunks")]
  df_list = [pd.read_csv(file) for file in tqdm(chunk_files, desc="Loading chunks")]
Loading chunks: 100%|██████████| 84/84 [00:12<00:00,  6.76it/s]


Total unique products: 4,168,397
Duplicates removed: 57


In [None]:
csv_file = f"OpenFoodFacts/exports/openfoodfacts_complete_{timestamp}.csv.gz"
df_complete.to_csv(csv_file, index=False, encoding='utf-8', compression='gzip')

In [None]:
df_complete['code'] = df_complete['code'].astype(str)

for col in df_complete.columns:
    if df_complete[col].dtype == object:
        df_complete[col] = df_complete[col].astype(str)

timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

parquet_file = f"OpenFoodFacts/exports/openfoodfacts_complete_{timestamp}.parquet"
df_complete.to_parquet(parquet_file, compression='gzip', index=False)

print(f"Complete dataset saved as Parquet: {parquet_file}")


Complete dataset saved as Parquet: OpenFoodFacts/exports/openfoodfacts_complete_20251206_190555.parquet


## Data Statistics and Summary

In [13]:
if df_complete is not None:
    print("OPEN FOOD FACTS - COMPLETE DATASET SUMMARY")
   
    
    print(f"\nTotal Products: {len(df_complete):,}")
    print(f"Dataset Shape: {df_complete.shape}")
    
    print(f"\nData Completeness (Top 15 columns):")
    completeness = (df_complete.notna().sum() / len(df_complete) * 100).sort_values(ascending=False)
    print(completeness.head(15).to_string())
    
    if 'categories' in df_complete.columns:
        print(f"\nTop 10 Categories:")
        print(df_complete['categories'].value_counts().head(10).to_string())
    
    if 'countries' in df_complete.columns:
        print(f"\nTop 10 Countries:")
        print(df_complete['countries'].value_counts().head(10).to_string())
    
    if 'nutrition_grade' in df_complete.columns:
        print(f"\nNutrition Grade Distribution:")
        print(df_complete['nutrition_grade'].value_counts().sort_index().to_string())
    
    
    print("\nSample Products:")
    display(df_complete[['product_name', 'brands', 'categories', 'nutrition_grade']].head(10))

OPEN FOOD FACTS - COMPLETE DATASET SUMMARY

Total Products: 4,168,397
Dataset Shape: (4168397, 21)

Data Completeness (Top 15 columns):
code                  100.000000
allergens             100.000000
countries_tags        100.000000
countries             100.000000
product_name          100.000000
serving_size          100.000000
traces                100.000000
nutrition_grade       100.000000
ingredients_text      100.000000
categories_tags       100.000000
categories            100.000000
brands                100.000000
generic_name          100.000000
proteins_100g          72.212412
carbohydrates_100g     72.179353

Top 10 Categories:
categories
nan                                                                                 2408565
undefined                                                                             34854
Snacks                                                                                33158
Beverages                                                     

Unnamed: 0,product_name,brands,categories,nutrition_grade
0,Véritable pâte à tartiner noisettes chocolat noir,Bovetti,"Petit-déjeuners,Produits à tartiner,Produits à...",e
1,Chamomile Herbal Tea,Lagg's,,unknown
2,"Lagg's, herbal tea, peppermint",Lagg's,"Plant-based foods and beverages, Beverages, Ho...",unknown
3,Linden Flowers Tea,Lagg's,"Beverages and beverages preparations, Plant-ba...",unknown
4,"Herbal Tea, Hibiscus",Lagg's,,unknown
5,Apple & Cinnamon Tea,Lagg's,,unknown
6,Green Tea,Lagg's,,unknown
7,Shave Grass Herbal Tea,Lagg's,,unknown
8,"Lagg's, herbal tea, chamomile * mint",Lagg's,"Plant-based foods and beverages, Beverages, Ho...",unknown
9,Artichoke Herbal Tea,Lagg's,,unknown
