# Processor

I'll use this processor to obtain only the data I am interest in.

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import logging
from typing import Dict, List

In [None]:
class FoundationFoodProcessor:
    """
    Process USDA Foundation Foods from your specific April 2025 dataset
    """
    
    def __init__(self, data_path: str):
        self.data_path = Path(data_path)
        
        # Essential nutrients for meal planning and diabetes management
        self.essential_nutrients = {
            'Energy': 'calories_kcal',
            'Protein': 'protein_g', 
            'Carbohydrate, by difference': 'carbs_g',
            'Total lipid (fat)': 'fat_g',
            'Fiber, total dietary': 'fiber_g',
            'Sugars, total including NLEA': 'sugars_g',
            'Sodium, Na': 'sodium_mg',
            'Calcium, Ca': 'calcium_mg',
            'Iron, Fe': 'iron_mg',
            'Potassium, K': 'potassium_mg',
            'Vitamin C, total ascorbic acid': 'vitamin_c_mg',
            'Cholesterol': 'cholesterol_mg'
        }
        
        logging.basicConfig(level=logging.INFO)
        self.logger = logging.getLogger(__name__)
    
    def load_foundation_foods(self):
        """Load foundation foods from food.csv"""
        self.logger.info("Loading foundation foods from food.csv...")
        
        # Load main food file
        food_df = pd.read_csv(self.data_path / 'food.csv')
        
        # Filter to foundation foods only
        foundation_foods = food_df[food_df['data_type'] == 'foundation_food'].copy()
        
        self.logger.info(f"Found {len(foundation_foods)} foundation foods")
        return foundation_foods
    
    def load_nutrients(self):
        """Load nutrient definitions"""
        self.logger.info("Loading nutrient definitions...")
        
        nutrients_df = pd.read_csv(self.data_path / 'nutrient.csv')
        
        # Create mapping of nutrient names to IDs
        nutrient_mapping = {}
        for nutrient_name, column_name in self.essential_nutrients.items():
            # Find nutrient by exact or partial match
            matches = nutrients_df[
                nutrients_df['name'].str.contains(nutrient_name, case=False, na=False, regex=False)
            ]
            
            if not matches.empty:
                nutrient_id = matches.iloc[0]['id']
                nutrient_mapping[nutrient_id] = {
                    'name': nutrient_name,
                    'column_name': column_name,
                    'unit': matches.iloc[0]['unit_name']
                }
                self.logger.info(f"Mapped '{nutrient_name}' -> ID {nutrient_id}")
        
        self.logger.info(f"Successfully mapped {len(nutrient_mapping)} nutrients")
        return nutrient_mapping
    
    def load_food_categories(self):
        """Load food categories"""
        self.logger.info("Loading food categories...")
        categories_df = pd.read_csv(self.data_path / 'food_category.csv')
        return categories_df
    
    def get_nutrient_data_for_foods(self, foundation_foods_df, nutrient_mapping):
        """Get nutrient data for foundation foods - processing in chunks due to large file"""
        self.logger.info("Loading nutrient data (this may take a while due to large file size)...")
        
        foundation_food_ids = set(foundation_foods_df['fdc_id'].tolist())
        essential_nutrient_ids = set(nutrient_mapping.keys())
        
        # Process food_nutrient.csv in chunks to handle the large file (26M+ rows)
        chunk_size = 100000
        nutrient_data = []
        
        self.logger.info("Processing food_nutrient.csv in chunks...")
        chunk_count = 0
        
        for chunk in pd.read_csv(self.data_path / 'food_nutrient.csv', chunksize=chunk_size):
            chunk_count += 1
            if chunk_count % 50 == 0:
                self.logger.info(f"Processed {chunk_count * chunk_size:,} rows...")
            
            # Filter chunk to only foundation foods and essential nutrients
            relevant_chunk = chunk[
                (chunk['fdc_id'].isin(foundation_food_ids)) &
                (chunk['nutrient_id'].isin(essential_nutrient_ids))
            ]
            
            if not relevant_chunk.empty:
                nutrient_data.append(relevant_chunk)
        
        # Combine all relevant chunks
        if nutrient_data:
            combined_nutrient_data = pd.concat(nutrient_data, ignore_index=True)
            self.logger.info(f"Found {len(combined_nutrient_data)} nutrient values for foundation foods")
            return combined_nutrient_data
        else:
            self.logger.warning("No nutrient data found for foundation foods!")
            return pd.DataFrame()
    
    def create_food_profiles(self, foundation_foods_df, nutrient_data, nutrient_mapping, categories_df):
        """Create complete nutritional profiles for foundation foods"""
        self.logger.info("Creating food profiles...")
        
        food_profiles = []
        
        for _, food in foundation_foods_df.iterrows():
            profile = {
                'fdc_id': food['fdc_id'],
                'description': food['description'],
                'food_category_id': food.get('food_category_id'),
                'publication_date': food.get('publication_date')
            }
            
            # Get nutrients for this food
            food_nutrients = nutrient_data[nutrient_data['fdc_id'] == food['fdc_id']]
            
            # Add nutrient values
            for _, nutrient_row in food_nutrients.iterrows():
                nutrient_id = nutrient_row['nutrient_id']
                if nutrient_id in nutrient_mapping:
                    column_name = nutrient_mapping[nutrient_id]['column_name']
                    profile[column_name] = nutrient_row['amount']
            
            # Fill missing nutrients with 0
            for nutrient_info in nutrient_mapping.values():
                if nutrient_info['column_name'] not in profile:
                    profile[nutrient_info['column_name']] = 0.0
            
            food_profiles.append(profile)
        
        profiles_df = pd.DataFrame(food_profiles)
        
        # Add category names
        if 'food_category_id' in profiles_df.columns:
            profiles_df = profiles_df.merge(
                categories_df[['id', 'description']].rename(
                    columns={'id': 'food_category_id', 'description': 'category_name'}
                ),
                on='food_category_id',
                how='left'
            )
        
        self.logger.info(f"Created {len(profiles_df)} complete food profiles")
        return profiles_df
    
    def clean_and_enhance_data(self, profiles_df):
        """Clean data and add useful metrics"""
        self.logger.info("Cleaning and enhancing data...")
        
        # Keep only foods with meaningful calorie data
        valid_foods = profiles_df[profiles_df['calories_kcal'] > 0].copy()
        
        # Fill missing nutrients with 0
        nutrient_cols = [col for col in valid_foods.columns if col.endswith(('_g', '_mg', '_kcal'))]
        valid_foods[nutrient_cols] = valid_foods[nutrient_cols].fillna(0)
        
        # Remove extreme outliers
        valid_foods = valid_foods[
            (valid_foods['calories_kcal'] >= 1) & 
            (valid_foods['calories_kcal'] <= 900)
        ]
        
        # Clean food descriptions
        valid_foods['description'] = valid_foods['description'].str.strip().str.title()
        
        # Add calculated metrics for meal planning
        valid_foods['calorie_density'] = valid_foods['calories_kcal'] / 100
        valid_foods['protein_pct_calories'] = (valid_foods['protein_g'] * 4) / valid_foods['calories_kcal'] * 100
        valid_foods['carb_pct_calories'] = (valid_foods['carbs_g'] * 4) / valid_foods['calories_kcal'] * 100
        valid_foods['fat_pct_calories'] = (valid_foods['fat_g'] * 9) / valid_foods['calories_kcal'] * 100
        valid_foods['fiber_per_calorie'] = valid_foods['fiber_g'] / valid_foods['calories_kcal']
        valid_foods['net_carbs_g'] = valid_foods['carbs_g'] - valid_foods['fiber_g']
        
        # Satiety score (protein + fiber for feeling full)
        valid_foods['satiety_score'] = (valid_foods['protein_g'] * 0.4 + valid_foods['fiber_g'] * 0.6)
        
        # Handle any inf/nan values
        valid_foods = valid_foods.replace([np.inf, -np.inf], 0)
        valid_foods = valid_foods.fillna(0)
        
        self.logger.info(f"Cleaned data: {len(valid_foods)} valid foods")
        return valid_foods
    
    def save_results(self, profiles_df, output_path: str):
        """Save processed data"""
        output_path = Path(output_path)
        output_path.parent.mkdir(parents=True, exist_ok=True)
        
        # Save main data
        profiles_df.to_csv(output_path, index=False)
        self.logger.info(f"Saved {len(profiles_df)} foundation foods to {output_path}")
        
        # Save summary report
        summary_path = output_path.parent / "foundation_foods_summary.txt"
        with open(summary_path, 'w') as f:
            f.write("USDA Foundation Foods Processing Summary\n")
            f.write("=" * 50 + "\n\n")
            f.write(f"Total Foods: {len(profiles_df)}\n")
            f.write(f"Average Calories per 100g: {profiles_df['calories_kcal'].mean():.1f}\n")
            f.write(f"Food Categories: {profiles_df['category_name'].nunique()}\n\n")
            
            # Top categories
            f.write("Top Food Categories:\n")
            for category, count in profiles_df['category_name'].value_counts().head(10).items():
                f.write(f"  {category}: {count} foods\n")
            
            # Nutritional ranges
            f.write(f"\nNutritional Ranges (per 100g):\n")
            f.write(f"  Calories: {profiles_df['calories_kcal'].min():.0f} - {profiles_df['calories_kcal'].max():.0f}\n")
            f.write(f"  Protein: {profiles_df['protein_g'].min():.1f} - {profiles_df['protein_g'].max():.1f}g\n")
            f.write(f"  Carbs: {profiles_df['carbs_g'].min():.1f} - {profiles_df['carbs_g'].max():.1f}g\n")
            f.write(f"  Fat: {profiles_df['fat_g'].min():.1f} - {profiles_df['fat_g'].max():.1f}g\n")
            f.write(f"  Fiber: {profiles_df['fiber_g'].min():.1f} - {profiles_df['fiber_g'].max():.1f}g\n")
        
        # Save sample of high-satiety foods (good for your 1900 calorie goal)
        high_satiety = profiles_df.nlargest(20, 'satiety_score')[
            ['description', 'calories_kcal', 'protein_g', 'fiber_g', 'satiety_score']
        ]
        high_satiety.to_csv(output_path.parent / "high_satiety_foods.csv", index=False)
        
        self.logger.info(f"Saved summary to {summary_path}")
    
    def process_all(self, output_path: str = "processed_data/foundation_foods.csv"):
        """Complete processing pipeline"""
        self.logger.info("Starting foundation foods processing...")
        
        try:
            # Load all data
            foundation_foods_df = self.load_foundation_foods()
            nutrient_mapping = self.load_nutrients()
            categories_df = self.load_food_categories()
            
            # Get nutrient data (this will take some time due to large file)
            nutrient_data = self.get_nutrient_data_for_foods(foundation_foods_df, nutrient_mapping)
            
            # Create profiles
            profiles_df = self.create_food_profiles(
                foundation_foods_df, nutrient_data, nutrient_mapping, categories_df
            )
            
            # Clean and enhance
            profiles_df = self.clean_and_enhance_data(profiles_df)
            
            # Save results
            self.save_results(profiles_df, output_path)
            
            self.logger.info("Processing complete!")
            return profiles_df
            
        except Exception as e:
            self.logger.error(f"Error during processing: {e}")
            raise

# Run the processor
if __name__ == "__main__":
    # Your data path
    data_path = r"C:\Users\Usuario\Documents\FoodData_Central_csv_2025-04-24\FoodData_Central_csv_2025-04-24"
    
    # Initialize processor
    processor = FoundationFoodProcessor(data_path)
    
    # Process all foundation foods
    foundation_foods_df = processor.process_all("output/foundation_foods.csv")
    
    # Preview results
    print(f"\nProcessed {len(foundation_foods_df)} foundation foods")
    print("\nTop 10 foods by satiety score (good for feeling full):")
    print(foundation_foods_df.nlargest(10, 'satiety_score')[
        ['description', 'calories_kcal', 'protein_g', 'fiber_g', 'satiety_score']
    ].to_string())
    
    print("\nSample of low-calorie, high-nutrition foods:")
    low_cal_nutritious = foundation_foods_df[
        (foundation_foods_df['calories_kcal'] < 100) & 
        (foundation_foods_df['protein_g'] > 5)
    ].head(10)
    print(low_cal_nutritious[['description', 'calories_kcal', 'protein_g', 'carbs_g']].to_string())