In [1]:
pip install PyMuPDF pandas


Note: you may need to restart the kernel to use updated packages.


In [2]:
import fitz  
import re
import pandas as pd

# Extract Text From Allergy Report PDF File

In [3]:
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

# Function to process extracted text
def process_text(text):
    results = []
    lines = text.split("\n")
    current_level = None

    for line in lines:
        # Check for level markers
        level_match = re.match(r"(LEVEL \d+)", line)
        if level_match:
            current_level = level_match.group(1)
        elif current_level:
            # Extract food type and ingredient
            match = re.match(r"(.*)- (.*)", line)
            if match:
                food_type = match.group(1).strip()
                ingredient = match.group(2).strip()
                LEVELS[current_level].append(ingredient)
                results.append({"Level": current_level, "Food Type": food_type, "Ingredient": ingredient})
    return results

# Main script
if __name__ == "__main__":
    # Replace with the path to your PDF file
    pdf_path = "/Users/ireneliu/Desktop/Level info.pdf"
    print("Extracting text from PDF...")
    pdf_text = extract_text_from_pdf(pdf_path)

    print("Processing text...")
    extracted_data = process_text(pdf_text)

    # Convert to DataFrame for easy analysis
    df = pd.DataFrame(extracted_data)

    # Count ingredients per level
    level_counts = df["Level"].value_counts()

    print("\nIngredient Counts by Level:")
    print(level_counts)

    print("\nExtracted Data:")
    print(df)

    # Save the results to a CSV file
    output_path = "/Users/ireneliu/Desktop/ingredients_analysis.csv"
    df.to_csv(output_path, index=False)
    print(f"Results saved to {output_path}.")

Extracting text from PDF...
Processing text...

Ingredient Counts by Level:
Level
LEVEL 3    168
LEVEL 1    143
LEVEL 2    137
Name: count, dtype: int64

Extracted Data:
       Level Food Type        Ingredient
0    LEVEL 1       Fat     Sunflower Oil
1    LEVEL 2     Dairy  Cheese (cheddar)
2    LEVEL 3       Fat     Vegetable Oil
3    LEVEL 3         -                  
4    LEVEL 3     Dairy  Cheese (cottage)
..       ...       ...               ...
443  LEVEL 1         -                  
444  LEVEL 3         -                  
445  LEVEL 2         -                  
446  LEVEL 1         -                  
447  LEVEL 1         -                  

[448 rows x 3 columns]
Results saved to /Users/ireneliu/Desktop/ingredients_analysis.csv.


# Load Ingredient Data From The CSV File

In [4]:
import pandas as pd

def load_levels_from_csv(csv_path):
    # Read the CSV file
    df = pd.read_csv(csv_path)

    # Initialize LEVELS dictionary
    levels = {}

    # Extract unique levels and their associated ingredients
    for level in df["Level"].unique():
        # Filter rows belonging to the current level
        ingredients = df[df["Level"] == level]["Ingredient"].dropna().unique()
        # Add to levels dictionary
        levels[level] = list(ingredients)
    
    return levels

# Function to analyze ingredients
def analyze_ingredients(data, levels):
    results = []
    for item in data:
        name = item["name"]
        brand = item["brand"]
        ingredients = item["ingredients"]
        
        level_counts = {level: 0 for level in levels}
        for level, keywords in levels.items():
            for keyword in keywords:
                level_counts[level] += len(re.findall(re.escape(keyword), ingredients, re.IGNORECASE))
        
        results.append({
            "brand": brand,
            "name": name,
            **level_counts
        })
    return results

if __name__ == "__main__":
    # Path to the CSV file
    csv_path = "/Users/ireneliu/Desktop/ingredients_analysis.csv"

    print("Loading ingredient levels from CSV...")
    LEVELS = load_levels_from_csv(csv_path)
    print("Loaded ingredient levels:")
    print(LEVELS)

Loading ingredient levels from CSV...
Loaded ingredient levels:
{'LEVEL 1': ['Sunflower Oil', 'Blue #1', 'Cheese (swiss)', 'Blue #2', 'Yogurt (plain Greek)', 'Yellow #5', 'Yogurt (plain)', 'Yellow #6', 'Almond Oil', 'Apple', 'Beef Fat', 'Canola Oil', 'Blueberries', 'Hemp Seed Oil', 'Mango', 'Oranges', 'Salmon Oil', 'Sardine Oil', 'Pears', 'Sesame Oil', 'Pineapples', 'Soybean Oil', 'Raspberries', 'Pet Standard', 'Ground Yellow Corn', 'Pearled Barley', 'Gelatin', 'Quinoa', 'Hemp', 'Quinoa Powder', 'Rice (brown)', 'Lactobacillus Acidophilus', 'Rice (white)', 'Soy Grits', 'Yeast', 'Bison', 'Canola Meal', 'Chicken Heart', 'Duck Gizzard', 'Bacon', 'Duck Liver', 'Duck Meal', 'Beef Broth', 'Beef Hide', 'Turkey Broth', 'Pork Skins', 'Crab', 'Rabbit Liver', 'Herring', 'Herring Meal', 'Lobster', 'Parsley', 'Sage', 'Oyster', 'Shrimp', 'Agar Gum', 'Calcium Chloride', 'Basil', 'Calcium Pantothenate', 'Calcium Sulfate', 'Fenugreek Seed', 'Carnitine', 'Zinc Sulfate', 'Copper Sulfate', 'E 306 Natural T

In [5]:
pip install requests beautifulsoup4 pandas

Note: you may need to restart the kernel to use updated packages.


#  API For Dog Food Products

In [6]:
import requests
import pandas as pd

api_url = "https://us.openpetfoodfacts.org/cgi/search.pl"

# Parameters for the API request
params = {
    'search_terms': 'dog food',
    'search_simple': 1,
    'action': 'process',
    'json': 1,
    'page_size': 100  
}

# Send GET request to the API
response = requests.get(api_url, params=params)

# Check if the request was successful
if response.status_code == 200:
    data = response.json()
    products = data.get('products', [])

    # Extract brand names and ingredients
    product_info = []
    for product in products:
        brand = product.get('brands', 'N/A')  # Extract the brand name
        ingredients = product.get('ingredients_text', 'N/A')  # Extract the ingredients
        product_info.append({'Brand': brand, 'Ingredients': ingredients})

    # Convert to DataFrame for better readability
    df = pd.DataFrame(product_info)
    print(df)

    # Save to CSV
    df.to_csv('dog_food_ingredients.csv', index=False)
    print("Data saved to 'dog_food_ingredients.csv'")
else:
    print(f"Failed to retrieve data. Status code: {response.status_code}")

                       Brand  \
0                Royal Canin   
1                              
2          Canine Carry Outs   
3          Canine Carry Outs   
4     Instinct the raw brand   
..                       ...   
78  Heart to Tail,Pure Being   
79     Fromm Family Pet Food   
80  Pure Balance Small Breed   
81                Farm Table   
82                    purina   

                                          Ingredients  
0                                                 N/A  
1                                                      
2                                                      
3                                                      
4   chicken(including ground chicken bone),beef li...  
..                                                ...  
78  deboned salmon, chicken meal, sweet potatoes, ...  
79  Chicken, Chicken Meal, Chicken Broth, Oat Groa...  
80  chicken, chicken meal, dried ground peas, tapi...  
81                                                N/A  

In [7]:
import requests
import pandas as pd

api_url = "https://us.openpetfoodfacts.org/cgi/search.pl"

params = {
    'search_terms': 'dog food',
    'search_simple': 1,
    'action': 'process',
    'json': 1,
    'page_size': 100  
}

response = requests.get(api_url, params=params)

# Check if the request was successful
if response.status_code == 200:
    data = response.json()
    products = data.get('products', [])
    
    # Convert the raw data to a DataFrame
    df = pd.DataFrame(products)
    
    # Display the column names
    print("Column Names:")
    print(df.columns)

Column Names:
Index(['_id', '_keywords', 'added_countries_tags', 'additives_old_tags',
       'additives_original_tags', 'additives_tags', 'allergens',
       'allergens_from_ingredients', 'allergens_from_user',
       'allergens_hierarchy',
       ...
       'nutrition_data_per_debug_tags', 'origins_debug_tags',
       'packaging_debug_tags', 'product_name_debug_tags',
       'product_name_fr_debug_tags', 'purchase_places_debug_tags',
       'serving_size_debug_tags', 'stores_debug_tags', 'traces_debug_tags',
       'quality_tags'],
      dtype='object', length=278)


# Analyze Ingredients