# Libraries

In [1]:
import pandas as pd
from transformers import MarianMTModel, MarianTokenizer
from sentence_transformers import SentenceTransformer
import numpy as np
from multiprocessing import Pool
import torch
from tqdm import tqdm
import json
import ast
from typing import Any, List, Dict
from datetime import datetime
import sys 
import easyocr

# load files

In [12]:
def safe_parse_list(value: Any) -> List:
    """Safely parse string representations of lists"""
    # If already a list, return as is
    if isinstance(value, list):
        return value
        
    # If numpy array or pandas series, convert to list
    if hasattr(value, 'tolist'):
        return value.tolist()
        
    # Handle missing values
    if pd.isna(value):
        return []
    
    # Handle string representations
    if isinstance(value, str):
        cleaned_value = value.replace('\x00', '').strip()
        try:
            return ast.literal_eval(cleaned_value)
        except:
            try:
                return json.loads(cleaned_value)
            except:
                return [cleaned_value]
    
    return [value]

def load_and_prep_data(file_path: str) -> pd.DataFrame:
    """Load and prepare recipe data with robust error handling"""
    df = pd.read_parquet(file_path)
    
    list_columns = ['instructions_list', 'keywords', 'ingredients']
    
    for col in list_columns:
        if col in df.columns:
            print(f"Processing column: {col}")
            # Process column values directly
            df[col] = df[col].apply(safe_parse_list)
            
            # Validate results
            invalid_rows = df[col].apply(lambda x: not isinstance(x, list))
            if invalid_rows.any():
                print(f"Found {invalid_rows.sum()} invalid rows in {col}")
    
    return df

### data validation

In [13]:
def validate_and_inspect_data(df: pd.DataFrame) -> tuple[bool, dict]:
    """
    Validates recipe data and returns detailed inspection results
    Returns: (is_valid: bool, inspection_results: dict)
    """
    inspection_results = {}
    is_valid = True

    # Check for empty dataframe
    if df.empty:
        print("DataFrame is empty")
        return False, {}

    # Required columns check
    required_columns = ['title', 'description', 'instructions_list']
    missing_cols = [col for col in required_columns if col not in df.columns]
    if missing_cols:
        print(f"Missing required columns: {missing_cols}")
        is_valid = False
    inspection_results['missing_columns'] = missing_cols

    # Data types validation
    expected_types = {
        'title': 'object',
        'description': 'object',
        'instructions_list': 'object'
    }
    type_mismatches = []
    for col, expected_type in expected_types.items():
        if col in df.columns and not df[col].dtype == expected_type:
            type_mismatches.append(f"{col}: expected {expected_type}, got {df[col].dtype}")
            is_valid = False
    inspection_results['type_mismatches'] = type_mismatches

    # List columns inspection
    list_columns = ['instructions_list', 'keywords', 'ingredients']
    list_column_samples = {}
    for col in list_columns:
        if col in df.columns:
            sample = df[col].iloc[0]
            list_column_samples[col] = {
                'type': str(type(sample)),
                'sample': sample[:2] if hasattr(sample, '__getitem__') else sample
            }
    inspection_results['list_column_samples'] = list_column_samples

    # Missing values analysis
    inspection_results['null_counts'] = df.isnull().sum().to_dict()

    # Print formatted results
    print("\nData Structure Analysis:")
    print("-" * 50)
    print(f"\nValidation passed: {is_valid}")
    
    if inspection_results['missing_columns']:
        print("\nMissing columns:", inspection_results['missing_columns'])
    
    if inspection_results['type_mismatches']:
        print("\nType mismatches:", inspection_results['type_mismatches'])
    
    print("\nNull value counts:")
    for col, count in inspection_results['null_counts'].items():
        if count > 0:
            print(f"{col}: {count}")

    return is_valid, inspection_results

In [14]:
def clean_recipe_data(df):
    """Clean missing values from recipe data"""
    # Remove recipes missing critical information
    critical_columns = ['title', 'description', 'instructions']
    df_clean = df.dropna(subset=critical_columns)
    
    # Fill missing ratings with sensible defaults
    df_clean['ratings'] = df_clean['ratings'].fillna(0)
    df_clean['ratings_count'] = df_clean['ratings_count'].fillna(0)
    
    return df_clean

# preprocessing

## translation

In [15]:
def translate_text_batch(texts, model, tokenizer, batch_size=8):
    """Translate a batch of texts"""
    translations = []
    
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]
        # Clean and prepare texts
        batch = [str(text)[:512] for text in batch]  # Truncate to max length
        
        try:
            encoded = tokenizer(batch, return_tensors="pt", padding=True, truncation=True)
            translated = model.generate(**encoded)
            translated_texts = tokenizer.batch_decode(translated, skip_special_tokens=True)
            translations.extend(translated_texts)
        except Exception as e:
            print(f"Error in batch translation: {e}")
            translations.extend(batch)  # Use original text as fallback
            
    return translations

def translate_recipe_data(df):
    """Translate recipe data to German"""
    print("Loading translation model...")
    model_name = 'Helsinki-NLP/opus-mt-en-de'
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)
    
    translated_df = df.copy()
    
    # Translate text columns
    text_cols = {
        'title': 'title_de',
        'description': 'description_de'
    }
    
    for col, new_col in text_cols.items():
        print(f"Translating {col}...")
        translated_df[new_col] = translate_text_batch(
            df[col].tolist(), 
            model, 
            tokenizer
        )
    
    # Translate instructions list
    print("Translating instructions...")
    translated_df['instructions_list_de'] = translated_df['instructions_list'].apply(
        lambda x: translate_text_batch(x, model, tokenizer)
    )
    
    # Translate ingredients
    print("Translating ingredients...")
    translated_df['ingredients_de'] = translated_df['ingredients'].apply(
        lambda x: translate_text_batch(x, model, tokenizer)
    )
    
    return translated_df

# Embeddings

## create embeddings

In [16]:
def create_embeddings(texts, model_name='paraphrase-multilingual-MiniLM-L12-v2'):
    """Create embeddings using a multilingual model"""
    model = SentenceTransformer(model_name)
    embeddings = model.encode(texts, show_progress_bar=True)
    return embeddings

## run embeddings

### bon appetit

In [None]:
def process_recipes(file_path: str):
    """Main processing function with enhanced validation"""
    try:
        print("Loading data...")
        df = load_and_prep_data(file_path)
        
        # Replace separate validation functions with combined approach
        is_valid, inspection_results = validate_and_inspect_data(df)
        
        if not is_valid:
            # Store validation results for debugging
            with open('validation_report.json', 'w') as f:
                json.dump(inspection_results, f, indent=2)
            raise ValueError("Data validation failed. See validation_report.json for details")
            
        # Continue only if validation passed
        if inspection_results['null_counts'].get('title', 0) > 0:
            print("Warning: Some recipes are missing titles")
        
        # clean the data by removing parts with missing crucial information
        df = clean_recipe_data(df)

        # Translate
        print("Starting translation...")
        df_translated = translate_recipe_data(df)
        
        # Validate translated data
        is_valid_translated, translated_inspection = validate_and_inspect_data(df_translated)
        if not is_valid_translated:
            raise ValueError("Translation validation failed")
            
        # Create embeddings for both languages
        print("Creating embeddings...")
        embeddings = {
            'title_orig': create_embeddings(df['title'].tolist()),
            'title_de': create_embeddings(df_translated['title_de'].tolist()),
            'description_orig': create_embeddings(df['description'].tolist()),
            'description_de': create_embeddings(df_translated['description_de'].tolist())
        }
        
        # Save all results
        print("Saving results...")
        df_translated.to_parquet('recipes_translated.parquet')
        np.savez('recipe_embeddings.npz', **embeddings)
        
        # Save processing report
        processing_report = {
            'original_data_validation': inspection_results,
            'translated_data_validation': translated_inspection,
            'embedding_shapes': {k: v.shape for k, v in embeddings.items()},
            'timestamp': datetime.now().isoformat()
        }
        with open('processing_report.json', 'w') as f:
            json.dump(processing_report, f, indent=2)
            
        return df_translated, embeddings, processing_report
        
    except Exception as e:
        print(f"Error processing recipes: {str(e)}")
        import traceback
        print("Full error trace:")
        print(traceback.format_exc())
        raise

# Usage
if __name__ == "__main__":
    file_path = "../data/recipes.parquet"
    try:
        df, embeddings, report = process_recipes(file_path)
        print("\nProcessing completed successfully!")
        print(f"Processed {len(df)} recipes")
        print(f"Created {len(embeddings)} embedding sets")
    except Exception as e:
        print("Processing failed. Check error messages above.")
        sys.exit(1)

### chefkoch

In [None]:
def process_recipes(file_path: str):
    """Main processing function with enhanced validation"""
    try:
        print("Loading data...")
        df = load_and_prep_data(file_path)
        
        # Replace separate validation functions with combined approach
        is_valid, inspection_results = validate_and_inspect_data(df)
        
        if not is_valid:
            # Store validation results for debugging
            with open('validation_report.json', 'w') as f:
                json.dump(inspection_results, f, indent=2)
            raise ValueError("Data validation failed. See validation_report.json for details")
            
        # Continue only if validation passed
        if inspection_results['null_counts'].get('title', 0) > 0:
            print("Warning: Some recipes are missing titles")
        
        # clean the data by removing parts with missing crucial information
        df = clean_recipe_data(df)

        # Create embeddings for both languages
        print("Creating embeddings...")
        embeddings = {
            'title_orig': create_embeddings(df['title'].tolist()),
            'description_orig': create_embeddings(df['description'].tolist())
        }
        
        # Save all results
        print("Saving results...")
        df_translated.to_parquet('recipes_translated.parquet')
        np.savez('recipe_embeddings.npz', **embeddings)
        
        # Save processing report
        processing_report = {
            'original_data_validation': inspection_results,
            'translated_data_validation': translated_inspection,
            'embedding_shapes': {k: v.shape for k, v in embeddings.items()},
            'timestamp': datetime.now().isoformat()
        }
        with open('processing_report.json', 'w') as f:
            json.dump(processing_report, f, indent=2)
            
        return df_translated, embeddings, processing_report
        
    except Exception as e:
        print(f"Error processing recipes: {str(e)}")
        import traceback
        print("Full error trace:")
        print(traceback.format_exc())
        raise

# Usage
if __name__ == "__main__":
    file_path = "../data/recipes.parquet"
    try:
        df, embeddings, report = process_recipes(file_path)
        print("\nProcessing completed successfully!")
        print(f"Processed {len(df)} recipes")
        print(f"Created {len(embeddings)} embedding sets")
    except Exception as e:
        print("Processing failed. Check error messages above.")
        sys.exit(1)

# text recognition

## easyOCR

In [3]:
reader = easyocr.Reader(['de', 'en'], gpu = False)

Using CPU. Note: This module is much faster with a GPU.
Downloading detection model, please wait. This may take several minutes depending upon your network connection.


Progress: |██████████████████████████████████████████████████| 100.0% Complete

Downloading recognition model, please wait. This may take several minutes depending upon your network connection.


Progress: |██████████████████████████████████████████████████| 100.0% Complete

## run textextraction

In [9]:
from pathlib import Path
import cv2
import numpy as np
from dataclasses import dataclass
from typing import List, Tuple, Dict, Optional
import json
from datetime import datetime

@dataclass
class ReceiptTemplate:
    store_name: str
    header_region: Tuple[float, float]  # Relative y-coordinates
    items_region: Tuple[float, float]
    total_region: Tuple[float, float]
    price_pattern: str
    date_pattern: str
    
@dataclass
class ProcessedReceipt:
    store_name: str
    date: datetime
    items: List[Tuple[str, float]]
    total: float
    address: Optional[str] = None
    
class ReceiptProcessor:
    def __init__(self, template_path: Optional[Path] = None):
        self.templates: Dict[str, ReceiptTemplate] = {}
        if template_path:
            self._load_templates(template_path)
    
    def _load_templates(self, path: Path):
        """Load store templates from JSON configuration"""
        with open(path) as f:
            template_data = json.load(f)
            for store, config in template_data.items():
                self.templates[store] = ReceiptTemplate(**config)
    
    def process_image(self, image_path: Path) -> np.ndarray:
        """Load and preprocess image"""
        # Read image
        img = cv2.imread(str(image_path))
        if img is None:
            raise ValueError(f"Could not read image: {image_path}")
            
        # Convert to grayscale
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        
        # Determine if deskewing is needed by checking edges
        edges = cv2.Canny(gray, 50, 150)
        lines = cv2.HoughLines(edges, 1, np.pi/180, 100)
        if lines is not None:
            angle = self._calculate_skew_angle(lines)
            if abs(angle) > 0.5:  # Only rotate if skew is significant
                gray = self._rotate_image(gray, angle)
        
        # Enhance contrast
        gray = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8)).apply(gray)
        
        # Apply adaptive thresholding
        binary = cv2.adaptiveThreshold(
            gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, 
            cv2.THRESH_BINARY, 11, 2
        )
        
        return binary
    
    def identify_store(self, ocr_results: List[Tuple]) -> Optional[str]:
        """Identify store from OCR results to select template"""
        # Group text by vertical position
        top_section = self._group_by_vertical_position(
            ocr_results, max_y=0.2
        )
        
        # Look for store names in top section
        for store in self.templates.keys():
            if any(store.lower() in text.lower() 
                  for _, text, _ in top_section):
                return store
        return None
    
    def extract_info(self, ocr_results: List[Tuple], 
                    store_template: Optional[ReceiptTemplate] = None) -> ProcessedReceipt:
        """Extract information using template if available"""
        if store_template:
            return self._template_extraction(ocr_results, store_template)
        else:
            return self._generic_extraction(ocr_results)
    
    def _template_extraction(self, ocr_results: List[Tuple], 
                           template: ReceiptTemplate) -> ProcessedReceipt:
        """Extract info using store-specific template"""
        # Implementation here
        pass
    
    def _generic_extraction(self, ocr_results: List[Tuple]) -> ProcessedReceipt:
        """Fallback extraction without template"""
        # Implementation here
        pass

In [10]:
reader.readtext(process_image('../data/test_receipt.png'))

NameError: name 'process_image' is not defined

# test area

In [6]:
test.head()

Unnamed: 0,canonical_url,description,ingredient_groups,ingredients,instructions,instructions_list,keywords,ratings,ratings_count,site_name,title
asparagus-with-parmesan-fonduta,https://www.bonappetit.com/recipe/asparagus-wi...,"Snappy asparagus gets tucked under a rich, vel...","[{'ingredients': ['1 large bunch asparagus, wo...","[1 large bunch asparagus, woody ends trimmed, ...",Cook asparagus in a large pot of boiling gener...,[Cook asparagus in a large pot of boiling gene...,"[dinner, side, 5 ingredients or fewer, easy, q...",4.2,8.0,Bon Appétit,Asparagus With Parmesan Fonduta
green-panzanella-with-pickled-shallot,https://www.bonappetit.com/recipe/green-panzan...,We love the shades of green you get from using...,"[{'ingredients': ['1 large shallot, thinly sli...","[1 large shallot, thinly sliced, ½ cup red win...",Combine shallot and vinegar in a small bowl; s...,[Combine shallot and vinegar in a small bowl; ...,"[salad, italian, tomato, shallot, summer, quic...",4.0,2.0,Bon Appétit,Green Panzanella with Pickled Shallot
pan-fried-noodles-coconut-shrimp-vegetables,https://www.bonappetit.com/recipe/pan-fried-no...,Crispy Hong Kong–style pan-fried noodles make ...,[{'ingredients': ['1 lb. Hong Kong–style pan-f...,"[1 lb. Hong Kong–style pan-fry noodles, Vegeta...",Spread 1 lb. Hong Kong–style pan-fry noodles f...,[Spread 1 lb. Hong Kong–style pan-fry noodles ...,"[dinner, main, dairy-free, nut-free, shrimp, c...",5.0,4.0,Bon Appétit,Pan-Fried Noodles With Coconut Shrimp and Vege...
oniony-brothy-beans-recipe,https://www.bonappetit.com/recipe/oniony-broth...,Cozy beans are just what you need to kickstart...,[{'ingredients': ['2 Tbsp. extra-virgin olive ...,[2 Tbsp. extra-virgin olive oil or vegetable o...,Heat 2 Tbsp. extra-virgin olive oil or vegetab...,[Heat 2 Tbsp. extra-virgin olive oil or vegeta...,"[dinner, main, side, easy, weeknight meals, ca...",3.2,17.0,Bon Appétit,Oniony Brothy Beans
tandoori-carrots-with-vadouvan-spice-and-yogurt,https://www.bonappetit.com/recipe/tandoori-car...,"What’s vadouvan, you ask? It’s just one of our...","[{'ingredients': ['2 tablespoons vadouvan', '2...","[2 tablespoons vadouvan, 2 garlic cloves finel...","Preheat oven to 425°. Mix vadouvan, half of ga...","[Preheat oven to 425°. Mix vadouvan, half of g...","[carrot, turmeric, porcini mushroom, easy, wee...",3.8,44.0,Bon Appétit,“Tandoori” Carrots with Vadouvan Spice and Yogurt


In [2]:
import numpy as np
from sklearn.decomposition import PCA
import plotly.express as px
import pandas as pd

# Load the embeddings
embeddings_file = np.load('../data/bonappetit_recipe_embeddings.npz')

# Inspect the contents
print("Available arrays:", embeddings_file.files)
print("\nShapes of embeddings:")
for key in embeddings_file.files:
    print(f"{key}: {embeddings_file[key].shape}")



Available arrays: ['title_orig', 'title_de', 'description_orig', 'description_de']

Shapes of embeddings:
title_orig: (1192, 384)
title_de: (1192, 384)
description_orig: (1192, 384)
description_de: (1192, 384)


In [7]:
import numpy as np
from umap import UMAP
from sklearn.manifold import TSNE
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pandas as pd
import multiprocessing

# Get number of CPU cores
n_cores = multiprocessing.cpu_count()

# Load embeddings and recipes
embeddings_file = np.load('../data/bonappetit_recipe_embeddings.npz')
embeddings_to_plot = embeddings_file['title_orig']
recipes_df = pd.read_parquet('../data/bonappetit_recipes_translated.parquet')

# Print first few titles to verify data
titles = recipes_df['title'].values
print("First 5 titles in array:", titles[:5])

# UMAP and t-SNE processing remains the same...
umap_reducer = UMAP(
    n_neighbors=15,
    min_dist=0.1,
    n_components=2,
    n_jobs=n_cores,
    random_state=None
)
umap_embeddings = umap_reducer.fit_transform(embeddings_to_plot)

tsne = TSNE(
    n_components=2,
    perplexity=30,
    max_iter=250,
    method='barnes_hut',
    n_jobs=n_cores,
    random_state=None
)
tsne_embeddings = tsne.fit_transform(embeddings_to_plot)

# Create DataFrames with titles
df_umap = pd.DataFrame({
    'UMAP1': umap_embeddings[:, 0],
    'UMAP2': umap_embeddings[:, 1],
    'title': titles
})

df_tsne = pd.DataFrame({
    'tSNE1': tsne_embeddings[:, 0],
    'tSNE2': tsne_embeddings[:, 1],
    'title': titles
})

# Verify DataFrames have titles
print("\nUMAP DataFrame sample:")
print(df_umap[['title']].head())
print("\ntSNE DataFrame sample:")
print(df_tsne[['title']].head())

# Create figure
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=('UMAP Projection', 't-SNE Projection')
)

# Add traces with hover data
fig.add_trace(
    go.Scatter(
        x=df_umap['UMAP1'],
        y=df_umap['UMAP2'],
        mode='markers',
        marker=dict(size=5, opacity=0.6),
        name='UMAP',
        hovertext=df_umap['title'],  # Changed to hovertext
        hoverinfo='text'
    ),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(
        x=df_tsne['tSNE1'],
        y=df_tsne['tSNE2'],
        mode='markers',
        marker=dict(size=5, opacity=0.6),
        name='t-SNE',
        hovertext=df_tsne['title'],  # Changed to hovertext
        hoverinfo='text'
    ),
    row=1, col=2
)

# Update layout
fig.update_layout(
    height=600,
    width=1200,
    title_text="Comparison of UMAP and t-SNE Embeddings",
    showlegend=False
)

fig.show()

First 5 titles in array: ['Asparagus With Parmesan Fonduta' 'Green Panzanella with Pickled Shallot'
 'Pan-Fried Noodles With Coconut Shrimp and Vegetables'
 'Oniony Brothy Beans' '“Tandoori” Carrots with Vadouvan Spice and Yogurt']



'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.




UMAP DataFrame sample:
                                               title
0                    Asparagus With Parmesan Fonduta
1              Green Panzanella with Pickled Shallot
2  Pan-Fried Noodles With Coconut Shrimp and Vege...
3                                Oniony Brothy Beans
4  “Tandoori” Carrots with Vadouvan Spice and Yogurt

tSNE DataFrame sample:
                                               title
0                    Asparagus With Parmesan Fonduta
1              Green Panzanella with Pickled Shallot
2  Pan-Fried Noodles With Coconut Shrimp and Vege...
3                                Oniony Brothy Beans
4  “Tandoori” Carrots with Vadouvan Spice and Yogurt
