In [None]:
# Import required libraries
import pandas as pd
import requests
from bs4 import BeautifulSoup
import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed
import os
from datetime import datetime
import re

In [None]:
df = pd.read_excel('../meal_planning/Ingredients.xlsx')
print(df)

In [None]:
# Make a copy of the DataFrame to avoid modifying the original
scrape_df = df.copy()

# Function to fetch product info with normal and bonus price
def fetch_product_info(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36"
    }
    try:
        response = requests.get(url, headers=headers, timeout=10)
        if response.status_code != 200:
            print(f"Failed to fetch {url}: {response.status_code}")
            return ('Not found', 'Not found', None, None, None, 'Not found', None, False)
        soup = BeautifulSoup(response.text, "html.parser")
        # Full name
        name_span = soup.find("span", class_="line-clamp_root__7DevG")
        fullname = name_span.text.strip() if name_span else 'Not found'
        # Unit size
        unit_span = soup.find("span", class_="product-card-hero-price_unitSize__ReamD")
        unit = unit_span.text.strip() if unit_span else 'Not found'
        # Current price (bonus if available, else normal)
        price = None
        bonus_price = None
        normal_price = None
        bonus_desc = None
        in_bonus = False
        # Bonus price
        try:
            bonus_div = soup.find("div", class_="price-amount_root__Sa88q price-amount_bonus__zeyvy product-card-hero-price_now__zHsgu")
            if bonus_div:
                integer = bonus_div.find("span", class_="price-amount_integer__+e2XO")
                dot = bonus_div.find("span", class_="price-amount_dot__MV39M")
                fractional = bonus_div.find("span", class_="price-amount_fractional__kjJ7u")
                if integer and dot and fractional:
                    bonus_price = float(f"{integer.text}{dot.text}{fractional.text}".replace(',', '.'))
                    in_bonus = True
        except Exception as e:
            print(f"Bonus price error for {url}: {e}")
            bonus_price = None
        # Bonus sticker/description
        try:
            sticker_div = soup.find("div", class_="promo-sticker_root__0ogs6 promo-sticker_bonus__UCZRY product-card-hero_promoSticker__iDm1j")
            if sticker_div:
                bonus_desc = ' '.join(sticker_div.stripped_strings)
                in_bonus = True
        except Exception as e:
            print(f"Bonus sticker error for {url}: {e}")
            bonus_desc = None
        # Normal price (was price)
        try:
            was_div = soup.find("div", class_="product-card-hero-price_wasContainer__wBs7t")
            if was_div:
                was_price_div = was_div.find("div", class_="price-amount_root__Sa88q price-amount_was__ecJVB product-card-hero-price_was__4jz19")
                if was_price_div:
                    integer = was_price_div.find("span", class_="price-amount_integer__+e2XO")
                    dot = was_price_div.find("span", class_="price-amount_dot__MV39M")
                    fractional = was_price_div.find("span", class_="price-amount_fractional__kjJ7u")
                    if integer and dot and fractional:
                        normal_price = float(f"{integer.text}{dot.text}{fractional.text}".replace(',', '.'))
            # Fallback: if no was price, try hero price (non-bonus)
            if normal_price is None:
                hero_div = soup.find("div", class_="price-amount_root__Sa88q product-card-hero-price_now__zHsgu")
                if hero_div and 'price-amount_bonus__zeyvy' not in hero_div.get('class', []):
                    integer = hero_div.find("span", class_="price-amount_integer__+e2XO")
                    dot = hero_div.find("span", class_="price-amount_dot__MV39M")
                    fractional = hero_div.find("span", class_="price-amount_fractional__kjJ7u")
                    if integer and dot and fractional:
                        try:
                            normal_price = float(f"{integer.text}{dot.text}{fractional.text}".replace(',', '.'))
                        except Exception:
                            normal_price = None
        except Exception as e:
            print(f"Normal price error for {url}: {e}")
            normal_price = None
        # Fallback: if no bonus price, use current price as normal
        if bonus_price is not None:
            price = bonus_price
        elif normal_price is not None:
            price = normal_price
        else:
            try:
                price_div = soup.find("div", class_="price-amount_root__Sa88q")
                if price_div:
                    integer = price_div.find("span", class_="price-amount_integer__+e2XO")
                    dot = price_div.find("span", class_="price-amount_dot__MV39M")
                    fractional = price_div.find("span", class_="price-amount_fractional__kjJ7u")
                    if integer and dot and fractional:
                        price = float(f"{integer.text}{dot.text}{fractional.text}".replace(',', '.'))
            except Exception as e:
                print(f"Fallback price error for {url}: {e}")
                price = None
        date = datetime.today().date().isoformat()
        # Smart label (e.g., 'vanaf')
        smart_label = None
        try:
            smart_label_p = soup.find("p", {"data-testhook": "product-smart-label"})
            if smart_label_p:
                smart_label = smart_label_p.get_text(strip=True)
        except Exception as e:
            print(f"Smart label error for {url}: {e}")
            smart_label = None
        return (fullname, unit, price, normal_price, bonus_price, date, bonus_desc, in_bonus, smart_label)
    except Exception as e:
        print(f"General error for {url}: {e}")
        return ('Error', 'Error', None, None, None, 'Error', None, False, None)

urls = scrape_df['URL'].tolist()
results = [None] * len(urls)
with ThreadPoolExecutor() as executor:
    future_to_idx = {executor.submit(fetch_product_info, url): idx for idx, url in enumerate(urls)}
    for future in as_completed(future_to_idx):
        idx = future_to_idx[future]
        results[idx] = future.result()

valid_indices = []
valid_results = []

for i, r in enumerate(results):
    if isinstance(r, (list, tuple)) and len(r) == 9:
        valid_indices.append(i)
        valid_results.append(r)
    else:
        print(f"Skipping malformed result at index {i}: {r}")

results = [r for r in results if isinstance(r, (list, tuple)) and len(r) == 9]

# Reduce the DataFrame to only the valid rows
scrape_df = scrape_df.iloc[valid_indices].reset_index(drop=True)


fullnames, units, latest_prices, normal_prices, bonus_prices, latest_dates, bonus_descs, in_bonuses, smart_labels = zip(*results)
scrape_df['FullName'] = fullnames
scrape_df['Unit'] = units
scrape_df['Latest price'] = latest_prices
scrape_df['Normal price'] = normal_prices
scrape_df['Bonus price'] = bonus_prices
scrape_df['Latest price date'] = latest_dates
scrape_df['Bonus description'] = bonus_descs
scrape_df['In bonus'] = in_bonuses
scrape_df['Smart label'] = smart_labels
scrape_df.reset_index(drop=True, inplace=True)

In [None]:
# Load history for price correction if needed
if os.path.exists('../meal_planning/Ingredients_history.xlsx'):
    history_df = pd.read_excel('../meal_planning/Ingredients_history.xlsx')
    def adjust_vanaf(row):
        smart = str(row.get('Smart label', '')).lower()
        if 'vanaf' in smart:
            # Find last known normal price for this URL, most recent by Latest price date
            prev = history_df[(history_df['URL'] == row['URL']) & (history_df['In bonus'] == False)]
            prev = prev[prev['Normal price'].notnull() & (prev['Normal price'] != 'Not found') & (prev['Normal price'] != 'Error')]
            if not prev.empty:
                prev_sorted = prev.sort_values('Latest price date', ascending=False)
                last_normal_price = prev_sorted.iloc[0]['Normal price']
                return pd.Series({
                    'Latest price': last_normal_price,
                    'Bonus price': None,
                    'In bonus': False,
                    'Savings abs': None,
                    'Savings %': None
                })
            else:
                # No last known normal price, set blank
                return pd.Series({
                    'Latest price': None,
                    'Bonus price': None,
                    'In bonus': False,
                    'Savings abs': None,
                    'Savings %': None
                })
        return pd.Series({
            'Latest price': row['Latest price'],
            'Bonus price': row['Bonus price'],
            'In bonus': row['In bonus'],
            'Savings abs': row.get('Savings abs', None),
            'Savings %': row.get('Savings %', None)
        })
    scrape_df[['Latest price', 'Bonus price', 'In bonus', 'Savings abs', 'Savings %']] = scrape_df.apply(adjust_vanaf, axis=1)

In [None]:
# Calculate savings columns
def parse_bonus_price(row):
    # Only parse bonus price if in bonus is True
    if not row.get('In bonus', False):
        return None
    if row['Bonus price'] not in [None, 'Not found', 'Error'] and pd.notnull(row['Bonus price']):
        return row['Bonus price']
    desc = str(row.get('Bonus description', '')).lower()
    normal = row['Normal price']
    if normal in [None, 'Not found', 'Error'] or not pd.notnull(normal):
        return None
    # 1+1 gratis
    if '1+1' in desc and 'gratis' in desc:
        return round(normal / 2, 2)
    # 2e halve prijs
    if '2e halve prijs' in desc:
        return round(normal * 0.75, 2)
    # 2 voor' or '2 stuks voor'
    match = re.search(r'2\s*(?:stuks)?\s*voor\s*[Γé¼e]?\s*([\d,.]+)', desc)
    if match:
        try:
            total = float(match.group(1).replace(',', '.'))
            return round(total / 2, 2)
        except Exception:
            return None
    return None

# Apply bonus price logic
scrape_df['Bonus price (parsed)'] = scrape_df.apply(parse_bonus_price, axis=1)
# Use parsed bonus price if available, else original
scrape_df['Bonus price final'] = scrape_df.apply(
    lambda row: row['Bonus price (parsed)'] if row['Bonus price (parsed)'] not in [None, 'Not found', 'Error'] and pd.notnull(row['Bonus price (parsed)'])
    else row['Bonus price'], axis=1)

# Calculate savings columns using the final bonus price if present, else normal price
scrape_df['Savings %'] = scrape_df.apply(
    lambda row: round(((row['Normal price'] - row['Bonus price final']) / row['Normal price']), 2)
    if row['In bonus'] and row['Normal price'] not in [None, 0, 'Not found', 'Error'] and row['Bonus price final'] not in [None, 0, 'Not found', 'Error'] and row['Normal price'] > 0
    else None,
    axis=1
)
scrape_df['Savings abs'] = scrape_df.apply(
    lambda row: round((row['Normal price'] - row['Bonus price final']), 2)
    if row['In bonus'] and row['Normal price'] not in [None, 'Not found', 'Error'] and row['Bonus price final'] not in [None, 'Not found', 'Error']
    else None,
    axis=1
)

scrape_df.drop(columns=['Bonus price (parsed)','Bonus price final'], inplace=True)

print(scrape_df.head())
# Save the updated DataFrame to a new Excel file
output_path = '../meal_planning/Ingredients.xlsx'
scrape_df.to_excel(output_path, index=False)

In [None]:
# --- History tracking ---
history_path = '../meal_planning/Ingredients_history.xlsx'
today = datetime.now()
scrape_df['Date'] = today.date().isoformat()
scrape_df['WeekNr'] = today.isocalendar()[1]

# Load history if exists, else create new
if os.path.exists(history_path):
    history_df = pd.read_excel(history_path)
    # Drop any rows with the same date as the current scrape to avoid duplicates
    history_df = history_df[history_df['Date'] != scrape_df['Date'].iloc[0]].copy()
    # Find previous price for each ingredient (by URL or FullName)
    last_scrape = history_df.groupby('URL').last().reset_index()
    price_map = dict(zip(last_scrape['URL'], last_scrape['Latest price']))
    def get_change(row):
        prev = price_map.get(row['URL'])
        if prev in [None, 'Not found', 'Error'] or row['Latest price'] in ['Not found', 'Error']:
            return 'New' if prev is None else 'Error'
        try:
            prev_val = float(str(prev).replace(',', '.'))
            curr_val = float(str(row['Latest price']).replace(',', '.'))
            if curr_val > prev_val:
                return 'Up'
            elif curr_val < prev_val:
                return 'Down'
            else:
                return 'Same'
        except Exception:
            return 'Error'
    scrape_df['Change'] = scrape_df.apply(get_change, axis=1)
    # Drop any all-NA columns from scrape_df before concatenation
    scrape_df = scrape_df.dropna(axis=1, how='all')
    history_df = pd.concat([history_df, scrape_df], ignore_index=True)
else:
    scrape_df['Change'] = 'New'
    history_df = scrape_df.copy()

history_df.to_excel(history_path, index=False)
print('History updated:', history_path)