# LEGO Nowosci Scraper 2026

Nowoczesny scraper nowosci LEGO z uzyciem Playwright.
Dziala na Google Colab i lokalnie.

In [None]:
# Instalacja - uruchom raz
!pip install -q playwright pandas nest_asyncio matplotlib
!playwright install chromium
!playwright install-deps chromium
print("Instalacja zakonczona!")

In [None]:
import asyncio
import pandas as pd
import re
from datetime import datetime
from playwright.async_api import async_playwright

async def scrape_lego_nowosci(debug=False):
    """Scrapuje nowosci LEGO - wersja 2026 z ulepszona detekcja produktow."""
    
    async with async_playwright() as p:
        # Uruchom przegladarke z dodatkowymi opcjami anty-detekcji
        browser = await p.chromium.launch(
            headless=True,
            args=[
                '--disable-blink-features=AutomationControlled',
                '--disable-dev-shm-usage',
                '--no-sandbox'
            ]
        )
        context = await browser.new_context(
            viewport={"width": 1920, "height": 1080},
            locale="pl-PL",
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
            java_script_enabled=True
        )
        
        # Ukryj webdriver
        await context.add_init_script("""
            Object.defineProperty(navigator, 'webdriver', {get: () => undefined});
        """)
        
        page = await context.new_page()
        
        print("="*50)
        print("LEGO SCRAPER 2026")
        print("="*50)
        print("Otwieram LEGO...")
        
        # Idz na strone
        response = await page.goto(
            "https://www.lego.com/pl-pl/categories/new-sets-and-products", 
            timeout=60000,
            wait_until="domcontentloaded"
        )
        
        if debug:
            print(f"Status odpowiedzi: {response.status}")
        
        # Poczekaj na zaladowanie
        await page.wait_for_timeout(5000)
        
        # Zamknij popupy - Age Gate
        print("Sprawdzam popupy...")
        age_selectors = [
            "button[data-test='age-gate-grown-up-cta']",
            "#age-gate-grown-up-cta",
            "button:has-text('18+')",
            "button:has-text('dorosły')",
            "[data-test='age-gate-grown-up-cta']"
        ]
        for selector in age_selectors:
            try:
                btn = page.locator(selector)
                if await btn.count() > 0 and await btn.first.is_visible():
                    await btn.first.click(timeout=5000)
                    print(f"  Zamknieto age gate: {selector}")
                    await page.wait_for_timeout(3000)
                    break
            except:
                pass
        
        # Cookies
        cookie_selectors = [
            "button[data-test='cookie-accept-all']",
            "button:has-text('Akceptuj wszystkie')",
            "[id*='cookie'] button:has-text('Akceptuj')",
            "button:has-text('Accept All')"
        ]
        for selector in cookie_selectors:
            try:
                btn = page.locator(selector)
                if await btn.count() > 0 and await btn.first.is_visible():
                    await btn.first.click(timeout=3000)
                    print(f"  Zamknieto cookies: {selector}")
                    await page.wait_for_timeout(2000)
                    break
            except:
                pass
        
        # Poczekaj na zaladowanie tresci
        print("Czekam na zaladowanie produktow...")
        await page.wait_for_timeout(5000)
        
        # Sprobuj poczekac na elementy produktowe
        try:
            await page.wait_for_selector('a[href*="/product/"]', timeout=15000)
            print("  Znaleziono linki do produktow")
        except:
            print("  Nie znaleziono linkow /product/, czekam dluzej...")
            await page.wait_for_timeout(10000)
        
        # Scrolluj aby zaladowac wiecej produktow
        print("Laduje produkty przez scrollowanie...")
        prev_height = 0
        for i in range(12):
            await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
            await page.wait_for_timeout(2500)
            curr_height = await page.evaluate("document.body.scrollHeight")
            if curr_height == prev_height:
                break
            prev_height = curr_height
            
        # Wroc na gore
        await page.evaluate("window.scrollTo(0, 0)")
        await page.wait_for_timeout(2000)
        
        if debug:
            print("\n=== DEBUG ===")
            debug_selectors = [
                '[data-test="product-item"]',
                '[data-test="product-leaf"]',
                'article[class*="Product"]',
                'li[class*="Product"]',
                'a[href*="/product/"]',
                '[data-test*="product"]',
                'a[href*="/pl-pl/product/"]',
            ]
            for sel in debug_selectors:
                try:
                    count = await page.locator(sel).count()
                    marker = " <<<" if count > 0 else ""
                    print(f"{sel}: {count}{marker}")
                except:
                    print(f"{sel}: ERROR")
            
            print(f"\nURL: {page.url}")
            print(f"Tytul: {await page.title()}")
            
            page_text = await page.evaluate("document.body.innerText")
            has_prices = bool(re.search(r'\d+[,\.]\d+\s*zł', page_text))
            print(f"Strona zawiera ceny (zl): {has_prices}")
            print("=== END DEBUG ===\n")
        
        # METODA 1: Pobierz przez linki /product/
        print("Pobieram dane produktow...")
        
        js_code = """
        () => {
            const products = [];
            const seen = new Set();
            
            const allLinks = document.querySelectorAll('a[href*="/product/"]');
            
            allLinks.forEach(link => {
                const href = link.href;
                if (!href || seen.has(href)) return;
                if (!href.includes('/product/')) return;
                
                let container = link;
                for (let i = 0; i < 15; i++) {
                    if (!container.parentElement) break;
                    container = container.parentElement;
                    
                    const text = container.innerText || "";
                    const hasPrice = /[0-9]+[,\.][0-9]+\\s*zł/i.test(text);
                    const hasAge = /[0-9]+\\+/.test(text);
                    const childProducts = container.querySelectorAll('a[href*="/product/"]');
                    
                    if (hasPrice && hasAge && childProducts.length <= 1) {
                        break;
                    }
                    if (childProducts.length > 1) {
                        container = link.parentElement?.parentElement?.parentElement?.parentElement || link;
                        break;
                    }
                }
                
                seen.add(href);
                
                const text = container.innerText || "";
                const lines = text.split("\\n").map(l => l.trim()).filter(l => l && l.length < 100);
                
                let nazwa = "", cena = "", wiek = "", elementy = "", ocena = "";
                
                // Cena
                for (const line of lines) {
                    if (/^[0-9\\s]+[,\\.][0-9]+\\s*zł$/i.test(line.replace(/\\s/g, ''))) {
                        cena = line; break;
                    }
                }
                if (!cena) {
                    const m = text.match(/([0-9][0-9\\s]*[,\\.][0-9]+)\\s*zł/);
                    if (m) cena = m[0].trim();
                }
                
                // Wiek
                for (const line of lines) {
                    if (/^[0-9]+\\+$/.test(line.trim())) { wiek = line.trim(); break; }
                }
                
                // Elementy
                for (const line of lines) {
                    if (/^[0-9]{3,5}$/.test(line.trim())) { elementy = line.trim(); break; }
                }
                
                // Ocena
                for (const line of lines) {
                    if (/^[1-5][,\\.][0-9]$/.test(line.trim())) { ocena = line.trim(); break; }
                }
                
                // Nazwa
                const skip = ['zł', 'koszyk', 'dostępn', 'przedsprze', 'nowość', 'wkrótce', 'niedostępn', 'dodaj', 'wishlist', 'więcej', 'zobacz'];
                for (const line of lines) {
                    const t = line.trim();
                    if (t.length < 4 || t.length > 80) continue;
                    if (/^[0-9]+[\\+,\\.]?[0-9]*$/.test(t)) continue;
                    if (t.includes('zł')) continue;
                    let ok = true;
                    for (const w of skip) { if (t.toLowerCase().includes(w)) { ok = false; break; } }
                    if (ok) { nazwa = t; break; }
                }
                
                // Dostepnosc
                let dost = "Nieznana";
                const tl = text.toLowerCase();
                if (tl.includes("dodaj do koszyka")) dost = "Dostepny";
                else if (tl.includes("niedostępn")) dost = "Niedostepny";
                else if (tl.includes("przedsprzedaż")) dost = "Przedsprzedaz";
                else if (tl.includes("wkrótce")) dost = "Wkrotce dostepne";
                
                if (nazwa || cena) {
                    products.push({nazwa: nazwa || "Nieznana", cena, wiek, elementy, ocena, dostepnosc: dost, url: href});
                }
            });
            
            return products;
        }
        """
        
        products = await page.evaluate(js_code)
        print(f"  Metoda 1 (linki): {len(products)} produktow")
        
        # METODA 2: Jesli brak - sprobuj parsowac tekst strony
        if len(products) == 0:
            print("Probuje metode 2 (parsowanie tekstu)...")
            
            page_text = await page.evaluate("document.body.innerText")
            
            # Podziel tekst na sekcje (po cenach)
            price_pattern = r'([0-9\s]+[,\.][0-9]+)\s*zł'
            prices = list(re.finditer(price_pattern, page_text))
            
            if prices:
                print(f"  Znaleziono {len(prices)} cen na stronie")
                
                # Dla kazdej ceny sprobuj znalezc nazwe produktu
                text_products = []
                for i, price_match in enumerate(prices):
                    # Wez tekst przed cena (max 500 znakow)
                    start = max(0, price_match.start() - 500)
                    context = page_text[start:price_match.end()]
                    
                    # Podziel na linie
                    lines = [l.strip() for l in context.split('\n') if l.strip()]
                    
                    # Szukaj nazwy (linia przed cena)
                    nazwa = ""
                    skip_words = ['zł', 'koszyk', 'dostępn', 'przedsprze', 'nowość', 
                                  'wkrótce', 'niedostępn', 'dodaj', 'wishlist', 'więcej', 
                                  'zobacz', 'smart play', 'polecane', 'sortuj', 'filtru']
                    
                    for line in reversed(lines[:-1]):  # Wszystkie linie oprocz ostatniej (cena)
                        t = line.strip()
                        if len(t) < 4 or len(t) > 80:
                            continue
                        if re.match(r'^[0-9]+[\+,\.]?[0-9]*$', t):
                            continue
                        if 'zł' in t:
                            continue
                        
                        skip = False
                        for w in skip_words:
                            if w in t.lower():
                                skip = True
                                break
                        if skip:
                            continue
                        
                        nazwa = t
                        break
                    
                    # Szukaj wieku i elementow w kontekscie
                    wiek = ""
                    elementy = ""
                    ocena = ""
                    
                    age_match = re.search(r'\b([0-9]+)\+\b', context)
                    if age_match:
                        wiek = age_match.group(0)
                    
                    elem_match = re.search(r'\b([0-9]{3,5})\b', context)
                    if elem_match and elem_match.group(1) != price_match.group(1).replace(' ', '').replace(',', '').replace('.', ''):
                        elementy = elem_match.group(1)
                    
                    rating_match = re.search(r'\b([1-5][,\.][0-9])\b', context)
                    if rating_match:
                        ocena = rating_match.group(1)
                    
                    # Dostepnosc
                    dostepnosc = "Nieznana"
                    ctx_lower = context.lower()
                    if "dodaj do koszyka" in ctx_lower:
                        dostepnosc = "Dostepny"
                    elif "niedostępn" in ctx_lower:
                        dostepnosc = "Niedostepny"
                    elif "przedsprzedaż" in ctx_lower:
                        dostepnosc = "Przedsprzedaz"
                    elif "wkrótce" in ctx_lower:
                        dostepnosc = "Wkrotce dostepne"
                    
                    if nazwa:
                        text_products.append({
                            'nazwa': nazwa,
                            'cena': price_match.group(0),
                            'wiek': wiek,
                            'elementy': elementy,
                            'ocena': ocena,
                            'dostepnosc': dostepnosc,
                            'url': f"https://www.lego.com/pl-pl/search?q={nazwa.replace(' ', '+')}"
                        })
                
                # Usun duplikaty po nazwie
                seen_names = set()
                for p in text_products:
                    if p['nazwa'] not in seen_names:
                        seen_names.add(p['nazwa'])
                        products.append(p)
                
                print(f"  Metoda 2 (tekst): {len(products)} produktow")
        
        # METODA 3: Jesli nadal brak, zapisz debug
        if len(products) == 0:
            print("\nNie znaleziono produktow. Zapisuje dane debugowania...")
            
            await page.screenshot(path="debug_screenshot.png", full_page=True)
            print("  Zapisano: debug_screenshot.png")
            
            html = await page.content()
            with open("debug_page.html", "w", encoding="utf-8") as f:
                f.write(html)
            print("  Zapisano: debug_page.html")
            
            page_text = await page.evaluate("document.body.innerText")
            with open("debug_text.txt", "w", encoding="utf-8") as f:
                f.write(page_text)
            print("  Zapisano: debug_text.txt")
        
        await browser.close()
        
        # Usun duplikaty
        unique = {}
        for p in products:
            key = p.get('url', '') or p.get('nazwa', '')
            if key and key not in unique:
                unique[key] = p
        products = list(unique.values())
        
        print(f"\nKoncowy wynik: {len(products)} produktow")
        
        # Przetworz dane numeryczne
        for p in products:
            if p["cena"]:
                clean = p["cena"].replace(" ", "").replace("\xa0", "")
                match = re.search(r"([0-9]+[,.]?[0-9]*)", clean)
                p["cena_zl"] = float(match.group(1).replace(",", ".")) if match else None
            else:
                p["cena_zl"] = None
            
            if p["elementy"]:
                try:
                    p["elementy"] = int(p["elementy"])
                except:
                    p["elementy"] = None
            else:
                p["elementy"] = None
                
            if p["ocena"]:
                try:
                    p["ocena"] = float(p["ocena"].replace(",", "."))
                except:
                    p["ocena"] = None
            else:
                p["ocena"] = None
        
        return products

print("Gotowe! Uruchom nastepna komorke.")

In [None]:
# URUCHOM SCRAPER
# Ustaw debug=True aby zobaczyc szczegolowe informacje

# Dla Jupyter/Colab
import nest_asyncio
nest_asyncio.apply()

# Uruchom z debugowaniem
produkty = asyncio.get_event_loop().run_until_complete(scrape_lego_nowosci(debug=True))

print(f"\n{'='*50}")
print(f">>> POBRANO: {len(produkty)} produktow <<<")
print(f"{'='*50}")

if len(produkty) == 0:
    print("Brak produktow - sprawdz debug powyzej")
else:
    print("\nPrzykladowe produkty:")
    for p in produkty[:5]:
        print(f"  - {p['nazwa'][:50]} | {p['cena']} | {p['dostepnosc']}")

In [None]:
# Utworz DataFrame
df = pd.DataFrame(produkty)
df['data_pobrania'] = datetime.now().strftime('%Y-%m-%d %H:%M')

print(f"Produktow: {len(df)}")
df

In [None]:
# STATYSTYKI
print("\n" + "="*40)
print("STATYSTYKI")
print("="*40)

if df['cena_zl'].notna().any():
    print(f"\nCENY:")
    print(f"  Najtanszy: {df['cena_zl'].min():.2f} zl")
    print(f"  Najdrozszy: {df['cena_zl'].max():.2f} zl") 
    print(f"  Srednia: {df['cena_zl'].mean():.2f} zl")

if df['elementy'].notna().any():
    print(f"\nELEMENTY:")
    print(f"  Min: {df['elementy'].min()}")
    print(f"  Max: {df['elementy'].max()}")

print(f"\nDOSTEPNOSC:")
print(df['dostepnosc'].value_counts().to_string())

In [None]:
# ZAPISZ DO CSV
plik = f"lego_nowosci_{datetime.now().strftime('%Y%m%d_%H%M')}.csv"
df.to_csv(plik, index=False, encoding='utf-8-sig')
print(f"Zapisano: {plik}")

# Pobierz na Colab
try:
    from google.colab import files
    files.download(plik)
except:
    pass

In [None]:
# TOP 10 NAJDROZSZYCH
print("\nTOP 10 NAJDROZSZYCH:")
print("-"*40)
top10 = df.nlargest(10, 'cena_zl')[['nazwa', 'cena', 'elementy']]
for i, row in top10.iterrows():
    print(f"{row['nazwa'][:40]:40} | {row['cena']:>12} | {row['elementy']} elem.")

In [None]:
# WYKRES CEN
import matplotlib.pyplot as plt

if df['cena_zl'].notna().any():
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Histogram cen
    axes[0].hist(df['cena_zl'].dropna(), bins=25, color='#FFD700', edgecolor='black')
    axes[0].set_xlabel('Cena (zl)')
    axes[0].set_ylabel('Liczba produktow')
    axes[0].set_title('Rozklad cen nowosci LEGO')
    axes[0].grid(True, alpha=0.3)
    
    # Dostepnosc
    colors = ['#4CAF50', '#f44336', '#FF9800', '#2196F3', '#9E9E9E']
    df['dostepnosc'].value_counts().plot(kind='pie', ax=axes[1], autopct='%1.1f%%', colors=colors[:len(df['dostepnosc'].unique())])
    axes[1].set_title('Dostepnosc produktow')
    axes[1].set_ylabel('')
    
    plt.tight_layout()
    plt.show()