# LEGO Nowosci Scraper 2026

Nowoczesny scraper nowosci LEGO z uzyciem Playwright.
Dziala na Google Colab i lokalnie.

In [None]:
# Instalacja - uruchom raz
!pip install -q playwright pandas nest_asyncio matplotlib
!playwright install chromium
!playwright install-deps chromium

# WAZNE: Zainstaluj playwright-stealth dla lepszego ukrycia bota
!pip install -q playwright-stealth

print("Instalacja zakonczona!")

In [None]:
import asyncio
import pandas as pd
import re
from datetime import datetime
from playwright.async_api import async_playwright

# Sprobuj zaimportowac stealth
try:
    from playwright_stealth import stealth_async
    HAS_STEALTH = True
    print("playwright-stealth zaladowany")
except ImportError:
    HAS_STEALTH = False
    print("UWAGA: playwright-stealth nie jest zainstalowany - scraper moze byc wykryty")

async def scrape_lego_nowosci(debug=False):
    """
    Scrapuje nowosci LEGO - wersja 2026 z anty-detekcja.
    
    Parametry:
        debug: True = pokazuje szczegolowe informacje diagnostyczne
    """
    
    async with async_playwright() as p:
        # Uruchom przegladarke z opcjami anty-detekcji
        browser = await p.chromium.launch(
            headless=True,
            args=[
                '--disable-blink-features=AutomationControlled',
                '--disable-dev-shm-usage',
                '--no-sandbox',
                '--disable-setuid-sandbox',
                '--disable-infobars',
                '--window-position=0,0',
                '--ignore-certifcate-errors',
                '--ignore-certifcate-errors-spki-list',
            ]
        )
        
        context = await browser.new_context(
            viewport={"width": 1920, "height": 1080},
            locale="pl-PL",
            timezone_id="Europe/Warsaw",
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
            java_script_enabled=True,
            has_touch=False,
            is_mobile=False,
            color_scheme="light",
        )
        
        page = await context.new_page()
        
        # Zastosuj stealth jesli dostepny
        if HAS_STEALTH:
            await stealth_async(page)
        else:
            # Reczne ukrycie webdriver
            await page.add_init_script("""
                Object.defineProperty(navigator, 'webdriver', {get: () => undefined});
                Object.defineProperty(navigator, 'plugins', {get: () => [1, 2, 3, 4, 5]});
                Object.defineProperty(navigator, 'languages', {get: () => ['pl-PL', 'pl', 'en-US', 'en']});
                window.chrome = {runtime: {}};
            """)
        
        print("="*50)
        print("LEGO SCRAPER 2026")
        print("="*50)
        print("Otwieram LEGO...")
        
        # Idz na strone
        try:
            response = await page.goto(
                "https://www.lego.com/pl-pl/categories/new-sets-and-products", 
                timeout=60000,
                wait_until="networkidle"  # Czekaj az siec sie uspokoi
            )
            
            if debug:
                print(f"Status HTTP: {response.status}")
                print(f"URL: {page.url}")
                
            if response.status == 403:
                print("BLAD: Strona zwrocila 403 Forbidden - wykryto bota")
                print("Sprobuj uruchomic z headless=False lub na innej maszynie")
                await browser.close()
                return []
                
        except Exception as e:
            print(f"BLAD przy ladowaniu strony: {e}")
            await browser.close()
            return []
        
        # Poczekaj na zaladowanie
        await page.wait_for_timeout(5000)
        
        # === ZAMKNIJ POPUPY ===
        print("Zamykam popupy...")
        
        # Age Gate - LEGO wymaga potwierdzenia wieku
        age_gate_scripts = [
            "document.querySelector('[data-test=\"age-gate-grown-up-cta\"]')?.click()",
            "document.querySelector('#age-gate-grown-up-cta')?.click()",
            "document.querySelector('button[class*=\"AgeGate\"]')?.click()",
            "[...document.querySelectorAll('button')].find(b => b.innerText.includes('18+'))?.click()",
        ]
        for script in age_gate_scripts:
            try:
                result = await page.evaluate(script)
                await page.wait_for_timeout(1500)
            except:
                pass
        
        await page.wait_for_timeout(2000)
        
        # Cookies
        cookie_scripts = [
            "document.querySelector('[data-test=\"cookie-accept-all\"]')?.click()",
            "document.querySelector('button[class*=\"cookie\"]')?.click()",
            "[...document.querySelectorAll('button')].find(b => b.innerText.includes('Akceptuj'))?.click()",
        ]
        for script in cookie_scripts:
            try:
                result = await page.evaluate(script)
                await page.wait_for_timeout(1000)
            except:
                pass
        
        await page.wait_for_timeout(3000)
        
        # === SCROLL ABY ZALADOWAC PRODUKTY ===
        print("Laduje produkty...")
        
        prev_height = 0
        stable_count = 0
        for i in range(15):
            await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
            await page.wait_for_timeout(2000)
            
            curr_height = await page.evaluate("document.body.scrollHeight")
            
            # Sprawdz ile produktow mamy
            product_count = await page.evaluate("document.querySelectorAll('a[href*=\"/product/\"]').length")
            
            if debug:
                print(f"  Scroll {i+1}: wysokosc={curr_height}, produkty={product_count}")
            
            if curr_height == prev_height:
                stable_count += 1
                if stable_count >= 2:
                    break
            else:
                stable_count = 0
                
            prev_height = curr_height
        
        # Wroc na gore
        await page.evaluate("window.scrollTo(0, 0)")
        await page.wait_for_timeout(2000)
        
        # === DEBUG INFO ===
        if debug:
            print("\n" + "="*40)
            print("DEBUG INFO")
            print("="*40)
            
            info = await page.evaluate("""
                () => {
                    return {
                        url: window.location.href,
                        title: document.title,
                        bodyLength: document.body.innerHTML.length,
                        allLinks: document.querySelectorAll('a').length,
                        productLinks: document.querySelectorAll('a[href*="/product/"]').length,
                        hasPrice: /[0-9]+[,\\.][0-9]+\\s*zł/.test(document.body.innerText),
                        hasPriceCount: (document.body.innerText.match(/[0-9]+[,\\.][0-9]+\\s*zł/g) || []).length,
                    }
                }
            """)
            
            print(f"URL: {info['url']}")
            print(f"Tytul: {info['title']}")
            print(f"HTML body: {info['bodyLength']} znakow")
            print(f"Wszystkie linki: {info['allLinks']}")
            print(f"Linki /product/: {info['productLinks']}")
            print(f"Strona ma ceny: {info['hasPrice']} ({info['hasPriceCount']} znalezionych)")
            
            # Test selektorow
            selectors = [
                '[data-test="product-item"]',
                '[data-test="product-leaf"]',
                'a[href*="/product/"]',
                'article',
                'li[class*="Product"]',
            ]
            print("\nSelektory:")
            for sel in selectors:
                count = await page.evaluate(f"document.querySelectorAll('{sel}').length")
                marker = " <<<" if count > 0 else ""
                print(f"  {sel}: {count}{marker}")
            
            print("="*40 + "\n")
        
        # === EKSTRAKCJA PRODUKTOW ===
        print("Pobieram produkty...")
        
        products = await page.evaluate("""
            () => {
                const products = [];
                const seenUrls = new Set();
                
                // Znajdz wszystkie linki do produktow
                const productLinks = document.querySelectorAll('a[href*="/product/"]');
                
                productLinks.forEach(link => {
                    const href = link.href;
                    
                    // Sprawdz czy to link do produktu i czy go juz nie mamy
                    if (!href || !href.includes('/product/') || seenUrls.has(href)) return;
                    seenUrls.add(href);
                    
                    // Znajdz kontener produktu (idz w gore DOM)
                    let container = link;
                    for (let i = 0; i < 10; i++) {
                        if (!container.parentElement) break;
                        container = container.parentElement;
                        
                        const text = container.innerText || "";
                        const hasPrice = /[0-9]+[,\\.][0-9]+\\s*zł/i.test(text);
                        const hasAge = /[0-9]+\\+/.test(text);
                        
                        // Sprawdz czy mamy dobry kontener (cena + wiek, ale nie za duzo produktow)
                        const childLinks = container.querySelectorAll('a[href*="/product/"]').length;
                        if (hasPrice && hasAge && childLinks === 1) {
                            break;
                        }
                        // Jesli za duzo produktow, cofnij sie
                        if (childLinks > 2) {
                            container = link.parentElement?.parentElement?.parentElement || link;
                            break;
                        }
                    }
                    
                    const text = container.innerText || "";
                    const lines = text.split('\\n').map(l => l.trim()).filter(l => l.length > 0 && l.length < 100);
                    
                    // Parsowanie danych
                    let nazwa = '', cena = '', wiek = '', elementy = '', ocena = '';
                    
                    // Cena
                    for (const line of lines) {
                        if (/[0-9]+[,\\.][0-9]+\\s*zł$/i.test(line)) {
                            cena = line;
                            break;
                        }
                    }
                    
                    // Wiek (np. "18+", "12+")
                    for (const line of lines) {
                        if (/^[0-9]+\\+$/.test(line)) {
                            wiek = line;
                            break;
                        }
                    }
                    
                    // Elementy (liczba 3-5 cyfrowa)
                    for (const line of lines) {
                        if (/^[0-9]{3,5}$/.test(line)) {
                            elementy = line;
                            break;
                        }
                    }
                    
                    // Ocena (np. "4.5", "3,8")
                    for (const line of lines) {
                        if (/^[1-5][,\\.][0-9]$/.test(line)) {
                            ocena = line;
                            break;
                        }
                    }
                    
                    // Nazwa (pierwsza linia ktora nie jest cena/wiekiem/etc)
                    const skipWords = ['zł', 'koszyk', 'dostępn', 'przedsprze', 'nowość', 
                                       'wkrótce', 'niedostępn', 'dodaj', 'wishlist', 'więcej', 
                                       'zobacz', 'smart play'];
                    for (const line of lines) {
                        if (line.length < 4 || line.length > 80) continue;
                        if (/^[0-9]+[\\+,\\.]?[0-9]*$/.test(line)) continue;
                        if (line.includes('zł')) continue;
                        
                        let skip = false;
                        for (const w of skipWords) {
                            if (line.toLowerCase().includes(w)) {
                                skip = true;
                                break;
                            }
                        }
                        if (!skip) {
                            nazwa = line;
                            break;
                        }
                    }
                    
                    // Dostepnosc
                    let dostepnosc = 'Nieznana';
                    const textLower = text.toLowerCase();
                    if (textLower.includes('dodaj do koszyka')) dostepnosc = 'Dostepny';
                    else if (textLower.includes('niedostępn')) dostepnosc = 'Niedostepny';
                    else if (textLower.includes('przedsprzedaż')) dostepnosc = 'Przedsprzedaz';
                    else if (textLower.includes('wkrótce')) dostepnosc = 'Wkrotce';
                    
                    if (nazwa || cena) {
                        products.push({
                            nazwa: nazwa || 'Nieznana nazwa',
                            cena: cena,
                            wiek: wiek,
                            elementy: elementy,
                            ocena: ocena,
                            dostepnosc: dostepnosc,
                            url: href
                        });
                    }
                });
                
                return products;
            }
        """)
        
        print(f"  Znaleziono: {len(products)} produktow")
        
        # Jesli brak produktow, zapisz debug
        if len(products) == 0:
            print("\nBrak produktow! Zapisuje dane diagnostyczne...")
            
            await page.screenshot(path="debug_screenshot.png", full_page=True)
            print("  -> debug_screenshot.png")
            
            html = await page.content()
            with open("debug_page.html", "w", encoding="utf-8") as f:
                f.write(html)
            print("  -> debug_page.html")
            
            text = await page.evaluate("document.body.innerText")
            with open("debug_text.txt", "w", encoding="utf-8") as f:
                f.write(text)
            print("  -> debug_text.txt")
        
        await browser.close()
        
        # Usun duplikaty po URL
        unique = {}
        for p in products:
            url = p['url'].split('?')[0]  # Usun parametry URL
            if url not in unique:
                unique[url] = p
        products = list(unique.values())
        
        print(f"  Po dedupliccji: {len(products)} produktow")
        
        # Przetworz dane numeryczne
        for p in products:
            # Cena jako float
            if p['cena']:
                clean = p['cena'].replace(' ', '').replace('\xa0', '')
                match = re.search(r'([0-9]+[,.]?[0-9]*)', clean)
                p['cena_zl'] = float(match.group(1).replace(',', '.')) if match else None
            else:
                p['cena_zl'] = None
            
            # Elementy jako int
            if p['elementy']:
                try:
                    p['elementy'] = int(p['elementy'])
                except:
                    p['elementy'] = None
            else:
                p['elementy'] = None
            
            # Ocena jako float
            if p['ocena']:
                try:
                    p['ocena'] = float(p['ocena'].replace(',', '.'))
                except:
                    p['ocena'] = None
            else:
                p['ocena'] = None
        
        return products

print("Funkcja zaladowana. Uruchom nastepna komorke.")

In [None]:
# URUCHOM SCRAPER
# 
# Opcje:
#   debug=True  - pokazuje szczegolowe informacje diagnostyczne
#   debug=False - cichy tryb
#
# Jesli scraper zwraca 0 produktow:
#   1. Sprawdz debug_screenshot.png, debug_page.html, debug_text.txt
#   2. Jesli status 403 - strona wykrywa bota
#   3. Sprobuj uruchomic lokalnie z headless=False

import nest_asyncio
nest_asyncio.apply()

# URUCHOM Z DEBUGOWANIEM
produkty = asyncio.get_event_loop().run_until_complete(
    scrape_lego_nowosci(debug=True)
)

print("\n" + "="*50)
print(f">>> POBRANO: {len(produkty)} produktow <<<")
print("="*50)

if len(produkty) == 0:
    print("\nBrak produktow!")
    print("Sprawdz pliki debug_*.* w katalogu roboczym")
else:
    print("\nPrzykladowe produkty:")
    for p in produkty[:5]:
        nazwa = p['nazwa'][:45] if len(p['nazwa']) > 45 else p['nazwa']
        print(f"  - {nazwa:45} | {p['cena']:>12} | {p['dostepnosc']}")

In [None]:
# Utworz DataFrame
df = pd.DataFrame(produkty)
df['data_pobrania'] = datetime.now().strftime('%Y-%m-%d %H:%M')

print(f"Produktow: {len(df)}")
df

In [None]:
# STATYSTYKI
print("\n" + "="*40)
print("STATYSTYKI")
print("="*40)

if df['cena_zl'].notna().any():
    print(f"\nCENY:")
    print(f"  Najtanszy: {df['cena_zl'].min():.2f} zl")
    print(f"  Najdrozszy: {df['cena_zl'].max():.2f} zl") 
    print(f"  Srednia: {df['cena_zl'].mean():.2f} zl")

if df['elementy'].notna().any():
    print(f"\nELEMENTY:")
    print(f"  Min: {df['elementy'].min()}")
    print(f"  Max: {df['elementy'].max()}")

print(f"\nDOSTEPNOSC:")
print(df['dostepnosc'].value_counts().to_string())

In [None]:
# ZAPISZ DO CSV
plik = f"lego_nowosci_{datetime.now().strftime('%Y%m%d_%H%M')}.csv"
df.to_csv(plik, index=False, encoding='utf-8-sig')
print(f"Zapisano: {plik}")

# Pobierz na Colab
try:
    from google.colab import files
    files.download(plik)
except:
    pass

In [None]:
# TOP 10 NAJDROZSZYCH
print("\nTOP 10 NAJDROZSZYCH:")
print("-"*40)
top10 = df.nlargest(10, 'cena_zl')[['nazwa', 'cena', 'elementy']]
for i, row in top10.iterrows():
    print(f"{row['nazwa'][:40]:40} | {row['cena']:>12} | {row['elementy']} elem.")

In [None]:
# WYKRES CEN
import matplotlib.pyplot as plt

if df['cena_zl'].notna().any():
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Histogram cen
    axes[0].hist(df['cena_zl'].dropna(), bins=25, color='#FFD700', edgecolor='black')
    axes[0].set_xlabel('Cena (zl)')
    axes[0].set_ylabel('Liczba produktow')
    axes[0].set_title('Rozklad cen nowosci LEGO')
    axes[0].grid(True, alpha=0.3)
    
    # Dostepnosc
    colors = ['#4CAF50', '#f44336', '#FF9800', '#2196F3', '#9E9E9E']
    df['dostepnosc'].value_counts().plot(kind='pie', ax=axes[1], autopct='%1.1f%%', colors=colors[:len(df['dostepnosc'].unique())])
    axes[1].set_title('Dostepnosc produktow')
    axes[1].set_ylabel('')
    
    plt.tight_layout()
    plt.show()