# LEGO Nowości Scraper

Ten notebook scrapuje nowości ze strony LEGO Polska (lego.com/pl-pl).

**Funkcjonalności:**
- Instalacja ChromeDriver dla Google Colab
- Scraping produktów z sekcji "Nowości"
- Ekstrakcja: nazwa, cena, liczba elementów, wiek, ocena, dostępność
- Zapis do DataFrame i CSV

## 1. Instalacja i konfiguracja

In [None]:
import subprocess
import sys

def install_chromedriver():
    """Instaluje ChromeDriver na Google Colab."""
    print("Instalowanie chromedriver...")
    try:
        subprocess.check_output(["apt", "update"], stderr=subprocess.STDOUT)
        subprocess.check_output(["apt-get", "install", "-y", "chromium-chromedriver"], stderr=subprocess.STDOUT)
        subprocess.check_output(["cp", "/usr/lib/chromium-browser/chromedriver", "/usr/bin"], stderr=subprocess.STDOUT)
        print("Chromedriver zainstalowany pomyslnie.")
        return True
    except subprocess.CalledProcessError as e:
        print(f"Blad podczas instalacji chromedriver: {e}")
        return False

# Sprawdz czy jestesmy na Colab
try:
    import google.colab
    IN_COLAB = True
    print("Wykryto Google Colab")
except ImportError:
    IN_COLAB = False
    print("Lokalne srodowisko")

if IN_COLAB:
    install_chromedriver()
    !pip install -q selenium webdriver-manager

In [None]:
# Importy
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import pandas as pd
import time
import re
from datetime import datetime

## 2. Konfiguracja przegladarki

In [None]:
def create_driver():
    """Tworzy instancje Chrome WebDriver."""
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1920,1080")
    chrome_options.add_argument("--lang=pl-PL")
    chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
    
    if IN_COLAB:
        driver = webdriver.Chrome(options=chrome_options)
    else:
        from webdriver_manager.chrome import ChromeDriverManager
        service = Service(ChromeDriverManager().install())
        driver = webdriver.Chrome(service=service, options=chrome_options)
    
    return driver

print("Funkcja create_driver() gotowa.")

## 3. Funkcje do scrapowania

In [None]:
def accept_cookies(driver):
    """Akceptuje cookies na stronie LEGO."""
    try:
        cookie_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.ID, "age-gate-grown-up-cta"))
        )
        cookie_button.click()
        time.sleep(1)
    except TimeoutException:
        pass
    
    try:
        accept_btn = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, "button[data-test='cookie-accept-all]"))
        )
        accept_btn.click()
        time.sleep(1)
    except TimeoutException:
        pass

def scroll_page(driver, scrolls=5, delay=2):
    """Przewija strone aby zaladowac wiecej produktow."""
    for i in range(scrolls):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(delay)
        print(f"Przewinieto {i+1}/{scrolls}")

In [None]:
def extract_product_data(product_element):
    """Ekstrahuje dane z pojedynczego elementu produktu."""
    data = {
        'nazwa': None,
        'cena': None,
        'cena_numeryczna': None,
        'liczba_elementow': None,
        'wiek': None,
        'ocena': None,
        'dostepnosc': None,
        'url': None,
        'obrazek_url': None,
        'nowosc': False
    }
    
    try:
        # Nazwa produktu
        try:
            name_elem = product_element.find_element(By.CSS_SELECTOR, "span[data-test='product-leaf-title'], h3, .ProductTitlestyles__ProductTitleText")
            data['nazwa'] = name_elem.text.strip()
        except NoSuchElementException:
            try:
                name_elem = product_element.find_element(By.CSS_SELECTOR, "a[data-test='product-leaf-title-link']")
                data['nazwa'] = name_elem.text.strip()
            except NoSuchElementException:
                pass
        
        # Cena
        try:
            price_elem = product_element.find_element(By.CSS_SELECTOR, "span[data-test='product-leaf-price'], .ProductPricestyles__PriceText")
            price_text = price_elem.text.strip()
            data['cena'] = price_text
            # Ekstrakcja numerycznej wartosci ceny
            price_match = re.search(r'([\d\s,]+)', price_text.replace(' ', ''))
            if price_match:
                price_num = price_match.group(1).replace(',', '.').replace(' ', '')
                try:
                    data['cena_numeryczna'] = float(price_num)
                except ValueError:
                    pass
        except NoSuchElementException:
            pass
        
        # Liczba elementow
        try:
            pieces_elem = product_element.find_element(By.CSS_SELECTOR, "span[data-test='product-leaf-piece-count-label']")
            pieces_text = pieces_elem.text.strip()
            pieces_match = re.search(r'(\d+)', pieces_text)
            if pieces_match:
                data['liczba_elementow'] = int(pieces_match.group(1))
        except NoSuchElementException:
            pass
        
        # Wiek
        try:
            age_elem = product_element.find_element(By.CSS_SELECTOR, "span[data-test='product-leaf-age-range-label']")
            data['wiek'] = age_elem.text.strip()
        except NoSuchElementException:
            pass
        
        # Ocena
        try:
            rating_elem = product_element.find_element(By.CSS_SELECTOR, "div[data-test='product-leaf-rating'] span, .Ratingstyles__RatingValue")
            rating_text = rating_elem.text.strip()
            rating_match = re.search(r'([\d.,]+)', rating_text)
            if rating_match:
                data['ocena'] = float(rating_match.group(1).replace(',', '.'))
        except NoSuchElementException:
            pass
        
        # Dostepnosc (przycisk)
        try:
            button_elem = product_element.find_element(By.CSS_SELECTOR, "button[data-test='product-leaf-cta'], button")
            button_text = button_elem.text.strip().lower()
            if 'dodaj' in button_text or 'koszyk' in button_text:
                data['dostepnosc'] = 'Dostepny'
            elif 'niedostepn' in button_text:
                data['dostepnosc'] = 'Niedostepny'
            elif 'wkrotce' in button_text:
                data['dostepnosc'] = 'Wkrotce dostepne'
            elif 'przedsprzeda' in button_text:
                data['dostepnosc'] = 'Przedsprzedaz'
            else:
                data['dostepnosc'] = button_text
        except NoSuchElementException:
            pass
        
        # URL produktu
        try:
            link_elem = product_element.find_element(By.CSS_SELECTOR, "a[data-test='product-leaf-title-link'], a[href*='/product/']")
            data['url'] = link_elem.get_attribute('href')
        except NoSuchElementException:
            pass
        
        # Obrazek
        try:
            img_elem = product_element.find_element(By.CSS_SELECTOR, "img[data-test='product-leaf-image'], img")
            data['obrazek_url'] = img_elem.get_attribute('src')
        except NoSuchElementException:
            pass
        
        # Sprawdz czy jest "Nowosc"
        try:
            badge_elem = product_element.find_element(By.CSS_SELECTOR, "span[data-test='product-leaf-badges-new'], .Badge")
            if 'nowo' in badge_elem.text.lower():
                data['nowosc'] = True
        except NoSuchElementException:
            data['nowosc'] = True  # Na stronie nowosci wszystkie produkty sa "nowosciami"
        
    except Exception as e:
        print(f"Blad podczas ekstrakcji danych: {e}")
    
    return data

In [None]:
def scrape_lego_nowosci(url="https://www.lego.com/pl-pl/categories/new-sets-and-products", max_scrolls=10):
    """Glowna funkcja scrapujaca nowosci LEGO."""
    print(f"Rozpoczynam scrapowanie: {url}")
    
    driver = create_driver()
    products = []
    
    try:
        driver.get(url)
        print("Strona zaladowana")
        time.sleep(3)
        
        # Akceptuj cookies
        accept_cookies(driver)
        
        # Przewin strone aby zaladowac wiecej produktow
        scroll_page(driver, scrolls=max_scrolls, delay=2)
        
        # Znajdz wszystkie produkty
        product_selectors = [
            "li[data-test='product-leaf']",
            "article[data-test='product-leaf']",
            ".ProductLeafstyles__ProductLeafWrapper",
            "[data-test='product-item']"
        ]
        
        product_elements = []
        for selector in product_selectors:
            elements = driver.find_elements(By.CSS_SELECTOR, selector)
            if elements:
                product_elements = elements
                print(f"Znaleziono {len(elements)} produktow uzywajac selektora: {selector}")
                break
        
        if not product_elements:
            print("Nie znaleziono produktow. Probuje alternatywnej metody...")
            product_elements = driver.find_elements(By.CSS_SELECTOR, "[class*='ProductLeaf']")
            print(f"Znaleziono {len(product_elements)} produktow")
        
        # Ekstrakcja danych z kazdego produktu
        for i, product_elem in enumerate(product_elements):
            print(f"Przetwarzanie produktu {i+1}/{len(product_elements)}")
            product_data = extract_product_data(product_elem)
            if product_data['nazwa']:  # Dodaj tylko jesli ma nazwe
                products.append(product_data)
        
        print(f"\nZakończono. Pobrano {len(products)} produktow.")
        
    except Exception as e:
        print(f"Blad podczas scrapowania: {e}")
    finally:
        driver.quit()
    
    return products

## 4. Uruchomienie scrapera

In [None]:
# Scrapuj nowosci LEGO
products = scrape_lego_nowosci(max_scrolls=5)
print(f"\nPobrano {len(products)} produktow")

In [None]:
# Utworz DataFrame
df = pd.DataFrame(products)

# Dodaj timestamp
df['data_scrapowania'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

# Wyswietl wyniki
print(f"Liczba produktow: {len(df)}")
print(f"\nKolumny: {list(df.columns)}")
df.head(10)

In [None]:
# Statystyki
print("=== STATYSTYKI ===")
print(f"Liczba produktow: {len(df)}")

if 'cena_numeryczna' in df.columns and df['cena_numeryczna'].notna().any():
    print(f"\nCeny:")
    print(f"  Min: {df['cena_numeryczna'].min():.2f} zl")
    print(f"  Max: {df['cena_numeryczna'].max():.2f} zl")
    print(f"  Srednia: {df['cena_numeryczna'].mean():.2f} zl")

if 'liczba_elementow' in df.columns and df['liczba_elementow'].notna().any():
    print(f"\nLiczba elementow:")
    print(f"  Min: {df['liczba_elementow'].min()}")
    print(f"  Max: {df['liczba_elementow'].max()}")
    print(f"  Srednia: {df['liczba_elementow'].mean():.0f}")

if 'dostepnosc' in df.columns:
    print(f"\nDostepnosc:")
    print(df['dostepnosc'].value_counts())

## 5. Zapis do pliku

In [None]:
# Zapis do CSV
filename = f"lego_nowosci_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
df.to_csv(filename, index=False, encoding='utf-8-sig')
print(f"Zapisano do pliku: {filename}")

In [None]:
# Dla Google Colab - pobierz plik
if IN_COLAB:
    from google.colab import files
    files.download(filename)
    print("Plik zostal pobrany.")

## 6. Wizualizacja (opcjonalnie)

In [None]:
import matplotlib.pyplot as plt

# Rozklad cen
if 'cena_numeryczna' in df.columns and df['cena_numeryczna'].notna().any():
    plt.figure(figsize=(10, 6))
    df['cena_numeryczna'].hist(bins=20, edgecolor='black')
    plt.xlabel('Cena (zl)')
    plt.ylabel('Liczba produktow')
    plt.title('Rozklad cen nowosci LEGO')
    plt.grid(True, alpha=0.3)
    plt.show()

In [None]:
# Dostepnosc produktow
if 'dostepnosc' in df.columns and df['dostepnosc'].notna().any():
    plt.figure(figsize=(8, 8))
    df['dostepnosc'].value_counts().plot(kind='pie', autopct='%1.1f%%')
    plt.title('Dostepnosc produktow')
    plt.ylabel('')
    plt.show()