In [None]:
%pip install playwright
%pip install nest_asyncio


Note: you may need to restart the kernel to use updated packages.


In [6]:
import csv
import nest_asyncio
from typing import Any
from playwright.async_api import async_playwright

# Apply nest_asyncio to allow nested event loops in Jupyter Notebook
nest_asyncio.apply()

# Define the base Vogue URL
BASE_URL = "https://www.vogue.com/fashion-shows/"

# List of seasons to scrape
SEASONS = ["fall-2024-couture", "spring-2024-couture"]

# CSV File to store the results
CSV_FILE = "vogue_fashion_analysis.csv"

async def analyze_image_with_gpt(image_url: str) -> dict:
    """
    Simulates analyzing an image with GPT.
    Returns structured fashion attributes.
    """
    return {
        "top": "Silk blouse with floral patterns",
        "bottom": "High-waisted trousers",
        "accessories": "Gold chain necklace",
        "other": "Red stiletto heels"
    }

async def scrape_look_details(page, brand_url, brand_name, season_name, writer):
    """
    Scrapes all looks from a brand's fashion collection.
    """
    await page.goto(brand_url)
    await page.wait_for_load_state("networkidle")

    # Find total looks in the collection
    slideshow_selector = "div.Slideshow__Nav span"
    await page.wait_for_selector(slideshow_selector)
    total_looks_text = await page.locator(slideshow_selector).inner_text()
    
    # Extract total number of looks
    total_looks = int(total_looks_text.split("/")[-1].strip())

    # Iterate through each look in the collection
    for look_number in range(1, total_looks + 1):
        look_url = f"{brand_url}/slideshow/collection#{look_number}"
        await page.goto(look_url)

        # Extract image URL
        img_element = await page.query_selector("img")
        image_url = await img_element.get_attribute("src") if img_element else ""

        # Analyze image with GPT
        if image_url:
            analysis = await analyze_image_with_gpt(image_url)
        else:
            analysis = {"top": "", "bottom": "", "accessories": "", "other": ""}

        # Write data to CSV
        writer.writerow({
            "season": season_name,
            "brand": brand_name,
            "look_number": look_number,
            "image_url": image_url,
            "top": analysis["top"],
            "bottom": analysis["bottom"],
            "accessories": analysis["accessories"],
            "other": analysis["other"]
        })
        print(f"Scraped {season_name} - {brand_name} - Look {look_number}/{total_looks}")

async def scrape():
    """
    Scrapes all seasons, brands, and looks from Vogue Fashion Shows.
    """
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()

        # Open CSV file
        with open(CSV_FILE, mode="w", newline="", encoding="utf-8") as file:
            writer = csv.DictWriter(file, fieldnames=["season", "brand", "look_number", "image_url", "top", "bottom", "accessories", "other"])
            writer.writeheader()

            # Iterate through each season
            for season in SEASONS:
                season_url = f"{BASE_URL}{season}"
                await page.goto(season_url)
                await page.wait_for_load_state("networkidle")

                print(f"Scraping season: {season}")

                # Extract brand links
                brand_elements = await page.query_selector_all("a.NavigationInternalLink-cWEaeo")
                for brand_element in brand_elements:
                    brand_href = await brand_element.get_attribute("href")
                    if brand_href:
                        brand_name = brand_href.split("/")[-1]  # Extract brand name from URL
                        brand_url = f"https://www.vogue.com{brand_href}"

                        # Scrape brand's collection
                        await scrape_look_details(page, brand_url, brand_name, season, writer)

        # Close the browser
        await browser.close()

# ✅ Run the scraper correctly inside Jupyter Notebook
await scrape()


TimeoutError: Page.goto: Timeout 30000ms exceeded.
Call log:
  - navigating to "https://www.vogue.com/fashion-shows/fall-2024-couture", waiting until "load"
