# VNL Scraper Notebook
Scrape Volleyball Nations League data (men and women) from Flashscore using Playwright and export to CSV/JSON.

In [9]:
from playwright.async_api import async_playwright
import pandas as pd
from datetime import datetime
import os
import asyncio


In [10]:
BASE_URLS = {
    "women": "https://www.flashscore.com/volleyball/world/nations-league-women-{year}/results/",
    "men": "https://www.flashscore.com/volleyball/world/nations-league-{year}/results/"
}

OUTPUT_DIR = "vnl_results"
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [11]:
async def scrape_vnl_season(year, gender):
    url = BASE_URLS[gender].format(year=year)
    print(f"Scraping {gender.upper()} {year}...")

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        await page.goto(url)

        match_elements = await page.query_selector_all("div.event__match")
        if not match_elements:
            print(f"⚠️ No matches found for {gender} {year} — skipping.")
            await browser.close()
            return []

        match_data = []

        for match in match_elements:
            try:
                home_el = await match.query_selector(".event__participant--home")
                away_el = await match.query_selector(".event__participant--away")
                home_team = (await home_el.inner_text()).strip() if home_el else ""
                away_team = (await away_el.inner_text()).strip() if away_el else ""

                if gender == "women":
                    home_team = home_team.removesuffix(" W")
                    away_team = away_team.removesuffix(" W")

                time_el = await match.query_selector(".event__time")
                date_text = await time_el.inner_text() if time_el else ""
                day_month = date_text.split(" ")[0]
                clean_day_month = day_month.strip().replace("..", ".").rstrip(".")
                match_date_str = f"{clean_day_month}.{year}"
                match_date = datetime.strptime(match_date_str, "%d.%m.%Y").date()

                match_id_attr = await match.get_attribute("id")
                match_id = match_id_attr.split("_")[-1]
                match_url = f"https://www.flashscore.com/match/{match_id}/#/match-summary"

                sets_home = []
                sets_away = []

                async def extract_int(el):
                    try:
                        return int(float((await el.inner_text()).strip()))
                    except:
                        return None

                for i in range(1, 6):
                    home_el = await match.query_selector(f".event__part--home.event__part--{i}")
                    away_el = await match.query_selector(f".event__part--away.event__part--{i}")

                    if home_el and away_el:
                        h = await extract_int(home_el)
                        a = await extract_int(away_el)
                        sets_home.append(h)
                        sets_away.append(a)
                    else:
                        break

                home_sets_won = sum(1 for h, a in zip(sets_home, sets_away) if h > a)
                away_sets_won = sum(1 for h, a in zip(sets_home, sets_away) if a > h)

                row = {
                    "season": year,
                    "gender": gender,
                    "match_date": match_date.isoformat(),
                    "home_team": home_team,
                    "away_team": away_team,
                    "home_score": home_sets_won,
                    "away_score": away_sets_won,
                    "outcome": 1 if home_sets_won > away_sets_won else 0,
                }

                for idx in range(5):
                    row[f"set_{idx+1}_home"] = sets_home[idx] if idx < len(sets_home) else None
                    row[f"set_{idx+1}_away"] = sets_away[idx] if idx < len(sets_away) else None

                row["match_url"] = match_url
                match_data.append(row)

            except Exception as e:
                print(f"Skipping match due to error: {e}")
                continue

        await browser.close()
        return match_data


In [12]:
# Example: scrape a single season and gender for testing
results = await scrape_vnl_season(2023, "women")
df = pd.DataFrame(results)
df.head()

Scraping WOMEN 2023...


NotImplementedError: 