In [5]:
from bs4 import BeautifulSoup
from datetime import datetime
import re
import pandas as pd
import requests


In [23]:
url = "https://www.battlefielddistrictva.org/public/genie/354/school/8/date/2025-08-21/view/month/"

# Get the page content
response = requests.get(url)
html = response.text

# Now feed the HTML into BeautifulSoup
soup = BeautifulSoup(html, "html.parser")

print(soup.title)  # quick test to see if you got a real page

In [25]:


soup = BeautifulSoup(html, "html.parser")

SCHOOL_NAME = "Spotsylvania High School"

# 1) Month/Year for this grid
month_label = soup.select_one(".nav-month-dropdown .dropdown-toggle")
month_year = month_label.get_text(strip=True) if month_label else None
# Fallback if needed
if not month_year:
    switch = soup.select_one("#datepicker .datepicker-switch")
    month_year = switch.get_text(strip=True) if switch else "August 2025"

# Parse month/year once
month_dt = datetime.strptime(month_year, "%B %Y")
month_num = month_dt.month
year_num = month_dt.year

rows = []

# 2) Loop days in the grid
for day_li in soup.select("#calendar .days > li.day"):
    # Skip other-month cells if they appear
    if "other-month" in day_li.get("class", []):
        continue

    date_div = day_li.select_one(".date")
    if not date_div:
        continue
    day_num_txt = date_div.get_text(strip=True)
    if not day_num_txt.isdigit():
        continue

    # Build full date
    day_num = int(day_num_txt)
    try:
        event_date = datetime(year_num, month_num, day_num).date()
    except ValueError:
        continue

    # 3) Loop events for this day
    for ev in day_li.select(".event"):
        # Time
        time_el = ev.select_one(".event-time")
        time_txt = time_el.get_text(strip=True) if time_el else ""

        # Title (sport-ish)
        title_el = ev.select_one(".event-desc a.show-popup")
        title = title_el.get_text(strip=True) if title_el else ""

        # Location line (to get School)
        loc_el = ev.select_one(".event-desc-sub")
        loc_txt = loc_el.get_text(" ", strip=True) if loc_el else ""

        # --- SPORT normalization ---
        sport_raw = title
        # prefer text before colon if present
        if ":" in sport_raw:
            sport_base = sport_raw.split(":", 1)[0].strip()
        else:
            sport_base = sport_raw

        # Strip common suffix keywords to get base sport
        suffixes = [
            r"\bPractice\b",
            r"\bTryouts?\b",
            r"\bOpen Gym/Conditioning\b",
            r"\bConditioning\b",
            r"\bPictures?\b",
            r"\bMeeting\b",
            r"\bOpen House\b",
        ]
        for pat in suffixes:
            sport_base = re.sub(pat, "", sport_base, flags=re.IGNORECASE).strip()

        # Clean duplicate spaces
        sport_base = re.sub(r"\s{2,}", " ", sport_base)

        
        rows.append({
            "Sport": sport_base,
            "Date": event_date.isoformat(),
            "Time": time_txt,
            "School": SCHOOL_NAME
        })

df = pd.DataFrame(rows, columns=["Sport", "Date", "Time", "School"])


NameError: name 'date' is not defined

In [21]:
df.head()

Unnamed: 0,Sport,Date,Time,School
0,Football,2025-08-01,6:00pm - 9:00pm,Spotsylvania High School
1,Football,2025-08-02,8:00am - 11:00am,Spotsylvania High School
2,Field Hockey,2025-08-04,7:00am - 9:00am,Spotsylvania High School
3,Golf,2025-08-04,3:00pm - 5:00pm,Spotsylvania High School
4,Volleyball,2025-08-04,4:30pm - 6:30pm,Spotsylvania High School


In [29]:
import requests
from bs4 import BeautifulSoup
from datetime import date, datetime
from dateutil.relativedelta import relativedelta
import pandas as pd
import re
import time

BASE = "https://www.battlefielddistrictva.org/public/genie/354/school/8/date/{ymd}/view/month/"
SCHOOL_NAME = "Spotsylvania High School"

# Optional: be polite
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/120.0.0.0 Safari/537.36"
}

def month_urls(start_month: date, end_month: date):
    """Yield first-of-month dates from start → end (inclusive) and their URLs."""
    cur = date(start_month.year, start_month.month, 1)
    end = date(end_month.year, end_month.month, 1)
    while cur <= end:
        yield cur, BASE.format(ymd=cur.strftime("%Y-%m-%d"))
        cur += relativedelta(months=1)

def normalize_sport(title: str) -> str:
    """Reduce event title to base sport."""
    sport = title.split(":", 1)[0].strip() if ":" in title else title.strip()
    # Remove common non-sport suffixes
    suffixes = [
        r"\bPractice\b",
        r"\bTryouts?\b",
        r"\bOpen Gym/Conditioning\b",
        r"\bConditioning\b",
        r"\bPictures?\b",
        r"\bMeeting\b",
        r"\bOpen House\b",
        r"\bAthlete/Parent\b",
    ]
    for pat in suffixes:
        sport = re.sub(pat, "", sport, flags=re.IGNORECASE).strip()
    sport = re.sub(r"\s{2,}", " ", sport)
    return sport

def scrape_month(url: str, target_year: int, target_month: int):
    """Scrape a single month page → list of dict rows."""
    rows = []
    resp = requests.get(url, headers=HEADERS, timeout=30)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    # Iterate visible days in the grid
    for day_li in soup.select("#calendar .days > li.day"):
        classes = day_li.get("class", [])
        if "other-month" in classes:
            continue

        date_div = day_li.select_one(".date")
        if not (date_div and date_div.get_text(strip=True).isdigit()):
            continue

        day_num = int(date_div.get_text(strip=True))
        try:
            event_date = date(target_year, target_month, day_num).isoformat()
        except ValueError:
            continue  # bad day number (shouldn't happen)

        for ev in day_li.select(".event"):
            # Skip cancelled rows
            cancelled = ev.select_one(".table-data-styles-subtext")
            if cancelled and "(Cancelled)" in cancelled.get_text():
                continue

            time_el = ev.select_one(".event-time")
            time_txt = time_el.get_text(strip=True) if time_el else ""

            title_el = ev.select_one(".event-desc a.show-popup")
            title = title_el.get_text(strip=True) if title_el else ""

            sport = normalize_sport(title)

            rows.append({
                "Sport": sport,
                "Date": event_date,
                "Time": time_txt,
                "School": SCHOOL_NAME,
            })

    return rows

# ---- Run for Aug 2024 → May 2025 ----
start_month = date(2024, 8, 1)
end_month   = date(2025, 5, 1)

all_rows = []
for month_dt, url in month_urls(start_month, end_month):
    try:
        month_rows = scrape_month(url, month_dt.year, month_dt.month)
        all_rows.extend(month_rows)
        # polite pause to avoid hammering the server
        time.sleep(0.5)
    except Exception as e:
        print(f"Warning: failed {month_dt.strftime('%Y-%m')} -> {e}")

spotsy_df = pd.DataFrame(all_rows, columns=["Sport", "Date", "Time", "School"])

# Optional: sort for readability
if not spotsy_df.empty:
    spotsy_df["Date"] = pd.to_datetime(spotsy_df["Date"])
    spotsy_df = spotsy_df.sort_values(["Date", "Time", "Sport"]).reset_index(drop=True)
    # Convert Date back to ISO string if you prefer strings
    spotsy_df["Date"] = spotsy_df["Date"].dt.date.astype(str)

spotsy_df


Unnamed: 0,Sport,Date,Time,School
0,Fall Sports Athlete,2024-08-09,2:30pm,Spotsylvania High School
1,Golf,2024-08-12,4:00pm,Spotsylvania High School
2,Fall Sports Parents,2024-08-12,6:00pm,Spotsylvania High School
3,Golf,2024-08-14,4:00pm,Spotsylvania High School
4,Field Hockey,2024-08-14,5:00pm,Spotsylvania High School
...,...,...,...,...
555,Softball,2025-05-28,6:00pm,Spotsylvania High School
556,Softball,2025-05-29,6:00pm,Spotsylvania High School
557,Baseball,2025-05-30,2:00pm,Spotsylvania High School
558,Soccer,2025-05-30,2:00pm,Spotsylvania High School


In [31]:

# ---- Config ----
# We'll target Month view and always use the first of each month in the URL
BASE = "https://www.battlefielddistrictva.org/public/genie/354/school/2/date/{ymd}/view/month/"
SCHOOL_NAME = "Chancellor High School"   # set once for all rows

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    )
}

def month_urls(start_month: date, end_month: date):
    """Yield (first-of-month date, URL) pairs from start → end inclusive."""
    cur = date(start_month.year, start_month.month, 1)
    end = date(end_month.year, end_month.month, 1)
    while cur <= end:
        yield cur, BASE.format(ymd=cur.strftime("%Y-%m-%d"))
        cur += relativedelta(months=1)

def normalize_sport(title: str) -> str:
    """Reduce event title to base sport name (strip levels and admin words)."""
    sport = title.split(":", 1)[0].strip() if ":" in title else title.strip()
    # Remove common suffixes (Practice, Tryouts, etc.)
    suffixes = [
        r"\bPractice\b",
        r"\bTryouts?\b",
        r"\bOpen Gym/Conditioning\b",
        r"\bConditioning\b",
        r"\bPictures?\b",
        r"\bMeeting\b",
        r"\bOpen House\b",
        r"\bAthlete/Parent\b",
    ]
    for pat in suffixes:
        sport = re.sub(pat, "", sport, flags=re.IGNORECASE).strip()
    # Clean double spaces
    sport = re.sub(r"\s{2,}", " ", sport)
    return sport

def scrape_month(url: str, y: int, m: int):
    """Scrape a single Month page → list of dict rows."""
    rows = []
    resp = requests.get(url, headers=HEADERS, timeout=30)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    # Loop visible day cells (ignore other-month fillers)
    for day_li in soup.select("#calendar .days > li.day"):
        if "other-month" in (day_li.get("class") or []):
            continue

        date_div = day_li.select_one(".date")
        day_txt = date_div.get_text(strip=True) if date_div else ""
        if not day_txt.isdigit():
            continue

        # Build ISO date for this event's day
        try:
            event_date = date(y, m, int(day_txt)).isoformat()
        except ValueError:
            continue

        # Loop events within this day
        for ev in day_li.select(".event"):
            # Skip cancelled items
            cancelled = ev.select_one(".table-data-styles-subtext")
            if cancelled and "(Cancelled)" in cancelled.get_text():
                continue

            # Time text (can be ranges like "6:00pm - 9:00pm" or "TBD")
            time_el = ev.select_one(".event-time")
            time_txt = time_el.get_text(strip=True) if time_el else ""

            # Title holds the sport descriptor
            title_el = ev.select_one(".event-desc a.show-popup")
            title = title_el.get_text(strip=True) if title_el else ""
            sport = normalize_sport(title)

            rows.append({
                "Sport": sport,
                "Date": event_date,
                "Time": time_txt,
                "School": SCHOOL_NAME,
            })

    return rows

# ---- Run for Aug 2024 → May 2025 ----
start_month = date(2024, 8, 1)
end_month   = date(2025, 5, 1)

all_rows = []
for month_dt, url in month_urls(start_month, end_month):
    try:
        month_rows = scrape_month(url, month_dt.year, month_dt.month)
        all_rows.extend(month_rows)
        time.sleep(0.5)  # polite pause
    except Exception as e:
        print(f"Warning: failed {month_dt.strftime('%Y-%m')} -> {e}")

chancellor_df = pd.DataFrame(all_rows, columns=["Sport", "Date", "Time", "School"])

# Optional: tidy sorting and ISO string dates
if not chancellor_df.empty:
    chancellor_df["Date"] = pd.to_datetime(chancellor_df["Date"], errors="coerce")
    chancellor_df = chancellor_df.sort_values(["Date", "Time", "Sport"]).reset_index(drop=True)
    chancellor_df["Date"] = chancellor_df["Date"].dt.date.astype(str)

chancellor_df

Unnamed: 0,Sport,Date,Time,School
0,Fall Sports,2024-08-01,2:30pm - 8:00pm,Chancellor High School
1,Cheer,2024-08-01,4:00pm - 6:00pm,Chancellor High School
2,Field Hockey,2024-08-01,6:00pm - 8:00pm,Chancellor High School
3,Fall Sports,2024-08-02,2:30pm - 8:00pm,Chancellor High School
4,Fall Sports,2024-08-03,2:30pm - 8:00pm,Chancellor High School
...,...,...,...,...
1984,Soccer,2025-05-30,6:00pm,Chancellor High School
1985,Outside Group,2025-05-30,6:00pm - 10:00pm,Chancellor High School
1986,Soccer,2025-05-30,7:00pm,Chancellor High School
1987,Football,2025-05-30,8:00am - 11:00am,Chancellor High School


In [35]:

# ---- Config ----
BASE = "https://www.battlefielddistrictva.org/public/genie/354/school/3/date/{ymd}/view/month/"
SCHOOL_NAME = "Courtland High School"   # set once for all rows

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    )
}

def month_urls(start_month: date, end_month: date):
    """Yield (first-of-month date, URL) pairs from start → end inclusive."""
    cur = date(start_month.year, start_month.month, 1)
    end = date(end_month.year, end_month.month, 1)
    while cur <= end:
        yield cur, BASE.format(ymd=cur.strftime("%Y-%m-%d"))
        cur += relativedelta(months=1)

def normalize_sport(title: str) -> str:
    """Reduce event title to base sport name (strip levels/admin words)."""
    sport = title.split(":", 1)[0].strip() if ":" in title else title.strip()
    suffixes = [
        r"\bPractice\b",
        r"\bTryouts?\b",
        r"\bOpen Gym/Conditioning\b",
        r"\bConditioning\b",
        r"\bPictures?\b",
        r"\bMeeting\b",
        r"\bOpen House\b",
        r"\bAthlete/Parent\b",
    ]
    for pat in suffixes:
        sport = re.sub(pat, "", sport, flags=re.IGNORECASE).strip()
    return re.sub(r"\s{2,}", " ", sport)

def scrape_month(url: str, y: int, m: int):
    """Scrape a single Month page → list of dict rows."""
    rows = []
    resp = requests.get(url, headers=HEADERS, timeout=30)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    for day_li in soup.select("#calendar .days > li.day"):
        if "other-month" in (day_li.get("class") or []):
            continue

        date_div = day_li.select_one(".date")
        day_txt = date_div.get_text(strip=True) if date_div else ""
        if not day_txt.isdigit():
            continue

        try:
            event_date = date(y, m, int(day_txt)).isoformat()
        except ValueError:
            continue

        for ev in day_li.select(".event"):
            # Skip cancelled items
            cancelled = ev.select_one(".table-data-styles-subtext")
            if cancelled and "(Cancelled)" in cancelled.get_text():
                continue

            time_el = ev.select_one(".event-time")
            time_txt = time_el.get_text(strip=True) if time_el else ""

            title_el = ev.select_one(".event-desc a.show-popup")
            title = title_el.get_text(strip=True) if title_el else ""
            sport = normalize_sport(title)

            rows.append({
                "Sport": sport,
                "Date": event_date,
                "Time": time_txt,
                "School": SCHOOL_NAME,
            })
    return rows

# ---- Run for Aug 2024 → May 2025 ----
start_month = date(2024, 8, 1)
end_month   = date(2025, 5, 1)

all_rows = []
for month_dt, url in month_urls(start_month, end_month):
    try:
        month_rows = scrape_month(url, month_dt.year, month_dt.month)
        all_rows.extend(month_rows)
        time.sleep(0.5)  # polite pause
    except Exception as e:
        print(f"Warning: failed {month_dt.strftime('%Y-%m')} -> {e}")

courtland_df = pd.DataFrame(all_rows, columns=["Sport", "Date", "Time", "School"])

# Optional: tidy sorting and ISO string dates
if not courtland_df.empty:
    courtland_df["Date"] = pd.to_datetime(courtland_df["Date"], errors="coerce")
    courtland_df = courtland_df.sort_values(["Date", "Time", "Sport"]).reset_index(drop=True)
    courtland_df["Date"] = courtland_df["Date"].dt.date.astype(str)

courtland_df


Unnamed: 0,Sport,Date,Time,School
0,Volleyball Camp,2024-08-01,6:00pm - 8:30pm,Courtland High School
1,American Youth Football (AYF) - Games,2024-08-03,,Courtland High School
2,The Bridge Church - Set up & Movie Night,2024-08-03,10:00am - 8:00pm,Courtland High School
3,American Youth Football (AYF) - Games,2024-08-04,,Courtland High School
4,The Bridge Church,2024-08-04,8:00am - 1:00pm,Courtland High School
...,...,...,...,...
1017,Tennis,2025-05-29,TBD,Courtland High School
1018,Tennis,2025-05-29,TBD,Courtland High School
1019,Soccer,2025-05-30,5:00pm,Courtland High School
1020,Music in the Parks,2025-05-30,7:00am - 2:30pm,Courtland High School


In [37]:


# ---- Config ----
BASE = "https://www.commonwealthdistrictva.org/public/genie/366/school/4/date/{ymd}/view/month/"
SCHOOL_NAME = "Massaponax High School"   # set once for all rows

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    )
}

def month_urls(start_month: date, end_month: date):
    """Yield (first-of-month date, URL) pairs from start → end inclusive."""
    cur = date(start_month.year, start_month.month, 1)
    end = date(end_month.year, end_month.month, 1)
    while cur <= end:
        yield cur, BASE.format(ymd=cur.strftime("%Y-%m-%d"))
        cur += relativedelta(months=1)

def normalize_sport(title: str) -> str:
    """Reduce event title to base sport name (strip levels/admin words)."""
    sport = title.split(":", 1)[0].strip() if ":" in title else title.strip()
    suffixes = [
        r"\bPractice\b",
        r"\bTryouts?\b",
        r"\bOpen Gym/Conditioning\b",
        r"\bConditioning\b",
        r"\bPictures?\b",
        r"\bMeeting\b",
        r"\bOpen House\b",
        r"\bAthlete/Parent\b",
    ]
    for pat in suffixes:
        sport = re.sub(pat, "", sport, flags=re.IGNORECASE).strip()
    return re.sub(r"\s{2,}", " ", sport)

def scrape_month(url: str, y: int, m: int):
    """Scrape a single Month page → list of dict rows."""
    rows = []
    resp = requests.get(url, headers=HEADERS, timeout=30)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    for day_li in soup.select("#calendar .days > li.day"):
        if "other-month" in (day_li.get("class") or []):
            continue

        date_div = day_li.select_one(".date")
        day_txt = date_div.get_text(strip=True) if date_div else ""
        if not day_txt.isdigit():
            continue

        try:
            event_date = date(y, m, int(day_txt)).isoformat()
        except ValueError:
            continue

        for ev in day_li.select(".event"):
            # Skip cancelled items
            cancelled = ev.select_one(".table-data-styles-subtext")
            if cancelled and "(Cancelled)" in cancelled.get_text():
                continue

            time_el = ev.select_one(".event-time")
            time_txt = time_el.get_text(strip=True) if time_el else ""

            title_el = ev.select_one(".event-desc a.show-popup")
            title = title_el.get_text(strip=True) if title_el else ""
            sport = normalize_sport(title)

            rows.append({
                "Sport": sport,
                "Date": event_date,
                "Time": time_txt,
                "School": SCHOOL_NAME,
            })
    return rows

# ---- Run for Aug 2024 → May 2025 ----
start_month = date(2024, 8, 1)
end_month   = date(2025, 5, 1)

all_rows = []
for month_dt, url in month_urls(start_month, end_month):
    try:
        month_rows = scrape_month(url, month_dt.year, month_dt.month)
        all_rows.extend(month_rows)
        time.sleep(0.5)  # polite pause
    except Exception as e:
        print(f"Warning: failed {month_dt.strftime('%Y-%m')} -> {e}")

mass_df = pd.DataFrame(all_rows, columns=["Sport", "Date", "Time", "School"])

# Optional: tidy sorting and ISO string dates
if not mass_df.empty:
    mass_df["Date"] = pd.to_datetime(mass_df["Date"], errors="coerce")
    mass_df = mass_df.sort_values(["Date", "Time", "Sport"]).reset_index(drop=True)
    mass_df["Date"] = mass_df["Date"].dt.date.astype(str)

mass_df


Unnamed: 0,Sport,Date,Time,School
0,COMPETITION CHEER - Start Date,2024-08-01,5:30pm,Massaponax High School
1,FOOTBALL - Start Date,2024-08-01,6:00am,Massaponax High School
2,Golf,2024-08-05,1:00pm,Massaponax High School
3,VOLLEYBALL - Start Date,2024-08-05,4:15pm,Massaponax High School
4,CROSS COUNTRY - Start Date,2024-08-05,5:30pm,Massaponax High School
...,...,...,...,...
797,Softball,2025-05-29,6:00pm,Massaponax High School
798,Outdoor Track,2025-05-29,TBD,Massaponax High School
799,Outdoor Track,2025-05-29,TBD,Massaponax High School
800,Outdoor Track,2025-05-30,TBD,Massaponax High School


In [41]:
# ---- Config ----
BASE = "https://www.commonwealthdistrictva.org/public/genie/366/school/8/date/{ymd}/view/month/"
SCHOOL_NAME = "Riverbend High School"   # set once for all rows

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    )
}

def month_urls(start_month: date, end_month: date):
    """Yield (first-of-month date, URL) pairs from start → end inclusive."""
    cur = date(start_month.year, start_month.month, 1)
    end = date(end_month.year, end_month.month, 1)
    while cur <= end:
        yield cur, BASE.format(ymd=cur.strftime("%Y-%m-%d"))
        cur += relativedelta(months=1)

def normalize_sport(title: str) -> str:
    """Reduce event title to base sport name (strip levels/admin words)."""
    sport = title.split(":", 1)[0].strip() if ":" in title else title.strip()
    suffixes = [
        r"\bPractice\b",
        r"\bTryouts?\b",
        r"\bOpen Gym/Conditioning\b",
        r"\bConditioning\b",
        r"\bPictures?\b",
        r"\bMeeting\b",
        r"\bOpen House\b",
        r"\bAthlete/Parent\b",
    ]
    for pat in suffixes:
        sport = re.sub(pat, "", sport, flags=re.IGNORECASE).strip()
    return re.sub(r"\s{2,}", " ", sport)

def scrape_month(url: str, y: int, m: int):
    """Scrape a single Month page → list of dict rows."""
    rows = []
    resp = requests.get(url, headers=HEADERS, timeout=30)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    for day_li in soup.select("#calendar .days > li.day"):
        if "other-month" in (day_li.get("class") or []):
            continue

        date_div = day_li.select_one(".date")
        day_txt = date_div.get_text(strip=True) if date_div else ""
        if not day_txt.isdigit():
            continue

        try:
            event_date = date(y, m, int(day_txt)).isoformat()
        except ValueError:
            continue

        for ev in day_li.select(".event"):
            # Skip cancelled items
            cancelled = ev.select_one(".table-data-styles-subtext")
            if cancelled and "(Cancelled)" in cancelled.get_text():
                continue

            time_el = ev.select_one(".event-time")
            time_txt = time_el.get_text(strip=True) if time_el else ""

            title_el = ev.select_one(".event-desc a.show-popup")
            title = title_el.get_text(strip=True) if title_el else ""
            sport = normalize_sport(title)

            rows.append({
                "Sport": sport,
                "Date": event_date,
                "Time": time_txt,
                "School": SCHOOL_NAME,
            })
    return rows

# ---- Run for Aug 2024 → May 2025 ----
start_month = date(2024, 8, 1)
end_month   = date(2025, 5, 1)

all_rows = []
for month_dt, url in month_urls(start_month, end_month):
    try:
        month_rows = scrape_month(url, month_dt.year, month_dt.month)
        all_rows.extend(month_rows)
        time.sleep(0.5)  # polite pause
    except Exception as e:
        print(f"Warning: failed {month_dt.strftime('%Y-%m')} -> {e}")

riverbend_df = pd.DataFrame(all_rows, columns=["Sport", "Date", "Time", "School"])

# Optional: tidy sorting and ISO string dates
if not riverbend_df.empty:
    riverbend_df["Date"] = pd.to_datetime(riverbend_df["Date"], errors="coerce")
    riverbend_df = riverbend_df.sort_values(["Date", "Time", "Sport"]).reset_index(drop=True)
    riverbend_df["Date"] = riverbend_df["Date"].dt.date.astype(str)

riverbend_df

Unnamed: 0,Sport,Date,Time,School
0,Golf,2024-08-05,TBD,Riverbend High School
1,Golf,2024-08-07,TBD,Riverbend High School
2,Golf,2024-08-14,4:00pm,Riverbend High School
3,Field Hockey,2024-08-14,5:00pm,Riverbend High School
4,Field Hockey,2024-08-14,6:30pm,Riverbend High School
...,...,...,...,...
648,Soccer,2025-05-29,TBD,Riverbend High School
649,Tennis,2025-05-30,8:30am,Riverbend High School
650,Lacrosse,2025-05-30,TBD,Riverbend High School
651,Lacrosse,2025-05-30,TBD,Riverbend High School


In [43]:

# Combine all five dataframes
sport_calendar_df = pd.concat(
    [riverbend_df, mass_df, courtland_df, chancellor_df, spotsy_df],
    ignore_index=True
)

# Optional: check the shape and preview
print(sport_calendar_df.shape)
print(sport_calendar_df.head())


(5026, 4)
          Sport        Date    Time                 School
0          Golf  2024-08-05     TBD  Riverbend High School
1          Golf  2024-08-07     TBD  Riverbend High School
2          Golf  2024-08-14  4:00pm  Riverbend High School
3  Field Hockey  2024-08-14  5:00pm  Riverbend High School
4  Field Hockey  2024-08-14  6:30pm  Riverbend High School


In [47]:
# Make sure the 'date' column is in datetime format
sport_calendar_df['Date'] = pd.to_datetime(sport_calendar_df['Date'])

# Create a new column with the day of the week (e.g., Monday, Tuesday, etc.)
sport_calendar_df['Weekday'] = sport_calendar_df['Date'].dt.day_name()

# Preview
print(sport_calendar_df[['Date', 'Weekday']].head())


        Date    Weekday
0 2024-08-05     Monday
1 2024-08-07  Wednesday
2 2024-08-14  Wednesday
3 2024-08-14  Wednesday
4 2024-08-14  Wednesday


In [49]:
sport_calendar_df['Sport'].unique()

array(['Golf', 'Field Hockey', 'Football', 'Volleyball',
       'Sideline Cheer Fall', 'Cross Country', 'Competition Cheer',
       'Scholastic Bowl', 'Basketball', 'Wrestling', 'Swim & Dive',
       'Indoor Track', 'Speech', 'Lacrosse', 'Tennis', 'Softball',
       'Baseball', 'Soccer', 'Outdoor Track',
       'COMPETITION CHEER - Start Date', 'FOOTBALL - Start Date',
       'VOLLEYBALL - Start Date', 'CROSS COUNTRY - Start Date',
       'FIELD HOCKEY - Start Date', 'FCS', 'SATs', 'Esports Fall',
       'Track Camp', 'All County Chorus', 'Parent/Teacher Conferences',
       'VBODA Marching Band Assessment', 'Fall Concert',
       'Middle School Football Game', 'Regional Orchestra',
       'Boys Basketball -', 'Swim -', 'Indoor Track -',
       'Girls Basketball -', 'Wrestling -',
       'VAYFA - Youth Football Games', 'Indoor Percussion',
       'Fall Band Performance', 'Pics', 'TMS Choir Concert',
       'Virginia Jags - Youth Football Games', 'Mus', 'TMS Band Concert',
       'Orche

In [57]:
# List of allowed sport keywords (substrings)
allowed_sports = [
    "Golf", "Field Hockey", "Football", "Volleyball", "Sideline Cheer Fall", 
    "Cross Country", "Competition Cheer", "Basketball", "Wrestling", 
    "Swim & Dive", "Indoor Track", "Lacrosse", "Tennis", "Softball", 
    "Baseball", "Soccer", "Outdoor Track", "Swim", 
    "Girls Soccer", "Girls Basketball", "Girls Lacrosse", 
    "Boys Soccer", "Boys Lacrosse", "Boys Basketball"
]

# Build regex pattern from keywords
pattern = '|'.join(allowed_sports)

# Keep only rows where Sport contains one of the allowed substrings
sport_calendar_df = sport_calendar_df[sport_calendar_df['Sport'].str.contains(pattern, case=False, na=False)]

# Reset index for cleanliness
sport_calendar_df = sport_calendar_df.reset_index(drop=True)

print(sport_calendar_df['Sport'].unique())



['Golf' 'Field Hockey' 'Football' 'Volleyball' 'Sideline Cheer Fall'
 'Cross Country' 'Competition Cheer' 'Basketball' 'Wrestling'
 'Swim & Dive' 'Indoor Track' 'Lacrosse' 'Tennis' 'Softball' 'Baseball'
 'Soccer' 'Outdoor Track' 'Boys Basketball' 'Boys Soccer'
 'Girls Basketball' 'Swim' 'Girls Soccer']


In [59]:
sport_calendar_df

Unnamed: 0,Sport,Date,Time,School,day_of_week,Weekday
0,Golf,2024-08-05,TBD,Riverbend High School,Monday,Monday
1,Golf,2024-08-07,TBD,Riverbend High School,Wednesday,Wednesday
2,Golf,2024-08-14,4:00pm,Riverbend High School,Wednesday,Wednesday
3,Field Hockey,2024-08-14,5:00pm,Riverbend High School,Wednesday,Wednesday
4,Field Hockey,2024-08-14,6:30pm,Riverbend High School,Wednesday,Wednesday
...,...,...,...,...,...,...
3943,Softball,2025-05-28,6:00pm,Spotsylvania High School,Wednesday,Wednesday
3944,Softball,2025-05-29,6:00pm,Spotsylvania High School,Thursday,Thursday
3945,Baseball,2025-05-30,2:00pm,Spotsylvania High School,Friday,Friday
3946,Soccer,2025-05-30,2:00pm,Spotsylvania High School,Friday,Friday


In [123]:
import re
import numpy as np
import pandas as pd

# --- 1) Average roster sizes (edit if you want different numbers) ---
avg_roster_sizes = {
    "Football Boys": 50,
    "Cheerleading Girls": 20,          # used for Sideline/Competition Cheer too
    "Field Hockey Girls": 20,
    "Soccer Boys": 20, "Soccer Girls": 20,
    "Cross Country Boys": 25, "Cross Country Girls": 25,
    "Swimming Boys": 20, "Swimming Girls": 20,  # swim & dive mapped here
    "Golf Boys": 8, "Golf Girls": 8,
    "Volleyball Boys": 12, "Volleyball Girls": 12,
    "Basketball Boys": 13, "Basketball Girls": 13,
    "Wrestling Boys": 25,
    "Baseball Boys": 16,
    "Softball Girls": 16,
    "Tennis Boys": 10, "Tennis Girls": 10,
    "Track & Field Boys": 40, "Track & Field Girls": 40,
    "Lacrosse Boys": 25, "Lacrosse Girls": 25,
}

# --- 2) Helpers: normalize text & detect gender tokens ---
def _norm(s: str) -> str:
    s = str(s).lower()
    s = s.replace("&", " and ")
    s = re.sub(r"[^a-z0-9\s]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def _detect_gender(norm_text: str):
    if re.search(r"\bboys?\b|\bboy'?s\b", norm_text):
        return "Boys"
    if re.search(r"\bgirls?\b|\bgirl'?s\b", norm_text):
        return "Girls"
    return None  # unspecified/coed

# --- 3) Patterns to recognize sports (add synonyms as needed) ---
SPORT_PATTERNS = [
    (r"\bfootball\b",                        "Football",      "boys"),
    (r"\bfield\s*hockey\b",                  "Field Hockey",  "girls"),
    (r"\bvolley\s*ball\b|\bvolleyball\b",    "Volleyball",    "detect"),
    (r"\bcheer(leading)?\b|sideline\s*cheer|competition\s*cheer", "Cheerleading", "girls"),
    (r"\bwrestling\b|\bwrestle\b",           "Wrestling",     "boys"),
    (r"\bcross\s*country\b|\bxc\b|\bx\s*country\b", "Cross Country", "detect"),
    (r"\bgolf\b",                             "Golf",          "detect"),
    (r"\bbasket\s*ball\b|\bbasketball\b",     "Basketball",    "detect"),
    (r"\bswim(?:ming)?(?:\s*and\s*dive)?\b|\bswim\s*and\s*dive\b|\bdive\b", "Swimming", "detect"),
    (r"\btennis\b",                           "Tennis",        "detect"),
    (r"\blacrosse\b|\blax\b",                 "Lacrosse",      "detect"),
    (r"\bbase\s*ball\b|\bbaseball\b",         "Baseball",      "boys"),
    (r"\bsoft\s*ball\b|\bsoftball\b",         "Softball",      "girls"),
    (r"\bsoc(?:cer)?\b|\bsoccer\b",           "Soccer",        "detect"),
    (r"\bindoor\s*track\b",                   "Track & Field", "detect"),
    (r"\boutdoor\s*track\b",                  "Track & Field", "detect"),
    (r"\btrack\b|\btrack\s*and\s*field\b|\bt\s*&\s*f\b", "Track & Field", "detect"),
]

# --- 4) Canonicalize one row to "Sport Gender" (or BOTH) ---
def infer_canonical_sport(event_name: str) -> str | None:
    s = _norm(event_name)
    gender = _detect_gender(s)  # "Boys" / "Girls" / None

    for pat, base, grule in SPORT_PATTERNS:
        if re.search(pat, s):
            # sports that are inherently gendered by name:
            if base == "Baseball":
                return "Baseball Boys"
            if base == "Softball":
                return "Softball Girls"
            if base == "Field Hockey":
                return "Field Hockey Girls"

            # volleyball sometimes has boys teams; honor explicit gender if present
            if base == "Volleyball" and gender in ("Boys", "Girls"):
                return f"Volleyball {gender}"

            # cheer variants → Cheerleading Girls
            if base == "Cheerleading":
                return "Cheerleading Girls"

            # sports where gender can be detected or coed:
            if grule == "boys":
                return f"{base} Boys"
            if grule == "girls":
                return f"{base} Girls"
            if grule == "detect" and gender in ("Boys", "Girls"):
                return f"{base} {gender}"

            # If still unspecified/coed, count BOTH genders where applicable:
            if base in {"Soccer","Basketball","Lacrosse","Tennis","Golf",
                        "Cross Country","Swimming","Track & Field","Volleyball"}:
                return f"{base} BOTH"

            # default fallbacks for single-gender sports:
            if base in {"Football","Wrestling"}:
                return f"{base} Boys"

            # otherwise give up
            return None

    return None  # no sport match found

# --- 5) Add canonical sport + participants to your dataframe ---
# Assumes your combined df is named `sport_calendar_df` and has a column with event titles.
# If your sport title column is not exactly 'Sport', change the name below.
title_col = "Sport"

sport_calendar_df["canonical_sport"] = sport_calendar_df[title_col].apply(infer_canonical_sport)

def participants_from_canonical(canon: str | None) -> float:
    if not canon:
        return np.nan
    if canon.endswith(" BOTH"):
        base = canon.replace(" BOTH", "")
        m = avg_roster_sizes.get(f"{base} Boys", 0)
        f = avg_roster_sizes.get(f"{base} Girls", 0)
        val = (m if m else 0) + (f if f else 0)
        return np.nan if val == 0 else val
    return avg_roster_sizes.get(canon, np.nan)

sport_calendar_df["participants"] = sport_calendar_df["canonical_sport"].apply(participants_from_canonical)

# --- 6) (Optional) See what didn’t match so you can refine patterns quickly ---
unmatched = (
    sport_calendar_df[sport_calendar_df["participants"].isna()][title_col]
    .value_counts()
    .head(30)
)
print("Top unmatched event titles (sample):\n", unmatched)

# --- 7) (Optional) Get busiest days of week by weighted participants ---
# Make sure you already created the 'day_of_week' column as earlier.
busiest = (
    sport_calendar_df.dropna(subset=["participants"])
    .groupby("Weekday", sort=False)["participants"]
    .sum()
    .sort_values(ascending=False)
)
print("\nBusiest days (weighted by participants):\n", busiest)



Top unmatched event titles (sample):
 Series([], Name: count, dtype: int64)

Busiest days (weighted by participants):
 Weekday
Tuesday      24925
Wednesday    23361
Thursday     22837
Friday       22287
Monday       18991
Saturday     11231
Sunday         200
Name: participants, dtype: int64


In [125]:
# Count events per weekday
weekday_counts = sport_calendar_df['Weekday'].value_counts().sort_index()

print(weekday_counts)


Weekday
Friday       694
Monday       659
Saturday     279
Sunday         5
Thursday     779
Tuesday      875
Wednesday    657
Name: count, dtype: int64


In [127]:
# Count sports on Wednesday
wed_sports = (
    sport_calendar_df[sport_calendar_df['Weekday'] == "Wednesday"]
    ['Sport'].value_counts()
)

print("Sports on Wednesday:\n", wed_sports)


Sports on Wednesday:
 Sport
Football               69
Cross Country          64
Tennis                 54
Outdoor Track          52
Baseball               51
Soccer                 37
Lacrosse               34
Boys Basketball        33
Wrestling              32
Softball               31
Golf                   31
Basketball             29
Field Hockey           27
Volleyball             26
Indoor Track           24
Girls Soccer           17
Boys Soccer            15
Girls Basketball       12
Swim                    9
Competition Cheer       4
Swim & Dive             3
Sideline Cheer Fall     3
Name: count, dtype: int64


In [129]:
# Count sports on Wednesday
thurs_sports = (
    sport_calendar_df[sport_calendar_df['Weekday'] == "Thursday"]
    ['Sport'].value_counts()
)

print("Sports on Thursday:\n", thurs_sports)

Sports on Thursday:
 Sport
Soccer                 116
Basketball              85
Volleyball              83
Field Hockey            79
Baseball                70
Tennis                  69
Football                46
Softball                39
Girls Basketball        31
Lacrosse                28
Golf                    20
Wrestling               20
Indoor Track            16
Cross Country           15
Swim & Dive             12
Outdoor Track           12
Girls Soccer            11
Boys Soccer             10
Swim                     9
Boys Basketball          6
Sideline Cheer Fall      2
Name: count, dtype: int64


In [131]:
# Filter for Lacrosse events
lax_events = sport_calendar_df[sport_calendar_df['Sport'].str.contains("Lacrosse", case=False, na=False)]

# Get counts of which weekdays Lacrosse events fall on
lax_by_day = lax_events['Weekday'].value_counts().reindex(
    ["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"],
    fill_value=0
)

print(lax_by_day)


Weekday
Monday       31
Tuesday      19
Wednesday    34
Thursday     28
Friday       11
Saturday      0
Sunday        0
Name: count, dtype: int64


In [133]:
print(sport_calendar_df)

             Sport       Date    Time                    School    Weekday  \
0             Golf 2024-08-05     TBD     Riverbend High School     Monday   
1             Golf 2024-08-07     TBD     Riverbend High School  Wednesday   
2             Golf 2024-08-14  4:00pm     Riverbend High School  Wednesday   
3     Field Hockey 2024-08-14  5:00pm     Riverbend High School  Wednesday   
4     Field Hockey 2024-08-14  6:30pm     Riverbend High School  Wednesday   
...            ...        ...     ...                       ...        ...   
3943      Softball 2025-05-28  6:00pm  Spotsylvania High School  Wednesday   
3944      Softball 2025-05-29  6:00pm  Spotsylvania High School   Thursday   
3945      Baseball 2025-05-30  2:00pm  Spotsylvania High School     Friday   
3946        Soccer 2025-05-30  2:00pm  Spotsylvania High School     Friday   
3947      Softball 2025-05-30  4:00pm  Spotsylvania High School     Friday   

      participants     canonical_sport  
0               16    

In [141]:
import pandas as pd

def keep_evenings_and_tbd(df):
    # Copy so we don't overwrite original
    df = sport_calendar_df.copy()

    # Normalize column to string
    df['Time'] = df['Time'].astype(str).str.strip()

    # Identify TBD rows
    is_tbd = df['Time'].str.contains("TBD", case=False, na=False)

    # Convert to datetime (coerce errors so "TBD" -> NaT)
    times = pd.to_datetime(df.loc[~is_tbd, 'Time'], format='%I:%M %p', errors='coerce')

    # Keep if time > 4:00 PM or if it's TBD
    mask = is_tbd | (times.dt.hour > 16) | ((times.dt.hour == 16) & (times.dt.minute > 0))

    return df[mask].reset_index(drop=True)

# Apply to your combined dataset
evening_df = keep_evenings_and_tbd(sport_calendar_df)

print(evening_df[['Date', 'Time', 'Sport']].head())



        Date Time       Sport
0 2024-08-05  TBD        Golf
1 2024-08-07  TBD        Golf
2 2024-09-07  TBD  Volleyball
3 2024-09-25  TBD        Golf
4 2024-10-02  TBD        Golf


In [143]:
# Count NaNs per column
missing_counts = evening_df.isna().sum()

print(missing_counts)


Sport              0
Date               0
Time               0
School             0
Weekday            0
participants       0
canonical_sport    0
dtype: int64


In [145]:
# Remove any rows where Time contains "TBD"
sport_calendar_df = sport_calendar_df[~sport_calendar_df['Time'].str.contains("TBD", case=False, na=False)]

# Reset index for cleanliness
sport_calendar_df = sport_calendar_df.reset_index(drop=True)

print(sport_calendar_df.head())


          Sport       Date    Time                 School    Weekday  \
0          Golf 2024-08-14  4:00pm  Riverbend High School  Wednesday   
1  Field Hockey 2024-08-14  5:00pm  Riverbend High School  Wednesday   
2  Field Hockey 2024-08-14  6:30pm  Riverbend High School  Wednesday   
3  Field Hockey 2024-08-14  6:30pm  Riverbend High School  Wednesday   
4          Golf 2024-08-15  4:00pm  Riverbend High School   Thursday   

   participants     canonical_sport  
0            16           Golf BOTH  
1            20  Field Hockey Girls  
2            20  Field Hockey Girls  
3            20  Field Hockey Girls  
4            16           Golf BOTH  


In [147]:
sport_calendar_df["participants"] = sport_calendar_df["canonical_sport"].apply(participants_from_canonical)

# --- 6) (Optional) See what didn’t match so you can refine patterns quickly ---
unmatched = (
    sport_calendar_df[sport_calendar_df["participants"].isna()][title_col]
    .value_counts()
    .head(30)
)
print("Top unmatched event titles (sample):\n", unmatched)

# --- 7) (Optional) Get busiest days of week by weighted participants ---
# Make sure you already created the 'day_of_week' column as earlier.
busiest = (
    sport_calendar_df.dropna(subset=["participants"])
    .groupby("Weekday", sort=False)["participants"]
    .sum()
    .sort_values(ascending=False)
)
print("\nBusiest days (weighted by participants):\n", busiest)


Top unmatched event titles (sample):
 Series([], Name: count, dtype: int64)

Busiest days (weighted by participants):
 Weekday
Tuesday      24460
Wednesday    22304
Thursday     22162
Friday       21340
Monday       18622
Saturday     10283
Sunday         200
Name: participants, dtype: int64
