<a href="https://colab.research.google.com/github/joeygl/gaasite/blob/main/Untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# GAA.ie Match Scorer Scraper for Google Colab
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re

BASE = "https://www.gaa.ie"
HEADERS = {'User-Agent': 'Mozilla/5.0'}

def get_recent_match_urls(limit=30):
    url = f"{BASE}/fixtures-results"
    r = requests.get(url, headers=HEADERS)
    soup = BeautifulSoup(r.text, "html.parser")
    links = []
    for a in soup.find_all("a", href=True):
        href = a['href']
        if "/football/" in href and ("/match/" in href or "/article/" in href):
            full_url = BASE + href if href.startswith("/") else href
            if full_url not in links:
                links.append(full_url)
        if len(links) >= limit:
            break
    return links

def parse_match_page(url):
    try:
        r = requests.get(url, headers=HEADERS, timeout=15)
        soup = BeautifulSoup(r.text, "html.parser")
        title = soup.find("h1").get_text(strip=True) if soup.find("h1") else url
        text = soup.get_text(" ", strip=True)
        date = ""
        for tag in soup.find_all("time"):
            if tag.has_attr("datetime"):
                date = tag['datetime'][:10]
                break
        teams = re.findall(r'([A-Z][a-z]+)\s+v[s]?\s+([A-Z][a-z]+)', title)
        teamA, teamB = teams[0] if teams else ("", "")
        scorer_lines = re.findall(r'([A-Z][A-Za-z\'\.\- ]+)\s+(\d+)[–-](\d+)(f?)', text)
        match_data = []
        for player, goals, points, is_free in scorer_lines:
            match_data.append({
                "match_date": date,
                "match_title": title,
                "teamA": teamA,
                "teamB": teamB,
                "player": player.strip(),
                "goals": int(goals),
                "points": int(points),
                "is_free": bool(is_free),
                "match_url": url
            })
        return match_data
    except Exception as e:
        print(f"⚠️ Error in {url}: {e}")
        return []

# ---- MAIN WORKFLOW ----
all_match_rows = []
match_urls = get_recent_match_urls(limit=40)
print(f"🟢 Found {len(match_urls)} recent matches to process.")
for i, url in enumerate(match_urls):
    print(f"  [{i+1}/{len(match_urls)}] Scraping {url}")
    match_rows = parse_match_page(url)
    all_match_rows.extend(match_rows)
    time.sleep(0.5)
df = pd.DataFrame(all_match_rows)
df.to_csv("gaa_match_player_scorers.csv", index=False)
df.to_json("gaa_match_player_scorers.json", orient="records")
print(f"\n✅ Saved {len(df)} player-by-match scoring rows.")

# ---- PROVIDE DOWNLOAD LINK ----
from google.colab import files
files.download("gaa_match_player_scorers.csv")


🟢 Found 0 recent matches to process.

✅ Saved 0 player-by-match scoring rows.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>