In [1]:
import requests
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from typing import Dict, Tuple, List

# 0. WorldCup Data Parser (No need to check.)

In [None]:
# -*- coding: utf-8 -*-
import re
import json
import requests
import pandas as pd
from pathlib import Path
from typing import Optional, Tuple

# =========================
# 설정
# =========================
OUT_DIR = Path("../data/soccer/worldcup")
OUT_DIR.mkdir(parents=True, exist_ok=True)

YEAR_MIN, YEAR_MAX = 2006, 2022
GITHUB_API_ROOT = "https://api.github.com/repos/openfootball/worldcup/contents/"
RAW_ROOT = "https://raw.githubusercontent.com/openfootball/worldcup/master"

# =========================
# 점수 블롭 파서
# =========================
score_pair = re.compile(r"(\d+)\s*-\s*(\d+)")                 # e.g., 2-1
ht_paren   = re.compile(r"\(([^)]+)\)")                        # e.g., (1-0) or (2-2, 2-0)
has_pen    = re.compile(r"\bpen\.?\b", re.I)                   # pen. / PEN.
has_aet    = re.compile(r"\ba\.?e\.?t\.?\b", re.I)             # a.e.t. / aet

def parse_score_blob(blob: str) -> dict:
    """
    blob 예:
      - '2-1 (1-0)'
      - '4-2 pen. 3-3 a.e.t. (2-2, 2-0)'
      - '3-0 pen. 0-0 a.e.t. (0-0, 0-0)'
      - '1-3 pen. 1-1 a.e.t (1-1, 1-0)'
    반환: dict (ft_hg, ft_ag, aet_hg/ag, pen_hg/ag, ht_raw)
    """
    out = {
        "ft_home": None, "ft_away": None,
        "aet_home": None, "aet_away": None,
        "pen_home": None, "pen_away": None,
        "ht_raw": None
    }
    b = blob.strip()

    # 괄호 안 상세(전반/추가정보) 원문 보존
    m_ht = ht_paren.search(b)
    if m_ht:
        out["ht_raw"] = m_ht.group(1).strip()
        b = ht_paren.sub(" ", b).strip()

    # 승부차기 & 연장전 여부
    pen_flag = bool(has_pen.search(b))
    aet_flag = bool(has_aet.search(b))

    # 숫자쌍 전부 긁어서 해석 순서 부여
    nums = [tuple(map(int, m.groups())) for m in score_pair.finditer(b)]

    if pen_flag and len(nums) >= 2:
        out["pen_home"], out["pen_away"] = nums[0]
        out["aet_home"], out["aet_away"] = nums[1]
    elif aet_flag and len(nums) >= 1:
        out["aet_home"], out["aet_away"] = nums[0]
    elif len(nums) >= 1:
        out["ft_home"], out["ft_away"] = nums[0]

    # FT가 비어 있고 AET만 있으면 FT= AET로 보완(요약 목적)
    if out["ft_home"] is None and out["aet_home"] is not None:
        out["ft_home"], out["ft_away"] = out["aet_home"], out["aet_away"]

    return out

# =========================
# 매치 라인 파서 (time 선택적)
# =========================
# ▶ time을 옵션으로 변경: (?: ... )?  ← 2006년 이전 포맷 대응
pre_head = re.compile(
    r"^\((?P<no>\d+)\)\s+"
    r"(?P<dow>[A-Za-z]{3})\s+"
    r"(?P<md>[A-Za-z]{3}/\d{1,2})"
    r"(?:\s+(?P<time>\d{1,2}:\d{2}))?\s+"
)

def parse_match_line(line: str) -> Optional[dict]:
    m = pre_head.match(line)
    if not m:
        return None
    info = m.groupdict()
    rest = line[m.end():].rstrip()

    # venue 위치(@) 찾기
    at_idx = rest.rfind(" @ ")
    if at_idx == -1:
        return None
    before_at = rest[:at_idx].rstrip()
    venue_raw = rest[at_idx + 3:].strip()

    # 첫 점수쌍 위치
    m_score = score_pair.search(before_at)
    if not m_score:
        return None
    start = m_score.start()

    home = before_at[:start].strip()
    tail = before_at[start:].strip()

    # 점수블롭/원정 분리(공백 폭 가변 대응)
    parts = re.split(r"\s{2,}", tail)
    if len(parts) == 1:
        last_sc = list(score_pair.finditer(tail))[-1]
        score_blob = tail[:last_sc.end()].strip()
        away = tail[last_sc.end():].strip()
    else:
        score_blob = " ".join(parts[:-1]).strip()
        away = parts[-1].strip()

    # venue를 stadium / city로 분리
    stadium, city = venue_raw, None
    if "," in venue_raw:
        stadium, city = [s.strip() for s in venue_raw.split(",", 1)]

    score = parse_score_blob(score_blob)

    return {
        "match_no": int(info["no"]),
        "dow": info["dow"],
        "month_day": info["md"],                        # 예: Jun/9
        "time_local": info.get("time") or None,         # 시간 없으면 None
        "home": re.sub(r"\s{2,}", " ", home),
        "away": re.sub(r"\s{2,}", " ", away),
        "stadium": stadium,
        "city": city,
        "score_blob": score_blob,
        **score,
    }

# =========================
# TXT 파서(그룹/토너먼트 공용)
# =========================
def parse_worldcup_txt(url: str, section_kind="group", year: int = None) -> pd.DataFrame:
    """
    section_kind:
      'group'    -> 'Group A' ... 섹션 사용
      'knockout' -> 'Round of 16', 'Quarter-finals', 'Semi-finals', ...
    """
    r = requests.get(url, timeout=30, headers={"User-Agent": "Mozilla/5.0"})
    r.raise_for_status()
    lines = r.text.splitlines()

    sec_label = None
    out = []
    for raw in lines:
        line = raw.strip()
        if not line:
            continue

        # 섹션 헤더
        if section_kind == "group" and line.startswith("Group "):
            # "Group A | ..." 라인 → 섹션명은 "Group A"
            sec_label = " ".join(line.split()[:2])
            continue

        if section_kind == "knockout" and line in {
            "Round of 16", "Quarter-finals", "Semi-finals",
            "Match for third place", "Final"
        }:
            sec_label = line
            continue

        # 매치 라인
        row = parse_match_line(line)
        if row:
            row["section"] = sec_label
            row["stage"] = "Group" if section_kind == "group" else "Knockout"
            row["year"] = year
            out.append(row)

    return pd.DataFrame(out)

# =========================
# 디렉토리 목록에서 연도 폴더 수집
# =========================
def list_worldcup_year_dirs() -> list[Tuple[int, str]]:
    resp = requests.get(GITHUB_API_ROOT, timeout=30, headers={"User-Agent": "Mozilla/5.0"})
    resp.raise_for_status()
    items = resp.json()
    years = []
    for it in items:
        if it.get("type") == "dir":
            name = it.get("name", "")
            m = re.match(r"^(\d{4})--", name)
            if m:
                y = int(m.group(1))
                years.append((y, name))
    return sorted(years)  # (year, dir_name)

# =========================
# 날짜 변환 (time optional)
# =========================
def md_to_iso(md: str, year: int, time_local: Optional[str]):
    md2 = md.replace("/", " ")
    t = time_local if time_local else "00:00"  # 시간 없음 → 00:00으로 보완
    return pd.to_datetime(f"{year} {md2} {t}", errors="coerce")

# =========================
# 메인: 1986~2022 전체 파싱
# =========================

year_dirs = list_worldcup_year_dirs()

In [None]:
year_dfs = {y : None for y, dname in year_dirs if y >= YEAR_MIN and y <= YEAR_MAX}

for y, dname in year_dirs:
    all_frames = []
    if y < YEAR_MIN or y > YEAR_MAX:
        continue

    url_group  = f"{RAW_ROOT}/{dname}/cup.txt"
    url_finals = f"{RAW_ROOT}/{dname}/cup_finals.txt"

    # 그룹
    try:
        df_g = parse_worldcup_txt(url_group, section_kind="group", year=y)
        if not df_g.empty:
            all_frames.append(df_g)
            print(f"{y} group parsed: {df_g.shape}")
    except Exception as e:
        print(f"{y} group skip: {e}")

    # 토너먼트
    try:
        df_k = parse_worldcup_txt(url_finals, section_kind="knockout", year=y)
        if not df_k.empty:
            all_frames.append(df_k)
            print(f"{y} finals parsed: {df_k.shape}")
    except Exception as e:
        print(f"{y} finals skip: {e}")

    year_df = pd.concat(all_frames, ignore_index=True)
    year_dfs[y] = year_df

2006 group parsed: (48, 19)


2006 finals parsed: (16, 19)
2010 group parsed: (48, 19)


  year_df = pd.concat(all_frames, ignore_index=True)


2010 finals parsed: (16, 19)
2014 group parsed: (48, 19)
2014 finals parsed: (16, 19)


In [80]:
year_dfs[2022]

Unnamed: 0,match_no,dow,month_day,time_local,home,away,stadium,city,score_blob,ft_home,ft_away,aet_home,aet_away,pen_home,pen_away,ht_raw,section,stage,year
0,1,Fri,Jun/9,,Germany,Costa Rica,Allianz Arena,München,4-2 (2-1),4,2,,,,,2-1,Group A,Group,2006
1,2,Fri,Jun/9,,Poland,Ecuador,Veltins-Arena,Gelsenkirchen,0-2 (0-1),0,2,,,,,0-1,Group A,Group,2006
2,17,Wed,Jun/14,,Germany,Poland,Signal Iduna Park,Dortmund,1-0 (0-0),1,0,,,,,0-0,Group A,Group,2006
3,18,Thu,Jun/15,,Ecuador,Costa Rica,AOL Arena,Hamburg,3-0 (1-0),3,0,,,,,1-0,Group A,Group,2006
4,33,Tue,Jun/20,,Ecuador,Germany,Olympiastadion,Berlin,0-3 (0-2),0,3,,,,,0-2,Group A,Group,2006
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
315,59,Sat,Dec/10,22:00,England,France,Al Bayt Stadium,Al Khor,1-2 (0-1),1,2,,,,,0-1,Quarter-finals,Knockout,2022
316,61,Tue,Dec/13,22:00,Argentina,Croatia,Lusail Iconic Stadium,Lusail,3-0 (2-0),3,0,,,,,2-0,Semi-finals,Knockout,2022
317,62,Wed,Dec/14,22:00,France,Morocco,Al Bayt Stadium,Al Khor,2-0 (1-0),2,0,,,,,1-0,Semi-finals,Knockout,2022
318,63,Sat,Dec/17,18:00,Croatia,Morocco,Khalifa International Stadium,Al Rayyan,2-1 (2-1),2,1,,,,,2-1,Match for third place,Knockout,2022


In [None]:
wc_all = pd.concat(all_frames, ignore_index=True)

# 날짜/시간 파싱
wc_all["datetime_local"] = [
    md_to_iso(md, yr, t) for md, yr, t in zip(
        wc_all["month_day"], wc_all["year"], wc_all["time_local"]
    )
]

wc_all.sort_values(["year", "stage", "section", "match_no"], inplace=True)
wc_all.reset_index(drop=True, inplace=True)

# 저장
wc_all.to_csv(OUT_DIR / "worldcup_1986_2022_matches.csv", index=False)
try:
    wc_all.to_parquet(OUT_DIR / "worldcup_1986_2022_matches.parquet", index=False)
except Exception:
    pass  # pyarrow/fastparquet 미설치시 무시

print("TOTAL:", wc_all.shape)

In [64]:
from parser import parse_worldcup_txt

# Source URL 
URL_GROUP = "https://raw.githubusercontent.com/openfootball/worldcup/master/2006--germany/cup.txt"
URL_FINAL = "https://raw.githubusercontent.com/openfootball/worldcup/master/2006--germany/cup_finals.txt"


# Execution
df_group = parse_worldcup_txt(URL_GROUP, section_kind="group")
df_knock  = parse_worldcup_txt(URL_FINAL, section_kind="knockout")

wc2022 = pd.concat([df_group, df_knock], ignore_index=True).sort_values(["stage", "section", "match_no"])
wc2022.reset_index(drop=True, inplace=True)

print(wc2022.shape)
wc2022.head(12)


KeyError: 'stage'

In [14]:
from pathlib import Path
import requests, json

base_dir = Path("../data/soccer/worldcup")
base_dir.mkdir(parents=True, exist_ok=True)

# ✅ 1) World Cup 2022 JSON (worldcup.json 레포)
url = "https://raw.githubusercontent.com/openfootball/worldcup.json/master/2022/worldcup.json"

# 깃허브가 가끔 HTML 페이지를 돌려줄 때 대비해 UA 설정
r = requests.get(url, timeout=30, headers={"User-Agent": "Mozilla/5.0"})
r.raise_for_status()

# UTF-8 BOM 안전 처리
text = r.text.lstrip("\ufeff")
data = json.loads(text)

# 원본 JSON 저장
dest = base_dir / "world-cup-2022.json"
dest.write_text(text, encoding="utf-8")

print(list(data.keys()))       # ['name', 'rounds', ...]
print(data['name'], data.get('season', ''))
print(len(data['matches']), "rounds")


['name', 'matches']
World Cup 2022 
48 rounds


In [15]:
data

{'name': 'World Cup 2022',
 'matches': [{'round': 'Matchday 1',
   'group': 'Group A',
   'date': '2022-11-20',
   'team1': 'Qatar',
   'team2': 'Ecuador'},
  {'round': 'Matchday 2',
   'group': 'Group A',
   'date': '2022-11-21',
   'team1': 'Senegal',
   'team2': 'Netherlands'},
  {'round': 'Matchday 6',
   'group': 'Group A',
   'date': '2022-11-25',
   'team1': 'Qatar',
   'team2': 'Senegal'},
  {'round': 'Matchday 6',
   'group': 'Group A',
   'date': '2022-11-25',
   'team1': 'Netherlands',
   'team2': 'Ecuador'},
  {'round': 'Matchday 10',
   'group': 'Group A',
   'date': '2022-11-29',
   'team1': 'Ecuador',
   'team2': 'Senegal'},
  {'round': 'Matchday 10',
   'group': 'Group A',
   'date': '2022-11-29',
   'team1': 'Netherlands',
   'team2': 'Qatar'},
  {'round': 'Matchday 2',
   'group': 'Group B',
   'date': '2022-11-21',
   'team1': 'England',
   'team2': 'Iran'},
  {'round': 'Matchday 2',
   'group': 'Group B',
   'date': '2022-11-21',
   'team1': 'United States',
   'tea