In [1]:
import shutil
import time

from bayesball.schema import MatchSummarySchema

"""A module to ingest data from the FBRef website"""
import dataclasses
import logging as log
import os
import polars as pl
from pathlib import Path
from rich.progress import track
from concurrent.futures import ProcessPoolExecutor, as_completed

from bayesball.worldfootballr import call_wf_function, fb_parse_match_data
from bayesball.utils import (
    get_current_season,
    maybe_download_file,
    setup_logging,
    r_to_python,
)
from bayesball.config import (
    ADVANCED_MATCH_STATS,
    COUNTRIES,
    TIERS,
    MIN_SEASON_END_YEAR,
    LEAGUE_STATS,
)
from bayesball.models import AdvancedMatchStats, MatchStats

STAGE_DIR = "data/ingest/stage"

BASE_DIR = "data/ingest/fbref"

SOURCE_SUFFIX = "fbref"
GENDER = "M"

In [4]:
df = pl.read_csv("./data/ingest/fbref/advanced_match_stats/player/defense/ENG_M_1st_wf.csv")

In [5]:
import pandera as pa

MatchSummarySchema = pa.DataFrameSchema(
    {
        "MatchURL": pa.Column(str),
        "League": pa.Column(str),
        "Match_Date": pa.Column(pa.Date),
        "Matchweek": pa.Column(str),
        "Home_Team": pa.Column(str),
        "Home_Formation": pa.Column(str, nullable=True),
        "Home_Score": pa.Column(int, default=0),
        "Home_xG": pa.Column(float, nullable=True),
        "Home_Goals": pa.Column(str, nullable=True),
        "Home_Yellow_Cards": pa.Column(int),
        "Home_Red_Cards": pa.Column(int),
        "Away_Team": pa.Column(str),
        "Away_Formation": pa.Column(str, nullable=True),
        "Away_Score": pa.Column(int, nullable=True, default=0),
        "Away_xG": pa.Column(float, nullable=True),
        "Away_Goals": pa.Column(str, nullable=True),
        "Away_Yellow_Cards": pa.Column(int),
        "Away_Red_Cards": pa.Column(int),
        "Game_URL": pa.Column(str),
        "Team": pa.Column(str),
        "Home_Away": pa.Column(str),
        "Event_Time": pa.Column(str),
        "Is_Pens": pa.Column(str),
        "Event_Half": pa.Column(str),
        "Event_Type": pa.Column(str),
        "Event_Players": pa.Column(str),
        "Score_Progression": pa.Column(str, nullable=True),
        "Penalty_Number": pa.Column(str, nullable=True),
        "Competition_Name": pa.Column(str),
        "Gender": pa.Column(str),
        "Country": pa.Column(str),
        "Tier": pa.Column(str),
        "Season_End_Year": pa.Column(int),
    },
    coerce=True,
    strict=True,
)

In [2]:
    competitions = pl.read_csv(Path(BASE_DIR) / "competitions.csv").rename(
        {
            "country": "Country",
            "tier": "Tier",
            "season_end_year": "Season_End_Year",
            "gender": "Gender",
            "competition_name": "Competition_Name",
        }
    )

In [4]:
match_summary_files = Path(BASE_DIR) / "match_summary"

for f in match_summary_files.glob("*.csv"):
    df = pl.read_csv(f)
    # if "Competition_Name" not in df.columns:
    #     df = df.join(competitions.select("Country", "Gender", "Tier", "Competition_Name"), on=["Country", "Gender", "Tier"], how="left")

    df_renamed = df.select(MatchSummarySchema.columns.keys())
    df_val = MatchSummarySchema.validate(df_renamed.to_pandas())
    if df_renamed.schema != df.schema:
        df_renamed.write_csv(f)

  check_obj[col_name] = check_obj[col_name].fillna(
  check_obj[col_name] = check_obj[col_name].fillna(
  check_obj[col_name] = check_obj[col_name].fillna(
  check_obj[col_name] = check_obj[col_name].fillna(


In [37]:
gender = "M"
match_summaries = pl.read_csv(Path(BASE_DIR) / "match_summary" / "*.csv")
match_shooting = pl.concat(
    [
        pl.read_csv(x)
        .select("Country", "Gender", "Tier", "Season_End_Year", "MatchURL")
        .unique()
        for x in (Path(BASE_DIR) / "match_shooting").glob("*.csv")
    ],
    how="diagonal_relaxed",
)
team_summary_stats = pl.concat(
    [
        pl.read_csv(x).select("MatchURL").unique()
        for x in (Path(BASE_DIR) / "advanced_match_stats" / "team" / "summary").glob(
            "*.csv"
        )
    ],
    how="diagonal_relaxed",
)
team_advanced_stats = pl.concat(
    [
        pl.read_csv(x).select("MatchURL").unique()
        for x in (Path(BASE_DIR) / "advanced_match_stats" / "team" / "possession").glob(
            "*.csv"
        )
    ],
    how="diagonal_relaxed",
)
tier_df = pl.DataFrame(LEAGUE_STATS)
match_results = (
    pl.read_csv(Path(BASE_DIR) / "match_results" / "*.csv")
    .join(tier_df, on=["Country", "Tier"])
    .filter(
        pl.col("Season_End_Year") >= MIN_SEASON_END_YEAR, pl.col("Gender") == gender
    )
)
match_shooting.filter(pl.col("Season_End_Year").is_null())
match_results_filtered = match_results.filter(
    ~pl.col("MatchURL").str.contains("History"),
    ~pl.col("Notes").fill_null("").str.contains("Cancelled"),
    ~pl.col("MatchURL").str.contains("RelegationPromotion-Play-offs"),
)
in_match_summary = (
    match_summaries.select("MatchURL").unique().with_columns(InMatchSummary=True)
)
in_team_match_summary = (
    team_summary_stats.select("MatchURL").unique().with_columns(InTeamSummary=True)
)
in_team_advanced_stats = (
    team_advanced_stats.select("MatchURL").unique().with_columns(InTeamAdvanced=True)
)
match_results_filtered = (
    match_results_filtered.join(in_match_summary, on="MatchURL", how="left")
    .join(in_team_match_summary, on="MatchURL", how="left")
    .join(in_team_advanced_stats, on="MatchURL", how="left")
    .fill_null(False)
)
missing_cond = (
    ~pl.col("InMatchSummary")
    | ~pl.col("InTeamSummary")
    | (
        ~pl.col("InTeamAdvanced")
        & (pl.col("Season_End_Year") >= pl.col("Min_Advanced_Season"))
    ).fill_null(False)
)
missing_matches = match_results_filtered.filter(missing_cond)
missing_matches = missing_matches.with_columns(
    filename=pl.lit(f"{STAGE_DIR}/html/")
    + pl.col("Country")
    + pl.lit("/")
    + pl.col("MatchURL").str.split("/").list.last()
    + pl.lit(".html"),
    match_id=pl.col("MatchURL").str.split("/").list[-2]
)

In [45]:
test = pl.read_csv("/home/jimmy/Code/FantasyFootball/data/ingest/fbref/match_summary/ENG_M_5th_match_summary_fbref*.csv")

ComputeError: schema lengths differ

In [44]:
test.group_by("Season_End_Year").len()

Season_End_Year,len
i64,u32
2023,6475
2020,4992
2024,6810
2021,5281
2018,5886
2019,5938
2025,4549
2022,5889


In [28]:
match_results_filtered = match_results.filter(
    ~pl.col("MatchURL").str.contains("History"),
    ~pl.col("Notes").fill_null("").str.contains("Cancelled"),
    ~pl.col("MatchURL").str.contains("RelegationPromotion-Play-offs"),
)

In [40]:
match_results_filtered.with_columns(missing_cond=missing_cond).filter(pl.col("Season_End_Year") >= 2025).filter(pl.col("missing_cond"))

Competition_Name,Gender,Country,Season_End_Year,Tier,Round,Wk,Day,Date,Time,Home,HomeGoals,Home_xG,Away,AwayGoals,Away_xG,Attendance,Venue,Referee,Notes,MatchURL,Min_Advanced_Season,InMatchSummary,InTeamSummary,InTeamAdvanced,missing_cond
str,str,str,i64,str,str,str,str,str,str,str,f64,str,str,f64,str,str,str,str,str,str,i64,bool,bool,bool,bool
"""EFL Championship""","""M""","""ENG""",2025,"""2nd""",,"""19""","""Fri""","""2024-12-06""","""20:00""","""Burnley""",1.0,"""0.6""","""Middlesbrough""",1.0,"""1.1""","""20543.0""","""Turf Moor""","""Andrew Kitchen""",,"""https://fbref.com/en/matches/b…",2019,false,false,false,true
"""EFL Championship""","""M""","""ENG""",2025,"""2nd""",,"""19""","""Sat""","""2024-12-07""","""12:30""","""Sheffield Weds""",1.0,"""1.5""","""Preston""",1.0,"""3.0""","""23927.0""","""Hillsborough Stadium""","""Robert Madley""",,"""https://fbref.com/en/matches/6…",2019,false,false,false,true
"""EFL Championship""","""M""","""ENG""",2025,"""2nd""",,"""19""","""Sat""","""2024-12-07""","""12:30""","""Sunderland""",2.0,"""2.4""","""Stoke City""",1.0,"""1.0""","""39311.0""","""Stadium of Light""","""Craig Pawson""",,"""https://fbref.com/en/matches/a…",2019,false,false,false,true
"""EFL Championship""","""M""","""ENG""",2025,"""2nd""",,"""19""","""Sat""","""2024-12-07""","""12:30""","""Leeds United""",2.0,"""1.2""","""Derby County""",0.0,"""0.2""","""36468.0""","""Elland Road""","""Matt Donohue""",,"""https://fbref.com/en/matches/e…",2019,false,false,false,true
"""EFL Championship""","""M""","""ENG""",2025,"""2nd""",,"""19""","""Sat""","""2024-12-07""","""15:00""","""Portsmouth""",3.0,"""2.4""","""Bristol City""",0.0,"""1.2""","""20415.0""","""Fratton Park""","""Sunny Gill""",,"""https://fbref.com/en/matches/0…",2019,false,false,false,true
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""Serie A""","""M""","""ITA""",2025,"""1st""",,"""21""","""Sun""","""2025-01-19""","""15:00""","""Cagliari""",4.0,"""2.6""","""Lecce""",1.0,"""0.4""",,"""Unipol Domus""","""Juan Luca Sacchi""",,"""https://fbref.com/en/matches/2…",2018,false,false,false,true
"""Serie A""","""M""","""ITA""",2025,"""1st""",,"""21""","""Sun""","""2025-01-19""","""15:00""","""Parma""",1.0,"""1.6""","""Venezia""",1.0,"""1.1""",,"""Stadio Ennio Tardini""","""Francesco Fourneau""",,"""https://fbref.com/en/matches/f…",2018,false,false,false,true
"""Serie A""","""M""","""ITA""",2025,"""1st""",,"""21""","""Sun""","""2025-01-19""","""18:00""","""Hellas Verona""",0.0,"""1.3""","""Lazio""",3.0,"""1.5""",,"""Stadio Marc'Antonio Bentegodi""","""Michael Fabbri""",,"""https://fbref.com/en/matches/a…",2018,false,false,false,true
"""Serie A""","""M""","""ITA""",2025,"""1st""",,"""21""","""Sun""","""2025-01-19""","""20:45""","""Inter""",3.0,"""1.6""","""Empoli""",1.0,"""0.4""",,"""Stadio Giuseppe Meazza""","""Ermanno Feliciani""",,"""https://fbref.com/en/matches/9…",2018,false,false,false,true


In [34]:
match_summaries.filter(pl.col("Season_End_Year")>=2025).group_by("Country", "Tier", "Season_End_Year").len()

Country,Tier,Season_End_Year,len
str,str,i64,u32
"""FRA""","""1st""",2025,2413
"""ENG""","""1st""",2025,2148
"""ENG""","""3rd""",2025,4517
"""ITA""","""1st""",2025,3241
"""GER""","""1st""",2025,2899
"""ENG""","""2nd""",2025,6029
"""ESP""","""1st""",2025,3348
"""USA""","""1st""",2025,4790


In [26]:
match_results.filter(pl.col("Season_End_Year")>=2025,
    # ~pl.col("MatchURL").str.contains("History"),
    ~pl.col("Notes").fill_null("").str.contains("Cancelled"),
    # ~pl.col("MatchURL").str.contains("RelegationPromotion-Play-offs"),
).filter(pl.col("Date")>"2025-01-10")

Competition_Name,Gender,Country,Season_End_Year,Tier,Round,Wk,Day,Date,Time,Home,HomeGoals,Home_xG,Away,AwayGoals,Away_xG,Attendance,Venue,Referee,Notes,MatchURL,Min_Advanced_Season
str,str,str,i64,str,str,str,str,str,str,str,f64,str,str,f64,str,str,str,str,str,str,i64
"""EFL Championship""","""M""","""ENG""",2025,"""2nd""",,"""19""","""Tue""","""2025-01-14""","""19:45""","""Plymouth Argyle""",1.0,"""0.6""","""Oxford United""",1.0,"""0.4""","""15933.0""","""Home Park""","""Samuel Allison""",,"""https://fbref.com/en/matches/2…",2019
"""EFL Championship""","""M""","""ENG""",2025,"""2nd""",,"""19""","""Tue""","""2025-01-14""","""19:45""","""Cardiff City""",1.0,"""0.7""","""Watford""",1.0,"""1.0""","""16942.0""","""Cardiff City Stadium""","""Oliver Langford""",,"""https://fbref.com/en/matches/3…",2019
"""EFL Championship""","""M""","""ENG""",2025,"""2nd""",,"""16""","""Wed""","""2025-01-15""","""19:45""","""Blackburn""",3.0,"""1.3""","""Portsmouth""",0.0,"""1.1""","""13703.0""","""Ewood Park""","""Farai Hallam""",,"""https://fbref.com/en/matches/0…",2019
"""EFL Championship""","""M""","""ENG""",2025,"""2nd""",,"""27""","""Fri""","""2025-01-17""","""20:00""","""Burnley""",0.0,"""0.7""","""Sunderland""",0.0,"""2.8""","""21014.0""","""Turf Moor""","""Anthony Backhouse""",,"""https://fbref.com/en/matches/2…",2019
"""EFL Championship""","""M""","""ENG""",2025,"""2nd""",,"""27""","""Sat""","""2025-01-18""","""12:30""","""Millwall""",0.0,"""0.7""","""Hull City""",1.0,"""0.6""","""14579.0""","""The Den""","""Stephen Martin""",,"""https://fbref.com/en/matches/2…",2019
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""Major League Soccer""","""M""","""USA""",2025,"""1st""",,,"""Sat""","""2025-10-18""","""18:00""","""Portland Timbers""",,,"""San Diego FC""",,,,"""Providence Park""",,,"""https://fbref.com/en/stathead/…",2018
"""Major League Soccer""","""M""","""USA""",2025,"""1st""",,,"""Sat""","""2025-10-18""","""18:00""","""SJ Earthquakes""",,,"""Austin""",,,,"""PayPal Park""",,,"""https://fbref.com/en/stathead/…",2018
"""Major League Soccer""","""M""","""USA""",2025,"""1st""",,,"""Sat""","""2025-10-18""","""19:00""","""Colorado Rapids""",,,"""LAFC""",,,,"""Dick's Sporting Goods Park""",,,"""https://fbref.com/en/stathead/…",2018
"""Major League Soccer""","""M""","""USA""",2025,"""1st""",,,"""Sat""","""2025-10-18""","""20:00""","""Sporting KC""",,,"""Houston Dynamo""",,,,"""Children's Mercy Park""",,,"""https://fbref.com/en/stathead/…",2018


In [29]:
competitions = pl.read_csv(Path(BASE_DIR) / "competitions.csv").filter(pl.col("season_end_year") == 2025)

In [15]:
match_results = match_results.with_columns(CorrectURL=~pl.col("MatchURL").str.contains("History"))

match_results.filter(pl.col("Season_End_Year") == 2025).with_columns(Date=pl.col("Date").cast(pl.Date)).group_by("Date").agg(pl.col("CorrectURL").mean().alias("CorrectURL")).plot.line("Date","CorrectURL")

In [21]:
match_results.filter(~pl.col("CorrectURL"))

Competition_Name,Gender,Country,Season_End_Year,Tier,Round,Wk,Day,Date,Time,Home,HomeGoals,Home_xG,Away,AwayGoals,Away_xG,Attendance,Venue,Referee,Notes,MatchURL,Min_Advanced_Season,CorrectURL
str,str,str,i64,str,str,str,str,str,str,str,f64,str,str,f64,str,str,str,str,str,str,i64,bool
"""EFL Championship""","""M""","""ENG""",2025,"""2nd""",,"""28""","""Tue""","""2025-01-21""","""19:45""","""Swansea City""",,,"""Sheffield Utd""",,,,"""Swansea.com Stadium""",,,"""https://fbref.com/en/stathead/…",2019,false
"""EFL Championship""","""M""","""ENG""",2025,"""2nd""",,"""28""","""Tue""","""2025-01-21""","""19:45""","""Blackburn""",,,"""Coventry City""",,,,"""Ewood Park""",,,"""https://fbref.com/en/stathead/…",2019,false
"""EFL Championship""","""M""","""ENG""",2025,"""2nd""",,"""28""","""Tue""","""2025-01-21""","""19:45""","""Oxford United""",,,"""Luton Town""",,,,"""The Kassam Stadium""",,,"""https://fbref.com/en/stathead/…",2019,false
"""EFL Championship""","""M""","""ENG""",2025,"""2nd""",,"""28""","""Tue""","""2025-01-21""","""19:45""","""Derby County""",,,"""Sunderland""",,,,"""Pride Park Stadium""",,,"""https://fbref.com/en/stathead/…",2019,false
"""EFL Championship""","""M""","""ENG""",2025,"""2nd""",,"""28""","""Tue""","""2025-01-21""","""19:45""","""Watford""",,,"""Preston""",,,,"""Vicarage Road Stadium""",,,"""https://fbref.com/en/stathead/…",2019,false
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""Major League Soccer""","""M""","""USA""",2025,"""1st""",,,"""Sat""","""2025-10-18""","""18:00""","""Portland Timbers""",,,"""San Diego FC""",,,,"""Providence Park""",,,"""https://fbref.com/en/stathead/…",2018,false
"""Major League Soccer""","""M""","""USA""",2025,"""1st""",,,"""Sat""","""2025-10-18""","""18:00""","""SJ Earthquakes""",,,"""Austin""",,,,"""PayPal Park""",,,"""https://fbref.com/en/stathead/…",2018,false
"""Major League Soccer""","""M""","""USA""",2025,"""1st""",,,"""Sat""","""2025-10-18""","""19:00""","""Colorado Rapids""",,,"""LAFC""",,,,"""Dick's Sporting Goods Park""",,,"""https://fbref.com/en/stathead/…",2018,false
"""Major League Soccer""","""M""","""USA""",2025,"""1st""",,,"""Sat""","""2025-10-18""","""20:00""","""Sporting KC""",,,"""Houston Dynamo""",,,,"""Children's Mercy Park""",,,"""https://fbref.com/en/stathead/…",2018,false


In [22]:
df = pl.read_csv("/home/jimmy/Code/FantasyFootball/data/ingest/fbref/match_summary/ENG_M_5th_match_summary_fbref_0001.csv")

In [23]:
df

MatchURL,League,Match_Date,Matchweek,Home_Team,Home_Formation,Home_Score,Home_xG,Home_Goals,Home_Yellow_Cards,Home_Red_Cards,Away_Team,Away_Formation,Away_Score,Away_xG,Away_Goals,Away_Yellow_Cards,Away_Red_Cards,Game_URL,Team,Home_Away,Event_Time,Is_Pens,Event_Half,Event_Type,Event_Players,Score_Progression,Penalty_Number,Competition_Name,Gender,Country,Tier,Season_End_Year
str,str,str,str,str,str,f64,str,str,i64,i64,str,str,f64,str,str,i64,i64,str,str,str,f64,bool,f64,str,str,str,str,str,str,str,str,i64
"""https://fbref.com/en/matches/0…","""National League""","""2018-05-05""","""National League (Semi-finals)""","""Tranmere Rovers""","""4-4-2""",4.0,,"""James Norwood · 33&rsquor; Jos…",3,0,"""Ebbsfleet United""","""4-3-2-1""",2.0,,"""Luke Coulson · 16&rsquor; Myle…",1,1,"""https://fbref.com/en/matches/0…","""Ebbsfleet United""","""Away""",16.0,false,1.0,"""Goal""","""Luke Coulson""","""0:1""",,"""National League""","""M""","""ENG""","""5th""",2018
"""https://fbref.com/en/matches/0…","""National League""","""2018-05-05""","""National League (Semi-finals)""","""Tranmere Rovers""","""4-4-2""",4.0,,"""James Norwood · 33&rsquor; Jos…",3,0,"""Ebbsfleet United""","""4-3-2-1""",2.0,,"""Luke Coulson · 16&rsquor; Myle…",1,1,"""https://fbref.com/en/matches/0…","""Ebbsfleet United""","""Away""",16.0,false,1.0,"""Goal""","""Luke Coulson""","""0:1""",,"""National League""","""M""","""ENG""","""5th""",2018
"""https://fbref.com/en/matches/0…","""National League""","""2018-05-05""","""National League (Semi-finals)""","""Tranmere Rovers""","""4-4-2""",4.0,,"""James Norwood · 33&rsquor; Jos…",3,0,"""Ebbsfleet United""","""4-3-2-1""",2.0,,"""Luke Coulson · 16&rsquor; Myle…",1,1,"""https://fbref.com/en/matches/0…","""Ebbsfleet United""","""Away""",16.0,false,1.0,"""Goal""","""Luke Coulson""","""0:1""",,"""National League""","""M""","""ENG""","""5th""",2018
"""https://fbref.com/en/matches/0…","""National League""","""2018-05-05""","""National League (Semi-finals)""","""Tranmere Rovers""","""4-4-2""",4.0,,"""James Norwood · 33&rsquor; Jos…",3,0,"""Ebbsfleet United""","""4-3-2-1""",2.0,,"""Luke Coulson · 16&rsquor; Myle…",1,1,"""https://fbref.com/en/matches/0…","""Ebbsfleet United""","""Away""",16.0,false,1.0,"""Goal""","""Luke Coulson""","""0:1""",,"""National League""","""M""","""ENG""","""5th""",2018
"""https://fbref.com/en/matches/0…","""National League""","""2018-05-05""","""National League (Semi-finals)""","""Tranmere Rovers""","""4-4-2""",4.0,,"""James Norwood · 33&rsquor; Jos…",3,0,"""Ebbsfleet United""","""4-3-2-1""",2.0,,"""Luke Coulson · 16&rsquor; Myle…",1,1,"""https://fbref.com/en/matches/0…","""Ebbsfleet United""","""Away""",16.0,false,1.0,"""Goal""","""Luke Coulson""","""0:1""",,"""National League""","""M""","""ENG""","""5th""",2018
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""https://fbref.com/en/matches/4…","""National League""","""2019-05-05""","""National League (Semi-finals)""","""Salford City""","""5-3-2""",1.0,,"""Carl Piergianni · 43&rsquor;""",0,0,"""Eastleigh""","""5-3-2""",1.0,,"""Paul McCallum · 57&rsquor;""",0,0,"""https://fbref.com/en/matches/4…","""Eastleigh""","""Away""",121.0,true,5.0,"""Penalty Shootout""","""Chris Zebroski""","""4:3""","""10.0""","""National League""","""M""","""ENG""","""5th""",2019
"""https://fbref.com/en/matches/4…","""National League""","""2019-05-05""","""National League (Semi-finals)""","""Salford City""","""5-3-2""",1.0,,"""Carl Piergianni · 43&rsquor;""",0,0,"""Eastleigh""","""5-3-2""",1.0,,"""Paul McCallum · 57&rsquor;""",0,0,"""https://fbref.com/en/matches/4…","""Eastleigh""","""Away""",121.0,true,5.0,"""Penalty Shootout""","""Chris Zebroski""","""4:3""","""10.0""","""National League""","""M""","""ENG""","""5th""",2019
"""https://fbref.com/en/matches/4…","""National League""","""2019-05-05""","""National League (Semi-finals)""","""Salford City""","""5-3-2""",1.0,,"""Carl Piergianni · 43&rsquor;""",0,0,"""Eastleigh""","""5-3-2""",1.0,,"""Paul McCallum · 57&rsquor;""",0,0,"""https://fbref.com/en/matches/4…","""Eastleigh""","""Away""",121.0,true,5.0,"""Penalty Shootout""","""Chris Zebroski""","""4:3""","""10.0""","""National League""","""M""","""ENG""","""5th""",2019
"""https://fbref.com/en/matches/4…","""National League""","""2019-05-05""","""National League (Semi-finals)""","""Salford City""","""5-3-2""",1.0,,"""Carl Piergianni · 43&rsquor;""",0,0,"""Eastleigh""","""5-3-2""",1.0,,"""Paul McCallum · 57&rsquor;""",0,0,"""https://fbref.com/en/matches/4…","""Eastleigh""","""Away""",121.0,true,5.0,"""Penalty Shootout""","""Chris Zebroski""","""4:3""","""10.0""","""National League""","""M""","""ENG""","""5th""",2019


In [19]:
match_results.filter(pl.col("Season_End_Year") == 2025).with_columns(Date=pl.col("Date").cast(pl.Date).dt.month_end()).group_by("Date").agg(pl.col("CorrectURL").mean().alias("CorrectURL"))

Date,CorrectURL
date,f64
2025-06-30,0.0
2024-11-30,1.0
2025-04-30,0.0
2025-03-31,0.0
2024-12-31,1.0
…,…
2025-05-31,0.0
2025-01-31,0.65311
2025-07-31,0.0
2025-09-30,0.0


In [38]:
match_results.filter(pl.col("Season_End_Year")>=2025).filter(~pl.col("CorrectURL"))

Competition_Name,Gender,Country,Season_End_Year,Tier,Round,Wk,Day,Date,Time,Home,HomeGoals,Home_xG,Away,AwayGoals,Away_xG,Attendance,Venue,Referee,Notes,MatchURL,Min_Advanced_Season,CorrectURL
str,str,str,i64,str,str,str,str,str,str,str,f64,str,str,f64,str,str,str,str,str,str,i64,bool
"""EFL Championship""","""M""","""ENG""",2025,"""2nd""",,"""28""","""Tue""","""2025-01-21""","""19:45""","""Swansea City""",,,"""Sheffield Utd""",,,,"""Swansea.com Stadium""",,,"""https://fbref.com/en/stathead/…",2019,false
"""EFL Championship""","""M""","""ENG""",2025,"""2nd""",,"""28""","""Tue""","""2025-01-21""","""19:45""","""Blackburn""",,,"""Coventry City""",,,,"""Ewood Park""",,,"""https://fbref.com/en/stathead/…",2019,false
"""EFL Championship""","""M""","""ENG""",2025,"""2nd""",,"""28""","""Tue""","""2025-01-21""","""19:45""","""Oxford United""",,,"""Luton Town""",,,,"""The Kassam Stadium""",,,"""https://fbref.com/en/stathead/…",2019,false
"""EFL Championship""","""M""","""ENG""",2025,"""2nd""",,"""28""","""Tue""","""2025-01-21""","""19:45""","""Derby County""",,,"""Sunderland""",,,,"""Pride Park Stadium""",,,"""https://fbref.com/en/stathead/…",2019,false
"""EFL Championship""","""M""","""ENG""",2025,"""2nd""",,"""28""","""Tue""","""2025-01-21""","""19:45""","""Watford""",,,"""Preston""",,,,"""Vicarage Road Stadium""",,,"""https://fbref.com/en/stathead/…",2019,false
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""Major League Soccer""","""M""","""USA""",2025,"""1st""",,,"""Sat""","""2025-10-18""","""18:00""","""Portland Timbers""",,,"""San Diego FC""",,,,"""Providence Park""",,,"""https://fbref.com/en/stathead/…",2018,false
"""Major League Soccer""","""M""","""USA""",2025,"""1st""",,,"""Sat""","""2025-10-18""","""18:00""","""SJ Earthquakes""",,,"""Austin""",,,,"""PayPal Park""",,,"""https://fbref.com/en/stathead/…",2018,false
"""Major League Soccer""","""M""","""USA""",2025,"""1st""",,,"""Sat""","""2025-10-18""","""19:00""","""Colorado Rapids""",,,"""LAFC""",,,,"""Dick's Sporting Goods Park""",,,"""https://fbref.com/en/stathead/…",2018,false
"""Major League Soccer""","""M""","""USA""",2025,"""1st""",,,"""Sat""","""2025-10-18""","""20:00""","""Sporting KC""",,,"""Houston Dynamo""",,,,"""Children's Mercy Park""",,,"""https://fbref.com/en/stathead/…",2018,false


In [33]:
match_results

Competition_Name,Gender,Country,Season_End_Year,Tier,Round,Wk,Day,Date,Time,Home,HomeGoals,Home_xG,Away,AwayGoals,Away_xG,Attendance,Venue,Referee,Notes,MatchURL,Min_Advanced_Season,InMatchSummary,InTeamSummary,InTeamAdvanced,CorrectURL
str,str,str,i64,str,str,str,str,str,str,str,f64,str,str,f64,str,str,str,str,str,str,i64,bool,bool,bool,bool
"""EFL Championship""","""M""","""ENG""",2018,"""2nd""","""Semi-finals""",,"""Fri""","""2018-05-11""","""19:45""","""Derby County""",1.0,,"""Fulham""",0.0,,"""27163.0""","""iPro Stadium""","""Roger East""","""Leg 1 of 2""","""https://fbref.com/en/matches/f…",2019,true,true,false,true
"""EFL Championship""","""M""","""ENG""",2018,"""2nd""","""Semi-finals""",,"""Sat""","""2018-05-12""","""17:15""","""Middlesbrough""",0.0,,"""Aston Villa""",1.0,,"""29233.0""","""Riverside Stadium""","""Robert Madley""","""Leg 1 of 2""","""https://fbref.com/en/matches/e…",2019,true,true,false,true
"""EFL Championship""","""M""","""ENG""",2018,"""2nd""","""Semi-finals""",,"""Mon""","""2018-05-14""","""19:45""","""Fulham""",2.0,,"""Derby County""",0.0,,"""23529.0""","""Craven Cottage""","""Chris Kavanagh""","""Leg 2 of 2; Fulham won""","""https://fbref.com/en/matches/5…",2019,true,true,false,true
"""EFL Championship""","""M""","""ENG""",2018,"""2nd""","""Semi-finals""",,"""Tue""","""2018-05-15""","""19:45""","""Aston Villa""",0.0,,"""Middlesbrough""",0.0,,"""40505.0""","""Villa Park""","""Mike Dean""","""Leg 2 of 2; Aston Villa won""","""https://fbref.com/en/matches/4…",2019,true,true,false,true
"""EFL Championship""","""M""","""ENG""",2019,"""2nd""","""Regular season""","""45""","""Sat""","""2019-04-27""","""15:00""","""Bolton""",0.0,,"""Brentford""",1.0,,,"""Macron Stadium""","""Tony Harrington""","""Match awarded to Brentford""","""https://fbref.com/en/matches/1…",2019,false,true,false,true
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""Major League Soccer""","""M""","""USA""",2023,"""1st""","""Wild Card Round""",,"""Wed""","""2023-10-25""","""20:30""","""Sporting KC""",0.0,"""0.7""","""San Jose""",0.0,"""0.5""","""17437.0""","""Children's Mercy Park""","""Allen Chapman""","""Sporting KC won on penalty kic…","""https://fbref.com/en/matches/6…",2018,true,true,true,true
"""Major League Soccer""","""M""","""USA""",2023,"""1st""","""Conference Semifinals""",,"""Sat""","""2023-11-25""","""17:30""","""Orlando City""",0.0,"""1.2""","""Columbus Crew""",2.0,"""2.4""","""25527.0""","""Exploria Stadium""","""Jon Freemon""","""Required Extra Time""","""https://fbref.com/en/matches/3…",2018,true,true,true,true
"""Major League Soccer""","""M""","""USA""",2023,"""1st""","""Conference Finals""",,"""Sat""","""2023-12-02""","""18:00""","""FC Cincinnati""",2.0,"""1.0""","""Columbus Crew""",3.0,"""3.2""","""25513.0""","""TQL Stadium""","""Allen Chapman""","""Required Extra Time""","""https://fbref.com/en/matches/7…",2018,true,true,true,true
"""Major League Soccer""","""M""","""USA""",2024,"""1st""","""Wild Card Round""",,"""Tue""","""2024-10-22""","""19:30""","""CF Montréal""",2.0,"""1.6""","""Atlanta Utd""",2.0,"""1.3""","""16566.0""","""Stade Saputo""","""Joe Dickerson""","""Atlanta Utd won on penalty kic…","""https://fbref.com/en/matches/3…",2018,true,true,true,true


In [30]:
competitions

competition_type,competition_name,country,gender,governing_body,first_season,last_season,tier,awards,comp_url,seasons,season_end_year,seasons_urls,fixtures_url,is_completed,filter_out
str,str,str,str,str,str,str,str,str,str,str,i64,str,str,bool,str
"""Club International Cups""","""Copa Libertadores de América""",,"""M""","""CONMEBOL""","""2014""","""2025""",,,"""https://fbref.com/en/comps/14/…","""2025""",2025,"""https://fbref.com/en/comps/14/…","""https://fbref.com/en/comps/14/…",false,"""N"""
"""Club International Cups""","""Copa CONMEBOL Sudamericana""",,"""M""","""CONMEBOL""","""2014""","""2025""",,,"""https://fbref.com/en/comps/205…","""2025""",2025,"""https://fbref.com/en/comps/205…","""https://fbref.com/en/comps/205…",false,"""N"""
"""Club International Cups""","""UEFA Champions League""",,"""M""","""UEFA""","""1990-1991""","""2024-2025""",,,"""https://fbref.com/en/comps/8/h…","""2024-2025""",2025,"""https://fbref.com/en/comps/8/C…","""https://fbref.com/en/comps/8/s…",false,"""N"""
"""Club International Cups""","""UEFA Europa League""",,"""M""","""UEFA""","""1990-1991""","""2024-2025""",,,"""https://fbref.com/en/comps/19/…","""2024-2025""",2025,"""https://fbref.com/en/comps/19/…","""https://fbref.com/en/comps/19/…",false,"""N"""
"""Club International Cups""","""UEFA Conference League""",,"""M""","""UEFA""","""2021-2022""","""2024-2025""",,,"""https://fbref.com/en/comps/882…","""2024-2025""",2025,"""https://fbref.com/en/comps/882…","""https://fbref.com/en/comps/882…",false,"""N"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""Domestic Cups""","""Coppa Italia""","""ITA""","""M""",,"""2014-2015""","""2024-2025""",,,"""https://fbref.com/en/comps/529…","""2024-2025""",2025,"""https://fbref.com/en/comps/529…","""https://fbref.com/en/comps/529…",false,"""N"""
"""Domestic Cups""","""Supercoppa Italiana""","""ITA""","""M""",,"""2014""","""2025""",,,"""https://fbref.com/en/comps/612…","""2025""",2025,"""https://fbref.com/en/comps/612…","""https://fbref.com/en/comps/612…",true,"""N"""
"""Domestic Youth Leagues""","""Premier League 2""","""ENG""","""M""",,"""2016-2017""","""2024-2025""","""Youth""",,"""https://fbref.com/en/comps/852…","""2024-2025""",2025,"""https://fbref.com/en/comps/852…","""https://fbref.com/en/comps/852…",false,"""N"""
"""Domestic Youth Leagues""","""U17 DFB Youth League""","""GER""","""M""",,"""2016-2017""","""2024-2025""","""Youth""",,"""https://fbref.com/en/comps/851…","""2024-2025""",2025,"""https://fbref.com/en/comps/851…","""https://fbref.com/en/comps/851…",false,"""N"""


In [9]:
match_summaries.filter(pl.col("Season_End_Year")==2025).group_by("Country", "Tier", "Season_End_Year").len()

Country,Tier,Season_End_Year,len
str,str,i64,u32
"""ENG""","""3rd""",2025,4517
"""ENG""","""1st""",2025,2148
"""GER""","""1st""",2025,2899
"""ENG""","""2nd""",2025,6029
"""FRA""","""1st""",2025,2413
"""ITA""","""1st""",2025,3241
"""USA""","""1st""",2025,4790
"""ESP""","""1st""",2025,3348


In [85]:
import shutil

In [86]:
# new_files =list(Path(STAGE_DIR).glob("**/*.csv"))
# for f in new_files:
#     next_path = str(f).replace(STAGE_DIR, BASE_DIR)
#     existing_pattern = Path(next_path).with_name(f.name.replace("_fbref", "_fbref_*"))
#     # get the maximum 4 digit number in the existing files
#     existing_files = len([x for x in Path().glob(str(existing_pattern))]) + 1
#     new_name = Path(next_path).with_name(f.name.replace("_fbref", f"_fbref_{existing_files:>04}"))
#     shutil.move(f, new_name)

In [52]:
def r_to_python(r_obj):
    """Convert R object to Python object"""
    try:
        r('''
        unnest_list_columns <- function(df) {
            df <- tidyr::unnest(df, where(is.list), keep_empty = TRUE)
            return(df)
        }
        ''')
        r_obj = r.unnest_list_columns(r_obj)
        res = pandas2ri.rpy2py(r_obj)


        for c in res.columns:
            if res[c].dtype == "object":
                res[c] = res[c].str.replace("NA_character_", "")
        # if "MatchURL" in res.columns:
        #     if "Game_URL" in res.columns:
        #         res["MatchURL"] = res["Game_URL"]
        #     else:
        #         raise ValueError("URL not found")
        return pl.DataFrame(res)
    except Exception as e:
        # convert to list
        return r_obj

In [3]:
def get_wages(part_df):
    team_urls = call_wf_function(
        "fb_teams_urls", league_url=part_df["seasons_urls"].to_list()[0]
    )
    wages = []
    for url in track(team_urls, description="Getting wages"):
        try:
            w = call_wf_function("fb_squad_wages", team_urls=[str(url)], time_pause=4)
            # time.sleep(4)
            if w is not None:
                wages.append(w)
            else:
                log.error(f"No wages found for {url}")
        except Exception as e:
            log.error(f"Error getting wages for {url}: {e}")

    # wages = call_wf_function("fb_squad_wages", team_urls=team_urls, time_pause=4)
    wages = pl.concat(wages)
    return wages

In [None]:
    competitions = pl.read_csv(Path(BASE_DIR) / "competitions.csv").rename(
        {
            "country": "Country",
            "tier": "Tier",
            "season_end_year": "Season_End_Year",
            "gender": "Gender",
        }
    )
    tier_df = pl.DataFrame(LEAGUE_STATS)
    filtered_competitions = competitions.join(tier_df, on=["Country", "Tier"]).filter(
        pl.col("Season_End_Year") >= MIN_SEASON_END_YEAR, pl.col("Gender") == gender
    )
    league_parts = filtered_competitions.partition_by(
        ["Country", "Gender", "Tier", "Season_End_Year"], as_dict=True
    )
    for (country, gender, tier, season), part_df in league_parts.items():

        filename = (
            Path(STAGE_DIR)
            / "wages"
            / f"{country}_{gender}_{tier}_{SOURCE_SUFFIX}_{season}.csv"
        )
        if not filename.parent.exists():
            filename.parent.mkdir(parents=True, exist_ok=True)
        if filename.exists():
            continue
        print(f"Getting wages for {country} {tier} {season}")
        wages = get_wages(part_df)

        if wages is not None:
            wages.write_csv(filename)


In [54]:
w = [r_to_python(x) if not isinstance(x, pl.DataFrame) else x for x in wages]

In [55]:
w2 = pl.concat(w)

In [57]:
filename

PosixPath('data/ingest/stage/wages/ENG_M_1st_fbref_2018.csv')

In [58]:
w2.write_csv(filename)

In [49]:
r('''
unnest_list_columns <- function(df) {
    df <- tidyr::unnest(df, where(is.list), keep_empty = TRUE)
    return(df)
}
''')

rdf_exploded = r.unnest_list_columns(wages[-3])

In [46]:
print(wages[-3])

# A tibble: 36 × 15
   Team       Comp  Season Player Nation Pos     Age WeeklyWageGBP WeeklyWageEUR
   <chr>      <chr> <chr>  <chr>  <chr>  <chr> <int> <list>        <list>       
 1 Swansea C… Prem… 2017-… Wilfr… CIV    FW       28 <dbl [1]>     <dbl [1]>    
 2 Swansea C… Prem… 2017-… André… GHA    MF,FW    27 <dbl [1]>     <dbl [1]>    
 3 Swansea C… Prem… 2017-… Tammy… ENG    FW       19 <dbl [1]>     <dbl [1]>    
 4 Swansea C… Prem… 2017-… Sam C… ENG    MF       26 <dbl [1]>     <dbl [1]>    
 5 Swansea C… Prem… 2017-… Łukas… POL    GK       32 <dbl [1]>     <dbl [1]>    
 6 Swansea C… Prem… 2017-… Jorda… GHA    FW,MF    25 <dbl [1]>     <dbl [1]>    
 7 Swansea C… Prem… 2017-… Leroy… NED    MF       27 <dbl [1]>     <dbl [1]>    
 8 Swansea C… Prem… 2017-… Feder… ARG    DF       28 <dbl [1]>     <dbl [1]>    
 9 Swansea C… Prem… 2017-… Andy … WAL    MF       28 <dbl [1]>     <dbl [1]>    
10 Swansea C… Prem… 2017-… Leon … ENG    MF       34 <dbl [1]>     <dbl [1]>    
# ℹ 26 m

In [50]:
print(rdf_exploded)

# A tibble: 37 × 15
   Team       Comp  Season Player Nation Pos     Age WeeklyWageGBP WeeklyWageEUR
   <chr>      <chr> <chr>  <chr>  <chr>  <chr> <int>         <dbl>         <dbl>
 1 Swansea C… Prem… 2017-… Wilfr… CIV    FW       28         90000        103602
 2 Swansea C… Prem… 2017-… André… GHA    MF,FW    27         80000         92091
 3 Swansea C… Prem… 2017-… Tammy… ENG    FW       19         50000         57557
 4 Swansea C… Prem… 2017-… Sam C… ENG    MF       26         50000         57557
 5 Swansea C… Prem… 2017-… Łukas… POL    GK       32         50000         57557
 6 Swansea C… Prem… 2017-… Jorda… GHA    FW,MF    25         50000         57557
 7 Swansea C… Prem… 2017-… Leroy… NED    MF       27         45000         51801
 8 Swansea C… Prem… 2017-… Feder… ARG    DF       28         43000         49499
 9 Swansea C… Prem… 2017-… Andy … WAL    MF       28         40000         46045
10 Swansea C… Prem… 2017-… Leon … ENG    MF       34         37500         43168
# ℹ 27 m

In [51]:
r_to_python(rdf_exploded)

Team,Comp,Season,Player,Nation,Pos,Age,WeeklyWageGBP,WeeklyWageEUR,WeeklyWageUSD,AnnualWageGBP,AnnualWageEUR,AnnualWageUSD,Notes,Url
str,str,str,str,str,str,i64,f64,f64,f64,f64,f64,f64,str,str
"""Swansea City""","""Premier League""","""2017-2018""","""Wilfried Bony""","""CIV""","""FW""",28,90000.0,103602.0,113562.0,4.68e6,5.387307e6,5.905244e6,"""Unverified estimation""","""https://fbref.com/en/players/6…"
"""Swansea City""","""Premier League""","""2017-2018""","""André Ayew""","""GHA""","""MF,FW""",27,80000.0,92091.0,100944.0,4.16e6,4.788717e6,5.249105e6,"""Unverified estimation""","""https://fbref.com/en/players/5…"
"""Swansea City""","""Premier League""","""2017-2018""","""Tammy Abraham""","""ENG""","""FW""",19,50000.0,57557.0,63090.0,2.6e6,2.992948e6,3.280691e6,"""Unverified estimation""","""https://fbref.com/en/players/f…"
"""Swansea City""","""Premier League""","""2017-2018""","""Sam Clucas""","""ENG""","""MF""",26,50000.0,57557.0,63090.0,2.6e6,2.992948e6,3.280691e6,"""Unverified estimation""","""https://fbref.com/en/players/9…"
"""Swansea City""","""Premier League""","""2017-2018""","""Łukasz Fabiański""","""POL""","""GK""",32,50000.0,57557.0,63090.0,2.6e6,2.992948e6,3.280691e6,"""Unverified estimation""","""https://fbref.com/en/players/9…"
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""Swansea City""","""Premier League""","""2017-2018""","""Roque Mesa""","""ESP""","""MF""",28,,,,,,,"""""","""https://fbref.com/en/players/7…"
"""Swansea City""","""Premier League""","""2017-2018""","""Oliver McBurnie""","""SCO""","""FW,MF""",21,,,,,,,"""""","""https://fbref.com/en/players/7…"
"""Swansea City""","""Premier League""","""2017-2018""","""Joe Rodon""","""WAL""","""DF""",19,,,,,,,"""""","""https://fbref.com/en/players/8…"
"""Swansea City""","""Premier League""","""2017-2018""","""Adam King""","""SCO""","""MF""",21,,,,,,,"""""","""https://fbref.com/en/players/7…"


In [34]:
pandas2ri.rpy2py(wages[-3])

TypeError: 'NULLType' object is not iterable

In [38]:
from rpy2.robjects import r

# Assuming `rdf` is your rpy2.robjects.vectors.DataFrame
# Replace the problematic column (assume it's named "col_with_lists")
r('''
convert_list_column <- function(df, col_name) {
    df[[col_name]] <- sapply(df[[col_name]], function(x) paste(x, collapse = ","))
    return(df)
}
''')

# Apply the R function to your dataframe
rdf_fixed = r.convert_list_column(wages[-3], "AnnualWageGBP")

In [40]:
d = wages[-3]

In [39]:
r_to_python(rdf_fixed)

Team,Comp,Season,...,AnnualWageUSD,Notes,Url
...,...,...,...,FloatSexp...,...,...


In [31]:
wages[-3].to_csvfile("test.csv")

RRuntimeError: Error in (function (x, file = "", append = FALSE, quote = TRUE, sep = " ",  : 
  unimplemented type 'list' in 'EncodeElement'


In [30]:
help(wages[-3])

Help on DataFrame in module rpy2.robjects.vectors object:

class DataFrame(ListVector)
 |  DataFrame(obj, stringsasfactor=False, checknames=False)
 |
 |  R 'data.frame'.
 |
 |  Method resolution order:
 |      DataFrame
 |      ListVector
 |      Vector
 |      rpy2.robjects.robject.RObjectMixin
 |      abc.ABC
 |      rpy2.rinterface.ListSexpVector
 |      rpy2.rinterface_lib.sexp.SexpVector
 |      rpy2.rinterface_lib.sexp.Sexp
 |      rpy2.rinterface_lib.sexp.SexpVectorAbstract
 |      rpy2.rinterface_lib._rinterface_capi.SupportsSEXP
 |      typing.Generic
 |      builtins.object
 |
 |  Methods defined here:
 |
 |  __getitem__(self, i)
 |
 |  __init__(self, obj, stringsasfactor=False, checknames=False)
 |      Create a new data frame.
 |
 |      :param obj: object inheriting from rpy2.rinterface.SexpVector,
 |          or inheriting from TaggedList or a mapping name -> value
 |      :param stringsasfactors: Boolean indicating whether vectors
 |          of strings should be turned 

In [29]:
from bayesball.utils import r_to_python, pandas2ri

pandas2ri.rpy2py(wages[-3].get_column("wages"))

# r_to_python(wages[-3])

AttributeError: 'DataFrame' object has no attribute 'get_column'

In [83]:



    existing_files = [x for x in Path().glob(str(Path(next_path).with_name(f.name.replace("_fbref", "_fbref_*"))))]

data/ingest/stage/match_shooting/USA_M_1st_match_shooting_fbref.csv
data/ingest/fbref/match_shooting/USA_M_1st_match_shooting_fbref_0001.csv
data/ingest/stage/match_summary/ENG_M_5th_match_summary_fbref.csv
data/ingest/fbref/match_summary/ENG_M_5th_match_summary_fbref_0001.csv
data/ingest/stage/match_summary/USA_M_1st_match_summary_fbref.csv
data/ingest/fbref/match_summary/USA_M_1st_match_summary_fbref_0001.csv
data/ingest/stage/match_summary/ENG_M_2nd_match_summary_fbref.csv
data/ingest/fbref/match_summary/ENG_M_2nd_match_summary_fbref_0001.csv
data/ingest/stage/match_summary/GER_M_1st_match_summary_fbref.csv
data/ingest/fbref/match_summary/GER_M_1st_match_summary_fbref_0001.csv
data/ingest/stage/match_summary/ENG_M_3rd_match_summary_fbref.csv
data/ingest/fbref/match_summary/ENG_M_3rd_match_summary_fbref_0001.csv
data/ingest/stage/match_summary/ENG_M_4th_match_summary_fbref.csv
data/ingest/fbref/match_summary/ENG_M_4th_match_summary_fbref_0001.csv
data/ingest/stage/match_summary/ITA_M

In [70]:
 Path(next_path).with_name(f.name.replace("_fbref", "_fbref_*")).

PosixPath('data/ingest/fbref/advanced_match_stats/player/keeper/USA_M_1st_fbref_*.csv')

In [76]:
list(Path().glob('data/ingest/fbref/advanced_match_stats/player/keeper/USA_M_1st_fbref_*.csv'))

[PosixPath('data/ingest/fbref/advanced_match_stats/player/keeper/USA_M_1st_fbref_0001.csv')]

In [79]:
existing_files

[PosixPath('data/ingest/fbref/advanced_match_stats/player/keeper/USA_M_1st_fbref_0001.csv')]

In [2]:
import duckdb
import polars as pl
from pathlib import Path

EXTRACT_DIR = Path("data/extract")

db_conn = f"duckdb:///{str(EXTRACT_DIR / 'bayesball.db')}"
for f in  EXTRACT_DIR.glob("*.parquet"):
    print(f)
    pl.read_parquet(f).write_database(f.stem, db_conn, if_table_exists="replace")

data/extract/advanced_match_summary.parquet




data/extract/wages.parquet




data/extract/match_summary.parquet




data/extract/advanced_match_stats_player.parquet




AssertionError: 