In [117]:
import pandas as pd
from pathlib import Path

data_dir = Path("../data/raw/fbref")

In [118]:
csvs = list(data_dir.glob("**/*.csv"))

In [124]:
for f in csvs:
    df = pd.read_csv(f)
    if "Date" in df.columns and df["Date"].dtype == "float64":
        df["Date"] = pd.to_datetime(df["Date"], unit="D", origin="1970-01-01")
        df.to_csv(f, index=False)

In [110]:
def dedupe(df, season, country):
    df_2 = df.copy()
    if "Gender" in df.columns:
        df_2["Gender"] = "M"
    if "Season_End_Year" in df.columns:
        df_2["Season_End_Year"] = season
    if "Country" in df.columns:
        df_2["Country"] = country
    if "Tier" in df.columns:
        df_2["Tier"] = "1st"
    if "Competition_Name" in df.columns:
        df_2["Competition_Name"] = df_2["Competition_Name"].ffill()

    if "Game_URL" in df.columns:
        df_2["MatchURL"] = df_2["Game_URL"]
        df["MatchURL"] = df["Game_URL"]
    # fill numeric cols with 0
    group_keys = []
    for col in [
        "MatchURL",
        "Team",
        "Player",
        "Squad",
        "Team_or_Opponent",
        "url",
        "Home_Team",
        "Away_Team",
        "Match_Date",
        "Event_Time",
        "Event_Type",
        "Event_Players",
    ]:
        if col in df_2.columns:
            group_keys.append(col)
    numeric_cols = [x for x in df_2.columns if df_2[x].dtype in ["int64", "float64"]]
    df_2[numeric_cols] = df_2[numeric_cols].fillna(0)
    df_2 = df_2.drop_duplicates()
    df_2 = df_2.groupby(group_keys).head(1)
    init_keys = df[group_keys].drop_duplicates().dropna()
    new_keys = df_2[group_keys].drop_duplicates()
    init_keys.merge(new_keys, how="outer", indicator=True).query('_merge=="left_only"')
    if not init_keys.equals(new_keys):
        raise ValueError("Keys are not the same")
    return df_2


df = pd.read_csv(f)
df_2 = dedupe(df, 2017, "ENG")

In [None]:
tier = "1st"
for season in range(2017, 2026):
    for country in ["ENG", "GER", "ITA", "FRA", "ESP"]:
        p = data_dir / country / tier / str(season)
        files = list(p.glob(f"**/*.csv"))
        for f in files:
            df = pd.read_csv(f)
            df_2 = dedupe(df, season, country)
            if df_2.equals(df):
                continue
            else:
                print(f"Saving {f}")
                df_2.to_csv(f, index=False)

In [115]:
for tier in ["2nd", "3rd", "4th", "5th"]:
    for season in range(2017, 2026):
        for country in ["ENG"]:
            p = data_dir / country / tier / str(season)
            results = p / "match_stats" / "results.csv"
            urls = p / "match_stats" / "urls.csv"
            if results.exists():
                df = pd.read_csv(results)
                match_urls = df["MatchURL"].drop_duplicates()
            if not urls.exists():
                pd.DataFrame({"url": match_urls}).to_csv(urls, index=False)

In [33]:
df = pd.read_csv(
    "/home/jimmy/Code/FantasyFootball/data/raw/fbref/ITA/1st/2022/match_stats/player/summary.csv"
)
df_2 = dedupe(df, 2022, "ITA")

In [67]:
f

PosixPath('../data/raw/fbref/ENG/1st/2017/match_stats/report.csv')

ValueError: Keys are not the same