# ⚽ Make Fútbol Graphs

...based on [International football results from 1872 to 2022](https://www.kaggle.com/datasets/martj42/international-football-results-from-1872-to-2017).

In [None]:
import json
from pathlib import Path

import pandas as pd

In [None]:
FOLDER = Path("../datasets/futbol_raw_datasets/")

dataset_names = ("results", "goalscorers", "shootouts", "countries")

datasets = {
    dataset_name: pd.read_csv(FOLDER / f"{dataset_name}.csv")
    for dataset_name in dataset_names
}

# make a unique identifer for each match
for dataset_name, dataset in datasets.items():
    try:
        dataset["uuid"] = dataset.away_team + ":" + dataset.home_team + ":" + dataset.date
    except AttributeError:
        print(f"Couldn't add a UUID to '{dataset_name}' dataset")

# Augment `results` dataset

In [None]:
shootouts = datasets["shootouts"]

def get_winners(row: pd.Series) -> str | None:
    if row.home_score > row.away_score:
        return row.home_team
    if row.home_score < row.away_score:
        return row.away_team
    penalty_results = shootouts[shootouts.uuid.str.match(row.uuid)]
    if penalty_results.empty:
        return None
    return [*penalty_results.winner][0]

def get_losers(row: pd.Series) -> str | None:
    if row.winner is None:
        return None
    if row.winner == row.home_team:
        return row.away_team
    else:
        return row.home_team

In [None]:
results = datasets["results"]

results["winner"] = results.apply(get_winners, axis=1)
results["loser"] = results.apply(get_losers, axis=1)
results["goal_differential"] = (results.home_score - results.away_score).abs()

# Process the `nodes`

In [None]:
winners = set(results.winner).difference([None])

country_names = tuple(sorted(winners.union(results.loser).difference([None])))
country_or_areas = tuple(sorted(datasets["countries"]["Country or Area"]))

missing = set(country_names).difference(country_or_areas)
assert not missing, f"These teams don't have country data: {missing}"

country_data = datasets["countries"].set_index("Country or Area").to_dict(orient="index")

In [None]:
nodes = [dict(id=country_name, **country_data[country_name]) for country_name in country_names]

# Process the `links`

In [None]:
def get_links(winner: str, wins: pd.DataFrame) -> list[dict[str, (str | float | int)]]:
    return [
        dict(
            source=winner,
            target=loser,
            num_wins=stats["count"],
            goals_avg=stats["mean"],
            goals_total=stats["sum"],
        )
        for loser, stats in wins.groupby("loser")["goal_differential"].agg(["mean", "count", "sum"]).to_dict(orient="index").items()
    ]

links = [
    get_links(winner, results[(results.winner == winner)])
    for winner in winners
]
# flatten list of lists into just a list
links = sum(links, [])

In [None]:
# Save small dataset to file
Path("../datasets/futbol_small.json").write_text(
    json.dumps(
        dict(
            nodes=nodes,
            links=links,
        ),
    # TODO: find a better way to handle NaNs
    ).replace("NaN", "null")
)