In [1]:
import pandas as pd
import altair as alt
import numpy as np
import math
import requests
from bs4 import BeautifulSoup
import regex as re
import time
import geopandas as gpd

In [2]:
%load_ext nb_black

<IPython.core.display.Javascript object>

# Data Cleaning and Manipulation

### World Cup Power Rating

In [55]:
df = pd.read_csv("dataset/results.csv")

wc_data = df[df.tournament == "FIFA World Cup"].copy()

<IPython.core.display.Javascript object>

In [56]:
# Winning team gets 3 points, losing team gets 0 point, and tie gets 1 point, as per World Cup convention
def get_result(row, side):
    if side == "home":
        if row.home_score > row.away_score:
            return 3
        elif row.home_score < row.away_score:
            return 0
        else:
            return 1
    else:
        if row.home_score < row.away_score:
            return 3
        elif row.home_score > row.away_score:
            return 0
        else:
            return 1


# This function returns the goal difference for both home and away
def get_goal_diff(row, side):
    if side == "home":
        return row.home_score - row.away_score
    else:
        return row.away_score - row.home_score


# Returns World Cup results for each iteration of the event
def return_world_cup_result(data, country_name):
    df = data.copy()
    df = df[((df.tournament == "FIFA World Cup") & (df.country == country_name))].copy()

    df["home_result"] = df.apply(lambda x: get_result(x, "home"), axis=1)
    df["away_result"] = df.apply(lambda x: get_result(x, "away"), axis=1)
    df["home_goal_diff"] = df.apply(lambda x: get_goal_diff(x, "home"), axis=1)
    df["away_goal_diff"] = df.apply(lambda x: get_goal_diff(x, "away"), axis=1)

    home = df[
        ["tournament", "country", "home_team", "home_result", "home_goal_diff"]
    ].copy()
    away = df[
        ["tournament", "country", "away_team", "away_result", "away_goal_diff"]
    ].copy()

    home.rename(
        columns={
            "home_team": "team",
            "home_result": "result",
            "home_goal_diff": "goal_diff",
        },
        inplace=True,
    )
    away.rename(
        columns={
            "away_team": "team",
            "away_result": "result",
            "away_goal_diff": "goal_diff",
        },
        inplace=True,
    )

    combined = pd.concat([home, away]).copy()

    # Match count is the best indicator of how well the team did
    combined["match_count"] = 1

    # This ordering is not the most accurate, but is systematic way to calculate power rating
    combined = (
        combined.groupby(["tournament", "country", "team"], as_index=False)
        .sum()
        .sort_values(by=["match_count", "result", "goal_diff"], ascending=False)
        .reset_index(drop=True)
        .reset_index()
        .rename(columns={"index": "rank"})
    )
    combined["wc_result_score"] = (32 - (combined["rank"] + 1)) / 32 + 1 / 32

    return combined

<IPython.core.display.Javascript object>

In [57]:
russia = return_world_cup_result(wc_data, "Russia")[
    ["country", "team", "wc_result_score"]
]
qatar = return_world_cup_result(wc_data, "Qatar")[
    ["country", "team", "wc_result_score"]
]

wc_results_2018_2022 = russia.merge(qatar, how="outer", on=["team"]).copy()

wc_results_2018_2022 = wc_results_2018_2022[
    ["team", "wc_result_score_x", "wc_result_score_y"]
]

wc_results_2018_2022 = wc_results_2018_2022.fillna(0)

wc_results_2018_2022 = wc_results_2018_2022.melt(id_vars=["team"])

wc_results_2018_2022 = wc_results_2018_2022.drop(columns=["variable"]).rename(
    columns={"value": "power_rating"}
)

wc_results_2018_2022 = wc_results_2018_2022.groupby(["team"], as_index=False).mean()

wc_results_2018_2022 = wc_results_2018_2022.rename(columns={"team": "country_name"})

<IPython.core.display.Javascript object>

#### Convert to CSV: World Cup Power Rating

In [58]:
wc_results_2018_2022.to_csv("world_cup_result.csv")

<IPython.core.display.Javascript object>