In [1]:
import pandas as pd
import altair as alt
import numpy as np
import math
import requests
from bs4 import BeautifulSoup
import regex as re
import time
import geopandas as gpd

In [2]:
%load_ext nb_black

<IPython.core.display.Javascript object>

# Data Cleaning and Manipulation

**Please make sure that you have ran the data manipulation for DL_KAGGLE, which outputs a local file world_cup_result.csv. Otherwise you will encounter an error**.

### Transfermarkt: top 25 domestic team MVs

In [7]:

# This is needed when scraping website to let the site know what is accessing it
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36'}

page = 'https://www.transfermarkt.com/vereins-statistik/wertvollstemannschaften/marktwertetop/plus/0/galerie'
response = requests.get(page, headers=headers)
soup = BeautifulSoup(response.content,'html.parser')



<IPython.core.display.Javascript object>

In [8]:
# Gives top 25 values for soup inputted
def handle_regex(pattern, soup, output_list):
    counter = 0
    for x in soup:
        if len(re.findall(pattern, str(x))) >= 1 and counter < 25:
            output_list.append(re.findall(pattern, str(x))[0])
            counter += 1


def return_dom_league_mv(url):
    # To not overwhelm transfermarkt with requests
    time.sleep(2)

    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")

    market_val = []
    team_names = []
    team_country = []

    pattern_value = r">([^<>\n]+)<"
    pattern_team = r"title\=\"([^<>]+)\"\/><\/a><\/td>"
    pattern_country = r"title\=\"([^<>]+)\"\/\>\s\<a\shref\="

    soup_teams = soup.find_all("td", {"class": "zentriert"})
    soup_market_val = soup.find_all("td", {"class": "rechts"})
    soup_country = soup.find_all("td", {"class": "links"})

    handle_regex(pattern_team, soup_teams, team_names)
    handle_regex(pattern_value, soup_market_val, market_val)
    handle_regex(pattern_country, soup_country, team_country)

    return pd.DataFrame(
        {
            "country_name": team_country,
            "team_name": team_names,
            "market_val": market_val,
        }
    )


# This is the function to clean up the monetary value, all in Euros
def format_mv(value):
    if type(value) == float:
        return value
    value = value.replace("€", "")
    value = value.replace("k", "000")
    value = value.replace("-", "0")
    if "m" in value:
        mod_value = float(value.replace("m", "")) * 1000000
        return mod_value
    elif "bn" in value:
        mod_value = float(value.replace("bn", "")) * 1000000000
        return mod_value
    else:
        return float(value)

<IPython.core.display.Javascript object>

In [9]:
# Basically, transfermarkt has its own country code, which is impossible to guess
# So, the first thing to do, is to not select a country, and take the value from the drop down menu
# We can iterate over that drop down menu value

country_name = []
country_code = []

soup_list_countries = soup.find_all("div", {"class": "inline-select"})

pattern_value = r"value\=\"([0-9]+)\"\>([A-Za-z\,\s]+)\<"

for x in soup_list_countries:
    for y, z in re.findall(pattern_value, str(x)):
        country_name.append(z)
        country_code.append(y)

country_code = country_code[1:-7]
country_name = country_name[1:-7]

list_countries = pd.DataFrame(
    {"country_code": country_code, "country_name": country_name}
)

url_first_half = "https://www.transfermarkt.com/vereins-statistik/wertvollstemannschaften/marktwertetop/plus/0/galerie/0?land_id="
url_second_half = "&kontinent_id=0&yt0=Show"

list_countries["url"] = list_countries.apply(
    lambda x: url_first_half + x["country_code"] + url_second_half, axis=1
)

dom_league_mv = pd.DataFrame()

for x in list_countries.iterrows():
    df = return_dom_league_mv(x[1][2])
    dom_league_mv = pd.concat([df, dom_league_mv])

dom_league_mv = dom_league_mv.rename(columns={"market_val": "market_val_orig"})

dom_league_mv["market_val"] = dom_league_mv.apply(
    lambda x: format_mv(x["market_val_orig"]), axis=1
)

dom_league_mv.head(5)

Unnamed: 0,country_name,team_name,market_val_orig,market_val
0,Zimbabwe,Simba Bhora Football Club,€300k,300000.0
1,Zimbabwe,CAPS United FC,€175k,175000.0
2,Zimbabwe,Global Sports Academy,-,0.0
3,Zimbabwe,FC Platinum U19,-,0.0
4,Zimbabwe,Cranborne Bullets FC,-,0.0


<IPython.core.display.Javascript object>

#### Convert to CSV: Avg. of top 25 clubs' MV

In [None]:
dom_league_mv.to_csv('domestic_league_mv.csv')
dom_league_mv = dom_league_mv.drop(columns=['market_val_orig'])
avg_league_values = dom_league_mv.drop(columns='team_name').groupby(['country_name'],as_index=False).mean()
avg_league_values.to_csv('avg_league_val_by_country.csv')

### Transfermarkt: MV Difference - Playing abroad vs at home

**Please ignore the printout (not runtime error). Errors will not be encountered due to exception handling, but due to changes in Transfermarkt, some unencountered cases of unsuccessful scrape entries do arise, for example player name not available. However, this is limited to zero or a few entries so won't impact the overall outcome.**

In [25]:
page = "https://www.transfermarkt.com/vereins-statistik/wertvollstenationalmannschaften/marktwertetop?kontinent_id=0"
response = requests.get(page, headers=headers)
soup = BeautifulSoup(response.content, "html.parser")

<IPython.core.display.Javascript object>

In [26]:
countries = pd.read_csv('world_cup_result.csv')
countries = countries.drop(columns=['Unnamed: 0']).rename(columns={'team':'country_name'})

# This is used to aggregate country name and URL
nat_teams = pd.DataFrame()

country_url = 'https://www.transfermarkt.com/vereins-statistik/wertvollstenationalmannschaften/' \
              'marktwertetop?ajax=yw1&kontinent_id=0&page='

pattern_value = r"\<a href\=\"(\/[a-z\-\_]+\/[a-z\-\_]+\/[a-z\-\_]+\/[0-9]+\/saison_id\/2022)\"\stitle\=\"([A-Za-z\s\,]+)"


# There are total of four pages
pages = ['1','2','3','4']

for page in pages:
    time.sleep(3)
    
    url = country_url + page

    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content,'html.parser')
    
    country_name = []
    country_code = []
    
    soup_national_teams = soup.find_all("td",{"rowspan":"2"})

    for x in soup_national_teams:
        country_code.append(re.findall(pattern_value,str(x))[0][0])
        country_name.append(re.findall(pattern_value,str(x))[0][1])

    temp = pd.DataFrame({'country_name':country_name,'url':country_code})
    
    nat_teams = pd.concat([nat_teams,temp])
    
countries_list = countries.merge(nat_teams,how='left',on=['country_name'])

# Manually filling as the value was missing - these two are never picked up
countries_list.iloc[7,2] = '/costa-rica/kader/verein/8497/saison_id/2022'
countries_list.iloc[27,2] = '/katar/startseite/verein/14162/saison_id/2022'

players_country = pd.DataFrame()

# For each of the countries, it will attempt to extract player name, market value, and domestic club URL
# Given that the content of the site changes, sometimes previously unencountered issue arise, which is inevitable
# At the time of the previous run, there was no print out, but if there is, it gives way for the owner of the code
# to update the coding as required. Exception handling ensures no error, just omission

for x in countries_list.iterrows():
    
    # Placing a two second timer not to send too many request at once - change as required
    time.sleep(2)
    player_data = []
    
    national_team_url = 'https://www.transfermarkt.com' + x[1]['url']
    country_name = x[1]['country_name']

    response = requests.get(national_team_url, headers=headers)
    national_team_soup = BeautifulSoup(response.content,'html.parser')
    
    players_country_temp = pd.DataFrame()

    pattern_mv = r"\>([\€\.0-9a-z]+)\<\/a\>\s\<\/td\>\<\/tr\>"
    # Back-up regex for market value if the first fails
    pattern_mv_bu = r'(\€[\.0-9a-z]+)'
    
    pattern_pl_name = r'\"\>([\wA-Za-z\sé\-íó\'\.]+)\<\/a\>\<\/span\>\<\/div\>'
    # Back-up regex for player name if the first fails
    pattern_pl_name_bu = r'[\s]{17}([\wA-Za-zé\-íó\'\.\s]+)[\s]{12}\<\/a\>'
    pattern_league_team = r'\<a\shref\=\"\/([0-9a-z\-\_\.A-Zíóé]+\/startseite\/verein\/[0-9]+)\"'
    
    # HTML has class even and odd, so there needs to be two separate for-loops
    for x in national_team_soup.find_all("tr",{"class":"odd"}):
        try: 
            player_data.append( (re.findall(pattern_mv,str(x))[0],
                                 re.findall(pattern_pl_name,str(x))[0],
                                 re.findall(pattern_league_team,str(x))[0]))
        except:
            if len(re.findall(pattern_mv,str(x))) == 0 and len(re.findall(pattern_pl_name,str(x))) >= 1 and len(re.findall(pattern_league_team,str(x))) >= 1:
                try:
                    player_data.append( (re.findall(pattern_mv_bu,str(x))[0],
                                         re.findall(pattern_pl_name,str(x))[0],
                                         re.findall(pattern_league_team,str(x))[0]))
                except:
                    try:
                        #This is when players don't have market value on transfermarkt
                        player_data.append( ('€0',
                                         re.findall(pattern_pl_name,str(x))[0],
                                         re.findall(pattern_league_team,str(x))[0]))
                    except:
                        print('Failed to replace MV')
                        print(re.findall(pattern_mv,str(x)))
                        print(re.findall(pattern_pl_name,str(x)))
                        print(re.findall(pattern_league_team,str(x)))
                        print('----------------------')
                        print(str(x))
            elif len(re.findall(pattern_mv,str(x))) == 0:
                try:
                    player_data.append( (re.findall(pattern_mv_bu,str(x))[0],
                                         re.findall(pattern_pl_name_bu,str(x))[0],
                                         re.findall(pattern_league_team,str(x))[0]))
                except:         
                    print('Issue other than MV')
                    print(re.findall(pattern_mv_bu,str(x)))
                    print(re.findall(pattern_pl_name_bu,str(x)))
                    print(re.findall(pattern_league_team,str(x)))
                    print('----------------------')
                    print(str(x))
            else:
                print('Issue other than MV')
                print(re.findall(pattern_mv_bu,str(x)))
                print(re.findall(pattern_pl_name_bu,str(x)))
                print(re.findall(pattern_league_team,str(x)))
                print('----------------------')
                print(str(x))


    for x in national_team_soup.find_all("tr",{"class":"even"}):
        try:
            player_data.append( (re.findall(pattern_mv,str(x))[0],
                                 re.findall(pattern_pl_name,str(x))[0],
                                 re.findall(pattern_league_team,str(x))[0]))
        except:
            if len(re.findall(pattern_mv,str(x))) == 0 and len(re.findall(pattern_pl_name,str(x))) >= 1 and len(re.findall(pattern_league_team,str(x))) >= 1:
                try:
                    player_data.append( (re.findall(pattern_mv_bu,str(x))[0],
                                         re.findall(pattern_pl_name,str(x))[0],
                                         re.findall(pattern_league_team,str(x))[0]))
                except:
                    try:
                        #This is when players don't have market value on transfermarkt
                        player_data.append( ('€0',
                                         re.findall(pattern_pl_name,str(x))[0],
                                         re.findall(pattern_league_team,str(x))[0]))
                    except:
                        print('Failed to replace MV')
                        print(re.findall(pattern_mv,str(x)))
                        print(re.findall(pattern_pl_name,str(x)))
                        print(re.findall(pattern_league_team,str(x)))
                        print('----------------------')
                        print(str(x))
            elif len(re.findall(pattern_mv,str(x))) == 0:
                try:
                    player_data.append( (re.findall(pattern_mv_bu,str(x))[0],
                                         re.findall(pattern_pl_name_bu,str(x))[0],
                                         re.findall(pattern_league_team,str(x))[0]))
                except:         
                    print('Issue other than MV')
                    print(re.findall(pattern_mv_bu,str(x)))
                    print(re.findall(pattern_pl_name_bu,str(x)))
                    print(re.findall(pattern_league_team,str(x)))
                    print('----------------------')
                    print(str(x))
            else:
                print('Issue other than MV')
                print(re.findall(pattern_mv_bu,str(x)))
                print(re.findall(pattern_pl_name_bu,str(x)))
                print(re.findall(pattern_league_team,str(x)))
                print('----------------------')
                print(str(x))
                

    players_mv = [x[0] for x in player_data]
    players_name = [x[1] for x in player_data]
    players_url = [x[2] for x in player_data]

    players_country_temp = pd.DataFrame({'players_mv':players_mv,
                                    'players_name':players_name,
                                    'players_url':players_url})
    
    players_country_temp['country_name'] = country_name
    
    players_country = pd.concat([players_country,players_country_temp])


Issue other than MV
['€600k']
[]
['degerfors-if/startseite/verein/3641']
----------------------


<IPython.core.display.Javascript object>

In [27]:
domestic_team = []
domestic_team_country = []
domestic_team_country_pattern = r"\,([A-Za-z\s\,]+)\"\sname\=\"keywords\"\/\>"

# For each unique club URL, this returns a country name
for x in np.unique(players_country.players_url):
    time.sleep(2)
    #print(x)

    domestic_team_url = "https://www.transfermarkt.com/" + x
    response = requests.get(domestic_team_url, headers=headers)
    domestic_team_soup = BeautifulSoup(response.content, "html.parser")

    try:
        country_name = re.findall(
            domestic_team_country_pattern, str(domestic_team_soup)
        )[0]
        domestic_team.append(x)
        domestic_team_country.append(country_name)
    except:
        print("Encountered problem with {}".format(domestic_team_url))

domestic_teams = pd.DataFrame(
    {"domestic_team": domestic_team, "domestic_team_country": domestic_team_country}
)

# South Korea is written as 'Korea, South' in Transfermarkt
domestic_teams = domestic_teams.replace(
    {"domestic_team_country": {"Korea, South": "South Korea"}}
)

players_country_renamed = players_country.copy().rename(
    columns={"players_url": "domestic_team"}
)
players_country_domestic = players_country_renamed.merge(
    domestic_teams, on=["domestic_team"], how="left"
)

# vereinslos/startseite/verein/515 refers to free agent
players_country_domestic[players_country_domestic.domestic_team_country.isnull()]


# For simplicity, let's just say free agent players play in own country
def is_domestic(country_name, domestic_team_country, domestic_team):
    if (
        country_name == domestic_team_country
        or domestic_team == "vereinslos/startseite/verein/515"
    ):
        return "Y"
    else:
        return "N"


players_country_domestic["plays_domestically"] = players_country_domestic.apply(
    lambda x: is_domestic(
        x["country_name"], x["domestic_team_country"], x["domestic_team"]
    ),
    axis=1,
)

players_country_domestic["players_mv"] = players_country_domestic.apply(
    lambda x: format_mv(x["players_mv"]), axis=1
)

players_country_domestic_agg = players_country_domestic.groupby(
    ["country_name", "plays_domestically"], as_index=False
).agg({"players_mv": ["sum", "count"]})

players_country_domestic_agg.columns = list(
    map("".join, players_country_domestic_agg.columns.values)
)

plays_dom = (
    players_country_domestic_agg[players_country_domestic_agg.plays_domestically == "Y"]
    .copy()
    .rename(columns={"players_mvcount": "num_play_dom", "players_mvsum": "mv_play_dom"})
)

plays_int = (
    players_country_domestic_agg[players_country_domestic_agg.plays_domestically == "N"]
    .copy()
    .rename(columns={"players_mvcount": "num_play_int", "players_mvsum": "mv_play_int"})
)


combined = plays_dom.merge(plays_int, on=["country_name"], how="outer").drop(
    columns=["plays_domestically_x", "plays_domestically_y"]
)


combined = combined.fillna(0)

combined["dom_int_mv_ratio"] = (combined.mv_play_dom / combined.num_play_dom) / (
    combined.mv_play_int / combined.num_play_int
)

combined = combined.dropna()

1-fc-union-berlin/startseite/verein/89
1-fsv-mainz-05/startseite/verein/39
aarhus-gf/startseite/verein/678
abha-club/startseite/verein/40039
ac-ajaccio/startseite/verein/1147
ac-florenz/startseite/verein/430
ac-mailand/startseite/verein/5
academico-viseu-fc/startseite/verein/7788
ad-san-carlos/startseite/verein/18659
adelaide-united/startseite/verein/875
aek-athen/startseite/verein/2441
afc-bournemouth/startseite/verein/989
aik-solna/startseite/verein/272
aj-auxerre/startseite/verein/290
ajax-amsterdam/startseite/verein/610
ajman-club/startseite/verein/24724
al-adalah-fc/startseite/verein/55544
al-ahli-dschidda/startseite/verein/18487
al-ahli-sc/startseite/verein/3612
al-arabi-sc/startseite/verein/1230
al-duhail-sc/startseite/verein/26091
al-ettifaq/startseite/verein/7732
al-fateh/startseite/verein/27221
al-gharafa-sc/startseite/verein/6297
al-hilal-riad/startseite/verein/1114
al-ittihad-dschidda/startseite/verein/8023
al-nasr-riad/startseite/verein/18544
al-rayyan-sc/startseite/verein

legia-warschau/startseite/verein/255
leicester-city/startseite/verein/1003
lokomotiv-moskau/startseite/verein/932
los-angeles-fc/startseite/verein/51828
los-angeles-galaxy/startseite/verein/1061
losc-lille/startseite/verein/1082
lyngby-bk/startseite/verein/369
maccabi-haifa/startseite/verein/1064
maccabi-tel-aviv/startseite/verein/119
machida-zelvia/startseite/verein/23568
malmo-ff/startseite/verein/496
manchester-city-u23/startseite/verein/9265
manchester-city/startseite/verein/281
manchester-united/startseite/verein/985
mazatlan-fc/startseite/verein/82696
melbourne-city-fc/startseite/verein/25580
millonarios-fc/startseite/verein/2350
minnesota-united-fc/startseite/verein/56089
monagas-sc/startseite/verein/14596
montpellier-hsc/startseite/verein/969
municipal-perez-zeledon/startseite/verein/8601
nagoya-grampus/startseite/verein/1066
najran-sc/startseite/verein/26586
nashville-sc/startseite/verein/63966
new-england-revolution/startseite/verein/626
newcastle-united/startseite/verein/762

Unnamed: 0,players_mv,players_name,domestic_team,country_name,domestic_team_country,plays_domestically
889,5000000.0,Matt Turner,fc-arsenal/startseite/verein/11,United States,England,N
890,800000.0,Drake Callender,inter-miami-cf/startseite/verein/69261,United States,United States,Y
891,10000000.0,Auston Trusty,fc-arsenal/startseite/verein/11,United States,England,N
892,3500000.0,Walker Zimmerman,nashville-sc/startseite/verein/63966,United States,United States,Y
893,12000000.0,Sergiño Dest,ac-mailand/startseite/verein/5,United States,Italy,N


<IPython.core.display.Javascript object>

#### Convert to CSV: MV Difference - Playing abroad vs at home

In [63]:
combined.to_csv("foreign_player_national_teams_agg.csv")
players_country_domestic.to_csv("foreign_player_national_teams.csv")

<IPython.core.display.Javascript object>

# Visualizations

In [None]:
def return_ranking_chart(title, subtitle, data_input, metric_name, list_of_scale):
    data_input = data_input.rename(columns={metric_name: "metric"})

    ranking_viz = (
        alt.Chart(
            data_input,
            title=alt.Title(
                # "FIFA World Cup Power Ratings", subtitle="Russia 2018 and Qatar 2022"
                title,
                subtitle=subtitle,
            ),
        )
        .mark_bar(size=11, color="#4899F3")
        .encode(
            x=alt.X(
                "metric:Q",
                axis=alt.Axis(ticks=False, labels=False),
                scale=alt.Scale(domain=list_of_scale),
            ).title(""),
            y=alt.Y(
                "country_name:N",
                axis=alt.Axis(ticks=False, labelPadding=10),
                sort=alt.EncodingSortField(field="metric", order="descending"),
            ).title(""),
            color=alt.Color("is_former_champion:N", legend=None),
        )
    )

    ranking_text_viz = (
        alt.Chart(data_input)
        .mark_text(dx=10, align="left")
        .encode(
            x=alt.X("metric:Q", scale=alt.Scale(domain=list_of_scale)),
            y=alt.Y(
                "country_name:N",
                sort=alt.EncodingSortField(field="metric", order="descending"),
            ),
            text=alt.Text("metric:Q"),
        )
    )

    return (ranking_viz + ranking_text_viz).properties(width=180)

def return_isotype_chart(data, title, subtitle):
    # For the code snippet for person, I do not claim ownership. It comes from an example visualization at the Altair
    # website which gives below person value to sketch a isotype of a person.
    
    person = (
        "M1.7 -1.7h-0.8c0.3 -0.2 0.6 -0.5 0.6 -0.9c0 -0.6 "
        "-0.4 -1 -1 -1c-0.6 0 -1 0.4 -1 1c0 0.4 0.2 0.7 0.6 "
        "0.9h-0.8c-0.4 0 -0.7 0.3 -0.7 0.6v1.9c0 0.3 0.3 0.6 "
        "0.6 0.6h0.2c0 0 0 0.1 0 0.1v1.9c0 0.3 0.2 0.6 0.3 "
        "0.6h1.3c0.2 0 0.3 -0.3 0.3 -0.6v-1.8c0 0 0 -0.1 0 "
        "-0.1h0.2c0.3 0 0.6 -0.3 0.6 -0.6v-2c0.2 -0.3 -0.1 "
        "-0.6 -0.4 -0.6z"
    )

    chart = (
        alt.Chart(data, title=alt.Title(title, subtitle=subtitle))
        # Below transforms ensure that there are 16 persons per row
        .transform_calculate(row="10 - ceil(datum.rank/16)")
        .transform_calculate(col="datum.rank - ceil(datum.rank/16)*16")
        .mark_point(filled=True, size=70)
        .encode(
            x=alt.X("col:Q").axis(None),
            #         y = alt.Y("row:O",scale=alt.Scale(domain=[0,1])).axis(None),
            y=alt.Y("row:Q", scale=alt.Scale(domain=[0, 10])).axis(None),
            color=alt.Color(
                "is_top_10:N",
                legend=alt.Legend(title="Top 10 players with highest MV", orient="top"),
            ),
            shape=alt.ShapeValue(person),
        )
        .properties(width=400, height=400)
    )

    return chart

In [None]:
fifa_power = pd.read_csv("world_cup_result.csv")
fifa_power = fifa_power.drop(columns="Unnamed: 0")
fifa_power = fifa_power.rename(columns={"team": "country_name"})

fifa_power = fifa_power.sort_values(["power_rating"], ascending=False)

# Highlighting countries that have won the World Cup
fifa_power["is_former_champion"] = fifa_power.apply(
    lambda x: "Y"
    if x["country_name"] == "France" or x["country_name"] == "Argentina"
    else "N",
    axis=1,
)

fifa_power = fifa_power.reset_index(drop=True).reset_index()
fifa_power["index"] = fifa_power["index"] + 1
fifa_power = fifa_power.rename(columns={"index": "rank"})

fifa_power.head(5)

In [None]:
avg_league_val_by_country = pd.read_csv("avg_league_val_by_country.csv")
avg_league_val_by_country = avg_league_val_by_country.drop(columns="Unnamed: 0")

avg_league_val_by_country = avg_league_val_by_country.replace(
    {"country_name": {"Korea, South": "South Korea"}}
)

avg_league_val_by_country = avg_league_val_by_country.merge(
    fifa_power, on=["country_name"], how="inner"
)

In [None]:
foreign_player_national_teams_agg = pd.read_csv("foreign_player_national_teams_agg.csv")

foreign_player_national_teams_agg = foreign_player_national_teams_agg.drop(
    columns="Unnamed: 0"
)

foreign_player_national_teams_agg["dom_int_mv_ratio"] = round(
    (
        foreign_player_national_teams_agg["mv_play_dom"]
        / foreign_player_national_teams_agg["num_play_dom"]
    )
    / (
        foreign_player_national_teams_agg["mv_play_int"]
        / foreign_player_national_teams_agg["num_play_int"]
    ),
    3,
)

foreign_player_national_teams_agg = foreign_player_national_teams_agg.dropna(how="any")

foreign_player_national_teams_agg = foreign_player_national_teams_agg.merge(
    fifa_power, on=["country_name"], how="inner"
)

foreign_player_national_teams_agg.head(5)

In [None]:
rank = (
    alt.Chart(fifa_power)
    .encode(
        y=alt.Y("rank:N", axis=alt.Axis(ticks=False, labels=False, title="")),
        text=alt.Text("rank:N"),
    )
    .mark_text()
    .properties(width=0, height=800)
)

fifa_power_viz = return_ranking_chart(
    "FIFA World Cup Power Ratings",
    "Russia 2018, Qatar 2022",
    fifa_power.copy(),
    "power_rating",
    [0, 1.2],
)

mv_domestic_viz = return_ranking_chart(
    "Avg. Market Value of Top 25 Domestic Teams",
    "Transfermarkt 2023",
    avg_league_val_by_country.copy(),
    "market_val",
    [0, max(avg_league_val_by_country.market_val)],
)

mv_dom_int_ratio_viz = return_ranking_chart(
    "Ratio of MV Dom. Players to MV Int. Players",
    "Transfermarkt 2023",
    foreign_player_national_teams_agg.copy(),
    "dom_int_mv_ratio",
    [0, max(foreign_player_national_teams_agg.dom_int_mv_ratio)],
)

colors = ["#1CC1EA", "#EA1C1C"]

### Visualization I: Comparison among rankings (WC Power Rating, Avg. of Top 25 Domestic Clubs, MV Ratio)

In [None]:
(rank | fifa_power_viz | mv_domestic_viz | mv_dom_int_ratio_viz).properties(
    title={
        "text": "Countries ranked by World Cup power rating and by Transfermarkt data",
        "subtitle": [
            "",
            "FIFA World Cup Power Ratings: Manually calculated based on historic performance of the last two iteration of the event",
            "Average Market Value of Top 25 Domestic Teams: Calculated based on scraped data from website Transfermarkt",
            "Ratio of Market Value of International Players to Domestic Players: Calculated based on scraped data from website Transfermarkt",
            "",
            "*Power rating is calculation methodology limited to project - it is average of final rank/32 over two World Cups",
            "*Previous champions are highlighted in red",
            "",
        ],
        "fontSize": 25,
    }
).configure_title(fontSize=14, anchor="start").configure_view(
    strokeWidth=0
).configure_range(
    category=alt.RangeScheme(colors)
).configure_axis(
    grid=False, domain=False
)

In [None]:
players = pd.read_csv("foreign_player_national_teams.csv")

players = players.drop(columns="Unnamed: 0")

players = players.sort_values(by=["players_mv"], ascending=False)

top_100_players = players[0:100]

top_100_players = (
    top_100_players.reset_index(drop=True)
    .reset_index()
    .rename(columns={"index": "rank"})
)

top_100_players["is_top_10"] = top_100_players.apply(
    lambda x: "Y" if x["rank"] <= 9 else "N", axis=1
)

top_100_players.head(5)

In [None]:
top_100_players_dom = (
    top_100_players[top_100_players.plays_domestically == "Y"]
    .copy()
    .drop(columns=["rank"])
)
top_100_players_dom = (
    top_100_players_dom.reset_index(drop=True)
    .reset_index()
    .rename(columns={"index": "rank"})
)
top_100_players_dom["rank"] = top_100_players_dom["rank"] + 1

top_100_players_int = (
    top_100_players[top_100_players.plays_domestically == "N"]
    .copy()
    .drop(columns=["rank"])
)
top_100_players_int = (
    top_100_players_int.reset_index(drop=True)
    .reset_index()
    .rename(columns={"index": "rank"})
)
top_100_players_int["rank"] = top_100_players_int["rank"] + 1

### Visualization II: Isotype chart - do the best players play abroad or at home?

In [None]:

(
    return_isotype_chart(
        top_100_players_int,
        "From Top 100 MV players: Plays Internationally",
        "From Transfermarkt",
    )
    | return_isotype_chart(
        top_100_players_dom,
        "From Top 100 MV players: Plays Domestically",
        "From Transfermarkt",
    )
).properties(
    title={
        "text": "Do highest paid / highest market value players play abroad or at home?",
        "subtitle": [
            "",
            "After ranking all the players registered on the national teams based on MV, we can see the number of players playing domestically or internationally.",
            "Out of the 100 top players by MV, we see that players with higher market value tends to play abroad (77%) rather than at home (23%).",
            "Interestingly though, looking at the top 10 though, it is perfectly split at 50-50. National hero effect?",
            "",
        ],
        "fontSize": 25,
    }
).configure(
    concat=alt.CompositionConfig(spacing=100)
).configure_view(
    strokeWidth=0
).configure_range(
    category=alt.RangeScheme(colors)
).configure_axis(
    grid=False, domain=False
).configure_legend(
    titleFontSize=10, labelFontSize=10, symbolSize=30
)

In [None]:
# https://github.com/francisadrianviernes/GeoVisualization/tree/master/Shapefiles
gdf = gpd.read_file('shapefile/world-administrative-boundaries.shp').rename(columns={'name':'country_name'})
gdf.head()

In [None]:
wc_results = pd.read_csv('world_cup_result.csv').drop(columns=['Unnamed: 0'])

wc_results = wc_results.replace({'United States':'United States of America'})
wc_results = wc_results.replace({'England':'U.K. of Great Britain and Northern Ireland'})
wc_results = wc_results.replace({'South Korea':'Republic of Korea'})
wc_results = wc_results.replace({'Iran':'Iran (Islamic Republic of)'})
wc_results = wc_results.replace({'Russia':'Russian Federation'})
wc_results = wc_results.replace({'Wales':'U.K. of Great Britain and Northern Ireland'})

combined = gdf.merge(wc_results, how = 'inner',on='country_name' )

### Visualization III: Chorolepath - World Cup as a global event

In [None]:
world = alt.Chart(gdf).mark_geoshape(stroke='white',
                            fill='#CFD1CF').properties(width=950,
                                                                 height=550)

participant = alt.Chart(combined).mark_geoshape(stroke='white',
                            fill='#58DC78').properties(width=950,
                                                                 height=550)

(world + participant).properties(
    title={
        "text": "World Cup is a truly global event",
        "subtitle": [
            "",
            "The qualification process for 32 teams that compete in the World Cup ensures that the event is truly global.",
            "The highlighted countries have participated in either of the previous iteration of the event.",
            "AFC (Asia): 4 or 5",
            "CAF (Africa): 5",
            "CONCACAF (North, Central America and Caribbean): 3 or 4",
            "CONMEBOL (South America): 4 or 5",
            "OFC (Oceania): 0 or 1",
            "UEFA (Europe): 13",
            "Hosts: 1"
        ],
        "fontSize": 25,
    }
)