In [1]:
import numpy as np
import pandas as pd
import json

In [2]:
# Load data about countries
countries_df = pd.read_json("../../parsing/parsedData/sorted_national_teams.json")
countries_df.head()

Unnamed: 0,NationalTeamID,NationalTeamName,Link_to_team,Page
0,3437,Аргентина,/argentinien/startseite/verein/3437,1
1,3377,Франция,/frankreich/startseite/verein/3377,1
2,3375,Испания,/spanien/startseite/verein/3375,1
3,3299,Англия,/england/startseite/verein/3299,1
4,3439,Бразилия,/brasilien/startseite/verein/3439,1


In [3]:
# Check axes
countries_df.axes

[RangeIndex(start=0, stop=210, step=1),
 Index(['NationalTeamID', 'NationalTeamName', 'Link_to_team', 'Page'], dtype='object')]

In [4]:
# Drop "Link_to_team" and "Page" columns, we will not use them
countries_df.drop(["Link_to_team", "Page"], axis=1, inplace=True)
countries_df.head()

Unnamed: 0,NationalTeamID,NationalTeamName
0,3437,Аргентина
1,3377,Франция
2,3375,Испания
3,3299,Англия
4,3439,Бразилия


In [5]:
# Check for missing values
countries_df.isna().sum()

NationalTeamID      0
NationalTeamName    0
dtype: int64

In [6]:
# Load data about clubs and their country belongings
clubs_df = pd.read_json("../../parsing/parsedData/sorted_teams.json")
clubs_df.head()

Unnamed: 0,TeamID,Team_name,Country_Name,Link_to_team,Page
0,418,Реал Мадрид,Испания,/real-madrid/startseite/verein/418,1
1,281,Манчестер Сити,Англия,/manchester-city/startseite/verein/281,1
2,27,Бавария,Германия,/fc-bayern-munchen/startseite/verein/27,1
3,31,Ливерпуль,Англия,/fc-liverpool/startseite/verein/31,1
4,583,ПСЖ,Франция,/fc-paris-saint-germain/startseite/verein/583,1


In [7]:
# Check axes
clubs_df.axes

[RangeIndex(start=0, stop=536, step=1),
 Index(['TeamID', 'Team_name', 'Country_Name', 'Link_to_team', 'Page'], dtype='object')]

In [8]:
# Drop "Link_to_team" and "Page" columns, we will not use them
clubs_df.drop(["Link_to_team", "Page"], axis=1, inplace=True)
clubs_df.head()

Unnamed: 0,TeamID,Team_name,Country_Name
0,418,Реал Мадрид,Испания
1,281,Манчестер Сити,Англия
2,27,Бавария,Германия
3,31,Ливерпуль,Англия
4,583,ПСЖ,Франция


In [9]:
# Merge two data frames
countries_df = countries_df.merge(clubs_df, left_on="NationalTeamName", right_on="Country_Name")
countries_df.head(15)

Unnamed: 0,NationalTeamID,NationalTeamName,TeamID,Team_name,Country_Name
0,3377,Франция,583,ПСЖ,Франция
1,3377,Франция,1082,Лилль,Франция
2,3377,Франция,244,Марсель,Франция
3,3377,Франция,273,Ренн,Франция
4,3377,Франция,1041,Лион,Франция
5,3377,Франция,417,Ницца,Франция
6,3377,Франция,3911,Брест,Франция
7,3377,Франция,826,Ланс,Франция
8,3377,Франция,415,Тулуза,Франция
9,3377,Франция,995,Нант,Франция


In [10]:
# Drop club names, we will not need them
countries_df.drop(["Team_name"], axis=1, inplace=True)
countries_df.head(15)

Unnamed: 0,NationalTeamID,NationalTeamName,TeamID,Country_Name
0,3377,Франция,583,Франция
1,3377,Франция,1082,Франция
2,3377,Франция,244,Франция
3,3377,Франция,273,Франция
4,3377,Франция,1041,Франция
5,3377,Франция,417,Франция
6,3377,Франция,3911,Франция
7,3377,Франция,826,Франция
8,3377,Франция,415,Франция
9,3377,Франция,995,Франция


In [11]:
# Load complete data about clubs
complete_clubs_df = pd.read_json("../../parsing/parsedData/complete_clubs.json")
complete_clubs_df.head()

Unnamed: 0,TeamID,Year,TeamCost,AverageAge,Legioners,TeamSize,PlayerIDS,NationalPlayersCount,TeamSizeRatio
0,1002,2014,"4,25 млн €",215,16,39,"[95621, 112797, 169963, 198986, 38793, 139304,...",11,
1,1002,2015,"3,60 млн €",216,12,32,"[112797, 169963, 198986, 38793, 133770, 133613...",9,0.82
2,1002,2016,"3,48 млн €",210,10,38,"[112797, 169963, 198986, 324448, 133770, 13361...",15,1.19
3,1002,2017,"3,55 млн €",213,8,37,"[112797, 169963, 198986, 324448, 363402, 13361...",10,0.97
4,1002,2018,"5,80 млн €",224,9,39,"[198981, 112797, 169963, 198986, 324448, 36359...",13,1.05


In [12]:
# Check axes
complete_clubs_df.axes

[RangeIndex(start=0, stop=5896, step=1),
 Index(['TeamID ', 'Year', 'TeamCost', 'AverageAge', 'Legioners', 'TeamSize',
        'PlayerIDS', 'NationalPlayersCount', 'TeamSizeRatio'],
       dtype='object')]

In [13]:
# Remove redundant symbols from axes
complete_clubs_df.columns = complete_clubs_df.columns.str.strip()
complete_clubs_df.axes

[RangeIndex(start=0, stop=5896, step=1),
 Index(['TeamID', 'Year', 'TeamCost', 'AverageAge', 'Legioners', 'TeamSize',
        'PlayerIDS', 'NationalPlayersCount', 'TeamSizeRatio'],
       dtype='object')]

In [14]:
# Check data types
complete_clubs_df.dtypes

TeamID                    int64
Year                      int64
TeamCost                 object
AverageAge               object
Legioners                 int64
TeamSize                  int64
PlayerIDS                object
NationalPlayersCount      int64
TeamSizeRatio           float64
dtype: object

In [15]:
# Let us find unique values for "TeamCost" column without considering digits
complete_clubs_df["TeamCost"].str.replace(r"([\d]+)", "", regex=True).unique()

array([', млн €', ' тыс €', '-', ', Млрд. €'], dtype=object)

In [16]:
# Drop rows with missing values ("-" sign)
complete_clubs_df.drop(complete_clubs_df[complete_clubs_df["TeamCost"].str.contains("-")].index, inplace=True)

In [17]:
# Verify result
complete_clubs_df["TeamCost"].str.replace(r"([\d]+)", "", regex=True).unique()

array([', млн €', ' тыс €', ', Млрд. €'], dtype=object)

In [18]:
# Remove commas from string in "TeamCost" column, remove "тыс" (thousand), replace "млн" (million) with one zero because numbers that represent millions have exactly 2 float digits, and replace "Млрд." (billion) with 4 zeros, so that every number will represent amount of thousands in cost
complete_clubs_df.TeamCost = complete_clubs_df.TeamCost. \
    str.replace(",", "") \
    .str.replace("тыс", "") \
    .str.replace("млн", "0") \
    .str.replace("Млрд.", "0000") \
    .str.replace("€", "") \
    .str.replace(" ", "")
complete_clubs_df.head(15)

Unnamed: 0,TeamID,Year,TeamCost,AverageAge,Legioners,TeamSize,PlayerIDS,NationalPlayersCount,TeamSizeRatio
0,1002,2014,4250,215,16,39,"[95621, 112797, 169963, 198986, 38793, 139304,...",11,
1,1002,2015,3600,216,12,32,"[112797, 169963, 198986, 38793, 133770, 133613...",9,0.82
2,1002,2016,3480,210,10,38,"[112797, 169963, 198986, 324448, 133770, 13361...",15,1.19
3,1002,2017,3550,213,8,37,"[112797, 169963, 198986, 324448, 363402, 13361...",10,0.97
4,1002,2018,5800,224,9,39,"[198981, 112797, 169963, 198986, 324448, 36359...",13,1.05
5,1002,2019,4680,214,5,38,"[198981, 324448, 169963, 363402, 549282, 73962...",14,0.97
6,1002,2020,5150,210,3,40,"[198981, 324448, 242044, 363402, 739626, 24736...",16,1.05
7,1002,2021,5750,224,3,37,"[198981, 324448, 242044, 363402, 402119, 57422...",17,0.93
8,1002,2022,5060,223,3,38,"[242044, 324448, 363402, 402119, 198988, 24736...",14,1.03
9,1002,2023,5330,212,5,51,"[363402, 402119, 324448, 488270, 927920, 24736...",15,1.34


In [19]:
# Merge two data frames to obtain clubs costs per year
countries_df = countries_df.merge(complete_clubs_df, left_on="TeamID", right_on="TeamID")
countries_df.head(15)

Unnamed: 0,NationalTeamID,NationalTeamName,TeamID,Country_Name,Year,TeamCost,AverageAge,Legioners,TeamSize,PlayerIDS,NationalPlayersCount,TeamSizeRatio
0,3377,Франция,583,Франция,2014,363900,257,30,32,"[25508, 18036, 182906, 164904, 29241, 46741, 1...",16,
1,3377,Франция,583,Франция,2015,436300,254,31,35,"[45672, 25508, 18036, 181767, 46741, 29241, 28...",17,1.09
2,3377,Франция,583,Франция,2016,519000,250,35,39,"[45672, 120629, 25508, 282028, 181767, 46741, ...",20,1.11
3,3377,Франция,583,Франция,2017,900100,255,31,35,"[120629, 45672, 282028, 395251, 181767, 282041...",19,0.9
4,3377,Франция,583,Франция,2018,1010000,254,36,39,"[120629, 45672, 282028, 5023, 395251, 181767, ...",21,1.11
5,3377,Франция,583,Франция,2019,874150,258,35,35,"[120629, 45672, 79422, 207302, 465955, 460626,...",20,0.9
6,3377,Франция,583,Франция,2020,805000,247,39,41,"[79422, 207302, 465955, 93730, 460626, 606576,...",15,1.17
7,3377,Франция,583,Франция,2021,875150,262,39,41,"[315858, 79422, 207302, 93730, 460626, 606576,...",19,1.0
8,3377,Франция,583,Франция,2022,891100,259,34,37,"[315858, 79422, 207302, 654408, 93730, 181767,...",17,0.9
9,3377,Франция,583,Франция,2023,1020000,249,34,36,"[315858, 466783, 79422, 207302, 93730, 810067,...",18,0.97


In [20]:
# Divide data into specific data frames
full_players_cost_df = countries_df[["NationalTeamID", "Year", "TeamID", "TeamCost"]]
full_players_cost_df.head(15)

Unnamed: 0,NationalTeamID,Year,TeamID,TeamCost
0,3377,2014,583,363900
1,3377,2015,583,436300
2,3377,2016,583,519000
3,3377,2017,583,900100
4,3377,2018,583,1010000
5,3377,2019,583,874150
6,3377,2020,583,805000
7,3377,2021,583,875150
8,3377,2022,583,891100
9,3377,2023,583,1020000


In [21]:
# Check data types
full_players_cost_df.dtypes

NationalTeamID     int64
Year               int64
TeamID             int64
TeamCost          object
dtype: object

In [22]:
# Check for missing values
full_players_cost_df.isna().sum()

NationalTeamID    0
Year              0
TeamID            0
TeamCost          0
dtype: int64

In [23]:
# Convert "TotalCost" column to integer
full_players_cost_df.loc[:, "TeamCost"] = full_players_cost_df.loc[:, "TeamCost"].astype(int)
full_players_cost_df.dtypes

NationalTeamID     int64
Year               int64
TeamID             int64
TeamCost          object
dtype: object

In [24]:
# Create another data frame for another metric, which is "average_team_cost"
average_team_cost_df = full_players_cost_df.groupby(["NationalTeamID", "Year"], as_index=False) \
    .agg({"TeamCost": "mean"}) \
    .rename(columns={"TotalCountryCost": "AverageCountryCost"}) \
    .sort_values(["NationalTeamID", "Year"])
    
average_team_cost_df.head(15)

Unnamed: 0,NationalTeamID,Year,TeamCost
0,3262,2014,140033.75
1,3262,2015,153685.625
2,3262,2016,166979.375
3,3262,2017,228828.75
4,3262,2018,279282.5
5,3262,2019,263208.125
6,3262,2020,284627.5
7,3262,2021,260271.25
8,3262,2022,275444.375
9,3262,2023,288541.875


In [25]:
# Write these results to a JSON file
with open("average_team_cost.json", "w") as file:
    json.dump(average_team_cost_df.to_dict(orient="records"), file, indent=4)

In [26]:
# Sum up costs of all players in clubs
full_players_cost_df = full_players_cost_df.groupby(["NationalTeamID", "Year"], as_index=False) \
    .agg({"TeamCost": "sum"}) \
    .rename(columns={"TeamCost": "TotalCountryCost"}) \
    .sort_values(["NationalTeamID", "Year"])
    
full_players_cost_df.head(15)

Unnamed: 0,NationalTeamID,Year,TotalCountryCost
0,3262,2014,2240540
1,3262,2015,2458970
2,3262,2016,2671670
3,3262,2017,3661260
4,3262,2018,4468520
5,3262,2019,4211330
6,3262,2020,4554040
7,3262,2021,4164340
8,3262,2022,4407110
9,3262,2023,4616670


In [27]:
# Write these results to a JSON file

with open("full_players_costs.json", "w") as file:
    json.dump(full_players_cost_df.to_dict(orient="records"), file, indent=4)

In [28]:
# Extract data about national teams players
national_teams_players_df = countries_df[["NationalTeamID", "Year", "TeamID", "NationalPlayersCount"]]
national_teams_players_df.head(15)

Unnamed: 0,NationalTeamID,Year,TeamID,NationalPlayersCount
0,3377,2014,583,16
1,3377,2015,583,17
2,3377,2016,583,20
3,3377,2017,583,19
4,3377,2018,583,21
5,3377,2019,583,20
6,3377,2020,583,15
7,3377,2021,583,19
8,3377,2022,583,17
9,3377,2023,583,18


In [29]:
# Check for missing values
national_teams_players_df.isna().sum()

NationalTeamID          0
Year                    0
TeamID                  0
NationalPlayersCount    0
dtype: int64

In [30]:
# Find number of national teams players within clubs per year for each country
national_teams_players_df = national_teams_players_df.groupby(["NationalTeamID", "Year"], as_index=False) \
    .agg({"NationalPlayersCount": "sum"}) \
    .sort_values(["NationalTeamID", "Year"])
    
national_teams_players_df.head(15)

Unnamed: 0,NationalTeamID,Year,NationalPlayersCount
0,3262,2014,171
1,3262,2015,160
2,3262,2016,153
3,3262,2017,152
4,3262,2018,145
5,3262,2019,165
6,3262,2020,155
7,3262,2021,184
8,3262,2022,178
9,3262,2023,180


In [31]:
# Write these results to a JSON file
with open("national_teams_players_total_amount.json", "w") as file:
    json.dump(national_teams_players_df.to_dict(orient="records"), file, indent=4)

In [32]:
# Extract data about legionnaires
legionnaires_df = countries_df[["NationalTeamID", "Year", "TeamID", "Legioners"]]
legionnaires_df.head(15)

Unnamed: 0,NationalTeamID,Year,TeamID,Legioners
0,3377,2014,583,30
1,3377,2015,583,31
2,3377,2016,583,35
3,3377,2017,583,31
4,3377,2018,583,36
5,3377,2019,583,35
6,3377,2020,583,39
7,3377,2021,583,39
8,3377,2022,583,34
9,3377,2023,583,34


In [33]:
# Find number of legionnaires within clubs per year for each country
legionnaires_df = legionnaires_df.groupby(["NationalTeamID", "Year"], as_index=False) \
    .agg({"Legioners": "sum"}) \
    .sort_values(["NationalTeamID", "Year"]) \
    .rename(columns={"Legioners": "TotalLegionnairesAmount"})
    
legionnaires_df.head(15)

Unnamed: 0,NationalTeamID,Year,TotalLegionnairesAmount
0,3262,2014,358
1,3262,2015,360
2,3262,2016,384
3,3262,2017,367
4,3262,2018,364
5,3262,2019,372
6,3262,2020,369
7,3262,2021,381
8,3262,2022,372
9,3262,2023,362


In [34]:
# Write these results to a JSON file
with open("legionnaires_total_amount.json", "w") as file:
    json.dump(legionnaires_df.to_dict(orient="records"), file, indent=4)

In [35]:
# Extract data about average age of players
average_age_df = countries_df[["NationalTeamID", "Year", "TeamID", "AverageAge"]]
average_age_df.head(15)

Unnamed: 0,NationalTeamID,Year,TeamID,AverageAge
0,3377,2014,583,257
1,3377,2015,583,254
2,3377,2016,583,250
3,3377,2017,583,255
4,3377,2018,583,254
5,3377,2019,583,258
6,3377,2020,583,247
7,3377,2021,583,262
8,3377,2022,583,259
9,3377,2023,583,249


In [36]:
# Replace commas with dots in "AverageAge" column
average_age_df.loc[:, "AverageAge"] = average_age_df.loc[:, "AverageAge"].str.replace(",", ".")
average_age_df.head(15)

Unnamed: 0,NationalTeamID,Year,TeamID,AverageAge
0,3377,2014,583,25.7
1,3377,2015,583,25.4
2,3377,2016,583,25.0
3,3377,2017,583,25.5
4,3377,2018,583,25.4
5,3377,2019,583,25.8
6,3377,2020,583,24.7
7,3377,2021,583,26.2
8,3377,2022,583,25.9
9,3377,2023,583,24.9


In [37]:
# Check data types
average_age_df.dtypes

NationalTeamID     int64
Year               int64
TeamID             int64
AverageAge        object
dtype: object

In [38]:
# Convert "AverageAge" column to float
average_age_df.loc[:, "AverageAge"] = average_age_df.loc[:, "AverageAge"].astype(float)
average_age_df.dtypes

NationalTeamID     int64
Year               int64
TeamID             int64
AverageAge        object
dtype: object

In [39]:
average_age_df.groupby(["NationalTeamID", "Year"], as_index=False) \
    .agg({"AverageAge": "mean"}) \
    .sort_values(["NationalTeamID", "Year"]) \
    .rename(columns={"AverageAge": "AverageAgeAmongClubs"})

Unnamed: 0,NationalTeamID,Year,AverageAgeAmongClubs
0,3262,2014,25.5625
1,3262,2015,25.4375
2,3262,2016,25.20625
3,3262,2017,25.39375
4,3262,2018,25.275
...,...,...,...
556,53982,2020,25.822222
557,53982,2021,25.422222
558,53982,2022,25.411111
559,53982,2023,25.144444


In [40]:
# Write these results to a JSON file
with open("total_average_age.json", "w") as file:
    json.dump(average_age_df.to_dict(orient="records"), file, indent=4)