In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
%matplotlib inline

In [2]:
# Load data about clubs
clubs_info_df = pd.read_json("../../parsing/parsedData/complete_clubs.json")
clubs_info_df.head(15)

Unnamed: 0,TeamID,Year,TeamCost,AverageAge,Legioners,TeamSize,PlayerIDS,NationalPlayersCount,TeamSizeRatio
0,1002,2014,"4,25 млн €",215,16,39,"[95621, 112797, 169963, 198986, 38793, 139304,...",11,
1,1002,2015,"3,60 млн €",216,12,32,"[112797, 169963, 198986, 38793, 133770, 133613...",9,0.82
2,1002,2016,"3,48 млн €",210,10,38,"[112797, 169963, 198986, 324448, 133770, 13361...",15,1.19
3,1002,2017,"3,55 млн €",213,8,37,"[112797, 169963, 198986, 324448, 363402, 13361...",10,0.97
4,1002,2018,"5,80 млн €",224,9,39,"[198981, 112797, 169963, 198986, 324448, 36359...",13,1.05
5,1002,2019,"4,68 млн €",214,5,38,"[198981, 324448, 169963, 363402, 549282, 73962...",14,0.97
6,1002,2020,"5,15 млн €",210,3,40,"[198981, 324448, 242044, 363402, 739626, 24736...",16,1.05
7,1002,2021,"5,75 млн €",224,3,37,"[198981, 324448, 242044, 363402, 402119, 57422...",17,0.93
8,1002,2022,"5,06 млн €",223,3,38,"[242044, 324448, 363402, 402119, 198988, 24736...",14,1.03
9,1002,2023,"5,33 млн €",212,5,51,"[363402, 402119, 324448, 488270, 927920, 24736...",15,1.34


In [3]:
# Show shape
clubs_info_df.shape

(5896, 9)

In [4]:
# Get data about axes
clubs_info_df.axes

[RangeIndex(start=0, stop=5896, step=1),
 Index(['TeamID ', 'Year', 'TeamCost', 'AverageAge', 'Legioners', 'TeamSize',
        'PlayerIDS', 'NationalPlayersCount', 'TeamSizeRatio'],
       dtype='object')]

In [5]:
# Remove any redundant symbols in columns, such as space after "TeamID"
clubs_info_df.columns = clubs_info_df.columns.str.strip()
clubs_info_df.axes

[RangeIndex(start=0, stop=5896, step=1),
 Index(['TeamID', 'Year', 'TeamCost', 'AverageAge', 'Legioners', 'TeamSize',
        'PlayerIDS', 'NationalPlayersCount', 'TeamSizeRatio'],
       dtype='object')]

In [6]:
# Drop "TeamCost", "AverageAge", "Legioners", "TeamSize", "PlayerIDS", "NationalPlayersCount" columns: we will not need them
clubs_info_df.drop(columns=["TeamCost", "AverageAge", "Legioners", "TeamSize", "PlayerIDS", "NationalPlayersCount"],    
                   inplace=True)
clubs_info_df.head(10)

Unnamed: 0,TeamID,Year,TeamSizeRatio
0,1002,2014,
1,1002,2015,0.82
2,1002,2016,1.19
3,1002,2017,0.97
4,1002,2018,1.05
5,1002,2019,0.97
6,1002,2020,1.05
7,1002,2021,0.93
8,1002,2022,1.03
9,1002,2023,1.34


We obtained some NaNs in rows with year equal to 2014, which is fine because we may calculate this ratio forever, but for now we will just drop 2014 year

In [7]:
# Verify that every team has 10 rows without missing values (and missed value is exactly in row with year equal to 2014)
clubs_info_df.groupby(by="TeamID", as_index=False) \
    .agg({"TeamSizeRatio": "count"})

Unnamed: 0,TeamID,TeamSizeRatio
0,3,10
1,5,10
2,11,10
3,12,10
4,13,10
...,...,...
531,64534,5
532,64780,10
533,69752,5
534,80996,4


Okay, as we can see some teams are missing more than 1 rows, let us find and drop them

In [8]:
# Firstly, make sure that 2014 year is missing for every team
clubs_info_df.query("Year == 2014").isna().sum()

TeamID             0
Year               0
TeamSizeRatio    536
dtype: int64

In [9]:
# And the shape after grouping is:
clubs_info_df.query("Year == 2014").shape

(536, 3)

Great! Then drop rows where "Year" is 2014

In [10]:
# Drop rows where "Year" is 2014
clubs_info_df.drop(clubs_info_df.query("Year == 2014").index, inplace=True)
clubs_info_df.shape

(5360, 3)

The shape is reduced by exactly 526 rows, as expected, fine

In [11]:
# Retrieve teams with missing years
clubs_info_df.groupby("TeamID", as_index=False) \
    .agg({"TeamSizeRatio": "count"}) \
    .query("TeamSizeRatio != 10")

Unnamed: 0,TeamID,TeamSizeRatio
178,1007,7
251,2782,9
265,3137,8
270,3461,9
275,3592,6
276,3649,8
309,5228,6
342,6993,8
358,8715,8
362,8969,5


In [12]:
# Check shape of the output (the number of temas to remove)
clubs_info_df.groupby("TeamID", as_index=False) \
    .agg({"TeamSizeRatio": "count"}) \
    .query("TeamSizeRatio != 10") \
    .shape

(50, 2)

In [13]:
# Find number of teams to remove
clubs_info_df.groupby("TeamID", as_index=False) \
    .agg({"TeamSizeRatio": "count"}) \
    .query("TeamSizeRatio != 10") \
    .TeamID \
    .nunique()

50

In [14]:
# Find teams with other missing years
teams_with_missing_years = clubs_info_df.groupby("TeamID", as_index=False) \
    .agg({"TeamSizeRatio": "count"}) \
    .query("TeamSizeRatio != 10") \
    .TeamID \
    .unique()

In [15]:
# Drop those teams
clubs_info_df.drop(clubs_info_df[clubs_info_df["TeamID"].isin(teams_with_missing_years)].index, inplace=True)

In [16]:
# See again at grouping
clubs_info_df.groupby("TeamID", as_index=False) \
    .agg({"TeamSizeRatio": "count"})

Unnamed: 0,TeamID,TeamSizeRatio
0,3,10
1,5,10
2,11,10
3,12,10
4,13,10
...,...,...
481,46887,10
482,48325,10
483,48332,10
484,53152,10


In [17]:
# Verify drop
clubs_info_df.groupby("TeamID", as_index=False) \
    .agg({"TeamSizeRatio": "count"}) \
    .query("TeamSizeRatio != 10")

Unnamed: 0,TeamID,TeamSizeRatio


In [18]:
# See how Data Frame looks like now
clubs_info_df.head(15)

Unnamed: 0,TeamID,Year,TeamSizeRatio
1,1002,2015,0.82
2,1002,2016,1.19
3,1002,2017,0.97
4,1002,2018,1.05
5,1002,2019,0.97
6,1002,2020,1.05
7,1002,2021,0.93
8,1002,2022,1.03
9,1002,2023,1.34
10,1002,2024,0.59


In [19]:
# Sort data by "TeamID" and "Year"
clubs_info_df.sort_values(by=["TeamID", "Year"], inplace=True)
clubs_info_df.head(10)

Unnamed: 0,TeamID,Year,TeamSizeRatio
2641,3,2015,0.93
2642,3,2016,1.2
2643,3,2017,1.27
2644,3,2018,0.89
2645,3,2019,1.03
2646,3,2020,1.03
2647,3,2021,0.86
2648,3,2022,1.26
2649,3,2023,0.85
2650,3,2024,0.94


In [20]:
# Check for NaNs
clubs_info_df.isna().sum()

TeamID           0
Year             0
TeamSizeRatio    0
dtype: int64

In [21]:
with open("team_size_ratio.json", "w") as file:
    json.dump(clubs_info_df.to_dict(orient="records"), file, indent=4)