In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json

In [2]:
# Obtain information about teams
team_info_df = pd.read_json("../parsing/titles_and_cups/sorted_teams.json")
team_info_df.head(10)

Unnamed: 0,TeamID,Team_name,Country_Name,Link_to_team,Page
0,418,Реал Мадрид,Испания,/real-madrid/startseite/verein/418,1
1,281,Манчестер Сити,Англия,/manchester-city/startseite/verein/281,1
2,27,Бавария,Германия,/fc-bayern-munchen/startseite/verein/27,1
3,31,Ливерпуль,Англия,/fc-liverpool/startseite/verein/31,1
4,583,ПСЖ,Франция,/fc-paris-saint-germain/startseite/verein/583,1
5,46,Интер,Италия,/inter-mailand/startseite/verein/46,1
6,12,Рома,Италия,/as-rom/startseite/verein/12,1
7,16,Боруссия Д.,Германия,/borussia-dortmund/startseite/verein/16,1
8,631,Челси,Англия,/fc-chelsea/startseite/verein/631,1
9,15,Байер,Германия,/bayer-04-leverkusen/startseite/verein/15,1


In [3]:
# Check data types
team_info_df.dtypes

TeamID           int64
Team_name       object
Country_Name    object
Link_to_team    object
Page             int64
dtype: object

In [4]:
# Show dimensionality
team_info_df.shape

(536, 5)

In [5]:
# Drop links and page number in the list of all teams, we will not need them
team_info_df.drop(columns=["Link_to_team", "Page"], inplace=True)
team_info_df.head(10)

Unnamed: 0,TeamID,Team_name,Country_Name
0,418,Реал Мадрид,Испания
1,281,Манчестер Сити,Англия
2,27,Бавария,Германия
3,31,Ливерпуль,Англия
4,583,ПСЖ,Франция
5,46,Интер,Италия
6,12,Рома,Италия
7,16,Боруссия Д.,Германия
8,631,Челси,Англия
9,15,Байер,Германия


In [6]:
# Check fpr NaN's 
team_info_df.isna().sum()

TeamID          0
Team_name       0
Country_Name    0
dtype: int64

In [7]:
# Load data about club titles
club_titles_df = pd.read_json("../parsing/titles_and_cups/titles_cups.json")
club_titles_df.head(10)

Unnamed: 0,TeamID,NumberOfTitlesByYears,NumberOfCups
0,31,"{'2024': 1, '2023': 2, '2022': 2, '2021': 4, '...",71
1,12,"{'2024': 1, '2023': 1, '2022': 2, '2021': 2, '...",17
2,583,"{'2024': 3, '2023': 4, '2022': 3, '2021': 2, '...",51
3,418,"{'2024': 3, '2023': 4, '2022': 5, '2021': 4, '...",103
4,281,"{'2024': 3, '2023': 6, '2022': 4, '2021': 2, '...",43
5,631,"{'2024': 1, '2022': 1, '2021': 4, '2020': 3, '...",34
6,27,"{'2024': 2, '2023': 2, '2022': 3, '2021': 3, '...",84
7,16,"{'2024': 2, '2023': 2, '2022': 2, '2021': 3, '...",24
8,46,"{'2024': 2, '2023': 3, '2022': 4, '2021': 3, '...",46
9,15,"{'2024': 2, '2023': 5, '2022': 2, '2021': 1, '...",6


In [8]:
# Show dimensionality
club_titles_df.shape

(536, 3)

In [9]:
# Check data types 
club_titles_df.dtypes

TeamID                    int64
NumberOfTitlesByYears    object
NumberOfCups              int64
dtype: object

In [10]:
# Check for any inconsistencies
club_titles_df.isna().sum()

TeamID                   0
NumberOfTitlesByYears    0
NumberOfCups             0
dtype: int64

In [11]:
# As we can see, now titles are stored in a dictionary in one line for each club, but we need to modify and expand them
club_titles_df["NumberOfTitlesByYears"] = club_titles_df["NumberOfTitlesByYears"].apply(lambda x: list(x.items()))
club_titles_df.head(10)

Unnamed: 0,TeamID,NumberOfTitlesByYears,NumberOfCups
0,31,"[(2024, 1), (2023, 2), (2022, 2), (2021, 4), (...",71
1,12,"[(2024, 1), (2023, 1), (2022, 2), (2021, 2), (...",17
2,583,"[(2024, 3), (2023, 4), (2022, 3), (2021, 2), (...",51
3,418,"[(2024, 3), (2023, 4), (2022, 5), (2021, 4), (...",103
4,281,"[(2024, 3), (2023, 6), (2022, 4), (2021, 2), (...",43
5,631,"[(2024, 1), (2022, 1), (2021, 4), (2020, 3), (...",34
6,27,"[(2024, 2), (2023, 2), (2022, 3), (2021, 3), (...",84
7,16,"[(2024, 2), (2023, 2), (2022, 2), (2021, 3), (...",24
8,46,"[(2024, 2), (2023, 3), (2022, 4), (2021, 3), (...",46
9,15,"[(2024, 2), (2023, 5), (2022, 2), (2021, 1), (...",6


In [17]:
trophies = [sum(item[i][1] for i in range(len(item))) for item in club_titles_df["NumberOfTitlesByYears"]]
len(trophies)

536

In [18]:
club_titles_df["Trophies"] = [item for item in trophies]
club_titles_df.head(15)

Unnamed: 0,TeamID,NumberOfTitlesByYears,NumberOfCups,Trophies
0,31,"[(2024, 1), (2023, 2), (2022, 2), (2021, 4), (...",71,24
1,12,"[(2024, 1), (2023, 1), (2022, 2), (2021, 2), (...",17,14
2,583,"[(2024, 3), (2023, 4), (2022, 3), (2021, 2), (...",51,44
3,418,"[(2024, 3), (2023, 4), (2022, 5), (2021, 4), (...",103,42
4,281,"[(2024, 3), (2023, 6), (2022, 4), (2021, 2), (...",43,34
5,631,"[(2024, 1), (2022, 1), (2021, 4), (2020, 3), (...",34,20
6,27,"[(2024, 2), (2023, 2), (2022, 3), (2021, 3), (...",84,42
7,16,"[(2024, 2), (2023, 2), (2022, 2), (2021, 3), (...",24,30
8,46,"[(2024, 2), (2023, 3), (2022, 4), (2021, 3), (...",46,21
9,15,"[(2024, 2), (2023, 5), (2022, 2), (2021, 1), (...",6,19


In [19]:
club_titles_df[club_titles_df["NumberOfCups"] < club_titles_df["Trophies"]]

Unnamed: 0,TeamID,NumberOfTitlesByYears,NumberOfCups,Trophies
7,16,"[(2024, 2), (2023, 2), (2022, 2), (2021, 3), (...",24,30
9,15,"[(2024, 2), (2023, 5), (2022, 2), (2021, 1), (...",6,19
14,23826,"[(2024, 1), (2023, 2), (2022, 3), (2021, 3), (...",6,20
19,24,"[(2024, 1), (2023, 1), (2022, 3), (2021, 2), (...",11,12
25,6195,"[(2023, 1), (2022, 2), (2021, 1), (2020, 1), (...",14,15
...,...,...,...,...
527,10883,"[(2020, 1), (2017, 1), (2015, 1), (2014, 1)]",3,4
528,17959,"[(2022, 1), (2021, 1)]",1,2
529,45520,"[(2020, 1), (2019, 1), (2014, 1)]",0,3
532,6993,"[(2016, 2), (2014, 1)]",2,3


In [25]:
club_titles_df.query("TeamID == 16")["NumberOfTitlesByYears"].values[0]

[('2024', 2),
 ('2023', 2),
 ('2022', 2),
 ('2021', 3),
 ('2020', 3),
 ('2019', 3),
 ('2018', 3),
 ('2017', 3),
 ('2016', 3),
 ('2015', 3),
 ('2014', 3)]

In [13]:
# Take a look from a different perspective
club_titles_df.query("NumberOfCups == 0")

Unnamed: 0,TeamID,NumberOfTitlesByYears,NumberOfCups
89,3911,"[(2024, 1), (2018, 1)]",0
155,12321,"[(2024, 1), (2021, 1), (2018, 1), (2016, 1)]",0
160,16704,"[(2020, 2), (2019, 1), (2018, 1), (2016, 1), (...",0
214,324,[],0
220,40090,[],0
251,27843,[],0
257,63993,"[(2017, 1)]",0
261,36999,[],0
268,61955,"[(2022, 1), (2019, 1), (2017, 1), (2016, 1)]",0
287,46710,"[(2015, 1)]",0


In [None]:
# Now we should fill missing rows:

# club_titles_df.loc[club_titles_df["NumberOfCups"] == 0, "NumberOfCups"] = [len(item) for item in club_titles_df.loc[club_titles_df["NumberOfCups"] == 0, "NumberOfTitlesByYears"]]

# COMMENTED OUT BECAUSE FURTHER WHEN CALCULATING NUMBER OF TROPHIES FOR PREVIOUS YEARS MANY TROUBLES ARISED

In [15]:
# Verify change
club_titles_df.query("NumberOfCups == 0")

Unnamed: 0,TeamID,NumberOfTitlesByYears,NumberOfCups
214,324,[],0
220,40090,[],0
251,27843,[],0
261,36999,[],0
300,31130,[],0
344,11194,[],0
355,25232,[],0
388,26985,[],0
394,22461,[],0
415,32125,[],0


In [16]:
# Some random check
club_titles_df.query("TeamID == 3911 | TeamID == 61955 | TeamID == 35545")

Unnamed: 0,TeamID,NumberOfTitlesByYears,NumberOfCups
89,3911,"[(2024, 1), (2018, 1)]",2
268,61955,"[(2022, 1), (2019, 1), (2017, 1), (2016, 1)]",4
343,35545,"[(2022, 1)]",1


In [17]:
# Remove teams that did not win any title in considered period from 2014 to 2025, they are useless for us
club_titles_df = club_titles_df.drop(club_titles_df.loc[club_titles_df["NumberOfCups"] == 0, :].index)
club_titles_df.shape

(519, 3)

Great, we removed exactly 17 rows as expected

In [18]:
# Now let us create standalone Data Frame for storing titles per years for each team
titles_per_years_df = club_titles_df.explode("NumberOfTitlesByYears")
titles_per_years_df.head(15)

Unnamed: 0,TeamID,NumberOfTitlesByYears,NumberOfCups
0,31,"(2024, 1)",71
0,31,"(2023, 2)",71
0,31,"(2022, 2)",71
0,31,"(2021, 4)",71
0,31,"(2020, 1)",71
0,31,"(2019, 5)",71
0,31,"(2018, 2)",71
0,31,"(2017, 2)",71
0,31,"(2015, 3)",71
0,31,"(2014, 2)",71


In [19]:
# Find out new, expanded shape
titles_per_years_df.shape

(2103, 3)

In [118]:
titles_per_years_df[titles_per_years_df["TeamID"] == 15]

Unnamed: 0,TeamID,NumberOfCups,Year,NumberOfTitlesThisYear
104,15,6,2014,1
103,15,6,2015,2
102,15,6,2016,1
101,15,6,2018,1
100,15,6,2019,3
99,15,6,2020,1
98,15,6,2021,1
97,15,6,2022,2
96,15,6,2023,5
95,15,6,2024,2


In [20]:
# Investigate data types
titles_per_years_df.dtypes

TeamID                    int64
NumberOfTitlesByYears    object
NumberOfCups              int64
dtype: object

In [21]:
# Find rows that contain null values in "TeamID" field
titles_per_years_df[titles_per_years_df["TeamID"].isna()]

Unnamed: 0,TeamID,NumberOfTitlesByYears,NumberOfCups


In [22]:
# Find rows with NaNs in titles column
titles_per_years_df[titles_per_years_df["NumberOfTitlesByYears"].isna()]

Unnamed: 0,TeamID,NumberOfTitlesByYears,NumberOfCups
116,1101,,10
147,327,,1
167,4529,,1
253,759,,6
262,987,,8
284,29053,,2
297,8780,,10
301,172,,1
351,715,,2
372,8818,,2


In [23]:
# Count obtained rows
titles_per_years_df["NumberOfTitlesByYears"].isna().sum()

np.int64(24)

In [24]:
# Check whether there are rows with missing total number of titles 
titles_per_years_df[titles_per_years_df["NumberOfCups"].isna()]

Unnamed: 0,TeamID,NumberOfTitlesByYears,NumberOfCups


In [25]:
# Drop rows with missing values because they are inferenceless for our analysis
titles_per_years_df.dropna(inplace=True)
titles_per_years_df.shape

(2079, 3)

As expected, number of rows in the Data Frame reduced by 24.

In [26]:
# Since we have multiple rows for each team, we need to reset index to count every single row 
titles_per_years_df.reset_index(drop=True, inplace=True)
titles_per_years_df.head(10)

Unnamed: 0,TeamID,NumberOfTitlesByYears,NumberOfCups
0,31,"(2024, 1)",71
1,31,"(2023, 2)",71
2,31,"(2022, 2)",71
3,31,"(2021, 4)",71
4,31,"(2020, 1)",71
5,31,"(2019, 5)",71
6,31,"(2018, 2)",71
7,31,"(2017, 2)",71
8,31,"(2015, 3)",71
9,31,"(2014, 2)",71


In [27]:
# Separate tuple into years and number of trophies in that year, respectively
titles_per_years_df["Year"] = titles_per_years_df["NumberOfTitlesByYears"][titles_per_years_df["NumberOfTitlesByYears"].notna()].apply(lambda x: x[0])
titles_per_years_df["NumberOfTitlesThisYear"] = titles_per_years_df["NumberOfTitlesByYears"][titles_per_years_df["NumberOfTitlesByYears"].notna()].apply(lambda x: x[1])
titles_per_years_df.head(10)

Unnamed: 0,TeamID,NumberOfTitlesByYears,NumberOfCups,Year,NumberOfTitlesThisYear
0,31,"(2024, 1)",71,2024,1
1,31,"(2023, 2)",71,2023,2
2,31,"(2022, 2)",71,2022,2
3,31,"(2021, 4)",71,2021,4
4,31,"(2020, 1)",71,2020,1
5,31,"(2019, 5)",71,2019,5
6,31,"(2018, 2)",71,2018,2
7,31,"(2017, 2)",71,2017,2
8,31,"(2015, 3)",71,2015,3
9,31,"(2014, 2)",71,2014,2


In [28]:
# Drop redundant column, we have expanded it
titles_per_years_df.drop(columns=["NumberOfTitlesByYears"], inplace=True)
titles_per_years_df.head(15)

Unnamed: 0,TeamID,NumberOfCups,Year,NumberOfTitlesThisYear
0,31,71,2024,1
1,31,71,2023,2
2,31,71,2022,2
3,31,71,2021,4
4,31,71,2020,1
5,31,71,2019,5
6,31,71,2018,2
7,31,71,2017,2
8,31,71,2015,3
9,31,71,2014,2


In [29]:
# Check obtained data types
titles_per_years_df.dtypes

TeamID                     int64
NumberOfCups               int64
Year                      object
NumberOfTitlesThisYear     int64
dtype: object

In [30]:
# Firsly, sort values by team ID and year
titles_per_years_df.sort_values(by=["TeamID", "Year"], inplace=True)
titles_per_years_df.head(10)

Unnamed: 0,TeamID,NumberOfCups,Year,NumberOfTitlesThisYear
1201,3,13,2017,2
1200,3,13,2018,2
1199,3,13,2022,1
1198,3,13,2023,1
337,5,53,2016,1
336,5,53,2021,1
335,5,53,2024,1
135,11,48,2014,3
134,11,48,2015,2
133,11,48,2016,2


In [31]:
# Convert "Year" column to the int
titles_per_years_df["Year"] = titles_per_years_df["Year"].astype(int)

"NumberOfCups" shows now total number of titles for each team in the current (2025) year, we have to modify it to emphasize trends over selected time interval, which is 11 years: 2014 through 2025.
However, we may face the following issue: some teams did not win anything in some year, but we need this data because we collected them from 2014 through 2025

In [32]:
# Create range of years and new Data Frama with all posssible years
years_range = range(2014, 2026)
team_ids = titles_per_years_df["TeamID"].unique()
full_titles_per_years_df = pd.MultiIndex.from_product([team_ids, years_range], names=["TeamID", "Year"]).to_frame(index=False)
full_titles_per_years_df.head(20)

Unnamed: 0,TeamID,Year
0,3,2014
1,3,2015
2,3,2016
3,3,2017
4,3,2018
5,3,2019
6,3,2020
7,3,2021
8,3,2022
9,3,2023


In [33]:
# Merge full DataFrame with titles per years
full_titles_per_years_df = full_titles_per_years_df.merge(titles_per_years_df, on=["TeamID", "Year"], how="left")
full_titles_per_years_df.head(20)

Unnamed: 0,TeamID,Year,NumberOfCups,NumberOfTitlesThisYear
0,3,2014,,
1,3,2015,,
2,3,2016,,
3,3,2017,13.0,2.0
4,3,2018,13.0,2.0
5,3,2019,,
6,3,2020,,
7,3,2021,,
8,3,2022,13.0,1.0
9,3,2023,13.0,1.0


In [34]:
# Investigate new shape
full_titles_per_years_df.shape

(5940, 4)

In [35]:
# Check for NaNs
full_titles_per_years_df.isna().sum()

TeamID                       0
Year                         0
NumberOfCups              3861
NumberOfTitlesThisYear    3861
dtype: int64

Now we have to fill 0's in "NumberOfCups" column with the respective number for each team

In [36]:
# For each TeamID we had only one ""NumberOfCups" value - total number of titles in the current (2025) year. Let us find those numbers and fill NaNs with them
most_frequent_number_of_titles = full_titles_per_years_df[full_titles_per_years_df["NumberOfCups"] != 0].groupby("TeamID")["NumberOfCups"].agg(lambda x: x.mode())

In [37]:
# Let us fill NaNs using collected modes
full_titles_per_years_df.loc[full_titles_per_years_df["NumberOfCups"].isna(), "NumberOfCups"] = full_titles_per_years_df.loc[full_titles_per_years_df["NumberOfCups"].isna()].apply(lambda row: most_frequent_number_of_titles[row["TeamID"]], axis=1)

In [38]:
# Check what we have obtained
full_titles_per_years_df.head(15)

Unnamed: 0,TeamID,Year,NumberOfCups,NumberOfTitlesThisYear
0,3,2014,13.0,
1,3,2015,13.0,
2,3,2016,13.0,
3,3,2017,13.0,2.0
4,3,2018,13.0,2.0
5,3,2019,13.0,
6,3,2020,13.0,
7,3,2021,13.0,
8,3,2022,13.0,1.0
9,3,2023,13.0,1.0


In [39]:
# Verify
full_titles_per_years_df.isna().sum()

TeamID                       0
Year                         0
NumberOfCups                 0
NumberOfTitlesThisYear    3861
dtype: int64

In [40]:
# Fill remaining NaNs with zeros
full_titles_per_years_df.fillna(0, inplace=True)
full_titles_per_years_df.head(15)

Unnamed: 0,TeamID,Year,NumberOfCups,NumberOfTitlesThisYear
0,3,2014,13.0,0.0
1,3,2015,13.0,0.0
2,3,2016,13.0,0.0
3,3,2017,13.0,2.0
4,3,2018,13.0,2.0
5,3,2019,13.0,0.0
6,3,2020,13.0,0.0
7,3,2021,13.0,0.0
8,3,2022,13.0,1.0
9,3,2023,13.0,1.0


In [46]:
# Sort by "NumberOfCups" in descending order to fix number of titles in the past
full_titles_per_years_df.sort_values(by=["TeamID", "Year"], ascending=[True, False], inplace=True)

# Reset indexjust for better look
full_titles_per_years_df.reset_index(drop=True, inplace=True)
full_titles_per_years_df.head(15)

Unnamed: 0,TeamID,Year,NumberOfCups,NumberOfTitlesThisYear
0,3,2025,13.0,0.0
1,3,2024,13.0,0.0
2,3,2023,13.0,1.0
3,3,2022,13.0,1.0
4,3,2021,13.0,0.0
5,3,2020,13.0,0.0
6,3,2019,13.0,0.0
7,3,2018,13.0,2.0
8,3,2017,13.0,2.0
9,3,2016,13.0,0.0


In [49]:
full_titles_per_years_df.groupby("TeamID")["NumberOfTitlesThisYear"].cumsum()

0       0.0
1       0.0
2       1.0
3       2.0
4       2.0
       ... 
5935    3.0
5936    3.0
5937    3.0
5938    3.0
5939    3.0
Name: NumberOfTitlesThisYear, Length: 5940, dtype: float64

In [59]:
full_titles_per_years_df.loc[1:, "NumberOfCups"] - full_titles_per_years_df.groupby("TeamID")["NumberOfTitlesThisYear"].cumsum()[:-1]

0        NaN
1       13.0
2       12.0
3       11.0
4       11.0
        ... 
5935    -2.0
5936    -2.0
5937    -2.0
5938    -2.0
5939     NaN
Length: 5940, dtype: float64

In [98]:
temp_df = full_titles_per_years_df.groupby("TeamID").apply(
    lambda group: group.assign(
        TotalTitles = (group["NumberOfCups"] - group["NumberOfTitlesThisYear"].cumsum().shift(fill_value=0))
    ),
    include_groups=False).reset_index(level=0)

In [99]:
temp_df

Unnamed: 0,TeamID,Year,NumberOfCups,NumberOfTitlesThisYear,TotalTitles
0,3,2025,13.0,0.0,13.0
1,3,2024,13.0,0.0,13.0
2,3,2023,13.0,1.0,13.0
3,3,2022,13.0,1.0,12.0
4,3,2021,13.0,0.0,11.0
...,...,...,...,...,...
5935,98841,2018,1.0,0.0,-2.0
5936,98841,2017,1.0,0.0,-2.0
5937,98841,2016,1.0,0.0,-2.0
5938,98841,2015,1.0,0.0,-2.0


In [100]:
temp_df[temp_df.TotalTitles < 0]

Unnamed: 0,TeamID,Year,NumberOfCups,NumberOfTitlesThisYear,TotalTitles
75,15,2022,6.0,2.0,-1.0
76,15,2021,6.0,1.0,-3.0
77,15,2020,6.0,1.0,-4.0
78,15,2019,6.0,3.0,-5.0
79,15,2018,6.0,1.0,-8.0
...,...,...,...,...,...
5935,98841,2018,1.0,0.0,-2.0
5936,98841,2017,1.0,0.0,-2.0
5937,98841,2016,1.0,0.0,-2.0
5938,98841,2015,1.0,0.0,-2.0


In [86]:
full_titles_per_years_df[full_titles_per_years_df["NumberOfCups"] < full_titles_per_years_df["NumberOfTitlesThisYear"]]

Unnamed: 0,TeamID,Year,NumberOfCups,NumberOfTitlesThisYear
2141,1050,2020,1.0,2.0
3031,3137,2018,1.0,2.0
3449,4645,2020,1.0,2.0
4847,19789,2014,1.0,2.0
5624,43532,2017,1.0,2.0
5734,48332,2015,1.0,2.0
5839,60551,2018,1.0,2.0
5930,98841,2023,1.0,2.0
