In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json

In [2]:
# Load data about club titles
club_titles_df = pd.read_json("../../parsing/parsedData/titles_cups.json")
club_titles_df.head()

Unnamed: 0,TeamID,NumberOfTitlesByYears,NumberOfCups
0,31,"{'2024': 1, '2023': 2, '2022': 2, '2021': 4, '...",71
1,12,"{'2024': 1, '2023': 1, '2022': 2, '2021': 2, '...",17
2,583,"{'2024': 3, '2023': 4, '2022': 3, '2021': 2, '...",51
3,418,"{'2024': 3, '2023': 4, '2022': 5, '2021': 4, '...",103
4,281,"{'2024': 3, '2023': 6, '2022': 4, '2021': 2, '...",43


In [3]:
# Show dimensionality
club_titles_df.shape

(536, 3)

In [4]:
# Check data types 
club_titles_df.dtypes

TeamID                    int64
NumberOfTitlesByYears    object
NumberOfCups              int64
dtype: object

In [5]:
# Check for any inconsistencies
club_titles_df.isna().sum()

TeamID                   0
NumberOfTitlesByYears    0
NumberOfCups             0
dtype: int64

In [6]:
# As we can see, now titles are stored in a dictionary in one line for each club, but we need to modify and expand them
club_titles_df["NumberOfTitlesByYears"] = club_titles_df["NumberOfTitlesByYears"].apply(lambda x: list(x.items()))
club_titles_df.head(10)

Unnamed: 0,TeamID,NumberOfTitlesByYears,NumberOfCups
0,31,"[(2024, 1), (2023, 2), (2022, 2), (2021, 4), (...",71
1,12,"[(2024, 1), (2023, 1), (2022, 2), (2021, 2), (...",17
2,583,"[(2024, 3), (2023, 4), (2022, 3), (2021, 2), (...",51
3,418,"[(2024, 3), (2023, 4), (2022, 5), (2021, 4), (...",103
4,281,"[(2024, 3), (2023, 6), (2022, 4), (2021, 2), (...",43
5,631,"[(2024, 1), (2022, 1), (2021, 4), (2020, 3), (...",34
6,27,"[(2024, 2), (2023, 2), (2022, 3), (2021, 3), (...",84
7,16,"[(2024, 2), (2023, 2), (2022, 2), (2021, 3), (...",24
8,46,"[(2024, 2), (2023, 3), (2022, 4), (2021, 3), (...",46
9,15,"[(2024, 2), (2023, 5), (2022, 2), (2021, 1), (...",6


In [7]:
# Drop "NumberOfCups" column, as it is not needed anymore
club_titles_df.drop(columns=["NumberOfCups"], inplace=True)
club_titles_df.head()

Unnamed: 0,TeamID,NumberOfTitlesByYears
0,31,"[(2024, 1), (2023, 2), (2022, 2), (2021, 4), (..."
1,12,"[(2024, 1), (2023, 1), (2022, 2), (2021, 2), (..."
2,583,"[(2024, 3), (2023, 4), (2022, 3), (2021, 2), (..."
3,418,"[(2024, 3), (2023, 4), (2022, 5), (2021, 4), (..."
4,281,"[(2024, 3), (2023, 6), (2022, 4), (2021, 2), (..."


In [8]:
# Now explode our Data Frame to have one line for each year
club_titles_df = club_titles_df.explode("NumberOfTitlesByYears")
club_titles_df.head(15)

Unnamed: 0,TeamID,NumberOfTitlesByYears
0,31,"(2024, 1)"
0,31,"(2023, 2)"
0,31,"(2022, 2)"
0,31,"(2021, 4)"
0,31,"(2020, 1)"
0,31,"(2019, 5)"
0,31,"(2018, 2)"
0,31,"(2017, 2)"
0,31,"(2015, 3)"
0,31,"(2014, 2)"


In [9]:
# Reset index
club_titles_df.reset_index(drop=True, inplace=True)
club_titles_df.head(15)

Unnamed: 0,TeamID,NumberOfTitlesByYears
0,31,"(2024, 1)"
1,31,"(2023, 2)"
2,31,"(2022, 2)"
3,31,"(2021, 4)"
4,31,"(2020, 1)"
5,31,"(2019, 5)"
6,31,"(2018, 2)"
7,31,"(2017, 2)"
8,31,"(2015, 3)"
9,31,"(2014, 2)"


In [10]:
# Find out new, expanded shape
club_titles_df.shape

(2120, 2)

In [11]:
# Check for NaNs
club_titles_df.isna().sum()

TeamID                    0
NumberOfTitlesByYears    41
dtype: int64

In [12]:
# Find rows with NaNs
club_titles_df[club_titles_df["NumberOfTitlesByYears"].isna()]

Unnamed: 0,TeamID,NumberOfTitlesByYears
883,1101,
1024,327,
1106,4529,
1291,324,
1315,40090,
1413,27843,
1415,759,
1434,36999,
1435,987,
1508,29053,


In [13]:
# Store teams that have missing values to exclude them from the Data Frame and further analysis
teams_with_missing_values = club_titles_df[club_titles_df["NumberOfTitlesByYears"].isna()]["TeamID"].unique()

In [14]:
# Drop obtained teams
club_titles_df.drop(club_titles_df[club_titles_df["TeamID"].isin(teams_with_missing_values)].index, inplace=True)

In [15]:
# Verify result
club_titles_df.isna().sum()

TeamID                   0
NumberOfTitlesByYears    0
dtype: int64

In [17]:
# Separate tuple into years and number of trophies in that year, respectively
club_titles_df["Year"] = club_titles_df["NumberOfTitlesByYears"].apply(lambda x: x[0])
club_titles_df["NumberOfTitlesThisYear"] = club_titles_df["NumberOfTitlesByYears"].apply(lambda x: x[1])
club_titles_df.head(10)

Unnamed: 0,TeamID,NumberOfTitlesByYears,Year,NumberOfTitlesThisYear
0,31,"(2024, 1)",2024,1
1,31,"(2023, 2)",2023,2
2,31,"(2022, 2)",2022,2
3,31,"(2021, 4)",2021,4
4,31,"(2020, 1)",2020,1
5,31,"(2019, 5)",2019,5
6,31,"(2018, 2)",2018,2
7,31,"(2017, 2)",2017,2
8,31,"(2015, 3)",2015,3
9,31,"(2014, 2)",2014,2


In [17]:
club_titles_df["NumberOfTitlesByYears"][club_titles_df["NumberOfTitlesByYears"].isna()]

Series([], Name: NumberOfTitlesByYears, dtype: object)

In [22]:
# See how many records we have for each team
club_titles_df.groupby("TeamID", as_index=False) \
    .agg({"NumberOfTitlesByYears": "count"})

Unnamed: 0,TeamID,NumberOfTitlesByYears
0,3,4
1,5,3
2,11,10
3,12,11
4,13,11
...,...,...
490,64534,3
491,64780,2
492,69752,1
493,80996,1


In [28]:
# Drop redundant column, we have expanded it
titles_per_years_df.drop(columns=["NumberOfTitlesByYears"], inplace=True)
titles_per_years_df.head(15)

Unnamed: 0,TeamID,NumberOfCups,Year,NumberOfTitlesThisYear
0,31,71,2024,1
1,31,71,2023,2
2,31,71,2022,2
3,31,71,2021,4
4,31,71,2020,1
5,31,71,2019,5
6,31,71,2018,2
7,31,71,2017,2
8,31,71,2015,3
9,31,71,2014,2


In [29]:
# Check obtained data types
titles_per_years_df.dtypes

TeamID                     int64
NumberOfCups               int64
Year                      object
NumberOfTitlesThisYear     int64
dtype: object

In [30]:
# Firsly, sort values by team ID and year
titles_per_years_df.sort_values(by=["TeamID", "Year"], inplace=True)
titles_per_years_df.head(10)

Unnamed: 0,TeamID,NumberOfCups,Year,NumberOfTitlesThisYear
1201,3,13,2017,2
1200,3,13,2018,2
1199,3,13,2022,1
1198,3,13,2023,1
337,5,53,2016,1
336,5,53,2021,1
335,5,53,2024,1
135,11,48,2014,3
134,11,48,2015,2
133,11,48,2016,2


In [31]:
# Convert "Year" column to the int
titles_per_years_df["Year"] = titles_per_years_df["Year"].astype(int)

"NumberOfCups" shows now total number of titles for each team in the current (2025) year, we have to modify it to emphasize trends over selected time interval, which is 11 years: 2014 through 2025.
However, we may face the following issue: some teams did not win anything in some year, but we need this data because we collected them from 2014 through 2025

In [32]:
# Create range of years and new Data Frama with all posssible years
years_range = range(2014, 2026)
team_ids = titles_per_years_df["TeamID"].unique()
full_titles_per_years_df = pd.MultiIndex.from_product([team_ids, years_range], names=["TeamID", "Year"]).to_frame(index=False)
full_titles_per_years_df.head(20)

Unnamed: 0,TeamID,Year
0,3,2014
1,3,2015
2,3,2016
3,3,2017
4,3,2018
5,3,2019
6,3,2020
7,3,2021
8,3,2022
9,3,2023
