In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json

In [2]:
# Obtain information about teams
team_info_df = pd.read_json("../parsing/titles_and_cups/sorted_teams.json")
team_info_df.head(10)

Unnamed: 0,TeamID,Team_name,Country_Name,Link_to_team,Page
0,418,Реал Мадрид,Испания,/real-madrid/startseite/verein/418,1
1,281,Манчестер Сити,Англия,/manchester-city/startseite/verein/281,1
2,27,Бавария,Германия,/fc-bayern-munchen/startseite/verein/27,1
3,31,Ливерпуль,Англия,/fc-liverpool/startseite/verein/31,1
4,583,ПСЖ,Франция,/fc-paris-saint-germain/startseite/verein/583,1
5,46,Интер,Италия,/inter-mailand/startseite/verein/46,1
6,12,Рома,Италия,/as-rom/startseite/verein/12,1
7,16,Боруссия Д.,Германия,/borussia-dortmund/startseite/verein/16,1
8,631,Челси,Англия,/fc-chelsea/startseite/verein/631,1
9,15,Байер,Германия,/bayer-04-leverkusen/startseite/verein/15,1


In [3]:
# Check data types
team_info_df.dtypes

TeamID           int64
Team_name       object
Country_Name    object
Link_to_team    object
Page             int64
dtype: object

In [4]:
# Show dimensionality
team_info_df.shape

(536, 5)

In [5]:
# Drop links and page number in the list of all teams, we will not need them
team_info_df.drop(columns=["Link_to_team", "Page"], inplace=True)
team_info_df.head(10)

Unnamed: 0,TeamID,Team_name,Country_Name
0,418,Реал Мадрид,Испания
1,281,Манчестер Сити,Англия
2,27,Бавария,Германия
3,31,Ливерпуль,Англия
4,583,ПСЖ,Франция
5,46,Интер,Италия
6,12,Рома,Италия
7,16,Боруссия Д.,Германия
8,631,Челси,Англия
9,15,Байер,Германия


In [6]:
# Load data about club titles
club_titles_df = pd.read_json("../parsing/titles_and_cups/titles_cups.json")
club_titles_df.head(10)

Unnamed: 0,TeamID,NumberOfTitlesByYears,NumberOfCups
0,31,"{'2024': 1, '2023': 2, '2022': 2, '2021': 4, '...",71
1,12,"{'2024': 1, '2023': 1, '2022': 2, '2021': 2, '...",17
2,583,"{'2024': 3, '2023': 4, '2022': 3, '2021': 2, '...",51
3,418,"{'2024': 3, '2023': 4, '2022': 5, '2021': 4, '...",103
4,281,"{'2024': 3, '2023': 6, '2022': 4, '2021': 2, '...",43
5,631,"{'2024': 1, '2022': 1, '2021': 4, '2020': 3, '...",34
6,27,"{'2024': 2, '2023': 2, '2022': 3, '2021': 3, '...",84
7,16,"{'2024': 2, '2023': 2, '2022': 2, '2021': 3, '...",24
8,46,"{'2024': 2, '2023': 3, '2022': 4, '2021': 3, '...",46
9,15,"{'2024': 2, '2023': 5, '2022': 2, '2021': 1, '...",6


In [7]:
# Show dimensionality
club_titles_df.shape

(536, 3)

In [8]:
# Check data types 
club_titles_df.dtypes

TeamID                    int64
NumberOfTitlesByYears    object
NumberOfCups              int64
dtype: object

In [9]:
# As we cann see, now titles are stored in a dictionary in one line for each club, but we need to expand them them
club_titles_df["NumberOfTitlesByYears"] = club_titles_df["NumberOfTitlesByYears"].apply(lambda x: list(x.items()))
club_titles_df.head(10)

Unnamed: 0,TeamID,NumberOfTitlesByYears,NumberOfCups
0,31,"[(2024, 1), (2023, 2), (2022, 2), (2021, 4), (...",71
1,12,"[(2024, 1), (2023, 1), (2022, 2), (2021, 2), (...",17
2,583,"[(2024, 3), (2023, 4), (2022, 3), (2021, 2), (...",51
3,418,"[(2024, 3), (2023, 4), (2022, 5), (2021, 4), (...",103
4,281,"[(2024, 3), (2023, 6), (2022, 4), (2021, 2), (...",43
5,631,"[(2024, 1), (2022, 1), (2021, 4), (2020, 3), (...",34
6,27,"[(2024, 2), (2023, 2), (2022, 3), (2021, 3), (...",84
7,16,"[(2024, 2), (2023, 2), (2022, 2), (2021, 3), (...",24
8,46,"[(2024, 2), (2023, 3), (2022, 4), (2021, 3), (...",46
9,15,"[(2024, 2), (2023, 5), (2022, 2), (2021, 1), (...",6


In [10]:
# Now let us create standalone Data Frame for storing titles per years for each team
titles_per_years_df = club_titles_df.explode("NumberOfTitlesByYears")
titles_per_years_df.head(10)

Unnamed: 0,TeamID,NumberOfTitlesByYears,NumberOfCups
0,31,"(2024, 1)",71
0,31,"(2023, 2)",71
0,31,"(2022, 2)",71
0,31,"(2021, 4)",71
0,31,"(2020, 1)",71
0,31,"(2019, 5)",71
0,31,"(2018, 2)",71
0,31,"(2017, 2)",71
0,31,"(2015, 3)",71
0,31,"(2014, 2)",71


In [11]:
titles_per_years_df.shape

(2120, 3)

In [18]:
titles_per_years_df.dtypes

TeamID                    int64
NumberOfTitlesByYears    object
NumberOfCups              int64
dtype: object

In [12]:
titles_per_years_df[titles_per_years_df["TeamID"].isna()]

Unnamed: 0,TeamID,NumberOfTitlesByYears,NumberOfCups


In [13]:
titles_per_years_df[titles_per_years_df["NumberOfTitlesByYears"].isna()]

Unnamed: 0,TeamID,NumberOfTitlesByYears,NumberOfCups
116,1101,,10
147,327,,1
167,4529,,1
214,324,,0
220,40090,,0
251,27843,,0
253,759,,6
261,36999,,0
262,987,,8
284,29053,,2


In [14]:
titles_per_years_df["NumberOfTitlesByYears"].isna().sum()

np.int64(41)

In [15]:
titles_per_years_df[titles_per_years_df["NumberOfCups"].isna()]

Unnamed: 0,TeamID,NumberOfTitlesByYears,NumberOfCups


In [22]:
titles_per_years_df["NumberOfTitlesByYears"][titles_per_years_df["NumberOfTitlesByYears"].notna()].apply(lambda x: x[0])

0      2024
0      2023
0      2022
0      2021
0      2020
       ... 
534    2016
535    2022
535    2018
535    2015
535    2014
Name: NumberOfTitlesByYears, Length: 2079, dtype: object

In [26]:
# Since we have multiple rows for each team, we need to reset index to count every single row 
titles_per_years_df.reset_index(drop=True, inplace=True)
titles_per_years_df.head(10)

Unnamed: 0,TeamID,NumberOfTitlesByYears,NumberOfCups
0,31,"(2024, 1)",71
1,31,"(2023, 2)",71
2,31,"(2022, 2)",71
3,31,"(2021, 4)",71
4,31,"(2020, 1)",71
5,31,"(2019, 5)",71
6,31,"(2018, 2)",71
7,31,"(2017, 2)",71
8,31,"(2015, 3)",71
9,31,"(2014, 2)",71


In [27]:
titles_per_years_df["Year"] = titles_per_years_df["NumberOfTitlesByYears"][titles_per_years_df["NumberOfTitlesByYears"].notna()].apply(lambda x: x[0])
titles_per_years_df["NumberOfTitles"] = titles_per_years_df["NumberOfTitlesByYears"][titles_per_years_df["NumberOfTitlesByYears"].notna()].apply(lambda x: x[1])
titles_per_years_df.head(10)

Unnamed: 0,TeamID,NumberOfTitlesByYears,NumberOfCups,Year,NumberOfTitles
0,31,"(2024, 1)",71,2024,1.0
1,31,"(2023, 2)",71,2023,2.0
2,31,"(2022, 2)",71,2022,2.0
3,31,"(2021, 4)",71,2021,4.0
4,31,"(2020, 1)",71,2020,1.0
5,31,"(2019, 5)",71,2019,5.0
6,31,"(2018, 2)",71,2018,2.0
7,31,"(2017, 2)",71,2017,2.0
8,31,"(2015, 3)",71,2015,3.0
9,31,"(2014, 2)",71,2014,2.0


In [28]:
titles_per_years_df.drop(columns=["NumberOfTitlesByYears"], inplace=True)
titles_per_years_df.head(10)

Unnamed: 0,TeamID,NumberOfCups,Year,NumberOfTitles
0,31,71,2024,1.0
1,31,71,2023,2.0
2,31,71,2022,2.0
3,31,71,2021,4.0
4,31,71,2020,1.0
5,31,71,2019,5.0
6,31,71,2018,2.0
7,31,71,2017,2.0
8,31,71,2015,3.0
9,31,71,2014,2.0
