In [1]:
import numpy as np
import pandas as pd
import json

In [2]:
# Load data about clubs images
club_images_df = pd.read_json("../../parsing/parsedData/club_images.json")
club_images_df.head()

Unnamed: 0,TeamID,ImageLink
0,281,https://tmssl.akamaized.net//images/wappen/hea...
1,27,https://tmssl.akamaized.net//images/wappen/hea...
2,418,https://tmssl.akamaized.net//images/wappen/hea...
3,131,https://tmssl.akamaized.net//images/wappen/hea...
4,583,https://tmssl.akamaized.net//images/wappen/hea...


In [3]:
# Check for NaNs
club_images_df.isna().sum()

TeamID       0
ImageLink    0
dtype: int64

In [4]:
# Check axes
club_images_df.axes

[RangeIndex(start=0, stop=536, step=1),
 Index(['TeamID ', 'ImageLink'], dtype='object')]

In [5]:
# Remove redundant symbols from columns
club_images_df.columns = club_images_df.columns.str.strip()
club_images_df.axes

[RangeIndex(start=0, stop=536, step=1),
 Index(['TeamID', 'ImageLink'], dtype='object')]

In [6]:
# Load data about clubs
complete_clubs_df = pd.read_json("../../parsing/parsedData/sorted_teams.json")
complete_clubs_df.head()

Unnamed: 0,TeamID,Team_name,Country_Name,Link_to_team,Page
0,418,Реал Мадрид,Испания,/real-madrid/startseite/verein/418,1
1,281,Манчестер Сити,Англия,/manchester-city/startseite/verein/281,1
2,27,Бавария,Германия,/fc-bayern-munchen/startseite/verein/27,1
3,31,Ливерпуль,Англия,/fc-liverpool/startseite/verein/31,1
4,583,ПСЖ,Франция,/fc-paris-saint-germain/startseite/verein/583,1


In [7]:
# Check for NaNs
complete_clubs_df.isna().sum()

TeamID          0
Team_name       0
Country_Name    0
Link_to_team    0
Page            0
dtype: int64

In [8]:
# Check axes
complete_clubs_df.axes

[RangeIndex(start=0, stop=536, step=1),
 Index(['TeamID', 'Team_name', 'Country_Name', 'Link_to_team', 'Page'], dtype='object')]

In [9]:
# Drop "Page" and "Link_to_team" columns, we will not need them
complete_clubs_df.drop(columns=["Page", "Link_to_team"], inplace=True)
complete_clubs_df.head() 

Unnamed: 0,TeamID,Team_name,Country_Name
0,418,Реал Мадрид,Испания
1,281,Манчестер Сити,Англия
2,27,Бавария,Германия
3,31,Ливерпуль,Англия
4,583,ПСЖ,Франция


In [10]:
# Load data about clubs total amount of cups
cups_clubs_df = pd.read_json("../../parsing/parsedData/titles_cups.json")
cups_clubs_df.head()

Unnamed: 0,TeamID,NumberOfTitlesByYears,NumberOfCups
0,31,"{'2024': 1, '2023': 2, '2022': 2, '2021': 4, '...",71
1,12,"{'2024': 1, '2023': 1, '2022': 2, '2021': 2, '...",17
2,583,"{'2024': 3, '2023': 4, '2022': 3, '2021': 2, '...",51
3,418,"{'2024': 3, '2023': 4, '2022': 5, '2021': 4, '...",103
4,281,"{'2024': 3, '2023': 6, '2022': 4, '2021': 2, '...",43


In [11]:
# Check for NaNs
cups_clubs_df.isna().sum()

TeamID                   0
NumberOfTitlesByYears    0
NumberOfCups             0
dtype: int64

In [12]:
# Check axes
cups_clubs_df.axes

[RangeIndex(start=0, stop=536, step=1),
 Index(['TeamID', 'NumberOfTitlesByYears', 'NumberOfCups'], dtype='object')]

In [13]:
# Drop "NumberOfTitlesByYears" column, we will not need it
cups_clubs_df.drop(columns=["NumberOfTitlesByYears"], inplace=True)
cups_clubs_df.head()

Unnamed: 0,TeamID,NumberOfCups
0,31,71
1,12,17
2,583,51
3,418,103
4,281,43


In [14]:
# Load data about national teams
national_teams_df = pd.read_json("../../parsing/parsedData/sorted_national_teams.json")
national_teams_df.head()

Unnamed: 0,NationalTeamID,NationalTeamName,Link_to_team,Page
0,3437,Аргентина,/argentinien/startseite/verein/3437,1
1,3377,Франция,/frankreich/startseite/verein/3377,1
2,3375,Испания,/spanien/startseite/verein/3375,1
3,3299,Англия,/england/startseite/verein/3299,1
4,3439,Бразилия,/brasilien/startseite/verein/3439,1


In [15]:
# Check for NaNs
national_teams_df.isna().sum()

NationalTeamID      0
NationalTeamName    0
Link_to_team        0
Page                0
dtype: int64

In [16]:
# Check axes
national_teams_df.axes

[RangeIndex(start=0, stop=210, step=1),
 Index(['NationalTeamID', 'NationalTeamName', 'Link_to_team', 'Page'], dtype='object')]

In [17]:
# Drop "Link_to_team" and "Page" columns, we will not need them
national_teams_df.drop(columns=["Link_to_team", "Page"], inplace=True)
national_teams_df.head()

Unnamed: 0,NationalTeamID,NationalTeamName
0,3437,Аргентина
1,3377,Франция
2,3375,Испания
3,3299,Англия
4,3439,Бразилия


In [18]:
# Create new Data Frame by merging "club_images_df" and "complete_clubs_df" Data Frames on "TeamID" column, so that we will add name of the club to the first Data Frame
club_info_df = club_images_df.merge(complete_clubs_df, on="TeamID")
club_info_df.head()

Unnamed: 0,TeamID,ImageLink,Team_name,Country_Name
0,281,https://tmssl.akamaized.net//images/wappen/hea...,Манчестер Сити,Англия
1,27,https://tmssl.akamaized.net//images/wappen/hea...,Бавария,Германия
2,418,https://tmssl.akamaized.net//images/wappen/hea...,Реал Мадрид,Испания
3,131,https://tmssl.akamaized.net//images/wappen/hea...,Барселона,Испания
4,583,https://tmssl.akamaized.net//images/wappen/hea...,ПСЖ,Франция


In [19]:
# Add "NationalTeamID" column from "national_teams_df" to "clubs_info_df" Data Frame
club_info_df = club_info_df.merge(national_teams_df, left_on="Country_Name", right_on="NationalTeamName")
club_info_df.head()

Unnamed: 0,TeamID,ImageLink,Team_name,Country_Name,NationalTeamID,NationalTeamName
0,281,https://tmssl.akamaized.net//images/wappen/hea...,Манчестер Сити,Англия,3299,Англия
1,27,https://tmssl.akamaized.net//images/wappen/hea...,Бавария,Германия,3262,Германия
2,418,https://tmssl.akamaized.net//images/wappen/hea...,Реал Мадрид,Испания,3375,Испания
3,131,https://tmssl.akamaized.net//images/wappen/hea...,Барселона,Испания,3375,Испания
4,583,https://tmssl.akamaized.net//images/wappen/hea...,ПСЖ,Франция,3377,Франция


In [20]:
# Drop "NationalTeamName" column, we will not need it
club_info_df.drop(columns=["NationalTeamName"], inplace=True)
club_info_df.head()

Unnamed: 0,TeamID,ImageLink,Team_name,Country_Name,NationalTeamID
0,281,https://tmssl.akamaized.net//images/wappen/hea...,Манчестер Сити,Англия,3299
1,27,https://tmssl.akamaized.net//images/wappen/hea...,Бавария,Германия,3262
2,418,https://tmssl.akamaized.net//images/wappen/hea...,Реал Мадрид,Испания,3375
3,131,https://tmssl.akamaized.net//images/wappen/hea...,Барселона,Испания,3375
4,583,https://tmssl.akamaized.net//images/wappen/hea...,ПСЖ,Франция,3377


In [21]:
# Add "NumberOfCups" column from "cups_clubs_df" to "club_info_df" Data Frame
club_info_df = club_info_df.merge(cups_clubs_df, on="TeamID")
club_info_df.head()

Unnamed: 0,TeamID,ImageLink,Team_name,Country_Name,NationalTeamID,NumberOfCups
0,281,https://tmssl.akamaized.net//images/wappen/hea...,Манчестер Сити,Англия,3299,43
1,27,https://tmssl.akamaized.net//images/wappen/hea...,Бавария,Германия,3262,84
2,418,https://tmssl.akamaized.net//images/wappen/hea...,Реал Мадрид,Испания,3375,103
3,131,https://tmssl.akamaized.net//images/wappen/hea...,Барселона,Испания,3375,105
4,583,https://tmssl.akamaized.net//images/wappen/hea...,ПСЖ,Франция,3377,51


In [22]:
# Drop "Country_Name" column, we will not need it
club_info_df.drop(columns=["Country_Name"], inplace=True)
club_info_df.head()

Unnamed: 0,TeamID,ImageLink,Team_name,NationalTeamID,NumberOfCups
0,281,https://tmssl.akamaized.net//images/wappen/hea...,Манчестер Сити,3299,43
1,27,https://tmssl.akamaized.net//images/wappen/hea...,Бавария,3262,84
2,418,https://tmssl.akamaized.net//images/wappen/hea...,Реал Мадрид,3375,103
3,131,https://tmssl.akamaized.net//images/wappen/hea...,Барселона,3375,105
4,583,https://tmssl.akamaized.net//images/wappen/hea...,ПСЖ,3377,51


In [23]:
# Rearrange columns
club_info_df = club_info_df[["TeamID", "Team_name", "NumberOfCups", "NationalTeamID", "ImageLink"]]
club_info_df.head()

Unnamed: 0,TeamID,Team_name,NumberOfCups,NationalTeamID,ImageLink
0,281,Манчестер Сити,43,3299,https://tmssl.akamaized.net//images/wappen/hea...
1,27,Бавария,84,3262,https://tmssl.akamaized.net//images/wappen/hea...
2,418,Реал Мадрид,103,3375,https://tmssl.akamaized.net//images/wappen/hea...
3,131,Барселона,105,3375,https://tmssl.akamaized.net//images/wappen/hea...
4,583,ПСЖ,51,3377,https://tmssl.akamaized.net//images/wappen/hea...


In [24]:
# Final check for NaNs
club_info_df.isna().sum()

TeamID            0
Team_name         0
NumberOfCups      0
NationalTeamID    0
ImageLink         0
dtype: int64

In [25]:
# Write resulting Data Frame to a JSON file
with open("club_info.json", "w") as file:
    json.dump(club_info_df.to_dict(orient="records"), file, indent=4)