In [1]:
import numpy as np
import pandas as pd
import json

In [2]:
# Load data about national teams
national_teams_df = pd.read_json("../../parsing/parsedData/sorted_national_teams.json")
national_teams_df.head()

Unnamed: 0,NationalTeamID,NationalTeamName,Link_to_team,Page
0,3437,Аргентина,/argentinien/startseite/verein/3437,1
1,3377,Франция,/frankreich/startseite/verein/3377,1
2,3375,Испания,/spanien/startseite/verein/3375,1
3,3299,Англия,/england/startseite/verein/3299,1
4,3439,Бразилия,/brasilien/startseite/verein/3439,1


In [3]:
# Check axes
national_teams_df.axes

[RangeIndex(start=0, stop=210, step=1),
 Index(['NationalTeamID', 'NationalTeamName', 'Link_to_team', 'Page'], dtype='object')]

In [4]:
# Drop "Link_to_team" and "Page" columns, we will not need them
national_teams_df.drop(columns=["Link_to_team", "Page"], inplace=True)
national_teams_df.head()

Unnamed: 0,NationalTeamID,NationalTeamName
0,3437,Аргентина
1,3377,Франция
2,3375,Испания
3,3299,Англия
4,3439,Бразилия


In [5]:
# Load data about clubs
clubs_df = pd.read_json("../../parsing/parsedData/sorted_teams.json")
clubs_df.head()

Unnamed: 0,TeamID,Team_name,Country_Name,Link_to_team,Page
0,418,Реал Мадрид,Испания,/real-madrid/startseite/verein/418,1
1,281,Манчестер Сити,Англия,/manchester-city/startseite/verein/281,1
2,27,Бавария,Германия,/fc-bayern-munchen/startseite/verein/27,1
3,31,Ливерпуль,Англия,/fc-liverpool/startseite/verein/31,1
4,583,ПСЖ,Франция,/fc-paris-saint-germain/startseite/verein/583,1


In [6]:
# Check axes
clubs_df.axes

[RangeIndex(start=0, stop=536, step=1),
 Index(['TeamID', 'Team_name', 'Country_Name', 'Link_to_team', 'Page'], dtype='object')]

In [7]:
# Drop "Team_name", "Link_to_team" and "Page" columns, we will not need them
clubs_df.drop(columns=["Team_name", "Link_to_team", "Page"], inplace=True)
clubs_df.head()

Unnamed: 0,TeamID,Country_Name
0,418,Испания
1,281,Англия
2,27,Германия
3,31,Англия
4,583,Франция


In [8]:
# Merge "national_teams_df" and "clubs_df" on country belonging
country_info_df = national_teams_df.merge(clubs_df, left_on="NationalTeamName", right_on="Country_Name")
country_info_df.head()

Unnamed: 0,NationalTeamID,NationalTeamName,TeamID,Country_Name
0,3377,Франция,583,Франция
1,3377,Франция,1082,Франция
2,3377,Франция,244,Франция
3,3377,Франция,273,Франция
4,3377,Франция,1041,Франция


In [9]:
# Drop "Country_Name" column because it is redundant
country_info_df.drop(columns=["Country_Name"], inplace=True)
country_info_df.head()

Unnamed: 0,NationalTeamID,NationalTeamName,TeamID
0,3377,Франция,583
1,3377,Франция,1082
2,3377,Франция,244
3,3377,Франция,273
4,3377,Франция,1041


In [10]:
# Group by "NationalTeamID" and aggregate "TeamID" column into list
country_info_df = country_info_df.groupby(["NationalTeamID", "NationalTeamName"], as_index=False)["TeamID"] \
    .apply(lambda x: x.tolist())
country_info_df.head()

Unnamed: 0,NationalTeamID,NationalTeamName,TeamID
0,3262,Германия,"[27, 16, 15, 23826, 24, 60, 89, 533, 18, 79, 8..."
1,3299,Англия,"[281, 31, 631, 985, 11, 379, 148, 405, 1003, 1..."
2,3300,Португалия,"[294, 720, 336, 1075, 2420, 2424, 2423, 2995, ..."
3,3375,Испания,"[418, 131, 13, 1050, 681, 368, 150, 621, 16795..."
4,3376,Италия,"[46, 12, 800, 5, 506, 398, 6195, 430, 1025, 65..."


In [11]:
# Check for NaNs, if any
country_info_df.isna().sum()

NationalTeamID      0
NationalTeamName    0
TeamID              0
dtype: int64

In [12]:
# Another way to check TeamID's 
country_info_df[country_info_df["TeamID"].apply(len) == 0]

Unnamed: 0,NationalTeamID,NationalTeamName,TeamID


In [13]:
# Rename columns
country_info_df.columns = ["NationalTeamID", "NationalTeamName", "ClubIDs"]
country_info_df.head()

Unnamed: 0,NationalTeamID,NationalTeamName,ClubIDs
0,3262,Германия,"[27, 16, 15, 23826, 24, 60, 89, 533, 18, 79, 8..."
1,3299,Англия,"[281, 31, 631, 985, 11, 379, 148, 405, 1003, 1..."
2,3300,Португалия,"[294, 720, 336, 1075, 2420, 2424, 2423, 2995, ..."
3,3375,Испания,"[418, 131, 13, 1050, 681, 368, 150, 621, 16795..."
4,3376,Италия,"[46, 12, 800, 5, 506, 398, 6195, 430, 1025, 65..."


In [14]:
# Write resulting Data Frame to a JSON file
with open("country_info.json", "w") as file:
    json.dump(country_info_df.to_dict(orient="records"), file, indent=4)