In [1]:
import requests

def download_html(url, filename):
    # Sending an HTTP GET request to the URL
    response = requests.get(url)
    
    # Check if the request was successful (HTTP status code 200)
    if response.status_code == 200:
        # Writing the HTML content to a local file
        with open(filename, 'w', encoding='utf-8') as file:
            file.write(response.text)
        print("HTML content has been downloaded and saved to", filename)
    else:
        print("Failed to retrieve the HTML content. Status code:", response.status_code)


In [1]:
import requests
import re
import json

def extract_json_from_html(url, save_output=False):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }
    
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        return "Failed to retrieve the HTML content. Status code: {}".format(response.status_code)

    html = response.text
    regex_pattern = r'(?<=require\.config\.params\["args"\].=.)[\s\S]*?;'
    match = re.findall(regex_pattern, html)
    if not match:
        return "No match found"

    # Correcting data format by adding quotes around keys
    data_txt = match[0]
    keys_to_replace = ['matchId', 'matchCentreData', 'matchCentreEventTypeJson', 'formationIdNameMappings']
    for key in keys_to_replace:
        data_txt = data_txt.replace(key, f'"{key}"')
    data_txt = data_txt.replace('};', '}')

    # Try to parse the JSON data
    try:
        data_json = json.loads(data_txt)
    except json.JSONDecodeError:
        return "JSON decoding failed"

    # Print the JSON structure summarization
    print_json_structure(data_json)

    # Optionally, save the JSON data to a text file
    if save_output:
        output_filename = 'output.txt'
        with open(output_filename, 'w', encoding='utf-8') as output_file:
            json.dump(data_json, output_file, indent=4)

    return data_json

def print_json_structure(data, indent=0, max_list_example=3):
    for key, value in data.items():
        if isinstance(value, dict):
            print('  ' * indent + f"{key} (dict with {len(value.keys())} keys)")
            print_json_structure(value, indent + 1)
        elif isinstance(value, list):
            print('  ' * indent + f"{key} (list with {len(value)} items)")
            if value:
                if isinstance(value[0], dict):
                    print('  ' * (indent + 1) + f"Example item from list:")
                    print_json_structure(value[0], indent + 2)
                else: 
                    example_items = value[:max_list_example]
                    print('  ' * (indent + 1) + f"Example items: {example_items}")
        else:
            print('  ' * indent + f"{key} ({type(value).__name__})")

# Usage example
url = "https://www.whoscored.com/Matches/1734924/Live/Spain-LaLiga-2023-2024-Real-Sociedad-Real-Madrid"
data = extract_json_from_html(url, save_output=True)


matchId (int)
matchCentreData (dict with 27 keys)
  playerIdNameDictionary (dict with 43 keys)
    377285 (str)
    137227 (str)
    297546 (str)
    299460 (str)
    400185 (str)
    401173 (str)
    404495 (str)
    407138 (str)
    408252 (str)
    413346 (str)
    315566 (str)
    317354 (str)
    344542 (str)
    367008 (str)
    367857 (str)
    369497 (str)
    372424 (str)
    298839 (str)
    395320 (str)
    238916 (str)
    256864 (str)
    270504 (str)
    463961 (str)
    337782 (str)
    337879 (str)
    20874 (str)
    353423 (str)
    369109 (str)
    31772 (str)
    104010 (str)
    317541 (str)
    106875 (str)
    106883 (str)
    113880 (str)
    144511 (str)
    144890 (str)
    74603 (str)
    415181 (str)
    345031 (str)
    344644 (str)
    379868 (str)
    422957 (str)
    482830 (str)
  periodMinuteLimits (dict with 4 keys)
    1 (int)
    2 (int)
    3 (int)
    4 (int)
  timeStamp (str)
  attendance (int)
  venueName (str)
  weatherCode (str)
  elapsed (str

In [2]:
import pandas as pd
import json

def extract_data_from_dict(data):
    # Assuming 'data' is a Python dictionary that has been properly parsed from JSON
    event_types_json = data["matchCentreEventTypeJson"]
    formation_mappings = data["formationIdNameMappings"]
    events_dict = data["matchCentreData"]["events"]
    teams_dict = {
        data["matchCentreData"]['home']['teamId']: data["matchCentreData"]['home']['name'],
        data["matchCentreData"]['away']['teamId']: data["matchCentreData"]['away']['name']
    }
    players_dict = data["matchCentreData"]["playerIdNameDictionary"]

    # Creating players dataframes
    players_home_df = pd.DataFrame(data["matchCentreData"]['home']['players'])
    players_home_df["teamId"] = data["matchCentreData"]['home']['teamId']

    players_away_df = pd.DataFrame(data["matchCentreData"]['away']['players'])
    players_away_df["teamId"] = data["matchCentreData"]['away']['teamId']

    # Combine the home and away players into a single DataFrame
    players_df = pd.concat([players_home_df, players_away_df], ignore_index=True)

    return events_dict, players_df, teams_dict


In [3]:
def get_passes_df(events_dict):
    df = pd.DataFrame(events_dict)
    df['eventType'] = df.apply(lambda row: row['type']['displayName'], axis=1)
    df['outcomeType'] = df.apply(lambda row: row['outcomeType']['displayName'], axis=1)

    # create receiver column based on the next event
    # this will be correct only for successfull passes
    df["receiver"] = df["playerId"].shift(-1)

    # filter only passes
    passes_ids = df.index[df['eventType'] == 'Pass']
    df_passes = df.loc[
        passes_ids, ["id", "x", "y", "endX", "endY", "teamId", "playerId", "receiver", "eventType", "outcomeType"]]

    return df_passes

In [4]:
events_dict, players_df, teams = extract_data_from_dict(data)
print(teams)
passes = get_passes_df(events_dict)
print(passes)

{68: 'Real Sociedad', 52: 'Real Madrid'}
               id     x      y   endX  endY  teamId  playerId  receiver  \
2    2.678382e+09  50.2   49.9   25.0  47.9      52  317541.0  113880.0   
3    2.678382e+09  25.0   47.9   89.5  20.3      52  113880.0  367008.0   
5    2.678382e+09  84.5    0.0   73.0   6.7      52  106883.0  353423.0   
6    2.678382e+09  70.4    9.2   53.3  16.5      52  353423.0  337879.0   
7    2.678382e+09  52.9   17.3   36.6  56.0      52  337879.0  106875.0   
..            ...   ...    ...    ...   ...     ...       ...       ...   
736  2.678427e+09  97.2    7.1   79.1  85.8      68  317354.0  369497.0   
739  2.678427e+09  85.0  100.0   86.4  82.7      68  369497.0  367857.0   
740  2.678427e+09  86.4   82.7   89.4  96.0      68  367857.0  369497.0   
741  2.678427e+09  89.4   96.0   84.9  85.7      68  369497.0  367857.0   
742  2.678427e+09  84.8   85.7  100.0  79.8      68  367857.0  106883.0   

    eventType   outcomeType  
2        Pass    Successful 

In [13]:
def get_passes_between_df(team_id, passes_df, players_df):
    # filter for only team
    print(team_id)
    passes_df = passes_df[passes_df["teamId"] == team_id]

    # add column with first eleven players only
    passes_df = passes_df.merge(players_df[["playerId", "isFirstEleven"]], on='playerId', how='left')
    # filter on first eleven column
    passes_df = passes_df[passes_df['isFirstEleven'] == True]

    # calculate mean positions for players
    average_locs_and_count_df = (passes_df.groupby('playerId')
                                 .agg({'x': ['mean'], 'y': ['mean', 'count']}))
    average_locs_and_count_df.columns = ['x', 'y', 'count']
    average_locs_and_count_df = average_locs_and_count_df.merge(players_df[['playerId', 'name', 'shirtNo', 'position']],
                                                                on='playerId', how='left')
    average_locs_and_count_df = average_locs_and_count_df.set_index('playerId')
    print(average_locs_and_count_df)

    # calculate the number of passes between each position (using min/ max so we get passes both ways)
    passes_player_ids_df = passes_df.loc[:, ['id', 'playerId', 'receiver', 'teamId']]
    passes_player_ids_df['pos_max'] = (passes_player_ids_df[['playerId', 'receiver']].max(axis='columns'))
    passes_player_ids_df['pos_min'] = (passes_player_ids_df[['playerId', 'receiver']].min(axis='columns'))

    # get passes between each player
    passes_between_df = passes_player_ids_df.groupby(['pos_min', 'pos_max']).id.count().reset_index()
    passes_between_df.rename({'id': 'pass_count'}, axis='columns', inplace=True)

    # add on the location of each player so we have the start and end positions of the lines
    passes_between_df = passes_between_df.merge(average_locs_and_count_df, left_on='pos_min', right_index=True)
    passes_between_df = passes_between_df.merge(average_locs_and_count_df, left_on='pos_max', right_index=True,
                                                suffixes=['', '_end'])
    return passes_between_df, average_locs_and_count_df

In [14]:
def pass_network_visualization(ax, passes_between_df, average_locs_and_count_df, flipped=False):
    MAX_LINE_WIDTH = 10
    MAX_MARKER_SIZE = 3000
    passes_between_df['width'] = (passes_between_df.pass_count / passes_between_df.pass_count.max() *
                                  MAX_LINE_WIDTH)
    average_locs_and_count_df['marker_size'] = (average_locs_and_count_df['count']
                                                / average_locs_and_count_df['count'].max() * MAX_MARKER_SIZE)

    MIN_TRANSPARENCY = 0.3
    color = np.array(to_rgba('#507293'))
    color = np.tile(color, (len(passes_between_df), 1))
    c_transparency = passes_between_df.pass_count / passes_between_df.pass_count.max()
    c_transparency = (c_transparency * (1 - MIN_TRANSPARENCY)) + MIN_TRANSPARENCY
    color[:, 3] = c_transparency

    pitch = Pitch(pitch_type='opta', pitch_color='#0D182E', line_color='#5B6378')
    pitch.draw(ax=ax)

    if flipped:
        passes_between_df['x'] = pitch.dim.right - passes_between_df['x']
        passes_between_df['y'] = pitch.dim.right - passes_between_df['y']
        passes_between_df['x_end'] = pitch.dim.right - passes_between_df['x_end']
        passes_between_df['y_end'] = pitch.dim.right - passes_between_df['y_end']
        average_locs_and_count_df['x'] = pitch.dim.right - average_locs_and_count_df['x']
        average_locs_and_count_df['y'] = pitch.dim.right - average_locs_and_count_df['y']

    pass_lines = pitch.lines(passes_between_df.x, passes_between_df.y,
                             passes_between_df.x_end, passes_between_df.y_end, lw=passes_between_df.width,
                             color=color, zorder=1, ax=ax)
    pass_nodes = pitch.scatter(average_locs_and_count_df.x, average_locs_and_count_df.y,
                               s=average_locs_and_count_df.marker_size, marker='h',
                               color='#FEFEFC', edgecolors='#FEFEFC', linewidth=1, alpha=1, ax=ax)
    for index, row in average_locs_and_count_df.iterrows():
        print(row)
        player_name = row["name"].split()
        player_initials = "".join(word[0] for word in player_name).upper()
        pitch.annotate(player_initials, xy=(row.x, row.y), c='#C4C4C4', va='center',
                       ha='center', size=14, ax=ax)

    return pitch

In [15]:
# create plot
fig, axes = plt.subplots(1, 2, figsize=(15, 8))
plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=None, hspace=None)
axes = axes.flat
plt.tight_layout()
fig.set_facecolor("#0D182E")

# plot variables
main_color = '#FBFAF5'
font_bold = FontManager(("https://github.com/google/fonts/blob/main/apache/roboto/static/"
                         "RobotoCondensed-Medium.ttf?raw=true"))

# home team viz
pass_network_visualization(axes[0], home_passes_between_df, home_average_locs_and_count_df)
axes[0].set_title(teams_dict[home_team_id], color=main_color, fontsize=14, fontproperties=font_bold.prop)

# away team viz
pass_network_visualization(axes[1], away_passes_between_df, away_average_locs_and_count_df, flipped=True)
axes[1].set_title(teams_dict[away_team_id], color=main_color, fontsize=14, fontproperties=font_bold.prop)

plt.suptitle(f"{teams_dict[home_team_id]} - {teams_dict[away_team_id]}", color=main_color, fontsize=42, fontproperties=font_bold.prop)
subtitle = "Passing networks and top combinations by volume of passes"
plt.text(-10, 120, subtitle, horizontalalignment='center', verticalalignment='center', color=main_color, fontsize=14, fontproperties=font_bold.prop)
plt.show()

NameError: name 'plt' is not defined