In [None]:
import os
import json
import pandas as pd

**Pass data until half time - Final match**

In [4]:
# Set base paths
data_dir = os.path.join("..", "test_data")
output_base_dir = os.path.join("..", "test_data")
os.makedirs(output_base_dir, exist_ok=True)

# Set team to filter
target_team = "Argentina"

# Loop through all folders inside 'data'
for folder_name in os.listdir(data_dir):
    folder_path = os.path.join(data_dir, folder_name)

    # Skip if not a directory
    if not os.path.isdir(folder_path):
        continue

    # Look for the JSON file inside the folder
    json_files = [f for f in os.listdir(folder_path) if f.endswith(".json")]
    if not json_files:
        print(f"No JSON file found in {folder_path}")
        continue

    json_file = json_files[0]  # Assuming each folder has only one .json
    json_path = os.path.join(folder_path, json_file)

    print(f"\n📄 Processing: {json_file} from {folder_name}")

    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    # Extract first-half pass events only
    pass_events = []
    for event in data:
        if (
            event.get("type", {}).get("name") == "Pass"
            and event.get("team", {}).get("name") == target_team
            and event.get("period") == 1
        ):
            passer = event.get("player", {}).get("name")
            recipient = event.get("pass", {}).get("recipient", {}).get("name")
            team = event.get("team", {}).get("name")
            outcome = event.get("pass", {}).get("outcome", {}).get("name", "Successful")
            success = outcome == "Successful"
            location = event.get("location", [None, None])
            end_location = event.get("pass", {}).get("end_location", [None, None])

            pass_events.append({
                "Passer": passer,
                "Recipient": recipient,
                "Team": team,
                "Pass Success": success,
                "Start X": location[0],
                "Start Y": location[1],
                "End X": end_location[0],
                "End Y": end_location[1]
            })

    # Convert to DataFrame
    passes_df = pd.DataFrame(pass_events)

    # Output path (same name as json file, but .xlsx)
    output_filename = os.path.splitext(json_file)[0] + "_first_half.xlsx"
    output_path = os.path.join(output_base_dir, output_filename)

    # Save Excel
    passes_df.to_excel(output_path, index=False)
    print(f"✅ Saved to: {os.path.abspath(output_path)}")



📄 Processing: 7_Arg_vs_Fra_Final.json from 7_Arg_vs_Fra_Final
✅ Saved to: d:\Masters\hcnlp_project\test_data\7_Arg_vs_Fra_Final_first_half.xlsx


**Pass network plots until half time - Final match**

In [5]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import os

# Input and output paths
input_dir = os.path.join("..", "test_data")
output_dir = os.path.join("..", "test_data")
os.makedirs(output_dir, exist_ok=True)

# Iterate over all Excel files
for file in os.listdir(input_dir):
    if file.endswith(".xlsx"):
        filepath = os.path.join(input_dir, file)
        print(f"\nProcessing: {file}")

        # Load the pass data
        df = pd.read_excel(filepath)

        # Drop rows with missing essential data
        df = df.dropna(subset=["Passer", "Recipient", "Start X", "Start Y", "End X", "End Y"])

        # Calculate average pass start locations for positioning
        positions = df.groupby("Passer")[["Start X", "Start Y"]].mean()

        # Count passes per player for sizing
        pass_counts = df["Passer"].value_counts()

        # Unique players
        all_players = set(df["Passer"]).union(set(df["Recipient"]))

        # Create directed graph
        G = nx.DiGraph()

        # Add nodes
        for player in all_players:
            pos = tuple(positions.loc[player]) if player in positions.index else (50, 40)
            size = pass_counts.get(player, 1)
            G.add_node(player, pos=pos, size=size)

        # Add edges with weights
        edge_data = df.groupby(["Passer", "Recipient"]).size().reset_index(name="weight")
        for _, row in edge_data.iterrows():
            G.add_edge(row["Passer"], row["Recipient"], weight=row["weight"])

        # Plotting
        plt.figure(figsize=(12, 8))
        pos = nx.get_node_attributes(G, 'pos')
        sizes = [G.nodes[n]['size'] * 10 for n in G.nodes]
        weights = [G[u][v]['weight'] for u, v in G.edges]

        nx.draw(G, pos, with_labels=True, arrows=True,
                node_size=sizes,
                width=weights,
                edge_color="gray",
                node_color="skyblue",
                font_size=10,
                connectionstyle='arc3,rad=0.1')

        title = file.replace(".xlsx", "").replace("_", " ").title() + " Passing Network"
        plt.title(title, fontsize=14)
        plt.axis('off')
        plt.tight_layout()

        # Save the plot
        output_path = os.path.join(output_dir, file.replace(".xlsx", "_network.png"))
        plt.savefig(output_path, dpi=300)
        plt.close()
        print(f"Saved network plot to: {os.path.abspath(output_path)}")



Processing: 7_Arg_vs_Fra_Final_first_half.xlsx


  plt.tight_layout()


Saved network plot to: d:\Masters\hcnlp_project\test_data\7_Arg_vs_Fra_Final_first_half_network.png


**Metrics computation until half time - Final match**

In [6]:
import pandas as pd
import os
import networkx as nx

# Define input and output directories
input_dir = os.path.join("..", "test_data")
player_output_dir = os.path.join("..", "test_data")
print(f"Player output directory: {player_output_dir}")
graph_output_dir = os.path.join("..", "test_data")

# Create output directories if they don't exist
os.makedirs(player_output_dir, exist_ok=True)
os.makedirs(graph_output_dir, exist_ok=True)

# Iterate over all Excel files in the pass_data folder
for file in os.listdir(input_dir):
    if file.endswith(".xlsx"):
        file_path = os.path.join(input_dir, file)
        print(f"\nProcessing: {file}")

        # Load the pass data from Excel file
        df = pd.read_excel(file_path)

        # Remove rows with missing essential data
        df = df.dropna(subset=["Passer", "Recipient", "Start X", "Start Y", "End X", "End Y"])

        # Create directed graph and add edges with pass weights
        G = nx.DiGraph()
        edges = df.groupby(["Passer", "Recipient"]).size().reset_index(name="weight")
        for _, row in edges.iterrows():
            G.add_edge(row["Passer"], row["Recipient"], weight=row["weight"])

        # Compute node-level centrality metrics
        metrics = {
            "degree_centrality": nx.degree_centrality(G),
            "betweenness_centrality": nx.betweenness_centrality(G),
            "closeness_centrality": nx.closeness_centrality(G),
            "eigenvector_centrality": nx.eigenvector_centrality(G, max_iter=500),
            "pagerank": nx.pagerank(G),
            "clustering": nx.clustering(G.to_undirected())
        }

        # Build a DataFrame with all player-level metrics
        players = list(G.nodes())
        player_metrics = pd.DataFrame({"Player": players})
        for key, values in metrics.items():
            player_metrics[key] = player_metrics["Player"].map(values)

        # Add pass volume stats
        player_metrics["passes_made"] = player_metrics["Player"].apply(lambda p: G.out_degree(p, weight="weight"))
        player_metrics["passes_received"] = player_metrics["Player"].apply(lambda p: G.in_degree(p, weight="weight"))

        # Save player metrics to Excel
        player_output_file_path = os.path.join(player_output_dir, file.replace(".xlsx", "_player_metrics.xlsx"))
        player_metrics.to_excel(player_output_file_path, index=False)

        print(f"Saved player metrics to: {os.path.abspath(player_output_file_path)}")

        # Compute graph-level metrics
        graph_metrics = {
            "match": file.replace(".xlsx", ""),
            "num_nodes": G.number_of_nodes(),
            "num_edges": G.number_of_edges(),
            "density": nx.density(G),
            "avg_clustering": nx.average_clustering(G.to_undirected())
        }

        # Convert graph metrics to a DataFrame
        graph_metrics_df = pd.DataFrame([graph_metrics])

        # Save graph metrics to CSV
        graph_output_file_path = os.path.join(graph_output_dir, file.replace(".xlsx", "_graph_metrics.csv"))
        graph_metrics_df.to_csv(graph_output_file_path, index=False)

        print(f"Saved graph metrics to: {os.path.abspath(graph_output_file_path)}")

        # Print graph-level metrics for verification
        print("\nGraph-Level Metrics:")
        for key, val in graph_metrics.items():
            print(f"{key}: {val}")


Player output directory: ..\test_data

Processing: 7_Arg_vs_Fra_Final_first_half.xlsx
Saved player metrics to: d:\Masters\hcnlp_project\test_data\7_Arg_vs_Fra_Final_first_half_player_metrics.xlsx
Saved graph metrics to: d:\Masters\hcnlp_project\test_data\7_Arg_vs_Fra_Final_first_half_graph_metrics.csv

Graph-Level Metrics:
match: 7_Arg_vs_Fra_Final_first_half
num_nodes: 11
num_edges: 77
density: 0.7
avg_clustering: 0.8402597402597404
