In [4]:
import json
import pandas as pd
import os

In [2]:
!pip install openpyxl

Defaulting to user installation because normal site-packages is not writeable


**Pass network data for all matches**

In [17]:
import os
import json
import pandas as pd

# Set base paths
data_dir = os.path.join("..", "data")
output_base_dir = os.path.join("..", "outputs", "pass_data")
os.makedirs(output_base_dir, exist_ok=True)

# Set team to filter
target_team = "Argentina"

# Loop through all folders inside 'data'
for folder_name in os.listdir(data_dir):
    folder_path = os.path.join(data_dir, folder_name)

    # Skip if not a directory
    if not os.path.isdir(folder_path):
        continue

    # Look for the JSON file inside the folder
    json_files = [f for f in os.listdir(folder_path) if f.endswith(".json")]
    if not json_files:
        print(f"No JSON file found in {folder_path}")
        continue

    json_file = json_files[0]  # Assuming each folder has only one .json
    json_path = os.path.join(folder_path, json_file)

    print(f"\nProcessing: {json_file} from {folder_name}")

    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    # Extract pass events
    pass_events = []
    for event in data:
        if event.get("type", {}).get("name") == "Pass" and event.get("team", {}).get("name") == target_team:
            passer = event.get("player", {}).get("name")
            recipient = event.get("pass", {}).get("recipient", {}).get("name")
            team = event.get("team", {}).get("name")
            outcome = event.get("pass", {}).get("outcome", {}).get("name", "Successful")
            success = outcome == "Successful"
            location = event.get("location", [None, None])
            end_location = event.get("pass", {}).get("end_location", [None, None])

            pass_events.append({
                "Passer": passer,
                "Recipient": recipient,
                "Team": team,
                "Pass Success": success,
                "Start X": location[0],
                "Start Y": location[1],
                "End X": end_location[0],
                "End Y": end_location[1]
            })

    # Convert to DataFrame
    passes_df = pd.DataFrame(pass_events)

    # Output path (same name as json file, but .xlsx)
    output_filename = os.path.splitext(json_file)[0] + ".xlsx"
    output_path = os.path.join(output_base_dir, output_filename)

    # Save Excel
    passes_df.to_excel(output_path, index=False)
    print(f"Saved to: {os.path.abspath(output_path)}")



Processing: 1_Arg_vs_Saudi_G1.json from 1_Arg_vs_Saudi_G1
Saved to: d:\Masters\hcnlp_project\outputs\pass_data\1_Arg_vs_Saudi_G1.xlsx

Processing: 2_Arg_vs_Mex_G2.json from 2_Arg_vs_Mex_G2
Saved to: d:\Masters\hcnlp_project\outputs\pass_data\2_Arg_vs_Mex_G2.xlsx

Processing: 3_Arg_vs_Pol_G3.json from 3_Arg_vs_Pol_G3
Saved to: d:\Masters\hcnlp_project\outputs\pass_data\3_Arg_vs_Pol_G3.xlsx

Processing: 4_Arg_vs_Aus_R016.json from 4_Arg_vs_Aus_R016
Saved to: d:\Masters\hcnlp_project\outputs\pass_data\4_Arg_vs_Aus_R016.xlsx

Processing: 5_Arg_vs_Ned_QF.json from 5_Arg_vs_Ned_QF
Saved to: d:\Masters\hcnlp_project\outputs\pass_data\5_Arg_vs_Ned_QF.xlsx

Processing: 6_Arg_vs_Cro_SF.json from 6_Arg_vs_Cro_SF
Saved to: d:\Masters\hcnlp_project\outputs\pass_data\6_Arg_vs_Cro_SF.xlsx

Processing: 7_Arg_vs_Fra_Final.json from 7_Arg_vs_Fra_Final
Saved to: d:\Masters\hcnlp_project\outputs\pass_data\7_Arg_vs_Fra_Final.xlsx


**Player interactions from all matches**

In [18]:
import os
import pandas as pd

# Paths
input_dir = os.path.join("..", "outputs", "pass_data")
output_dir = os.path.join("..", "outputs", "player_interactions")
os.makedirs(output_dir, exist_ok=True)

# Loop over all Excel files in the pass_data folder
for file in os.listdir(input_dir):
    if file.endswith(".xlsx"):
        input_path = os.path.join(input_dir, file)
        print(f"\nProcessing: {file}")

        # Load the Excel file
        df = pd.read_excel(input_path)

        # Drop rows with missing passers or recipients
        df = df.dropna(subset=["Passer", "Recipient"])

        # Get unique players
        players = sorted(set(df["Passer"]).union(set(df["Recipient"])))

        # Group pass counts
        pass_counts = df.groupby(["Passer", "Recipient"]).size().reset_index(name="Pass Count")

        # Calculate interaction metrics
        interaction_data = []
        for player in players:
            row = {"Player": player}

            row["Total Passes Made"] = df[df["Passer"] == player].shape[0]
            row["Total Passes Received"] = df[df["Recipient"] == player].shape[0]

            # Teammate pass interactions
            teammates = [p for p in players if p != player]
            for teammate in teammates:
                count = pass_counts[
                    (pass_counts["Passer"] == player) &
                    (pass_counts["Recipient"] == teammate)
                ]["Pass Count"].sum()
                row[teammate] = count

            interaction_data.append(row)

        # Save to Excel
        interaction_df = pd.DataFrame(interaction_data)
        output_filename = file.replace(".xlsx", "_interactions.xlsx")
        output_path = os.path.join(output_dir, output_filename)
        interaction_df.to_excel(output_path, index=False)

        print(f"Saved to: {os.path.abspath(output_path)}")



Processing: 1_Arg_vs_Saudi_G1.xlsx
Saved to: d:\Masters\hcnlp_project\outputs\player_interactions\1_Arg_vs_Saudi_G1_interactions.xlsx

Processing: 2_Arg_vs_Mex_G2.xlsx
Saved to: d:\Masters\hcnlp_project\outputs\player_interactions\2_Arg_vs_Mex_G2_interactions.xlsx

Processing: 3_Arg_vs_Pol_G3.xlsx
Saved to: d:\Masters\hcnlp_project\outputs\player_interactions\3_Arg_vs_Pol_G3_interactions.xlsx

Processing: 4_Arg_vs_Aus_R016.xlsx
Saved to: d:\Masters\hcnlp_project\outputs\player_interactions\4_Arg_vs_Aus_R016_interactions.xlsx

Processing: 5_Arg_vs_Ned_QF.xlsx
Saved to: d:\Masters\hcnlp_project\outputs\player_interactions\5_Arg_vs_Ned_QF_interactions.xlsx

Processing: 6_Arg_vs_Cro_SF.xlsx
Saved to: d:\Masters\hcnlp_project\outputs\player_interactions\6_Arg_vs_Cro_SF_interactions.xlsx

Processing: 7_Arg_vs_Fra_Final.xlsx
Saved to: d:\Masters\hcnlp_project\outputs\player_interactions\7_Arg_vs_Fra_Final_interactions.xlsx

Processing: arg_sau_passes.xlsx
Saved to: d:\Masters\hcnlp_project\o

**Pass network graphs for all matches**

In [20]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import os

# Input and output paths
input_dir = os.path.join("..", "outputs", "pass_data")
output_dir = os.path.join("..", "outputs", "pass_networks")
os.makedirs(output_dir, exist_ok=True)

# Iterate over all Excel files
for file in os.listdir(input_dir):
    if file.endswith(".xlsx"):
        filepath = os.path.join(input_dir, file)
        print(f"\nProcessing: {file}")

        # Load the pass data
        df = pd.read_excel(filepath)

        # Drop rows with missing essential data
        df = df.dropna(subset=["Passer", "Recipient", "Start X", "Start Y", "End X", "End Y"])

        # Calculate average pass start locations for positioning
        positions = df.groupby("Passer")[["Start X", "Start Y"]].mean()

        # Count passes per player for sizing
        pass_counts = df["Passer"].value_counts()

        # Unique players
        all_players = set(df["Passer"]).union(set(df["Recipient"]))

        # Create directed graph
        G = nx.DiGraph()

        # Add nodes
        for player in all_players:
            pos = tuple(positions.loc[player]) if player in positions.index else (50, 40)
            size = pass_counts.get(player, 1)
            G.add_node(player, pos=pos, size=size)

        # Add edges with weights
        edge_data = df.groupby(["Passer", "Recipient"]).size().reset_index(name="weight")
        for _, row in edge_data.iterrows():
            G.add_edge(row["Passer"], row["Recipient"], weight=row["weight"])

        # Plotting
        plt.figure(figsize=(12, 8))
        pos = nx.get_node_attributes(G, 'pos')
        sizes = [G.nodes[n]['size'] * 10 for n in G.nodes]
        weights = [G[u][v]['weight'] for u, v in G.edges]

        nx.draw(G, pos, with_labels=True, arrows=True,
                node_size=sizes,
                width=weights,
                edge_color="gray",
                node_color="skyblue",
                font_size=10,
                connectionstyle='arc3,rad=0.1')

        title = file.replace(".xlsx", "").replace("_", " ").title() + " Passing Network"
        plt.title(title, fontsize=14)
        plt.axis('off')
        plt.tight_layout()

        # Save the plot
        output_path = os.path.join(output_dir, file.replace(".xlsx", "_network.png"))
        plt.savefig(output_path, dpi=300)
        plt.close()
        print(f"Saved network plot to: {os.path.abspath(output_path)}")



Processing: 1_Arg_vs_Saudi_G1.xlsx


  plt.tight_layout()


Saved network plot to: d:\Masters\hcnlp_project\outputs\pass_networks\1_Arg_vs_Saudi_G1_network.png

Processing: 2_Arg_vs_Mex_G2.xlsx


  plt.tight_layout()


Saved network plot to: d:\Masters\hcnlp_project\outputs\pass_networks\2_Arg_vs_Mex_G2_network.png

Processing: 3_Arg_vs_Pol_G3.xlsx


  plt.tight_layout()


Saved network plot to: d:\Masters\hcnlp_project\outputs\pass_networks\3_Arg_vs_Pol_G3_network.png

Processing: 4_Arg_vs_Aus_R016.xlsx


  plt.tight_layout()


Saved network plot to: d:\Masters\hcnlp_project\outputs\pass_networks\4_Arg_vs_Aus_R016_network.png

Processing: 5_Arg_vs_Ned_QF.xlsx


  plt.tight_layout()


Saved network plot to: d:\Masters\hcnlp_project\outputs\pass_networks\5_Arg_vs_Ned_QF_network.png

Processing: 6_Arg_vs_Cro_SF.xlsx


  plt.tight_layout()


Saved network plot to: d:\Masters\hcnlp_project\outputs\pass_networks\6_Arg_vs_Cro_SF_network.png

Processing: 7_Arg_vs_Fra_Final.xlsx


  plt.tight_layout()


Saved network plot to: d:\Masters\hcnlp_project\outputs\pass_networks\7_Arg_vs_Fra_Final_network.png

Processing: arg_sau_passes.xlsx


  plt.tight_layout()


Saved network plot to: d:\Masters\hcnlp_project\outputs\pass_networks\arg_sau_passes_network.png


**Metrics computation for all matches**

In [28]:
import pandas as pd
import os
import networkx as nx

# Define input and output directories
input_dir = os.path.join("..", "outputs", "pass_data")
player_output_dir = os.path.join("..", "outputs", "player_metrics")
print(f"Player output directory: {player_output_dir}")
graph_output_dir = os.path.join("..", "outputs", "graph_metrics")

# Create output directories if they don't exist
os.makedirs(player_output_dir, exist_ok=True)
os.makedirs(graph_output_dir, exist_ok=True)

# Iterate over all Excel files in the pass_data folder
for file in os.listdir(input_dir):
    if file.endswith(".xlsx"):
        file_path = os.path.join(input_dir, file)
        print(f"\nProcessing: {file}")

        # Load the pass data from Excel file
        df = pd.read_excel(file_path)

        # Remove rows with missing essential data
        df = df.dropna(subset=["Passer", "Recipient", "Start X", "Start Y", "End X", "End Y"])

        # Create directed graph and add edges with pass weights
        G = nx.DiGraph()
        edges = df.groupby(["Passer", "Recipient"]).size().reset_index(name="weight")
        for _, row in edges.iterrows():
            G.add_edge(row["Passer"], row["Recipient"], weight=row["weight"])

        # Compute node-level centrality metrics
        metrics = {
            "degree_centrality": nx.degree_centrality(G),
            "betweenness_centrality": nx.betweenness_centrality(G),
            "closeness_centrality": nx.closeness_centrality(G),
            "eigenvector_centrality": nx.eigenvector_centrality(G, max_iter=500),
            "pagerank": nx.pagerank(G),
            "clustering": nx.clustering(G.to_undirected())
        }

        # Build a DataFrame with all player-level metrics
        players = list(G.nodes())
        player_metrics = pd.DataFrame({"Player": players})
        for key, values in metrics.items():
            player_metrics[key] = player_metrics["Player"].map(values)

        # Add pass volume stats
        player_metrics["passes_made"] = player_metrics["Player"].apply(lambda p: G.out_degree(p, weight="weight"))
        player_metrics["passes_received"] = player_metrics["Player"].apply(lambda p: G.in_degree(p, weight="weight"))

        # Save player metrics to Excel
        player_output_file_path = os.path.join(player_output_dir, file.replace(".xlsx", "_player_metrics.xlsx"))
        player_metrics.to_excel(player_output_file_path, index=False)

        print(f"Saved player metrics to: {os.path.abspath(player_output_file_path)}")

        # Compute graph-level metrics
        graph_metrics = {
            "match": file.replace(".xlsx", ""),
            "num_nodes": G.number_of_nodes(),
            "num_edges": G.number_of_edges(),
            "density": nx.density(G),
            "avg_clustering": nx.average_clustering(G.to_undirected())
        }

        # Convert graph metrics to a DataFrame
        graph_metrics_df = pd.DataFrame([graph_metrics])

        # Save graph metrics to CSV
        graph_output_file_path = os.path.join(graph_output_dir, file.replace(".xlsx", "_graph_metrics.csv"))
        graph_metrics_df.to_csv(graph_output_file_path, index=False)

        print(f"Saved graph metrics to: {os.path.abspath(graph_output_file_path)}")

        # Print graph-level metrics for verification
        print("\nGraph-Level Metrics:")
        for key, val in graph_metrics.items():
            print(f"{key}: {val}")


Player output directory: ..\outputs\player_metrics

Processing: 1_Arg_vs_Saudi_G1.xlsx
Saved player metrics to: d:\Masters\hcnlp_project\outputs\player_metrics\1_Arg_vs_Saudi_G1_player_metrics.xlsx
Saved graph metrics to: d:\Masters\hcnlp_project\outputs\graph_metrics\1_Arg_vs_Saudi_G1_graph_metrics.csv

Graph-Level Metrics:
match: 1_Arg_vs_Saudi_G1
num_nodes: 15
num_edges: 131
density: 0.6238095238095238
avg_clustering: 0.7908276908276908

Processing: 2_Arg_vs_Mex_G2.xlsx
Saved player metrics to: d:\Masters\hcnlp_project\outputs\player_metrics\2_Arg_vs_Mex_G2_player_metrics.xlsx
Saved graph metrics to: d:\Masters\hcnlp_project\outputs\graph_metrics\2_Arg_vs_Mex_G2_graph_metrics.csv

Graph-Level Metrics:
match: 2_Arg_vs_Mex_G2
num_nodes: 16
num_edges: 135
density: 0.5625
avg_clustering: 0.845770202020202

Processing: 3_Arg_vs_Pol_G3.xlsx
Saved player metrics to: d:\Masters\hcnlp_project\outputs\player_metrics\3_Arg_vs_Pol_G3_player_metrics.xlsx
Saved graph metrics to: d:\Masters\hcnlp_

**Q&A template based on previously computed metrics for players and matches**

In [None]:
# Define input directories
player_metrics_dir = os.path.join("..", "outputs", "player_metrics")
graph_metrics_dir = os.path.join("..", "outputs", "graph_metrics")
output_dir = os.path.join("..", "outputs", "pass_qa")

os.makedirs(output_dir, exist_ok=True)

# Helper: prettify match name
def format_match_name(raw_name):
    formatted = raw_name.replace("_", " ")
    formatted = formatted.replace("G1", "Group Stage Match 1")
    formatted = formatted.replace("G2", "Group Stage Match 2")
    formatted = formatted.replace("G3", "Group Stage Match 3")
    formatted = formatted.replace("R016", "Round of 16")
    return formatted

# Iterate over player metric files
for player_file in os.listdir(player_metrics_dir):
    if not player_file.endswith("_player_metrics.xlsx"):
        continue

    player_file_path = os.path.join(player_metrics_dir, player_file)
    match_id = player_file.replace("_player_metrics.xlsx", "")
    
    graph_file = match_id + "_graph_metrics.csv"
    graph_file_path = os.path.join(graph_metrics_dir, graph_file)
    print(graph_file_path, 'graph_file_path')
    
    # Check existence before doing anything else
    if not os.path.exists(graph_file_path):
        print(f"Warning: Graph file for {match_id} not found. Skipping.")
        continue

    # Now we can safely format the match name for display
    match_name = format_match_name(match_id)

    # Load data
    player_df = pd.read_excel(player_file_path).round(3)
    graph_df = pd.read_csv(graph_file_path).round(3)

    # Generate Q&A
    qa_pairs = []

    top_betweenness = player_df.sort_values("betweenness_centrality", ascending=False).iloc[0]
    qa_pairs.append({
        "question": f"Who had the highest betweenness centrality in the match against {match_name}?",
        "answer": f"{top_betweenness['Player']} had the highest betweenness centrality with a value of {top_betweenness['betweenness_centrality']}."
    })

    top_eigen = player_df.sort_values("eigenvector_centrality", ascending=False).iloc[0]
    qa_pairs.append({
        "question": f"Which player was most influential based on eigenvector centrality in the match against {match_name}?",
        "answer": f"{top_eigen['Player']} was the most influential player with an eigenvector centrality of {top_eigen['eigenvector_centrality']}."
    })

    top_passer = player_df.sort_values("passes_made", ascending=False).iloc[0]
    qa_pairs.append({
        "question": f"Who made the most passes for Argentina in the match against {match_name}?",
        "answer": f"{top_passer['Player']} made the most passes, totaling {int(top_passer['passes_made'])}."
    })

    top_receiver = player_df.sort_values("passes_received", ascending=False).iloc[0]
    qa_pairs.append({
        "question": f"Who received the most passes for Argentina in the match against {match_name}?",
        "answer": f"{top_receiver['Player']} received the most passes, totaling {int(top_receiver['passes_received'])}."
    })

    top_clust = player_df.sort_values("clustering", ascending=False).iloc[0]
    qa_pairs.append({
        "question": f"Which player was most involved in triangle passing patterns in the match against {match_name}?",
        "answer": f"{top_clust['Player']} had the highest clustering coefficient of {top_clust['clustering']}."
    })

    density = graph_df["density"].iloc[0]
    qa_pairs.append({
        "question": f"What was the density of Argentina's pass network in the match against {match_name}?",
        "answer": f"The network density was {density}, reflecting the overall connection of the team."
    })

    avg_clust = graph_df["avg_clustering"].iloc[0]
    qa_pairs.append({
        "question": f"What was the average clustering coefficient of Argentina's pass network in the match against {match_name}?",
        "answer": f"The average clustering coefficient was {avg_clust}, indicating the level of tight triangle formations."
    })

    # Save
    qa_df = pd.DataFrame(qa_pairs)
    output_file_path = os.path.join(output_dir, f"{match_name}_QA.csv")
    qa_df.to_csv(output_file_path, index=False)
    print(f"Q&A file saved for {match_name} as '{output_file_path}'")


..\outputs\graph_metrics\1_Arg_vs_Saudi_G1_graph_metrics.csv graph_file_path
✅ Q&A file saved for 1 Arg vs Saudi Group Stage Match 1 as '..\outputs\pass_qa\1 Arg vs Saudi Group Stage Match 1_QA.csv'
..\outputs\graph_metrics\2_Arg_vs_Mex_G2_graph_metrics.csv graph_file_path
✅ Q&A file saved for 2 Arg vs Mex Group Stage Match 2 as '..\outputs\pass_qa\2 Arg vs Mex Group Stage Match 2_QA.csv'
..\outputs\graph_metrics\3_Arg_vs_Pol_G3_graph_metrics.csv graph_file_path
✅ Q&A file saved for 3 Arg vs Pol Group Stage Match 3 as '..\outputs\pass_qa\3 Arg vs Pol Group Stage Match 3_QA.csv'
..\outputs\graph_metrics\4_Arg_vs_Aus_R016_graph_metrics.csv graph_file_path
✅ Q&A file saved for 4 Arg vs Aus Round of 16 as '..\outputs\pass_qa\4 Arg vs Aus Round of 16_QA.csv'
..\outputs\graph_metrics\5_Arg_vs_Ned_QF_graph_metrics.csv graph_file_path
✅ Q&A file saved for 5 Arg vs Ned QF as '..\outputs\pass_qa\5 Arg vs Ned QF_QA.csv'
..\outputs\graph_metrics\6_Arg_vs_Cro_SF_graph_metrics.csv graph_file_path
✅ 