This script processes XML files to extract player and team mappings, then uses these mappings to replace IDs with names in summary CSV files. Initially, the script loads player and team mappings from the XML files in the specified folder. It extracts player IDs and names from player elements and team IDs and names from the root attributes, storing these mappings in dictionaries. After loading the mappings, the script processes each CSV file in a designated folder, where each file contains event summaries with player and team IDs. For each summary, it identifies and replaces all player and team IDs with their corresponding names from the mappings. The updated summaries are then saved back to the CSV files.

In [1]:
import os
import pandas as pd
import xml.etree.ElementTree as ET
import re

def load_mappings(xml_folder):
    player_mappings = {}
    team_mappings = {}
    for filename in os.listdir(xml_folder):
        if filename.endswith(".xml"):
            path = os.path.join(xml_folder, filename)
            tree = ET.parse(path)
            root = tree.getroot()
            # Load player mappings
            for player in root.findall(".//Player"):
                player_id = player.get("player_id")
                player_name = player.get("player_name")
                if player_id and player_name:
                    player_mappings[player_id] = player_name
            # Load team mappings from the root attributes
            home_team_id = root.get('home_team_id')
            home_team_name = root.get('home_team_name')
            away_team_id = root.get('away_team_id')
            away_team_name = root.get('away_team_name')
            team_mappings[home_team_id] = home_team_name
            team_mappings[away_team_id] = away_team_name
            print(f"Loaded {len(player_mappings)} player and {len(team_mappings)} team entries from {filename}.")
    return player_mappings, team_mappings

def replace_ids_with_names(csv_folder, player_mappings, team_mappings):
    for filename in os.listdir(csv_folder):
        if filename.endswith(".csv"):
            path = os.path.join(csv_folder, filename)
            df = pd.read_csv(path)
            # Process each row in the CSV file
            events_updated = 0
            for index, row in df.iterrows():
                summary = row['Summary']  # Assuming 'Summary' contains the event descriptions
                # Replace all IDs found with corresponding names
                matches = re.findall(r'\d+', summary)
                for match in matches:
                    if match in player_mappings:
                        summary = summary.replace(match, player_mappings[match])
                    elif match in team_mappings:
                        summary = summary.replace(match, team_mappings[match])
                df.at[index, 'Summary'] = summary
                events_updated += 1
            # Save the modified CSV file
            df.to_csv(path, index=False)
            print(f"Updated {events_updated} events in {filename}")

# Paths to the folders
xml_folder = 'pass-files'  # Update with your actual path
csv_folder = 'csv-summaries'  # Update with your actual path

# Load player and team mappings from XML files
player_mappings, team_mappings = load_mappings(xml_folder)

# Replace IDs in CSV files
replace_ids_with_names(csv_folder, player_mappings, team_mappings)


Loaded 16 player and 2 team entries from pass_matrix_100_2022_g2359864_t418.xml.
Loaded 32 player and 4 team entries from pass_matrix_100_2022_g2290059_t569.xml.
Loaded 47 player and 5 team entries from pass_matrix_100_2022_g2359898_t2289.xml.
Loaded 62 player and 7 team entries from pass_matrix_100_2022_g2290014_t1000.xml.
Loaded 78 player and 9 team entries from pass_matrix_100_2022_g2290013_t2592.xml.
Loaded 93 player and 9 team entries from pass_matrix_100_2022_g2290096_t272.xml.
Loaded 98 player and 10 team entries from pass_matrix_100_2022_g2290082_t569.xml.
Loaded 114 player and 11 team entries from pass_matrix_100_2022_g2290015_t239.xml.
Loaded 129 player and 12 team entries from pass_matrix_100_2022_g2359892_t401.xml.
Loaded 145 player and 12 team entries from pass_matrix_100_2022_g2359881_t545.xml.
Loaded 151 player and 12 team entries from pass_matrix_100_2022_g2290069_t401.xml.
Loaded 157 player and 12 team entries from pass_matrix_100_2022_g2290011_t569.xml.
Loaded 159 pla