In [7]:
# Imports
import os
import json
import pandas as pd
from tqdm import tqdm

#  Paths
matches_path = '../data/matches/43/3.json'
events_path = '../data/events/'

# 🛠 Improved function
def build_match_snapshots(matches_path, events_path):
    with open(matches_path, encoding="utf-8") as f:
        matches = json.load(f)
    
    match_ids = [match['match_id'] for match in matches]
    snapshots = []
    successful_matches = []  # Track successful match_ids

    for match_id in tqdm(match_ids, desc="Processing Matches"):
        event_file = os.path.join(events_path, f'{match_id}.json')
        
        if not os.path.exists(event_file):
            print(f"Warning: Event file for match_id {match_id} not found.")
            continue
        
        with open(event_file, encoding="utf-8") as f:
            events = json.load(f)
        
        df = pd.json_normalize(events)

        # Check if event data is empty
        if df.empty:
            print(f"Warning: Event file for match_id {match_id} is empty.")
            continue
        
        # Fetch teams
        match_info = next((m for m in matches if m['match_id'] == match_id), None)
        if not match_info:
            continue

        home_team = match_info['home_team']['home_team_name']
        away_team = match_info['away_team']['away_team_name']

        final_result = get_final_result(matches, match_id)

        for minute in range(1, 91):
            current_events = df[df['minute'] <= minute]
            
            goals = current_events[(current_events['type.name'] == 'Shot') & (current_events['shot.outcome.name'] == 'Goal')]
            home_score = len(goals[goals['team.name'] == home_team])
            away_score = len(goals[goals['team.name'] == away_team])
            
            reds = current_events[(current_events['type.name'] == 'Foul Committed') & (current_events['foul_committed.card.name'] == 'Red Card')]
            home_red = len(reds[reds['team.name'] == home_team])
            away_red = len(reds[reds['team.name'] == away_team])
            
            shots = current_events[(current_events['type.name'] == 'Shot')]
            home_xg = shots[shots['team.name'] == home_team]['shot.statsbomb_xg'].sum()
            away_xg = shots[shots['team.name'] == away_team]['shot.statsbomb_xg'].sum()
            
            snapshots.append({
                'match_id': match_id,
                'minute': minute,
                'home_team': home_team,
                'away_team': away_team,
                'home_score': home_score,
                'away_score': away_score,
                'home_red': home_red,
                'away_red': away_red,
                'home_xg': home_xg,
                'away_xg': away_xg,
                'final_result': final_result
            })

        # Log successful match
        successful_matches.append(match_id)

    print(f"\n Successfully processed {len(successful_matches)} matches out of {len(match_ids)} available.")
    return pd.DataFrame(snapshots)

# Helper function
def get_final_result(matches, match_id):
    match = next(item for item in matches if item['match_id'] == match_id)
    if match['home_score'] > match['away_score']:
        return 'Home Win'
    elif match['home_score'] < match['away_score']:
        return 'Away Win'
    else:
        return 'Draw'

# Run the function
snapshot_df = build_match_snapshots(matches_path, events_path)

# Save it
snapshot_df.to_csv('../data/world_cup_snapshots.csv', index=False)

# Check
print(snapshot_df.shape)
print(snapshot_df['match_id'].nunique())
snapshot_df.head()


Processing Matches:   0%|          | 0/64 [00:00<?, ?it/s]



Processing Matches:   9%|▉         | 6/64 [00:00<00:04, 13.84it/s]



Processing Matches:  62%|██████▎   | 40/64 [00:00<00:00, 59.58it/s]



Processing Matches: 100%|██████████| 64/64 [00:01<00:00, 39.68it/s]


✅ Successfully processed 4 matches out of 64 available.
(360, 11)
4





Unnamed: 0,match_id,minute,home_team,away_team,home_score,away_score,home_red,away_red,home_xg,away_xg,final_result
0,8656,1,Croatia,England,0,0,0,0,0.0,0.0,Home Win
1,8656,2,Croatia,England,0,0,0,0,0.0,0.0,Home Win
2,8656,3,Croatia,England,0,0,0,0,0.0,0.0,Home Win
3,8656,4,Croatia,England,0,1,0,0,0.0,0.117755,Home Win
4,8656,5,Croatia,England,0,1,0,0,0.0,0.117755,Home Win
