In [77]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import json
import os
import pandas as pd
from tqdm import tqdm
root_path = "/home/work/data/MHL/bepro"

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [132]:
match_id_lst = [153381, 153387]
match_id = match_id_lst[0]

match_path = f"{root_path}/{match_id}"
os.listdir(match_path)


['153381_2_frame_data.jsonl',
 '153381_1_frame_data.jsonl',
 '153381_event',
 '153381_metadata.json']

In [133]:
def load_single_json(file_path):
    """
    Loads and parses a single JSON file from the specified path.

    Args:
        file_path (str): The path to the JSON file to load.

    Returns:
        dict or list or None: The parsed Python object on success.
                              Returns None if the file is not found or
                              is not valid JSON.
    """
    try:
        # Open the file in read mode ('r') with UTF-8 encoding.
        # Using 'with' ensures the file is automatically closed after use.
        with open(file_path, 'r', encoding='utf-8') as f:
            # json.load() parses the JSON data from the file object.
            data = json.load(f)
            print(f"Successfully loaded file: '{file_path}'")
            return data
    except FileNotFoundError:
        print(f"Error: File not found at '{file_path}'")
        return None
    except json.JSONDecodeError:
        print(f"Error: Could not decode JSON from '{file_path}'. Check format.")
        return None
    except Exception as e:
        print(f"An unexpected error occurred while reading '{file_path}': {e}")
        return None

In [134]:
# Check Metadata

meta_data_path = f"{match_path}/{match_id}_metadata.json"
meta_data = load_single_json(meta_data_path)
for key, value in meta_data.items():
    print(f"{key} : {value}")

Successfully loaded file: '/home/work/data/MHL/bepro/153381/153381_metadata.json'
match_id : 153381
match_title : 수원FC vs 강원FC
match_datetime : 2024-11-09T16:30:00+0900
match_full_time : 5400000
match_extra_time : None
periods : [{'period_name': '1st Half', 'period_order': 0, 'period_match_duration': 2700000}, {'period_name': '2nd Half', 'period_order': 1, 'period_match_duration': 2700000}]
home_team : {'team_id': 4220, 'team_name': '수원FC', 'team_name_en': 'Suwon FC', 'players': [{'player_id': 500530, 'team_id': 4220, 'shirt_number': '10', 'full_name': '지동원', 'full_name_en': 'Dongwon Ji', 'initial_position_name': 'CF', 'is_starting': True}, {'player_id': 77715, 'team_id': 4220, 'shirt_number': '24', 'full_name': '김주엽', 'full_name_en': 'Ju Yeop Kim', 'initial_position_name': None, 'is_starting': False}, {'player_id': 500547, 'team_id': 4220, 'shirt_number': '15', 'full_name': '김태한', 'full_name_en': 'Taehan Kim', 'initial_position_name': 'CB', 'is_starting': True}, {'player_id': 500552, 

In [135]:
def create_team_dataframe(home_team_info, away_team_info):
    home_team_rows = []
    base_info = {
            'player': None,
            'position': None,
            'team': 'Home',
            'jID': None,
            'pID': None,
            'tID': home_team_info.get('team_id'),
            'xID': None   
        }
    for idx, player_data in enumerate(home_team_info['players']):
        player_info = base_info.copy()
        player_info['player'] = player_data['full_name_en']
        player_info['position'] = player_data['initial_position_name']
        player_info['jID'] = player_data['shirt_number']
        player_info['pID'] = player_data['player_id']
        player_info['xID'] = idx
        home_team_rows.append(player_info)

    home_df = pd.DataFrame(home_team_rows)

    away_team_rows = []
    base_info['team'] = 'Away'
    base_info['tID'] = away_team_info.get('team_id')
    for idx, player_data in enumerate(away_team_info['players']):
        player_info = base_info.copy()
        player_info['player'] = player_data['full_name_en']
        player_info['position'] = player_data['initial_position_name']
        player_info['jID'] = player_data['shirt_number']
        player_info['pID'] = player_data['player_id']
        player_info['xID'] = idx
        away_team_rows.append(player_info)

    away_df = pd.DataFrame(away_team_rows)

    return {'Home': home_df, 'Away': away_df}

teams_dict = create_team_dataframe(meta_data['home_team'], meta_data['away_team'])
        

In [136]:
teams_dict['Home']

Unnamed: 0,player,position,team,jID,pID,tID,xID
0,Dongwon Ji,CF,Home,10,500530,4220,0
1,Ju Yeop Kim,,Home,24,77715,4220,1
2,Taehan Kim,CB,Home,15,500547,4220,2
3,Joon Soo Ahn,GK,Home,13,500552,4220,3
4,Hyunyong Lee,CB,Home,30,354820,4220,4
5,Young Woo Jang,,Home,26,500540,4220,5
6,Gyowon Han,RW,Home,71,532196,4220,6
7,Yoonho Jo,RWB,Home,4,500543,4220,7
8,Jungwoo Ha,,Home,39,503230,4220,8
9,Minki Jeong,,Home,99,532195,4220,9


In [137]:
teams_dict['Away']

Unnamed: 0,player,position,team,jID,pID,tID,xID
0,Vitor Gabriel,CF,Away,10,412017,4643,0
1,Youngbin Kim,,Away,2,145673,4643,1
2,Franko Kovačević,CF,Away,9,529741,4643,2
3,Junseo Jin,RB,Away,15,528959,4643,3
4,Marko Tuci,CB,Away,74,408792,4643,4
5,Insoo Yu,LW,Away,17,343587,4643,5
6,Gihyuk Lee,CB,Away,13,500115,4643,6
7,Sangheon Lee,CAM,Away,22,500117,4643,7
8,Gyeongmin Kim,,Away,19,528612,4643,8
9,Cheonghyo Park,,Away,21,500116,4643,9


In [140]:
event_data_path = next((f for f in os.listdir(match_path) if "Event" in f), None)
new_path_name = f"{match_id}_event"
if event_data_path and event_data_path != f"{match_id}_event":
    os.rename(f"{match_path}/{event_data_path}", f"{match_path}/{new_path_name}")
else:
    event_data_path = f"{match_path}/{new_path_name}"
print(event_data_path)

/home/work/data/MHL/bepro/153381/153381_event


In [141]:
event_data_path = f"{match_path}/{match_id}_event"

first_half_event_path =  next((f for f in os.listdir(event_data_path) if "1st" in f), None)
second_half_event_path =  next((f for f in os.listdir(event_data_path) if "2nd" in f), None)

first_half_event_data = load_single_json(f"{event_data_path}/{first_half_event_path}")
second_half_event_data = load_single_json(f"{event_data_path}/{second_half_event_path}")

Successfully loaded file: '/home/work/data/MHL/bepro/153381/153381_event/2024-11-09_SuwonFC vs GangwonFC_1st Half.json'
Successfully loaded file: '/home/work/data/MHL/bepro/153381/153381_event/2024-11-09_SuwonFC vs GangwonFC_2nd Half.json'


In [142]:
# Check Event Data

print(first_half_event_data.keys())

first_half_event_df = pd.DataFrame(first_half_event_data['data'])
second_half_event_df = pd.DataFrame(second_half_event_data['data'])

first_half_event_df.head()

dict_keys(['data'])


Unnamed: 0,period_type,period_name,period_order,period_duration,period_start_time,event_time,team_name,player_shirt_number,player_name,events,x,y,to_x,to_y,attack_direction
0,Half,1st Half,0,2700000,0,600,Suwon FC,10,Dongwon Ji,"[{'event_name': 'Passes', 'property': {'Outcom...",0.4998,0.4996,0.4984,0.5279,RIGHT
1,Half,1st Half,0,2700000,0,1133,Suwon FC,70,Anderson Oliveira,"[{'event_name': 'Passes Received', 'property':...",0.4984,0.5279,,,RIGHT
2,Half,1st Half,0,2700000,0,2033,Gangwon FC,10,Vitor Gabriel,"[{'name': 'VHIR', 'property': {'duration': 500...",0.4981,0.5887,0.5005,0.5421,LEFT
3,Half,1st Half,0,2700000,0,2700,Suwon FC,70,Anderson Oliveira,"[{'event_name': 'Passes', 'property': {'Outcom...",0.5041,0.5278,0.3257,0.5752,RIGHT
4,Half,1st Half,0,2700000,0,4067,Suwon FC,14,Bitgaram Yoon,"[{'event_name': 'Passes Received', 'property':...",0.3257,0.5752,,,RIGHT


In [143]:
def load_jsonl(file_path):
    """
    Loads data from a JSON Lines (.jsonl) file.

    Each line in the file is expected to be a valid JSON object.
    Lines that are empty or cannot be parsed as JSON will be skipped with a warning.

    Args:
        file_path (str): The path to the .jsonl file.

    Returns:
        list: A list containing the Python objects parsed from each valid JSON line.
              Returns an empty list if the file is not found or contains no valid JSON lines.
    """
    data = [] # To store the parsed JSON objects from each line
    try:
        # Open the file in read mode ('r') with UTF-8 encoding.
        # 'with' ensures the file is closed automatically.
        with open(file_path, 'r', encoding='utf-8') as f:
            # Iterate through each line in the file.
            # enumerate adds line numbers (starting from 1) for better error reporting.
            for line_number, line in enumerate(f, 1):
                # Remove leading/trailing whitespace (including the newline character \n)
                processed_line = line.strip()

                # Skip empty lines
                if not processed_line:
                    continue

                try:
                    # Parse the current line (which is a string) into a Python object.
                    # Use json.loads() for parsing a string, not json.load().
                    parsed_object = json.loads(processed_line)
                    data.append(parsed_object)
                except json.JSONDecodeError:
                    # Handle lines that are not valid JSON.
                    print(f"Warning: Skipping line {line_number} in '{file_path}' due to JSON decoding error.")
                    # Optional: Print the problematic line for debugging
                    # print(f"         Problematic line content: {processed_line[:100]}...")
                except Exception as e:
                    # Handle any other unexpected errors during line processing
                    print(f"Warning: An unexpected error occurred processing line {line_number} in '{file_path}': {e}. Skipping line.")

    except FileNotFoundError:
        # Handle the case where the file itself doesn't exist.
        print(f"Error: File not found at '{file_path}'")
    except Exception as e:
        # Handle other potential errors during file opening or reading (outside the line loop).
        print(f"An error occurred while reading the file '{file_path}': {e}")

    # Return the list of successfully parsed objects.
    return data

In [144]:
first_half_tracking_data = load_jsonl(f"{match_path}/{match_id}_1_frame_data.jsonl")
second_half_tracking_data = load_jsonl(f"{match_path}/{match_id}_2_frame_data.jsonl")

In [145]:
print(f"# of frames : {len(first_half_tracking_data)}")
print(f"Keys : {first_half_tracking_data[0].keys()}")
for key in ['period_order', 'match_time', 'frame_index', 'ball_state']:
    print(f"{key} : {first_half_tracking_data[0][key]}")

# of frames : 88226
Keys : dict_keys(['period_order', 'match_time', 'frame_index', 'ball_state', 'players', 'balls'])
period_order : 0
match_time : 0
frame_index : 0
ball_state : None


In [146]:
print(f"# of frames : {len(second_half_tracking_data)}")
print(f"Keys : {second_half_tracking_data[0].keys()}")
for key in ['period_order', 'match_time', 'frame_index', 'ball_state']:
    print(f"{key} : {second_half_tracking_data[0][key]}")

# of frames : 88168
Keys : dict_keys(['period_order', 'match_time', 'frame_index', 'ball_state', 'players', 'balls'])
period_order : 1
match_time : 2700000
frame_index : 81000
ball_state : neutral


In [147]:
for key in ['players', 'balls']:
    print(f"{key} : {len(first_half_tracking_data[0][key])}")
    if len(first_half_tracking_data[0][key]) != 0:
        print(f"{key} : {first_half_tracking_data[0][key][0]}")

players : 21
players : {'object': 'PLAYER', 'player_id': 85175, 'x': 97.8814, 'y': 32.1887, 'speed': 1.4158}
balls : 0


In [148]:
def create_tracking_dataframe(match_path, teams_dict):
    match_id = match_path.split("/")[-1]
    first_half_tracking_data = load_jsonl(f"{match_path}/{match_id}_1_frame_data.jsonl")
    second_half_tracking_data = load_jsonl(f"{match_path}/{match_id}_2_frame_data.jsonl")

    all_object_rows = []

    for half_tracking_data in [first_half_tracking_data, second_half_tracking_data]:
        for frame_data in tqdm(half_tracking_data):
            # Check ball state
            ball_state = frame_data.get('ball_state')
            if ball_state is None or ball_state == 'out':
                new_ball_state = 'dead'
                ball_owning_team_id = None
            else:
                new_ball_state = 'alive'
                if ball_state == 'home':
                    ball_owning_team_id = teams_dict['Home']['tID'].iloc[0]
                elif ball_state == 'away':
                    ball_owning_team_id = teams_dict['Away']['tID'].iloc[0]
                else:
                    ball_owning_team_id = ball_state

            # 2. Extract current frames base information.
            frame_info = {
                'game_id': match_id,
                'period_id': frame_data.get('period_order') + 1,
                'match_time': frame_data.get('match_time'),
                'frame_id': frame_data.get('frame_index'),
                'ball_state': new_ball_state,
                'ball_owning_team_id': ball_owning_team_id,
            }

            for object in ['players', 'balls']:
                object_list = frame_data.get(object, [])
                if object_list:
                    for object_data in object_list:
                        row_data = frame_info.copy()
                        row_data.update(object_data)
                        if object == 'balls':
                            row_data['id'] = 'ball'
                        else:
                            row_data['id'] = row_data['player_id']
                        row_data.pop('object')
                        row_data.pop('player_id')
                        all_object_rows.append(row_data)

    tracking_df = pd.DataFrame(all_object_rows)
    return tracking_df

In [149]:
tracking_df = create_tracking_dataframe(match_path, teams_dict)
tracking_df.head()

100%|██████████| 88226/88226 [00:00<00:00, 93460.71it/s] 
100%|██████████| 88168/88168 [00:00<00:00, 89823.82it/s]


Unnamed: 0,game_id,period_id,match_time,frame_id,ball_state,ball_owning_team_id,x,y,speed,id
0,153381,1,0,0,dead,,97.8814,32.1887,1.4158,85175
1,153381,1,0,0,dead,,65.8979,13.9988,0.0627,187794
2,153381,1,0,0,dead,,31.0232,21.5441,0.8194,287282
3,153381,1,0,0,dead,,51.1605,14.5683,0.3967,343587
4,153381,1,0,0,dead,,50.2939,65.4722,0.231,354809
