In [2]:
import os
import json
import pandas as pd
import yaml

In [7]:
def load_config(config_path):
    with open(config_path, 'r') as file:
        config = yaml.safe_load(file)
    return config

config = load_config('config.yaml')
json_dir = config['data_paths']['json_directory']
output_dir = config['data_paths']['output_directory_dataframes']

In [5]:
def process_json_file(file_path, output_dir):
    """""
    Processes a JSON file containing event data, extracts relevant information, and converts it into a structured DataFrame format.

    Parameters:
        - file_path (str): Path to the JSON file containing event data
        - output_dir (str): Path to the output_directory

    """""
    with open(file_path, 'r', encoding='utf-8') as json_file:
        data = json.load(json_file)

    # Initialize lists to store data
    rows = []

    # Extract data into lists
    for event in data['events']:
        event_id = event['id']
        #print(event_id)
        match_id = event['matchId']
        match_period = event['matchPeriod']
        minute = event['minute']
        second = event['second']
        match_timestamp = event['matchTimestamp']
        video_timestamp = event['videoTimestamp']
        
        primary_type = event['type']['primary']
        secondary_types = ', '.join(event['type']['secondary']) if 'secondary' in event['type'] else None
        
        # Extract location data, handling cases where it might be missing
        if 'location' in event and event['location'] is not None:
            location_x = event['location']['x']
            location_y = event['location']['y']
        
        team_id = event['team']['id']
        team_name = event['team']['name']
        team_formation = event['team']['formation']
        
        opponent_team_id = event['opponentTeam']['id']
        opponent_team_name = event['opponentTeam']['name']
        opponent_team_formation = event['opponentTeam']['formation']
        
        player_id = event['player']['id']
        player_name = event['player']['name']
        player_position = event['player']['position']

        # Initialize variables for extras
        pass_accurate = pass_angle = pass_length = recipient_id = recipient_name = recipient_position = end_location_x = end_location_y = None
        shot_body_part = is_goal = on_target = goal_zone = xg = post_shot_xg = goalkeeper_action_id = goalkeeper = None
        duel_type_ground_duel = opponent_id_ground_duel = opponent_name_ground_duel = opponent_position_ground_duel = kept_possession_ground_duel = progressed_with_ball_ground_duel = stopped_progress_ground_duel = recovered_possession_ground_duel = take_on_ground_duel = side_ground_duel = related_duel_id_ground_duel = None
        opponent_id_aerial_duel = opponent_name_aerial_duel = opponent_position_aerial_duel = opponent_height_aerial_duel = first_touch_aerial_duel = height_aerial_duel = related_duel_id_aerial_duel = None
        yellow_card = red_card = infraction_type = opponent_id_infraction = opponent_name_infraction = opponent_position_infraction = None
        carry_progression = end_location_x_carry = end_location_y_carry = None
        possession_id = possession_duration = possession_types = events_number = event_index = start_location_x = start_location_y = end_location_x_possession = end_location_y_possession = team_id_possession = team_name_possession = team_formation_possession = None
        with_shot = with_shot_on_goal = with_goal = flank = xg_possession = None

        # Check for extras and assign values accordingly
        if 'pass' in event and event['pass'] is not None:
            pass_accurate = event['pass']['accurate']
            pass_angle = event['pass']['angle']
            pass_length = event['pass']['length']
            recipient_id = event['pass']['recipient']['id']
            recipient_name = event['pass']['recipient']['name']
            recipient_position = event['pass']['recipient']['position']
            end_location_x = event['pass']['endLocation']['x']
            end_location_y = event['pass']['endLocation']['y']
        if 'shot' in event and event['shot'] is not None:
            shot_body_part = event['shot']['bodyPart']
            is_goal = event['shot']['isGoal']
            on_target = event['shot']['onTarget']
            goal_zone = event['shot']['goalZone']
            xg = event['shot']['xg']
            post_shot_xg = event['shot']['postShotXg']
            goalkeeper_action_id = event['shot']['goalkeeperActionId']
            goalkeeper = event['shot']['goalkeeper']
        if 'groundDuel' in event and event['groundDuel'] is not None:
            duel_type_ground_duel = event['groundDuel']['duelType']
            if 'opponent' in event['groundDuel'] and event['groundDuel']['opponent'] is not None:
                opponent_id_ground_duel = event['groundDuel']['opponent']['id']
                opponent_name_ground_duel = event['groundDuel']['opponent']['name']
                opponent_position_ground_duel = event['groundDuel']['opponent']['position']
            else:
            # Handle the case when opponent info is missing
                opponent_id_ground_duel = None
                opponent_name_ground_duel = None
                opponent_position_ground_duel = None
            kept_possession_ground_duel = event['groundDuel']['keptPossession']
            progressed_with_ball_ground_duel = event['groundDuel']['progressedWithBall']
            stopped_progress_ground_duel = event['groundDuel']['stoppedProgress']
            recovered_possession_ground_duel = event['groundDuel']['recoveredPossession']
            take_on_ground_duel = event['groundDuel']['takeOn']
            side_ground_duel = event['groundDuel']['side']
        if 'aerialDuel' in event and event['aerialDuel'] is not None:
            opponent_id_aerial_duel = event['aerialDuel']['opponent']['id']
            opponent_name_aerial_duel = event['aerialDuel']['opponent']['name']
            opponent_position_aerial_duel = event['aerialDuel']['opponent']['position']
            opponent_height_aerial_duel = event['aerialDuel']['opponent']['height']
            first_touch_aerial_duel = event['aerialDuel']['firstTouch']
            height_aerial_duel = event['aerialDuel']['height']
            related_duel_id_aerial_duel = event['aerialDuel']['relatedDuelId']
        if 'infraction' in event and event['infraction'] is not None:
            yellow_card = event['infraction']['yellowCard']
            red_card = event['infraction']['redCard']
            infraction_type = event['infraction']['type']
            if event['infraction']['opponent'] is not None:
                opponent_id_infraction = event['infraction']['opponent']['id']
                opponent_name_infraction = event['infraction']['opponent']['name']
                opponent_position_infraction = event['infraction']['opponent']['position']
        if 'carry' in event and event['carry'] is not None:
            carry_progression = event['carry']['progression']
            end_location_x_carry = event['carry']['endLocation']['x']
            end_location_y_carry = event['carry']['endLocation']['y']
            
        # Check for possession and assign values accordingly
        if 'possession' in event and event['possession'] is not None:
            possession_id = event['possession']['id']
            possession_duration = event['possession']['duration']
            possession_types = event['possession']['types']
            events_number = event['possession']['eventsNumber']
            event_index = event['possession']['eventIndex']
            start_location_x = event['possession']['startLocation']['x']
            start_location_y = event['possession']['startLocation']['y']
            end_location_x_possession = event['possession']['endLocation']['x']
            end_location_y_possession = event['possession']['endLocation']['y']
            team_id_possession = event['possession']['team']['id']
            team_name_possession = event['possession']['team']['name']
            team_formation_possession = event['possession']['team']['formation']
            if 'attack' in event['possession']:
                if event['possession']['attack'] is not None:
                    with_shot = event['possession']['attack']['withShot']
                    with_shot_on_goal = event['possession']['attack']['withShotOnGoal']
                    with_goal = event['possession']['attack']['withGoal']
                    flank = event['possession']['attack']['flank']
                    xg_possession = event['possession']['attack']['xg']

        
        
        
        # Append data to rows list
        rows.append([
        event_id,
        match_id,
        match_period,
        minute,
        second,
        match_timestamp,
        video_timestamp,
        primary_type,
        secondary_types,
        location_x,
        location_y,
        team_id,
        team_name,
        team_formation,
        opponent_team_id,
        opponent_team_name,
        opponent_team_formation,
        player_id,
        player_name,
        player_position,
        pass_accurate,
        pass_angle,
        pass_length,
        recipient_id,
        recipient_name,
        recipient_position,
        end_location_x,
        end_location_y,
        shot_body_part,
        is_goal,
        on_target,
        goal_zone,
        xg,
        post_shot_xg,
        goalkeeper_action_id,
        goalkeeper,
        opponent_id_ground_duel,
        opponent_name_ground_duel,
        opponent_position_ground_duel,
        duel_type_ground_duel,
        kept_possession_ground_duel,
        progressed_with_ball_ground_duel,
        stopped_progress_ground_duel,
        recovered_possession_ground_duel,
        take_on_ground_duel,
        side_ground_duel,
        related_duel_id_ground_duel,
        opponent_id_aerial_duel,
        opponent_name_aerial_duel,
        opponent_position_aerial_duel,
        opponent_height_aerial_duel,
        first_touch_aerial_duel,
        height_aerial_duel,
        related_duel_id_aerial_duel,
        yellow_card,
        red_card,
        infraction_type,
        opponent_id_infraction,
        opponent_name_infraction,
        opponent_position_infraction,
        carry_progression,
        end_location_x_carry,
        end_location_y_carry,
        possession_id,
        possession_duration,
        possession_types,
        events_number,
        event_index,
        start_location_x,
        start_location_y,
        end_location_x_possession,
        end_location_y_possession,
        team_id_possession,
        team_name_possession,
        team_formation_possession,
        with_shot,
        with_shot_on_goal,
        with_goal,
        flank,
        xg_possession
    ])
    df = pd.DataFrame(rows, columns=[
        "event_id",
        "match_id",
        "match_period",
        "minute",
        "second",
        "match_timestamp",
        "video_timestamp",
        "primary_type",
        "secondary_types",
        "location_x",
        "location_y",
        "team_id",
        "team_name",
        "team_formation",
        "opponent_team_id",
        "opponent_team_name",
        "opponent_team_formation",
        "player_id",
        "player_name",
        "player_position",
        "pass_accurate",
        "pass_angle",
        "pass_length",
        "recipient_id",
        "recipient_name",
        "recipient_position",
        "end_location_x",
        "end_location_y",
        "shot_body_part",
        "is_goal",
        "on_target",
        "goal_zone",
        "xg",
        "post_shot_xg",
        "goalkeeper_action_id",
        "goalkeeper",
        "opponent_id_ground_duel",
        "opponent_name_ground_duel",
        "opponent_position_ground_duel",
        "duel_type_ground_duel",
        "kept_possession_ground_duel",
        "progressed_with_ball_ground_duel",
        "stopped_progress_ground_duel",
        "recovered_possession_ground_duel",
        "take_on_ground_duel",
        "side_ground_duel",
        "related_duel_id_ground_duel",
        "opponent_id_aerial_duel",
        "opponent_name_aerial_duel",
        "opponent_position_aerial_duel",
        "opponent_height_aerial_duel",
        "first_touch_aerial_duel",
        "height_aerial_duel",
        "related_duel_id_aerial_duel",
        "yellow_card",
        "red_card",
        "infraction_type",
        "opponent_id_infraction",
        "opponent_name_infraction",
        "opponent_position_infraction",
        "carry_progression",
        "end_location_x_carry",
        "end_location_y_carry",
        "possession_id",
        "possession_duration",
        "possession_types",
        "events_number",
        "event_index",
        "start_location_x",
        "start_location_y",
        "end_location_x_possession",
        "end_location_y_possession",
        "team_id_possession",
        "team_name_possession",
        "team_formation_possession",
        "with_shot",
        "with_shot_on_goal",
        "with_goal",
        "flank",
        "xg_possession"
    ])
        
    # Save DataFrame to CSV file in output directory
    output_file_path = os.path.join(output_dir, os.path.basename(file_path).replace('.json', '_data.csv'))
    df.to_csv(output_file_path, index=False)

In [6]:
for filename in os.listdir(json_dir):
    if filename.endswith('.json'):
        json_file_path = os.path.join(json_dir, filename)
        process_json_file(json_file_path, output_dir)

Creating one dataframe:

In [8]:
def merge_dataframes_from_directory(directory):
    """
    Merges the CSV files of the dataframes from the directory into one large DataFrame.

    Parameters:
    - directory (str): Path to the directory containing CSV files.

    Returns:
    - merged_df (pd.DataFrame): Merged DataFrame containing data from all CSV files.
    """
    # List all CSV files in the directory
    csv_files = [file for file in os.listdir(directory) if file.endswith('.csv')]

    # Initialize an empty list to store DataFrames
    dfs = []

    # Read each CSV file into a DataFrame and append to the list
    for file in csv_files:
        file_path = os.path.join(directory, file)
        df = pd.read_csv(file_path)
        dfs.append(df)

    # Concatenate all DataFrames into one
    merged_df = pd.concat(dfs, ignore_index=True)

    return merged_df
    

In [9]:
merged_dataframe = merge_dataframes_from_directory(output_dir)
output_path = config['data_paths']['output_file_full_dataframe']
merged_dataframe.to_csv(output_path)