In [1]:
## loading dependent packages
import pandas as pd
from pandas import DataFrame, Series
import glob
import os
import re

In [2]:
## defining the data directory
data_dir: str = '../data/train/'

In [3]:
## initializing the list to store the concatenatd data frames
merged_df_list: list[pd.DataFrame] = []

In [4]:
## locating all files that match the pattern input_2023_w[01-18].csv
input_files: list[str] = sorted(glob.glob(os.path.join(data_dir, 'input_2023_w*.csv')))

In [5]:
## locating all files that match the pattern output_2023_w[01-18].csv
output_files = sorted(glob.glob(os.path.join(data_dir, 'output_2023_w*.csv')))

In [6]:
## defining the keys to join the corresponding input files on the output files
join_keys: list[str] = ['game_id', 'play_id', 'nfl_id']

In [7]:
## checking if the lists have the same length before zipping
if (len(input_files) != len(output_files)):
    print('Warning: The number of input files does not match the number of output files')

In [None]:
## using zip to iterate over the corresponding input and output files simultaneously
for input_file_path, output_file_path in zip(input_files, output_files):
    
    ## extracting the week identifier for clear matching
    match = re.search(r'(w\d{2})', os.path.basename(input_file_path))
    week = match.group(0) if match else 'Unknown_Week'

    try:
        ## loading the dataframes
        print(f'Processing {week}: Loading {os.path.basename(input_file_path)} and {os.path.basename(output_file_path)}')
        input_df: pd.DataFrame = pd.read_csv(input_file_path, engine = 'python')
        output_df: pd.DataFrame = pd.read_csv(output_file_path, engine = 'python')

        ## renaming the overlapping target columns in the output file dataframe
        output_df = output_df.rename(columns = {'x': 'target_x',
                                                'y': 'target_y',
                                                'frame_id': 'target_frame_id'})

        ## merging the input and output dataframes on the specified keys through an inner merge to keep rows that exist in both
        ## the input and output files 
        merged_df: pd.DataFrame = pd.merge(
            input_df,
            output_df,
            on = join_keys,
            how = 'inner',
            suffixes = ('_input', '_output')
        )

        ## adding the merged dataframe to the dataframe list
        merged_df_list.append(merged_df)

        ## verifying that all columns are present in the merged dataframe
        required_cols: list[str] = list(input_df.columns) + list(output_df.columns)
        missing_cols: list[str] = [col for col in required_cols if col not in merged_df.columns]

        if missing_cols:
            print(f'Warning: merged dataframe for {week} is missing expected columns: {missing_cols}')
            print(f'Successfully merged {week}, resulting in {len(merged_df)} rows')

    except FileNotFoundError:
        print(F'Error: File not found for {week}. Skipping.')
    except pd.errors.EmptyDataError:
        print(f'Error: One of the files for {week} is empty. Skipping.')
    except Exception as e:
        print(f'An unexpected error occurred during processing {week}: {e}')

print('\n--- Final Summary ---')
print(f'Processing complete. Total {len(merged_df_list)} merged dataframes stored in "merged_df_list."')


Processing w01: Loading input_2023_w01.csv and output_2023_w01.csv
      game_id  play_id  player_to_predict  nfl_id  frame_id play_direction  \
0  2023090700      101               True   46137         1          right   
1  2023090700      101               True   46137         1          right   
2  2023090700      101               True   46137         1          right   
3  2023090700      101               True   46137         1          right   
4  2023090700      101               True   46137         1          right   

   absolute_yardline_number  player_name player_height  player_weight  ...  \
0                        42  Justin Reid           6-1            204  ...   
1                        42  Justin Reid           6-1            204  ...   
2                        42  Justin Reid           6-1            204  ...   
3                        42  Justin Reid           6-1            204  ...   
4                        42  Justin Reid           6-1            204  ...