In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install optuna==2.10.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting optuna==2.10.1
  Downloading optuna-2.10.1-py3-none-any.whl (308 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m308.2/308.2 KB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
Collecting colorlog
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting cmaes>=0.8.2
  Downloading cmaes-0.9.1-py3-none-any.whl (21 kB)
Collecting alembic
  Downloading alembic-1.9.2-py3-none-any.whl (210 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m210.6/210.6 KB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cliff
  Downloading cliff-4.1.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.0/81.0 KB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
Collecting Mako
  Downloading Mako-1.2.4-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 KB[0m [31m8.8 MB/

## 1. Tabular Data & Modules Load



In [None]:
import os, sys
from pathlib import Path

# Set up path
try:
    base_path = Path(__file__).resolve().parent.parent
except:
    %cd '/content/drive/MyDrive/Colab Notebooks/kaggle/Player-Contact-Detection/tutorial'
    base_path = Path('__file__').resolve().parent.parent

module_path = base_path / 'module'
data_path = base_path / 'data'

# Append system path 
sys.path.append(str(module_path))

/content/drive/MyDrive/Colab Notebooks/kaggle/Player-Contact-Detection/tutorial


In [132]:
# Import data analysis libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from xgboost.sklearn import XGBClassifier
from lightgbm.sklearn import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import matthews_corrcoef
import tqdm
from ensemble import BinaryCalssifier
# from jtlearn import ImbSampler

In [None]:
# Load datasets
tr_baseline_helmets = pd.read_csv(data_path / 'train_baseline_helmets.csv')
tr_labels = pd.read_csv(data_path / 'train_labels.csv')
tr_player_tracking = pd.read_csv(data_path / 'train_player_tracking.csv')
tr_video_metadata = pd.read_csv(data_path / 'train_video_metadata.csv')

te_baseline_helmets = pd.read_csv(data_path / 'test_baseline_helmets.csv')
te_player_tracking = pd.read_csv(data_path / 'test_player_tracking.csv')
te_video_metadata = pd.read_csv(data_path / 'test_video_metadata.csv')

submission = pd.read_csv(data_path / 'sample_submission.csv')

In [None]:
def inplace(func):
    def wrapper(data, inplace=False):                           # 호출할 함수를 감싸는 함수
        if inplace:
            func(data, inplace)
        else:
            df = data.copy()
            func(df, inplace)
            return df
    return wrapper                           # wrapper 함수 반환

## contact id 펼치는 함수
@inplace
def expand_contact_id(df, inplace: bool=False):
    """
    Splits out contact_id into seperate columns.
    """
    df['game_play'] = df['contact_id'].str[:12]
    df[['step', 'nfl_player_id_1', 'nfl_player_id_2']] = df.contact_id.str.split('_', expand=True).iloc[:, 2:]
    df['step'] = df['step'].astype('int')
    df['nfl_player_id_1'] = df['nfl_player_id_1'].astype('int')
    df['group_id'] = df.apply(lambda x: '_'.join([x.game_play, x.nfl_player_id_1, x.nfl_player_id_2]),
                              axis=1)

    temp = df['contact']
    df.drop(columns=['contact'], inplace=True)
    df['contact'] = temp
    return df

## contact_id 만드는 함수
@inplace
def create_contact_id(df, inplace: bool=False):
    # Create contact ids
    cols = list(df)
    if df.nfl_player_id_2.dtype == 'str':
        df.loc[:, 'contact_id'] = (
            df.loc[:, 'game_play']
            + '_'
            + df.loc[:, 'step'].astype('str')
            + '_'
            + df.loc[:, 'nfl_player_id_1'].astype('str')
            + '_'
            + df.loc[:, 'nfl_player_id_2']
        )
    else:
        df.loc[:, 'contact_id'] = (
            df.loc[:, 'game_play']
            + '_'
            + df.loc[:, 'step'].astype('str')
            + '_'
            + df.loc[:, 'nfl_player_id_1'].astype('str')
            + '_'
            + df.loc[:, 'nfl_player_id_2'].astype('str')
        )

    temp = df[cols]
    df.drop(columns=cols, inplace=True)
    df[cols] = temp
    del temp
    return df

## contact id 만드는 함수
@inplace
def create_group_id(df, inplace: bool=False):
    # Create contact ids
    cols = list(df)
    if df.nfl_player_id_2.dtype == 'str':
        df.loc[:, 'group_id'] = (
            df.loc[:, 'game_play']
            + '_'
            + df.loc[:, 'nfl_player_id_1'].astype('str')
            + '_'
            + df.loc[:, 'nfl_player_id_2']
        )
    else:
        df.loc[:, 'group_id'] = (
            df.loc[:, 'game_play']
            + '_'
            + df.loc[:, 'nfl_player_id_1'].astype('str')
            + '_'
            + df.loc[:, 'nfl_player_id_2'].astype('str')
        )

    temp = df[cols]
    df.drop(columns=cols, inplace=True)
    df[cols] = temp
    del temp
    return df

In [None]:
te_labels = submission.copy()
expand_contact_id(te_labels, inplace=True)
te_labels = te_labels[['contact_id', 'group_id', 'game_play', 'step',
                       'nfl_player_id_1', 'nfl_player_id_2', 'contact']]
display(te_labels)

create_group_id(tr_labels, inplace=True)
tr_labels.drop(columns=['datetime'], inplace=True)
tr_labels = tr_labels[['contact_id', 'group_id', 'game_play', 'step',
                       'nfl_player_id_1', 'nfl_player_id_2', 'contact']]
display(te_labels)

Unnamed: 0,contact_id,group_id,game_play,step,nfl_player_id_1,nfl_player_id_2,contact
0,58168_003392_0_38590_43854,58168_003392_38590_43854,58168_003392,0,38590,43854,0
1,58168_003392_0_38590_41257,58168_003392_38590_41257,58168_003392,0,38590,41257,0
2,58168_003392_0_38590_41944,58168_003392_38590_41944,58168_003392,0,38590,41944,0
3,58168_003392_0_38590_42386,58168_003392_38590_42386,58168_003392,0,38590,42386,0
4,58168_003392_0_38590_47944,58168_003392_38590_47944,58168_003392,0,38590,47944,0
...,...,...,...,...,...,...,...
49583,58172_003247_125_40656_G,58172_003247_40656_G,58172_003247,125,40656,G,0
49584,58172_003247_125_52521_G,58172_003247_52521_G,58172_003247,125,52521,G,0
49585,58172_003247_125_52939_G,58172_003247_52939_G,58172_003247,125,52939,G,0
49586,58172_003247_125_39008_G,58172_003247_39008_G,58172_003247,125,39008,G,0


Unnamed: 0,contact_id,group_id,game_play,step,nfl_player_id_1,nfl_player_id_2,contact
0,58168_003392_0_38590_43854,58168_003392_38590_43854,58168_003392,0,38590,43854,0
1,58168_003392_0_38590_41257,58168_003392_38590_41257,58168_003392,0,38590,41257,0
2,58168_003392_0_38590_41944,58168_003392_38590_41944,58168_003392,0,38590,41944,0
3,58168_003392_0_38590_42386,58168_003392_38590_42386,58168_003392,0,38590,42386,0
4,58168_003392_0_38590_47944,58168_003392_38590_47944,58168_003392,0,38590,47944,0
...,...,...,...,...,...,...,...
49583,58172_003247_125_40656_G,58172_003247_40656_G,58172_003247,125,40656,G,0
49584,58172_003247_125_52521_G,58172_003247_52521_G,58172_003247,125,52521,G,0
49585,58172_003247_125_52939_G,58172_003247_52939_G,58172_003247,125,52939,G,0
49586,58172_003247_125_39008_G,58172_003247_39008_G,58172_003247,125,39008,G,0


In [None]:
# 야드 to 미터 변환
yard2meter = 0.9144
convert_columns = ['x_position', 'y_position', 'speed', 'distance', 'acceleration', 'sa']
tr_player_tracking[convert_columns] = tr_player_tracking[convert_columns] * yard2meter
te_player_tracking[convert_columns] = te_player_tracking[convert_columns] * yard2meter
display(tr_player_tracking)
display(te_player_tracking)

Unnamed: 0,game_play,game_key,play_id,nfl_player_id,datetime,step,team,position,jersey_number,x_position,y_position,speed,distance,direction,orientation,acceleration,sa
0,58580_001136,58580,1136,44830,2021-10-10T21:08:20.900Z,-108,away,CB,22,56.317896,38.953440,1.014984,0.100584,320.33,263.93,0.649224,-0.585216
1,58580_001136,58580,1136,47800,2021-10-10T21:08:20.900Z,-108,away,DE,97,54.388512,24.515064,0.210312,0.009144,346.84,247.16,1.179576,0.822960
2,58580_001136,58580,1136,52444,2021-10-10T21:08:20.900Z,-108,away,FS,29,66.010536,28.767024,0.557784,0.054864,11.77,247.69,0.576072,-0.301752
3,58580_001136,58580,1136,46206,2021-10-10T21:08:20.900Z,-108,home,TE,86,52.459128,20.226528,0.338328,0.036576,127.85,63.63,0.630936,0.566928
4,58580_001136,58580,1136,52663,2021-10-10T21:08:20.900Z,-108,away,ILB,48,57.835800,25.146000,0.466344,0.045720,183.62,253.71,0.283464,0.283464
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1353048,58575_003081,58575,3081,48476,2021-10-10T19:22:06.800Z,140,away,CB,40,66.092832,47.365920,1.453896,0.155448,345.36,342.68,0.448056,-0.374904
1353049,58575_003081,58575,3081,44887,2021-10-10T19:22:06.800Z,140,away,ILB,45,68.360544,49.834800,0.896112,0.091440,342.39,354.46,0.576072,-0.219456
1353050,58575_003081,58575,3081,44174,2021-10-10T19:22:06.800Z,140,away,ILB,49,68.159376,50.977800,1.024128,0.100584,352.79,349.20,0.576072,0.420624
1353051,58575_003081,58575,3081,45217,2021-10-10T19:22:06.800Z,140,away,TE,82,73.554336,44.595288,1.764792,0.182880,340.78,346.51,0.484632,-0.466344


Unnamed: 0,game_play,game_key,play_id,nfl_player_id,datetime,step,team,position,jersey_number,x_position,y_position,speed,distance,direction,orientation,acceleration,sa
0,58172_003247,58172,3247,41937,2020-09-13T19:30:20.200Z,-272,home,MLB,57,58.777632,10.323576,4.407408,0.420624,20.74,12.43,2.862072,2.743200
1,58172_003247,58172,3247,45345,2020-09-13T19:30:20.200Z,-272,away,ILB,50,69.521832,33.595056,2.880360,0.292608,179.52,184.91,1.709928,-1.655064
2,58172_003247,58172,3247,46205,2020-09-13T19:30:20.200Z,-272,home,DE,98,60.021216,14.392656,1.645920,0.182880,6.73,339.85,0.630936,-0.621792
3,58172_003247,58172,3247,43406,2020-09-13T19:30:20.200Z,-272,home,WR,14,33.238440,13.661136,1.655064,0.164592,330.50,344.03,0.566928,0.356616
4,58172_003247,58172,3247,48233,2020-09-13T19:30:20.200Z,-272,away,RB,45,69.585840,29.397960,3.072384,0.310896,166.59,180.56,0.804672,-0.795528
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14867,58168_003392,58168,3392,43395,2020-09-11T03:01:57.300Z,92,away,FS,23,46.661832,12.463272,1.554480,0.155448,21.01,18.32,0.320040,-0.182880
14868,58168_003392,58168,3392,39947,2020-09-11T03:01:57.300Z,92,home,T,72,39.072312,16.934688,1.088136,0.118872,331.22,332.97,0.530352,-0.512064
14869,58168_003392,58168,3392,44822,2020-09-11T03:01:57.300Z,92,home,QB,15,32.013144,17.080992,1.353312,0.137160,165.22,162.66,0.347472,-0.064008
14870,58168_003392,58168,3392,38590,2020-09-11T03:01:57.300Z,92,home,G,70,39.977568,16.678656,1.408176,0.146304,341.08,336.00,0.374904,-0.374904


In [None]:
def convert_angular_direction(theta):
    angle = 90 - theta
    if angle < 0:
        angle = 360 + angle
    return angle

# 각도 변환
angle_columns = ['direction', 'orientation']
tr_player_tracking[angle_columns] = tr_player_tracking[angle_columns].applymap(convert_angular_direction)
te_player_tracking[angle_columns] = te_player_tracking[angle_columns].applymap(convert_angular_direction)

## 2. Data preprocessing

In [None]:
## 지면 충돌
tr_ground_contact = tr_labels[tr_labels.nfl_player_id_2 == 'G']
te_ground_contact = te_labels[te_labels.nfl_player_id_2 == 'G']

## 선수간 충돌
tr_players_contact = tr_labels[tr_labels.nfl_player_id_2 != 'G']
te_players_contact = te_labels[te_labels.nfl_player_id_2 != 'G']

tr_players_contact.nfl_player_id_2 = tr_players_contact.nfl_player_id_2.astype(int)
te_players_contact.nfl_player_id_2 = te_players_contact.nfl_player_id_2.astype(int)

tr_ground_contact

Unnamed: 0,contact_id,group_id,game_play,step,nfl_player_id_1,nfl_player_id_2,contact
231,58168_003392_0_38590_G,58168_003392_38590_G,58168_003392,0,38590,G,0
232,58168_003392_0_43854_G,58168_003392_43854_G,58168_003392,0,43854,G,0
233,58168_003392_0_41257_G,58168_003392_41257_G,58168_003392,0,41257,G,0
234,58168_003392_0_41944_G,58168_003392_41944_G,58168_003392,0,41944,G,0
235,58168_003392_0_42386_G,58168_003392_42386_G,58168_003392,0,42386,G,0
...,...,...,...,...,...,...,...
4721613,58582_003121_91_48220_G,58582_003121_48220_G,58582_003121,91,48220,G,0
4721614,58582_003121_91_47906_G,58582_003121_47906_G,58582_003121,91,47906,G,0
4721615,58582_003121_91_38557_G,58582_003121_38557_G,58582_003121,91,38557,G,0
4721616,58582_003121_91_47872_G,58582_003121_47872_G,58582_003121,91,47872,G,0


In [None]:
te_players_contact

Unnamed: 0,contact_id,group_id,game_play,step,nfl_player_id_1,nfl_player_id_2,contact
0,58168_003392_0_38590_43854,58168_003392_38590_43854,58168_003392,0,38590,43854,0
1,58168_003392_0_38590_41257,58168_003392_38590_41257,58168_003392,0,38590,41257,0
2,58168_003392_0_38590_41944,58168_003392_38590_41944,58168_003392,0,38590,41944,0
3,58168_003392_0_38590_42386,58168_003392_38590_42386,58168_003392,0,38590,42386,0
4,58168_003392_0_38590_47944,58168_003392_38590_47944,58168_003392,0,38590,47944,0
...,...,...,...,...,...,...,...
49561,58172_003247_125_47912_48241,58172_003247_47912_48241,58172_003247,125,47912,48241,0
49562,58172_003247_125_47912_47920,58172_003247_47912_47920,58172_003247,125,47912,47920,0
49563,58172_003247_125_47912_48335,58172_003247_47912_48335,58172_003247,125,47912,48335,0
49564,58172_003247_125_47912_52521,58172_003247_47912_52521,58172_003247,125,47912,52521,0


### Physical Analysis

In [None]:
def acc_perpendicular(direct: float, acc: float, sa: float) -> float:
    res = np.sqrt(acc ** 2 - sa ** 2)
    if direct < 0:
        res = -res
    return res

def vectorization(data: pd.DataFrame) -> pd.DataFrame:
    deg2rad = np.pi / 180
    df = data.copy()

    df['x_vel'] = df.apply(lambda x: x.speed * np.cos(x.direction * deg2rad),
                           axis=1)
    df['y_vel'] = df.apply(lambda x: x.speed * np.sin(x.direction * deg2rad),
                           axis=1)

    df['sign_perpendicular'] = df.direction.diff().replace({0: np.nan})
    df['sign_perpendicular'] = df['sign_perpendicular'].fillna(method='ffill').fillna(method='bfill')
    df['sa_perpendicular'] = df.apply(lambda x: acc_perpendicular(x.sign_perpendicular, x.acceleration, x.sa),
                                axis=1)

    df['x_acc'] = df.apply(lambda x: x.sa * np.cos(x.direction * deg2rad), axis=1) + \
                  df.apply(lambda x: x.sa_perpendicular * np.cos((90 + x.direction) * deg2rad), axis=1)
    df['y_acc'] = df.apply(lambda x: x.sa * np.sin(x.direction * deg2rad), axis=1) + \
                  df.apply(lambda x: x.sa_perpendicular * np.sin((90 + x.direction) * deg2rad), axis=1)
    
    df = df.drop(columns=['sign_perpendicular', 'sa_perpendicular'])
    return df

In [None]:
def make_full_tracking(tracking):
    tracking_ls = []
    for game_play in tracking.game_play.unique():
        game_df = tracking[tracking.game_play == game_play]
        temp_ls = []

        for player_id in game_df.nfl_player_id.unique():
            game_player_df = game_df[game_df.nfl_player_id == player_id]
            temp_ls.append(vectorization(game_player_df))

        temp_df = pd.concat(temp_ls, axis=0)
        temp_df = temp_df.sort_values(by='step', ascending=True)
        tracking_ls.append(temp_df)

    result = pd.concat(tracking_ls, axis=0)
    return result.sort_index()

train_player_tracking = make_full_tracking(tr_player_tracking)
test_player_tracking = make_full_tracking(te_player_tracking)

In [None]:
train_player_tracking

NameError: ignored

In [None]:
train_player_tracking.drop(columns=['Unnamed: 0'])

Unnamed: 0,game_play,game_key,play_id,nfl_player_id,datetime,step,team,position,jersey_number,x_position,...,speed,distance,direction,orientation,acceleration,sa,x_vel,y_vel,x_acc,y_acc
0,58580_001136,58580,1136,44830,2021-10-10T21:08:20.900Z,-108,away,CB,22,56.317896,...,1.014984,0.100584,129.67,186.07,0.649224,-0.585216,-0.647930,0.781268,0.157213,-0.629901
1,58580_001136,58580,1136,47800,2021-10-10T21:08:20.900Z,-108,away,DE,97,54.388512,...,0.210312,0.009144,103.16,202.84,1.179576,0.822960,-0.047882,0.204789,0.635509,0.993745
2,58580_001136,58580,1136,52444,2021-10-10T21:08:20.900Z,-108,away,FS,29,66.010536,...,0.557784,0.054864,78.23,202.31,0.576072,-0.301752,0.113779,0.546056,-0.541953,-0.195309
3,58580_001136,58580,1136,46206,2021-10-10T21:08:20.900Z,-108,home,TE,86,52.459128,...,0.338328,0.036576,322.15,26.37,0.630936,0.566928,0.267151,-0.207597,0.277754,-0.566510
4,58580_001136,58580,1136,52663,2021-10-10T21:08:20.900Z,-108,away,ILB,48,57.835800,...,0.466344,0.045720,266.38,196.29,0.283464,0.283464,-0.029444,-0.465414,-0.017898,-0.282898
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1353048,58575_003081,58575,3081,48476,2021-10-10T19:22:06.800Z,140,away,CB,40,66.092832,...,1.453896,0.155448,104.64,107.32,0.448056,-0.374904,-0.367465,1.406692,-0.142638,-0.424745
1353049,58575_003081,58575,3081,44887,2021-10-10T19:22:06.800Z,140,away,ILB,45,68.360544,...,0.896112,0.091440,107.61,95.54,0.576072,-0.219456,-0.271106,0.854118,0.574066,-0.048031
1353050,58575_003081,58575,3081,44174,2021-10-10T19:22:06.800Z,140,away,ILB,49,68.159376,...,1.024128,0.100584,97.21,100.80,0.576072,0.420624,-0.128535,1.016030,-0.443296,0.367897
1353051,58575_003081,58575,3081,45217,2021-10-10T19:22:06.800Z,140,away,TE,82,73.554336,...,1.764792,0.182880,109.22,103.49,0.484632,-0.466344,-0.580963,1.666425,0.028993,-0.483764


In [None]:
train_player_tracking.to_csv(data_path / 'train_player_tracking_ver2.csv', index=False)
test_player_tracking.to_csv(data_path / 'test_player_tracking_ver2.csv', index=False)

In [133]:
def square(x):
    return x ** 2

def is_opposite(team1, team2):
    return 0 if team1 == team2 else 1

def relative_quantative(data1: pd.DataFrame, data2: pd.DataFrame) -> pd.DataFrame:
    # 동일 game_play, step에 대해 player1, player2에 대한 row of tracking data를 input으로 받는다.
    # position, velocitiy, acceleration
    # game_play, nfl_player_id, datetime, step
    # if isinstance(data1, pd.Series) and isinstance(data2, pd.Series):
    #     raise TypeError('Input data type must be Series.')
    # elif (data1.game_play != data2.game_play) or (data1.step != data2.step):
    #     raise ValueError('Time range between two dataframes does not match.')
    df1 = data1.reset_index(drop=True)
    df2 = data2.reset_index(drop=True)

    rel_df = df1.loc[:, ['game_play', 'step']]
    rel_df.loc[:, 'nfl_player_id_1'] = df1.nfl_player_id
    rel_df.loc[:, 'nfl_player_id_2'] = df2.nfl_player_id
    create_contact_id(rel_df, inplace=True)
    
    # 선수1과 선수2 사이의 변위, 상대속도, 상대가속도
    vector_quantative = ['x_position', 'y_position', 'x_vel', 'y_vel',
                         'x_acc', 'y_acc', 'direction', 'orientation']
    # 선수1과 선수2 사이의 거리, 상대속력, 상대가속력
    scalar_quantative = ['distance', 'speed', 'acceleration']

    # 선수1과 선수2 사이의 변위, 상대속도, 상대가속도 계산
    for idx, q in enumerate(vector_quantative):
        rel_df.loc[:, q] = df1.loc[:, q] - df2.loc[:, q]
        if (idx % 2 == 1) and (idx < 7):
            prev_q = vector_quantative[idx//2]
            scalar = scalar_quantative[idx//2]
            rel_df.loc[:, scalar] = rel_df.loc[:, q].apply(square) \
            + rel_df.loc[:, prev_q].apply(square)
            rel_df.loc[:, scalar] = rel_df.loc[:, scalar].apply(np.sqrt)

    team_df = pd.concat([df1.team, df2.team], axis=1)
    team_df.columns = ['team1', 'team2']

    rel_df.loc[:, 'opposite'] = team_df.apply(lambda x: is_opposite(x.team1, x.team2), axis=1)
    
    del team_df
    return rel_df

def ground_quantative(data: pd.DataFrame) -> pd.DataFrame:
    # 동일 game_play, step에 대해 player1, player2에 대한 row of tracking data를 input으로 받는다.
    # position, velocitiy, acceleration
    # game_play, nfl_player_id, datetime, step
    # if isinstance(data1, pd.Series) and isinstance(data2, pd.Series):
    #     raise TypeError('Input data type must be Series.')
    # elif (data1.game_play != data2.game_play) or (data1.step != data2.step):
    #     raise ValueError('Time range between two dataframes does not match.')
    df = data.reset_index(drop=True)
    res_df = df[['game_play', 'step']]
    res_df.loc[:, 'nfl_player_id_1'] = df.nfl_player_id
    res_df['nfl_player_id_2'] = 'G'
    create_contact_id(res_df, inplace=True)

    cols = ['x_position', 'y_position', 'distance',
            'x_vel', 'y_vel', 'speed',
            'x_acc', 'y_acc', 'acceleration',
            'direction', 'orientation']
    res_df.loc[:, cols] = df.loc[:, cols]
    res_df['opposite'] = 1

    return res_df

In [154]:
def filtering(quant_df, windows, id_cols, physics_cols):
    id_df = quant_df.loc[windows:, id_cols]
    if windows <= 1:
        rolling_df = quant_df.loc[:, physics_cols].diff()
    else:
        rolling_df = quant_df.loc[:, physics_cols].rolling(windows+1).mean()
    rolling_df = rolling_df.dropna()
    quant_df = pd.concat([id_df, rolling_df], axis=1)
    quant_df.reset_index(drop=True, inplace=True)

def make_graph_tracking_files(tracking, contact,
                              rolling: bool=False, 
                              windows: int=0, 
                              ground=False):
    final_ls = []

    id_cols = ['contact_id', 'game_play', 'step', 'nfl_player_id_1', 'nfl_player_id_2']

    if 'opposite' in tracking.columns.tolist():
        id_cols.append('opposite')

    physics_cols = ['x_position', 'y_position', 'distance',
                    'x_vel', 'y_vel', 'speed',
                    'x_acc', 'y_acc', 'acceleration',
                    'direction', 'orientation']
    
    for game_play in contact.game_play.unique():
        game_df = tracking.loc[tracking.game_play == game_play, :]
        
        # 필요한 frame만 뽑아내기 위함
        step_start = -windows if windows > 0 else 0
        game_df = game_df.loc[game_df.step >= step_start, :]
        
        # 비교군(추후 merge하기 위함)
        compare_df = contact.loc[contact.game_play == game_play, :]

        players1 = compare_df.nfl_player_id_1.unique()
        temp_ls = []

        for player1 in players1:
            player1_df = game_df.loc[game_df.nfl_player_id == player1, :]
            players2 = compare_df.loc[compare_df.nfl_player_id_1 == player1,
                                      'nfl_player_id_2'].unique()

            if ground:
                quant_df = ground_quantative(player1_df)

                if rolling:
                    filtering(quant_df, windows, id_cols, physics_cols)

                temp_ls.append(quant_df)
            else:
                #-------- only for contact between players --------#
                for player2 in players2:
                    player2_df = game_df.loc[game_df.nfl_player_id == player2, :]
                    quant_df = relative_quantative(player1_df, player2_df)
                    
                    if rolling:
                        filtering(quant_df, windows, id_cols, physics_cols)
                        
                    temp_ls.append(quant_df)

        temp_df = pd.concat(temp_ls, axis=0, ignore_index=True)
        result = pd.merge(compare_df, temp_df.loc[:, ['contact_id'] + physics_cols],
                          on='contact_id', how='inner')
        del temp_df
        final_ls.append(result)

    return pd.concat(final_ls, axis=0, ignore_index=True)

In [156]:
def make_train_dataset(windows):
    tr_filename1 = 'train_player_contact_tracking'
    tr_filename2 = 'train_ground_contact_tracking'
    te_filename1 = 'test_player_contact_tracking'
    te_filename2 = 'test_ground_contact_tracking'
    
    if windows == 0:
        tr_filename1 += '.csv'
        tr_filename2 += '.csv'
        te_filename1 += '.csv'
        te_filename2 += '.csv'
    else:
        tr_filename1 += f'_{windows}.csv'
        tr_filename2 += f'_{windows}.csv'
        te_filename1 += f'_{windows}.csv'
        te_filename2 += f'_{windows}.csv'
    
    rolling = True if windows > 0 else False

    tr_players_relative_tracking = make_graph_tracking_files(train_player_tracking,
                                                             tr_players_contact,
                                                             rolling=rolling,
                                                             windows=windows)
    
    tr_ground_relative_tracking = make_graph_tracking_files(train_player_tracking,
                                                            tr_ground_contact,
                                                            rolling=rolling,
                                                            windows=windows,
                                                            ground=True)
    
    te_players_relative_tracking = make_graph_tracking_files(test_player_tracking,
                                                             te_players_contact,
                                                             rolling=rolling,
                                                             windows=windows)
    
    te_ground_relative_tracking = make_graph_tracking_files(test_player_tracking,
                                                             te_ground_contact,
                                                             rolling=rolling,
                                                             windows=windows,
                                                             ground=True)

    # save files
    tr_players_relative_tracking.to_csv(data_path / tr_filename1, index=False)
    te_players_relative_tracking.to_csv(data_path / te_filename1, index=False)
    tr_ground_relative_tracking.to_csv(data_path / tr_filename2, index=False)
    te_ground_relative_tracking.to_csv(data_path / te_filename2, index=False)

In [161]:
from tqdm.notebook import tqdm

for window in tqdm(range(11)):
    make_train_dataset(window)

  0%|          | 0/11 [00:00<?, ?it/s]

## Load Preprocessing Data

In [None]:
train_player_tracking = pd.read_csv(data_path / 'train_player_tracking_ver2.csv')
test_player_tracking = pd.read_csv(data_path / 'test_player_tracking_ver2.csv')

tr_players_relative_tracking = pd.read_csv(data_path / 'train_player_contact_tracking.csv')
te_players_relative_tracking = pd.read_csv(data_path / 'test_player_contact_tracking.csv')