import os
import torch
import tensorflow as tf

class Config:
    AUTHOR = "colum2131"

    NAME = "NFLC-" + "lgbm+xgb"

    COMPETITION = "nfl-player-contact-detection"

    seed = 42
    num_fold = 10
    rf_params = {'max_depth': 9, 'max_leaf_nodes': 425, 'n_estimators': 332
        
    }
   

In [2]:
import os
import gc
import subprocess

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from IPython.display import Video, display

from scipy.optimize import minimize

from glob import glob
from tqdm import tqdm

from sklearn.model_selection import GroupKFold
from sklearn.metrics import (
    roc_auc_score,
    matthews_corrcoef,
)

import xgboost as xgb
import lightgbm as lgbm
from sklearn.ensemble import RandomForestClassifier as rf
import torch

if torch.cuda.is_available():
    import cupy 
    import cudf
    from cuml import ForestInference

In [3]:
def setup(cfg):
    cfg.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # set dirs
    cfg.INPUT = f'../input/{cfg.COMPETITION}'
    cfg.EXP = cfg.NAME
    cfg.OUTPUT_EXP = cfg.NAME
    cfg.SUBMISSION = './'
    cfg.DATASET = '../input/'

    cfg.EXP_MODEL = os.path.join(cfg.EXP, 'model')
    cfg.EXP_FIG = os.path.join(cfg.EXP, 'fig')
    cfg.EXP_PREDS = os.path.join(cfg.EXP, 'preds')

    # make dirs
    for d in [cfg.EXP_MODEL, cfg.EXP_FIG, cfg.EXP_PREDS]:
        os.makedirs(d, exist_ok=True)
        
    return cfg

In [35]:
# ==============================
# function
# ==============================
# ref: https://www.kaggle.com/code/robikscube/nfl-player-contact-detection-getting-started
def add_contact_id(df):
    # Create contact ids
    df["contact_id"] = (
        df["game_play"]
        + "_"
        + df["step"].astype("str")
        + "_"
        + df["nfl_player_id_1"].astype("str")
        + "_"
        + df["nfl_player_id_2"].astype("str")
    )
    return df

def expand_contact_id(df):
    """
    Splits out contact_id into seperate columns.
    """
    df["game_play"] = df["contact_id"].str[:12]
    df["step"] = df["contact_id"].str.split("_").str[-3].astype("int")
    df["nfl_player_id_1"] = df["contact_id"].str.split("_").str[-2]
    df["nfl_player_id_2"] = df["contact_id"].str.split("_").str[-1]
    return df

# cross validation
def get_groupkfold(train, target_col, group_col, n_splits):
    kf = GroupKFold(n_splits=n_splits)
    generator = kf.split(train, train[target_col], train[group_col])
    fold_series = []
    for fold, (idx_train, idx_valid) in enumerate(generator):
        fold_series.append(pd.Series(fold, index=idx_valid))
    fold_series = pd.concat(fold_series).sort_index()
    return fold_series

# randomForest_code

def fit_rf(cfg, X, y, params, add_suffix=''):

    oof_pred = np.zeros(len(y), dtype=np.float32)
    for fold in sorted(cfg.folds.unique()):
        if fold == -1: continue
        idx_train = (cfg.folds!=fold)
        idx_valid = (cfg.folds==fold)
        x_train, y_train = X[idx_train], y[idx_train]
        x_valid, y_valid = X[idx_valid], y[idx_valid]
        display(pd.Series(y_valid).value_counts())

        #lgbm_train = lgbm.Dataset(x_train, label=y_train)
        #lgbm_valid = lgbm.Dataset(x_valid, label=y_valid)
        #evals = [lgbm_train, lgbm_valid]

        model = rf(**params)
        clf = model.fit(x_train,y_train)
        model_path = '/kaggle/working/rf{}{}.pkl'.format(fold,add_suffix)
        joblib.dump(clf, model_path)

        pred_i = model.predict(x_valid)
        oof_pred[x_valid.index] = pred_i
        score = round(matthews_corrcoef(y, oof_pred.round()), 5)
        print(f'Performance of the prediction: {score}\n')
        del model; gc.collect()

    np.save(os.path.join(cfg.EXP_PREDS, f'oof_pred{add_suffix}'), oof_pred)
    score = round(roc_auc_score(y, oof_pred), 5)
    print(f'All Performance of the prediction: {score}')
    return oof_pred

def pred_rf(X, data_dir, add_suffix=''):
    dirs = '/kaggle/working/*.pkl'
    models = glob(dirs)
    models = [joblib.load(modles) for model_path in models]
    preds = np.array([model.predict(X) for model in models])
    preds = np.mean(preds, axis=0)
    return preds



In [5]:
# ==============================
# read data
# ==============================
cfg = setup(Config)

if not torch.cuda.is_available():
    tr_tracking = pd.read_csv(os.path.join(cfg.INPUT, 'train_player_tracking.csv'), parse_dates=["datetime"])
    te_tracking = pd.read_csv(os.path.join(cfg.INPUT, 'test_player_tracking.csv'), parse_dates=["datetime"])
    # tr_helmets = pd.read_csv(os.path.join(cfg.INPUT, 'train_baseline_helmets.csv'))
    # te_helmets = pd.read_csv(os.path.join(cfg.INPUT, 'test_baseline_helmets.csv'))
    # tr_video_metadata = pd.read_csv(os.path.join(cfg.INPUT, 'train_video_metadata.csv'))
    # te_video_metadata = pd.read_csv(os.path.join(cfg.INPUT, 'test_video_metadata.csv'))
    sub = pd.read_csv(os.path.join(cfg.INPUT, 'sample_submission.csv'))

    train = pd.read_csv(os.path.join(cfg.INPUT, 'train_labels.csv'), parse_dates=["datetime"])
    test = expand_contact_id(sub)
    
else:
    tr_tracking = cudf.read_csv(os.path.join(cfg.INPUT, 'train_player_tracking.csv'), parse_dates=["datetime"])
    te_tracking = cudf.read_csv(os.path.join(cfg.INPUT, 'test_player_tracking.csv'), parse_dates=["datetime"])
    # tr_helmets = cudf.read_csv(os.path.join(cfg.INPUT, 'train_baseline_helmets.csv'))
    # te_helmets = cudf.read_csv(os.path.join(cfg.INPUT, 'test_baseline_helmets.csv'))
    # tr_video_metadata = cudf.read_csv(os.path.join(cfg.INPUT, 'train_video_metadata.csv'))
    # te_video_metadata = cudf.read_csv(os.path.join(cfg.INPUT, 'test_video_metadata.csv'))
    sub = pd.read_csv(os.path.join(cfg.INPUT, 'sample_submission.csv'))

    train = cudf.read_csv(os.path.join(cfg.INPUT, 'train_labels.csv'), parse_dates=["datetime"])
    test = cudf.DataFrame(expand_contact_id(sub))

The following code is used to create the features.  
Basically, the numerical features contained in player_tracking.csv are merged into player_id_1 and player_id_2 respectively.

In [6]:
# ==============================
# feature engineering
# ==============================
def create_features(df, tr_tracking, merge_col="step", use_cols=["x_position", "y_position"]):
    output_cols = []
    df_combo = (
        df.astype({"nfl_player_id_1": "str"})
        .merge(
            tr_tracking.astype({"nfl_player_id": "str"})[
                ["game_play", merge_col, "nfl_player_id",] + use_cols
            ],
            left_on=["game_play", merge_col, "nfl_player_id_1"],
            right_on=["game_play", merge_col, "nfl_player_id"],
            how="left",
        )
        .rename(columns={c: c+"_1" for c in use_cols})
        .drop("nfl_player_id", axis=1)
        .merge(
            tr_tracking.astype({"nfl_player_id": "str"})[
                ["game_play", merge_col, "nfl_player_id"] + use_cols
            ],
            left_on=["game_play", merge_col, "nfl_player_id_2"],
            right_on=["game_play", merge_col, "nfl_player_id"],
            how="left",
        )
        .drop("nfl_player_id", axis=1)
        .rename(columns={c: c+"_2" for c in use_cols})
        .sort_values(["game_play", merge_col, "nfl_player_id_1", "nfl_player_id_2"])
        .reset_index(drop=True)
    )
    output_cols += [c+"_1" for c in use_cols]
    output_cols += [c+"_2" for c in use_cols]
    
    if ("x_position" in use_cols) & ("y_position" in use_cols):
        index = df_combo['x_position_2'].notnull()
        if torch.cuda.is_available():
            index = index.to_array()
        distance_arr = np.full(len(index), np.nan)
        tmp_distance_arr = np.sqrt(
            np.square(df_combo.loc[index, "x_position_1"] - df_combo.loc[index, "x_position_2"])
            + np.square(df_combo.loc[index, "y_position_1"]- df_combo.loc[index, "y_position_2"])
        )
        if torch.cuda.is_available():
            tmp_distance_arr = tmp_distance_arr.to_array()
        distance_arr[index] = tmp_distance_arr
        df_combo['distance'] = distance_arr
        output_cols += ["distance"]
        
    df_combo['G_flug'] = (df_combo['nfl_player_id_2']=="G")
    output_cols += ["G_flug"]
    return df_combo, output_cols


use_cols = [
    'x_position', 'y_position', 'speed', 'distance',
    'direction', 'orientation', 'acceleration', 'sa'
]
train, feature_cols = create_features(train, tr_tracking, use_cols=use_cols)
test, feature_cols = create_features(test, te_tracking, use_cols=use_cols)
if torch.cuda.is_available():
    train = train.to_pandas()
    test = test.to_pandas()

display(train)

Unnamed: 0,contact_id,game_play,datetime,step,nfl_player_id_1,nfl_player_id_2,contact,x_position_1,y_position_1,speed_1,...,x_position_2,y_position_2,speed_2,distance_2,direction_2,orientation_2,acceleration_2,sa_2,distance,G_flug
0,58168_003392_0_37084_37211,58168_003392,2020-09-11 03:01:48.100,0,37084,37211,0,41.90,20.08,0.54,...,39.59,17.07,0.53,0.05,134.84,84.73,1.43,1.42,3.794232,False
1,58168_003392_0_37084_38556,58168_003392,2020-09-11 03:01:48.100,0,37084,38556,0,41.90,20.08,0.54,...,41.93,30.61,0.67,0.05,232.50,227.00,1.82,1.61,10.530043,False
2,58168_003392_0_37084_38567,58168_003392,2020-09-11 03:01:48.100,0,37084,38567,0,41.90,20.08,0.54,...,40.37,19.88,0.66,0.07,136.70,88.92,0.90,0.89,1.543017,False
3,58168_003392_0_37084_38590,58168_003392,2020-09-11 03:01:48.100,0,37084,38590,0,41.90,20.08,0.54,...,40.33,25.28,0.52,0.06,141.08,100.37,0.59,0.58,5.431841,False
4,58168_003392_0_37084_39947,58168_003392,2020-09-11 03:01:48.100,0,37084,39947,0,41.90,20.08,0.54,...,40.11,26.73,0.99,0.09,163.38,90.69,1.68,1.64,6.886697,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4721613,58582_003121_91_52500_52619,58582_003121,2021-10-12 02:42:29.100,91,52500,52619,0,58.74,40.11,1.34,...,58.90,22.09,1.32,0.14,74.21,47.63,0.96,-0.44,18.020710,False
4721614,58582_003121_91_52500_G,58582_003121,2021-10-12 02:42:29.100,91,52500,G,0,58.74,40.11,1.34,...,,,,,,,,,,True
4721615,58582_003121_91_52609_52619,58582_003121,2021-10-12 02:42:29.100,91,52609,52619,0,60.32,25.93,1.38,...,58.90,22.09,1.32,0.14,74.21,47.63,0.96,-0.44,4.094142,False
4721616,58582_003121_91_52609_G,58582_003121,2021-10-12 02:42:29.100,91,52609,G,0,60.32,25.93,1.38,...,,,,,,,,,,True


# Exclude distance > 2
if the distance between two players is greater than 2 then the probability of contact is so low, we will consider it = 0, training data will be reduced from 4.7 M rows to 660 K

In [8]:
DISTANCE_THRESH = 2

train_y = train['contact'].values
oof_pred = np.zeros(len(train))
cond_dis_train = (train['distance']<=DISTANCE_THRESH) | (train['distance'].isna())
cond_dis_test = (test['distance']<=DISTANCE_THRESH) | (test['distance'].isna())

train = train[cond_dis_train]
train.reset_index(inplace = True, drop = True)

print('number of train data : ',len(train))

_ = gc.collect()

number of train data :  660560


# Helmet track Features

In [9]:
CLUSTERS = [10, 50, 100, 300, 500]

def add_step_pct(df, cluster):
    df['step_pct'] = cluster * (df['step']-min(df['step']))/(max(df['step'])-min(df['step']))
    df['step_pct'] = df['step_pct'].apply(np.ceil).astype(np.int32)
    return df

for cluster in CLUSTERS:
    train = train.groupby('game_play').apply(lambda x:add_step_pct(x,cluster))
    test = test.groupby('game_play').apply(lambda x:add_step_pct(x,cluster))

    for helmet_view in ['Sideline', 'Endzone']:
        helmet_train = pd.read_csv('/kaggle/input/nfl-player-contact-detection/train_baseline_helmets.csv')
        helmet_train.loc[helmet_train['view']=='Endzone2','view'] = 'Endzone'
        helmet_test = pd.read_csv('/kaggle/input/nfl-player-contact-detection/test_baseline_helmets.csv')
        helmet_test.loc[helmet_test['view']=='Endzone2','view'] = 'Endzone'

        helmet_train.rename(columns = {'frame': 'step'}, inplace = True)
        helmet_train = helmet_train.groupby('game_play').apply(lambda x:add_step_pct(x,cluster))
        helmet_test.rename(columns = {'frame': 'step'}, inplace = True)
        helmet_test = helmet_test.groupby('game_play').apply(lambda x:add_step_pct(x,cluster))
        helmet_train = helmet_train[helmet_train['view']==helmet_view]
        helmet_test = helmet_test[helmet_test['view']==helmet_view]

        helmet_train['helmet_id'] = helmet_train['game_play'] + '_' + helmet_train['nfl_player_id'].astype(str) + '_' + helmet_train['step_pct'].astype(str)
        helmet_test['helmet_id'] = helmet_test['game_play'] + '_' + helmet_test['nfl_player_id'].astype(str) + '_' + helmet_test['step_pct'].astype(str)

        helmet_train = helmet_train[['helmet_id', 'left', 'width', 'top', 'height']].groupby('helmet_id').mean().reset_index()
        helmet_test = helmet_test[['helmet_id', 'left', 'width', 'top', 'height']].groupby('helmet_id').mean().reset_index()
        for player_ind in [1, 2]:
            train['helmet_id'] = train['game_play'] + '_' + train['nfl_player_id_'+str(player_ind)].astype(str) + \
                                    '_' + train['step_pct'].astype(str)
            test['helmet_id'] = test['game_play'] + '_' + test['nfl_player_id_'+str(player_ind)].astype(str) + \
                                    '_' + test['step_pct'].astype(str)

            train = train.merge(helmet_train, how = 'left')
            test = test.merge(helmet_test, how = 'left')

            train.rename(columns = {i:i+'_'+helmet_view+'_'+str(cluster)+'_'+str(player_ind) for i in ['left', 'width', 'top', 'height']}, inplace = True)
            test.rename(columns = {i:i+'_'+helmet_view+'_'+str(cluster)+'_'+str(player_ind) for i in ['left', 'width', 'top', 'height']}, inplace = True)

            del train['helmet_id'], test['helmet_id']
            gc.collect()

            feature_cols += [i+'_'+helmet_view+'_'+str(cluster)+'_'+str(player_ind) for i in ['left', 'width', 'top', 'height']]
        del helmet_train, helmet_test
        gc.collect()

# Fill missing values for the ground

In [10]:
# 이 코드는 충돌 판정 시, 양쪽 선수의 위치 정보를 비슷하게 만들어주는 역할을 합니다.
for cluster in CLUSTERS:
    for helmet_view in ['Sideline', 'Endzone']:
        train.loc[train['G_flug']==True,'left_'+helmet_view+'_'+str(cluster)+'_2'] = train.loc[train['G_flug']==True,'left_'+helmet_view+'_'+str(cluster)+'_1']
        train.loc[train['G_flug']==True,'top_'+helmet_view+'_'+str(cluster)+'_2'] = train.loc[train['G_flug']==True,'top_'+helmet_view+'_'+str(cluster)+'_1']
        train.loc[train['G_flug']==True,'width_'+helmet_view+'_'+str(cluster)+'_2'] = 0
        train.loc[train['G_flug']==True,'height_'+helmet_view+'_'+str(cluster)+'_2'] = 0
        
        test.loc[test['G_flug']==True,'left_'+helmet_view+'_'+str(cluster)+'_2'] = test.loc[test['G_flug']==True,'left_'+helmet_view+'_'+str(cluster)+'_1']
        test.loc[test['G_flug']==True,'top_'+helmet_view+'_'+str(cluster)+'_2'] = test.loc[test['G_flug']==True,'top_'+helmet_view+'_'+str(cluster)+'_1']
        test.loc[test['G_flug']==True,'width_'+helmet_view+'_'+str(cluster)+'_2'] = 0
        test.loc[test['G_flug']==True,'height_'+helmet_view+'_'+str(cluster)+'_2'] = 0

# Diffrence & Product features

In [11]:
cols = [i[:-2] for i in train.columns if i[-2:]=='_1' and i!='nfl_player_id_1']
train[[i+'_diff' for i in cols]] = np.abs(train[[i+'_1' for i in cols]].values - train[[i+'_2' for i in cols]].values)
test[[i+'_diff' for i in cols]] = np.abs(test[[i+'_1' for i in cols]].values - test[[i+'_2' for i in cols]].values)
feature_cols += [i+'_diff' for i in cols]

cols = ['x_position', 'y_position', 'speed', 'distance', 'direction', 'orientation', 'acceleration', 'sa']
train[[i+'_prod' for i in cols]] = train[[i+'_1' for i in cols]].values * train[[i+'_2' for i in cols]].values
test[[i+'_prod' for i in cols]] = test[[i+'_1' for i in cols]].values * test[[i+'_2' for i in cols]].values
feature_cols += [i+'_prod' for i in cols]

print('number of features : ',len(feature_cols))
print('number of train data : ',len(train))

number of features :  154
number of train data :  660560


In [15]:
train = train.fillna(0)

In [16]:
train

Unnamed: 0,contact_id,game_play,datetime,step,nfl_player_id_1,nfl_player_id_2,contact,x_position_1,y_position_1,speed_1,...,top_Endzone_500_diff,height_Endzone_500_diff,x_position_prod,y_position_prod,speed_prod,distance_prod,direction_prod,orientation_prod,acceleration_prod,sa_prod
0,58168_003392_0_37084_38567,58168_003392,2020-09-11 03:01:48.100,0,37084,38567,0,41.90,20.08,0.54,...,29.0,17.0,1691.5030,399.1904,0.3564,0.0042,34542.7230,23324.6052,0.8280,0.8010
1,58168_003392_0_37084_G,58168_003392,2020-09-11 03:01:48.100,0,37084,G,0,41.90,20.08,0.54,...,0.0,17.0,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
2,58168_003392_0_37211_46445,58168_003392,2020-09-11 03:01:48.100,0,37211,46445,0,39.59,17.07,0.53,...,29.0,1.0,1597.0606,308.6256,0.5830,0.0050,20081.7212,7828.2047,2.9029,2.8826
3,58168_003392_0_37211_G,58168_003392,2020-09-11 03:01:48.100,0,37211,G,0,39.59,17.07,0.53,...,0.0,33.0,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
4,58168_003392_0_38556_G,58168_003392,2020-09-11 03:01:48.100,0,38556,G,0,41.93,30.61,0.67,...,0.0,23.0,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
660555,58582_003121_91_48220_G,58582_003121,2021-10-12 02:42:29.100,91,48220,G,0,33.18,25.26,2.55,...,0.0,0.0,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
660556,58582_003121_91_52493_G,58582_003121,2021-10-12 02:42:29.100,91,52493,G,0,65.04,38.68,1.31,...,0.0,39.0,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
660557,58582_003121_91_52500_G,58582_003121,2021-10-12 02:42:29.100,91,52500,G,0,58.74,40.11,1.34,...,0.0,44.0,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
660558,58582_003121_91_52609_G,58582_003121,2021-10-12 02:42:29.100,91,52609,G,0,60.32,25.93,1.38,...,0.0,0.0,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000


# Train & infer  RandomForest


In [None]:
!pip install joblib

In [13]:
import joblib


In [36]:
# ==============================
# training & inference
# ==============================

cfg.folds = get_groupkfold(train, 'contact', 'game_play', cfg.num_fold)
cfg.folds.to_csv(os.path.join(cfg.EXP_PREDS, 'folds.csv'), index=False)

oof_pred[np.where(cond_dis_train)] = fit_rf(cfg, train[feature_cols], train['contact'], cfg.rf_params,
                                               add_suffix="_rf_1st")
np.save('oof_pred.npy',oof_pred)
sub_pred = pred_rf(test.loc[cond_dis_test, feature_cols], cfg.EXP_MODEL, add_suffix="_rf_1st")

0    60071
1     6277
Name: contact, dtype: int64

Performance of the prediction: 0.16826



0    59431
1     6820
Name: contact, dtype: int64

Performance of the prediction: 0.24196



0    59131
1     6882
Name: contact, dtype: int64

Performance of the prediction: 0.29498



0    59991
1     6065
Name: contact, dtype: int64

Performance of the prediction: 0.33847



0    59857
1     6178
Name: contact, dtype: int64

Performance of the prediction: 0.37153



0    59495
1     6573
Name: contact, dtype: int64

Performance of the prediction: 0.40654



0    58761
1     6986
Name: contact, dtype: int64

Performance of the prediction: 0.44191



0    59359
1     6608
Name: contact, dtype: int64

Performance of the prediction: 0.47445



0    60895
1     5127
Name: contact, dtype: int64

Performance of the prediction: 0.50348



0    59257
1     6796
Name: contact, dtype: int64

Performance of the prediction: 0.53171

All Performance of the prediction: 0.69529


NameError: name 'modles' is not defined

In [24]:
model

NameError: name 'model' is not defined

In [32]:
!cd /kaggle/working/NFLC-lgbm+xgb/


In [33]:
!ls -a

.   .virtual_documents	__notebook_source__.ipynb
..  NFLC-lgbm+xgb	oof_pred.npy


# Submission

In [None]:
#==============================
#optimize
#==============================
def func(x_list):
    score = matthews_corrcoef(train_y, oof_pred>x_list[0])
    return -score

x0 = [0.5]
result = minimize(func, x0,  method="nelder-mead")
cfg.threshold = result.x[0]
print("score:", round(matthews_corrcoef(train_y, oof_pred>cfg.threshold), 5))
print("threshold", round(cfg.threshold, 5))
del train
gc.collect()

test = add_contact_id(test)
test['contact'] = 0
test.loc[cond_dis_test, 'contact'] = (sub_pred > 0.27).astype(int)
test[['contact_id', 'contact']].to_csv('submission.csv', index=False)
display(test[['contact_id', 'contact']].head())