# Module Import

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [35]:
!pip install optuna==2.10.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting optuna==2.10.1
  Downloading optuna-2.10.1-py3-none-any.whl (308 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m308.2/308.2 KB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorlog
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting alembic
  Downloading alembic-1.9.2-py3-none-any.whl (210 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m210.6/210.6 KB[0m [31m23.5 MB/s[0m eta [36m0:00:00[0m
Collecting cliff
  Downloading cliff-4.1.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.0/81.0 KB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
Collecting cmaes>=0.8.2
  Downloading cmaes-0.9.1-py3-none-any.whl (21 kB)
Collecting Mako
  Downloading Mako-1.2.4-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 KB[0m [31m9.5 MB

In [None]:
import os, sys
from pathlib import Path

# Set up path
try:
    base_path = Path(__file__).resolve().parent.parent
except:
    %cd '/content/drive/MyDrive/Colab Notebooks/kaggle/Player-Contact-Detection/tutorial'
    base_path = Path('__file__').resolve().parent.parent

module_path = base_path / 'module'
data_path = base_path / 'data'
submission_path = base_path / 'submission'

# Append system path
sys.path.append(str(module_path))

/content/drive/MyDrive/Colab Notebooks/kaggle/Player-Contact-Detection/tutorial


In [36]:
# Import data analysis libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import tqdm

from ensemble import BinaryCalssifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import matthews_corrcoef as mcc
from sklearn.ensemble import RandomForestClassifier
from xgboost.sklearn import XGBClassifier
from lightgbm.sklearn import LGBMClassifier

# Load Fully-Connected train-test dataset

In [None]:
def inplace(func):
    def wrapper(data, inplace: bool=False):                           # 호출할 함수를 감싸는 함수
        if inplace:
            func(data, inplace)
        else:
            df = data.copy()
            func(df, inplace)
            return df
    return wrapper                           # wrapper 함수 반환

## contact id 만드는 함수
@inplace
def create_group_id(df, inplace: bool=False):
    # Create contact ids
    cols = list(df)
    if df.nfl_player_id_2.dtype == 'str':
        df.loc[:, 'group_id'] = (
            df.loc[:, "game_play"]
            + "_"
            + df.loc[:, "nfl_player_id_1"].astype("str")
            + "_"
            + df.loc[:, "nfl_player_id_2"]
        )
    else:
        df.loc[:, 'group_id'] = (
            df.loc[:, "game_play"]
            + "_"
            + df.loc[:, "nfl_player_id_1"].astype("str")
            + "_"
            + df.loc[:, "nfl_player_id_2"].astype("str")
        )

    temp = df[cols]
    df.drop(columns=cols, inplace=True)
    df[cols] = temp
    del temp
    return df

In [None]:
def load_data(data_path, windows: int=0):
    train_filename = 'train_player_contact_tracking'
    test_filename = 'test_player_contact_tracking'

    if windows == 0:
        suffix = '.csv'
    elif windows == 1:
        suffix = '_diff.csv'
    elif windows >= 2:
        suffix = f'_{windows}.csv'

    train_filename += suffix
    test_filename += suffix

    try:
        train = pd.read_csv(data_path / train_filename)
        test = pd.read_csv(data_path / test_filename)
    except Exception as e:
        print("Return basic dataset.")
        load_data(data_path, 0)
    return train, test

# Imbalance Processing

In [None]:
def preprocessing(data):
    temp = data.loc[data.contact == 1]
    index = temp.index.values.tolist()

    prev_i = index[0]
    sep = []

    for i in index:
        if (i - prev_i) > 1:
            sep.append(prev_i)

        prev_i = i

    drop_cols = ['contact_id', 'game_play', 'datetime', 'step', 'nfl_player_id_1', 'nfl_player_id_2']
    temp_ls = []

    for idx in sep:
        sr = data.loc[idx,:]

        game_play = sr.loc['game_play']
        nfl_player_id_1 = sr.loc['nfl_player_id_1']
        nfl_player_id_2 = sr.loc['nfl_player_id_2']

        df = data.loc[data.game_play == game_play, :]
        df = df.loc[df.nfl_player_id_1 == nfl_player_id_1, :]
        df = df.loc[df.nfl_player_id_2 == nfl_player_id_2, :]
        temp_ls.append(df)

    return pd.concat(temp_ls, axis=0, ignore_index=True).drop(columns=drop_cols)

In [None]:
def model_result(train: pd.DataFrame, test: pd.DataFrame,
                 data_path, windows: int, model: callable,
                 target_column: str='contact', save: bool=False,
                 **param) -> pd.DataFrame:
    drop_cols = ['contact_id', 'game_play', 'step', 'nfl_player_id_1', 'nfl_player_id_2']

    X = train.drop(columns=[target_column])
    y = train.contact
    X_test = test.drop(columns=drop_cols + [target_column])

    X_train, X_val, y_train, y_val = train_test_split(X, y,
                                                      test_size=0.3,
                                                      random_state=42)

    cl = model(**param).fit(X_train, y_train)

    y_pred = cl.predict(X_val)

    score = mcc(y_val, y_pred)

    print(f'{str(model.split('.')[-1])} phi score is {score: .3f}')

    # y_test = cl.predict(X_test)

    # test.loc[:, target_column] = y_test
    # submission = test.loc[:, ['contact_id', target_column]]

    # try:
    #     num = len(os.listdir(submission_path)) + 1
    # except FileNotFoundError:
    #     num = 1

    # filename = f'submission_{num}.csv'

    # if save:
    #     submission.to_csv(base_path / filename, index=False)
    
    # return submission

In [None]:
windows = 0
train, test = load_data(data_path, windows)

train = train.sort_values(by=['game_play', 'nfl_player_id_1', 'nfl_player_id_2', 'step'])\
.reset_index(drop=True)
test = test.sort_values(by=['game_play', 'nfl_player_id_1', 'nfl_player_id_2', 'step'])\
.reset_index(drop=True)

train_balanced = preprocessing(train)


submission = model_result(train_balanced, test, XGBClassifier, save=True)
submission

<class 'xgboost.sklearn.XGBClassifier'> phi score is  0.671


ValueError: ignored

In [None]:
test

Unnamed: 0,contact_id,game_play,step,nfl_player_id_1,nfl_player_id_2,contact,x_position,y_position,distance,x_vel,y_vel,speed,x_acc,y_acc,acceleration
0,58168_003392_0_37084_37211,58168_003392,0,37084,37211,0,,,,,,,,,
1,58168_003392_1_37084_37211,58168_003392,1,37084,37211,0,,,,,,,,,
2,58168_003392_2_37084_37211,58168_003392,2,37084,37211,0,,,,,,,,,
3,58168_003392_3_37084_37211,58168_003392,3,37084,37211,0,,,,,,,,,
4,58168_003392_4_37084_37211,58168_003392,4,37084,37211,0,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45271,58172_003247_121_52852_52939,58172_003247,121,52852,52939,0,,,,,,,,,
45272,58172_003247_122_52852_52939,58172_003247,122,52852,52939,0,,,,,,,,,
45273,58172_003247_123_52852_52939,58172_003247,123,52852,52939,0,,,,,,,,,
45274,58172_003247_124_52852_52939,58172_003247,124,52852,52939,0,,,,,,,,,


In [None]:
for model in [XGBClassifier, LGBMClassifier]:
    model_result(train_balanced, test, model)

<class 'xgboost.sklearn.XGBClassifier'> phi score is  0.671
<class 'lightgbm.sklearn.LGBMClassifier'> phi score is  0.678


In [None]:
for windows in [1, 3, 5, 10, 15, 20, 30, 50]:
    train, test = load_data(data_path, windows)

    train = train.sort_values(by=['game_play', 'nfl_player_id_1', 'nfl_player_id_2', 'step'])\
    .reset_index(drop=True)
    # test = test.sort_values(by=['game_play', 'nfl_player_id_1', 'nfl_player_id_2', 'step'])\
    # .reset_index(drop=True)

    train_balanced = preprocessing(train)

    print(f'#----------------- Windows: {windows: %-2s}-------------------#')
    for model in [XGBClassifier, LGBMClassifier]:
        model_result(train_balanced, test, model)
    print('#-------------------------------------------------#')
    print()

#----------------- Windows: 1 --------------------#
<class 'xgboost.sklearn.XGBClassifier'> phi score is  0.215
<class 'lightgbm.sklearn.LGBMClassifier'> phi score is  0.344
#-------------------------------------------------#

#----------------- Windows: 3 --------------------#
<class 'xgboost.sklearn.XGBClassifier'> phi score is  0.633
<class 'lightgbm.sklearn.LGBMClassifier'> phi score is  0.676
#-------------------------------------------------#

#----------------- Windows: 5 --------------------#
<class 'xgboost.sklearn.XGBClassifier'> phi score is  0.606
<class 'lightgbm.sklearn.LGBMClassifier'> phi score is  0.664
#-------------------------------------------------#

#----------------- Windows: 10 --------------------#
<class 'xgboost.sklearn.XGBClassifier'> phi score is  0.527
<class 'lightgbm.sklearn.LGBMClassifier'> phi score is  0.650
#-------------------------------------------------#

#----------------- Windows: 15 --------------------#
<class 'xgboost.sklearn.XGBClassifier'

In [None]:
submission