# Module Import

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
!nvidia-smi

In [1]:
import os, sys
from pathlib import Path

# Set up path
if os.getcwd() == '/content':
    %cd '/content/drive/MyDrive/Colab Notebooks/kaggle/Player-Contact-Detection/tutorial'
    base_path = Path('__file__').resolve().parent.parent
else:
    base_path = Path().resolve()

print(base_path)
module_path = base_path / 'module'
data_path = base_path / 'data'
submission_path = base_path / 'submission'

# Append system path
sys.path.append(str(module_path))

/Users/john/Library/CloudStorage/GoogleDrive-piy8117982@gmail.com/내 드라이브/Colab Notebooks/kaggle/Player-Contact-Detection


In [2]:
# Import data analysis libraries
import itertools
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from ensemble import BinaryCalssifier
from lightgbm.sklearn import LGBMClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import matthews_corrcoef as mcc
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from tqdm import tqdm
from xgboost.sklearn import XGBClassifier

# Load Fully-Connected train-test dataset

In [3]:
submission = pd.read_csv(data_path / 'sample_submission.csv')

def make_filename(windows: int=0, test: bool=False, ground: bool=False):
    train_test = 'test' if test else 'train'
    type_ = 'ground' if ground else 'player'

    filename = f'{train_test}_{type_}_contact_tracking'

    if windows == 0:
        suffix = '.csv'
    elif windows >= 1:
        suffix = f'_{windows}.csv'

    return filename + suffix

def load_data(data_path, windows: int=0, ground: bool=False):
    train_filename = make_filename(windows, test=False, ground=ground)
    test_filename = make_filename(windows, test=True, ground=ground)

    try:
        train = pd.read_csv(data_path / train_filename)
        test = pd.read_csv(data_path / test_filename)

    except Exception as e:
        print("Return basic dataset.")
        load_data(data_path, 0)
        
    train.sort_values(by=['group_id', 'step'], inplace=True)
    train.reset_index(drop=True, inplace=True)
    test.sort_values(by=['group_id', 'step'], inplace=True)
    test.reset_index(drop=True, inplace=True)

    return train, test

# Imbalance Processing

In [4]:
def preprocessing(data):
    temp = data.loc[data.contact == 1]
    index = temp.index.values.tolist()

    prev_i = index[0]
    group_ids = []

    for i in index:
        if (i - prev_i) > 1:
            sr = data.loc[prev_i,:]
            group_id = sr.loc['group_id']
            group_ids.append(group_id)
        prev_i = i
    
    drop_cols = ['contact_id', 'group_id', 'game_play',
                 'step', 'nfl_player_id_1', 'nfl_player_id_2']

    result = data.loc[data.group_id.isin(group_ids), :]
    result.drop(columns=drop_cols, inplace=True)
    return result

## RandomFroest & XGBoost & LightGBM

In [5]:
def make_weights(n_learners: int, N: int):
    # x+y+z = N인 음이 아닌 정수 (x, y, z) 순서쌍 만들기
    weights = []

    if n_learners == 3:
        for i in range(N+1):
            for j in range(N+1-i):
                k = N-i-j
                temp = [i/N, j/N, k/N]
                weights.append(temp)

    elif n_learners == 2:
        for i in range(N+1):
            j = N-i
            temp = [i/N, j/N]
            weights.append(temp)

    return weights

In [10]:
def model_result(train: pd.DataFrame, test: pd.DataFrame,
                 windows: int, save: bool=False) -> pd.DataFrame:
    drop_cols = ['contact_id', 'group_id', 'game_play',
                 'step', 'nfl_player_id_1', 'nfl_player_id_2']
    target_column = 'contact'

    X = train.drop(columns=[target_column])
    y = train.contact
    X_test = test.drop(columns=drop_cols + [target_column])

    # train, validation dataset split
    X_train, X_val, y_train, y_val = train_test_split(X, y,
                                                      test_size=0.3,
                                                      random_state=42)

    # model initializing
    xgb = XGBClassifier()
    lgbm = LGBMClassifier()
    rf = RandomForestClassifier()
    
    clf = VotingClassifier(estimators=[('rf', rf), ('xgb', xgb), ('lgbm', lgbm)])
    
    # model training
    clf.fit(X_train, y_train)

    weights = make_weights(3, 5)
    grid_params = {'weights': weights}
    grid_Search = GridSearchCV(param_grid=grid_params,
                               estimator=clf,
                               scoring=mcc,
                               verbose=0,
                               refit=True)
    
    grid_Search.fit(X_train, y_train)
    
    # prediction
    y_train_pred = clf.predict(X_train)
    y_pred = clf.predict(X_val)

    # scoring
#     score_train = clf.score(y_train, y_train_pred)
#     score_val = clf.score(y_val, y_pred)
#     print("Train %s score is %.4f" % (clf.metric_, score_train))
#     print("Validation %s score is %.4f" % (clf.metric_, score_val))
    score_train = mcc(y_train, y_train_pred)
    score_val = mcc(y_val, y_pred)
    print("Train mcc score is %.4f" % (score_train))
    print("Validation mcc score is %.4f" % (score_val))

    y_test = clf.predict(X_test)

    test.loc[:, target_column] = y_test
    return test.loc[:, ['contact_id', target_column]], score_val

def save_submission(submission, windows,
                    test_player, test_ground):
    submission = pd.merge(submission.loc[:, 'contact_id'],
                          pd.concat([test_player, test_ground]),
                          on='contact_id', how='left')
    num = 1
    try:
        for filename in os.listdir(submission_path):
            if filename.contains(f'submission_win{windows}'):
                num += 1
    except FileNotFoundError:
        num = 1

    filename = f'submission_win{windows}_ver{num}.csv'
    submission.to_csv(base_path / 'submission' / filename, index=False)

In [None]:
score_ls = []

for windows in tqdm(range(11)):
    train_player, test_player = load_data(data_path, windows, ground=False)
    train_ground, test_ground = load_data(data_path, windows, ground=True)

    train_player_balanced = preprocessing(train_player)
    train_ground_balanced = preprocessing(train_ground)

    print(f'#----------------- Windows: {str(windows): <2s}-------------------#')
    print('#------ Model: contact between players -----#')
    player, score1 = model_result(train_player_balanced, test_player, windows)
    print()
    print('#------ Model: contact player-ground -------#')
    ground, score2 = model_result(train_ground_balanced, test_ground, windows)
    print('#-------------------------------------------------#')
    print()
    score_ls.append((score1, score2))
#     save_submission(submission, windows, player, ground)

  0%|                                                  | 0/11 [00:00<?, ?it/s]

#----------------- Windows: 0 -------------------#
#------ Model: contact between players -----#
Train mcc score is 0.7927
Validation mcc score is 0.7262

#------ Model: contact player-ground -------#


  9%|███▏                               | 1/11 [1:24:23<14:03:55, 5063.54s/it]

Train mcc score is 0.8898
Validation mcc score is 0.7527
#-------------------------------------------------#

#----------------- Windows: 1 -------------------#
#------ Model: contact between players -----#
Train mcc score is 0.7929
Validation mcc score is 0.7263

#------ Model: contact player-ground -------#


 18%|██████▎                            | 2/11 [2:59:02<13:33:48, 5425.42s/it]

Train mcc score is 0.8898
Validation mcc score is 0.7537
#-------------------------------------------------#

#----------------- Windows: 2 -------------------#
#------ Model: contact between players -----#
Train mcc score is 0.7925
Validation mcc score is 0.7275

#------ Model: contact player-ground -------#


 27%|█████████▌                         | 3/11 [4:38:37<12:36:50, 5676.34s/it]

Train mcc score is 0.8898
Validation mcc score is 0.7528
#-------------------------------------------------#

#----------------- Windows: 3 -------------------#
#------ Model: contact between players -----#
Train mcc score is 0.7913
Validation mcc score is 0.7267

#------ Model: contact player-ground -------#


 36%|████████████▋                      | 4/11 [6:15:33<11:08:42, 5731.72s/it]

Train mcc score is 0.8898
Validation mcc score is 0.7526
#-------------------------------------------------#

#----------------- Windows: 4 -------------------#
#------ Model: contact between players -----#


In [None]:
scores1 = [score[0] for score in score_ls]
scores2 = [score[1] for score in score_ls]

print()

In [None]:
window = 0
train_player, test_player = load_data(data_path, windows, ground=False)
train_ground, test_ground = load_data(data_path, windows, ground=True)

col_replace(train_player)
col_replace(test_player)
col_replace(train_ground)
col_replace(test_ground)

train_player_balanced = preprocessing(train_player)
train_ground_balanced = preprocessing(train_ground)

print(f'#----------------- Windows: {str(windows): <2s}-------------------#')
print('#------ Model: contact between players -----#')
player, score1 = model_result(train_player_balanced, test_player, windows)
print()
print('#------ Model: contact player-ground -------#')
ground, score2 = model_result(train_ground_balanced, test_ground, windows)
print('#-------------------------------------------------#')
print()
score_ls
save_submission(submission, windows, player, ground)

In [10]:
from torch import cuda
print(cuda.is_available())

False
