In [1]:
%load_ext autoreload
%autoreload 2

In [58]:
import os
import random
import torch
import numpy as np
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
import argparse
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
import sys
import pandas as pd
# 일단 __init__.py 추가해서 이렇게 해놨음.
BASE_PATH='/workspaces/kaggle-NFL-contact-detection'
sys.path.append(os.path.join(BASE_PATH, "src"))

from config import CFG
# TODO: data dir train/pred 구분.
CFG["dataset_params"]["data_dir"] = os.path.join(BASE_PATH, "data")
from factory.dataset_factory import DataSetFactory
from factory.lightning_module_factory import LightningModuleFactory
from train import seed_everything

In [26]:
seed_everything(CFG['seed'])
device_str = "cpu"
if torch.cuda.is_available():
    device_str = "cuda"
elif torch.backends.mps.is_available():
    device_str = "mps"
device_str

'cuda'

In [27]:
saved_model_path = os.path.join(
    BASE_PATH, 
    "epoch=9-step=33340.ckpt"
)

In [28]:

model_name = f"{CFG['model_name']}-{CFG['model_version']}"
dataset_params = CFG['dataset_params']
model_params = CFG['model_params']
data_module = DataSetFactory.get_dataset(name=model_name,
                                            params=dataset_params)
lightning_module = LightningModuleFactory.get_lightning_module(name=model_name,
                                                                load_path=saved_model_path,
                                                                params=model_params)

trainer = pl.Trainer(max_epochs=CFG["epochs"],
                        accelerator=device_str,
                        devices=1 if device_str != "cpu" else None,
                        logger=False,
                        callbacks=[EarlyStopping(monitor="val_loss", mode="min", patience=3)])

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [33]:
predictions = trainer.predict(
    model=lightning_module, 
    datamodule=data_module)


test videos: 47330


100%|██████████| 4/4 [00:00<00:00, 10672.53it/s]


train videos: 3783616


100%|██████████| 481/481 [00:00<00:00, 188494.84it/s]

Generating dataset: test





contact
0    6631
Name: contact, dtype: int64


100%|██████████| 4/4 [00:00<00:00, 510.50it/s]
100%|██████████| 2/2 [00:00<00:00,  2.58it/s]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting DataLoader 0: 100%|██████████| 277/277 [01:34<00:00,  2.94it/s]


In [34]:
df_filtered = data_module.dataset_pred.df

In [53]:
y_pred = []
for p in predictions:
    y_pred += p.tolist() 
y_pred_np = np.array(y_pred)

In [54]:
len(y_pred_np), len(df_filtered)

(6631, 6631)

In [57]:
df_filtered['contact'] = (y_pred_np >= CFG["threshold"]).astype('int')
df_filtered.groupby("contact")["contact"].count()

contact
0    5767
1     864
Name: contact, dtype: int64

In [59]:

sub = pd.read_csv(os.path.join(CFG["dataset_params"]["data_dir"], 'sample_submission.csv'))

sub = sub.drop("contact", axis=1).merge(df_filtered[['contact_id', 'contact']], how='left', on='contact_id')
sub['contact'] = sub['contact'].fillna(0).astype('int')

sub[["contact_id", "contact"]].to_csv("submission.csv", index=False)

sub.head()

Unnamed: 0,contact_id,contact
0,58168_003392_0_38590_43854,0
1,58168_003392_0_38590_41257,0
2,58168_003392_0_38590_41944,0
3,58168_003392_0_38590_42386,0
4,58168_003392_0_38590_47944,0
