In [118]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import torch
import pytorch_lightning as pl
from torch.utils.data import DataLoader
import json
import os
import pickle
import pandas as pd
import argparse # To accept checkpoint path as argument
import sys
root_path = os.path.abspath("..")
DATA_PATH = "/data/MHL/pressing-intensity-v2"
sys.path.append(root_path)

# Import project modules
# import config  # Import static configurations
from model import PytorchSoccerMapModel, exPressModel # Import Lightning model
from datasets import PressingSequenceDataset, exPressInputDataset, SoccerMapInputDataset 
from config import TEAMNAME2ID, TEAMID2NAME
pl.seed_everything(42, workers=True) # For reproducibility

Seed set to 42


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


42

In [119]:
test_dataset = exPressInputDataset(os.path.join(DATA_PATH, "test_dataset.pkl"))
pos_cnt = 0
neg_cnt = 0
match_ids = []
for sample in test_dataset:
    if sample['label'] == 1:
        pos_cnt += 1
    else:
        neg_cnt += 1

    if sample['match_info'].split("-")[0] not in match_ids:
        match_ids.append(sample['match_info'].split("-")[0])
print(f"Positive samples : {pos_cnt} / Negative samples : {neg_cnt}")
print(f"Test Matches : {match_ids}")

Loading dataset from /data/MHL/pressing-intensity-v2/test_dataset.pkl...


Positive samples : 99 / Negative samples : 540
Test Matches : ['153379', '153385', '153387']


In [122]:
optimizer_params = {
        "optimizer_params": {
            "lr": 1e-4,
            "weight_decay": 1e-5
        }
    }
model_config = {
    "in_channels": 19,
        "num_gnn_layers": 2,
        "gnn_hidden_dim": 64,
        "num_lstm_layers": 2,
        "lstm_hidden_dim": 64,
        "lstm_dropout": 0.4,
        "lstm_bidirectional": True,
        "use_pressing_features": False,
        "gnn_head": 4
}

# f1 0.660
ckpt_path = "/data/MHL/pressing-intensity-feat/checkpoints/exPress-epoch=192-val_loss=0.51.ckpt"

#f1 0.680
ckpt_path = "/data/MHL/pressing-intensity-0.9/checkpoints_vat_one_frame_0.7_w/ovel/exPress-epoch=196-val_loss=1.07.ckpt"

# #lstm gat
ckpt_path = "/data/MHL/pressing-intensity-v2/checkpoints_grugat_10_frame_0.9_w/ovel/exPress-epoch=18-val_loss=0.81.ckpt"
# w/o vel
# ckpt_path = "/data/MHL/pressing-intensity-v2/checkpoints_grugat_10_frame_0.9_w/ovel/exPress-epoch=08-val_loss=0.82.ckpt"
ckpt_path = "/data/MHL/pressing-intensity-v2/checkpoints_grugat_10_frame_0.9_w/ovel/exPress-epoch=51-val_loss=0.92.ckpt"
model = exPressModel.load_from_checkpoint(checkpoint_path=ckpt_path,
                                          model_config=model_config, optimizer_params=optimizer_params)
model = model.to("cuda")


In [123]:
from utils_data import custom_temporal_collate

test_loader = DataLoader(
        test_dataset,
        batch_size=1,
        shuffle=False,
        num_workers=4,
        pin_memory=True,
        persistent_workers=True,
        collate_fn=custom_temporal_collate
    )

In [62]:
total_dicts ={}
for match_id in match_ids:
    match_path = f"/data/MHL/bepro/processed/{match_id}"

    with open(f"{match_path}/{match_id}_processed_dict.pkl", "rb") as f:
        match_dict = pickle.load(f)
    total_dicts[match_id] = match_dict
    event_df = pd.read_csv(f"{match_path}/valid_events_filtered2.csv")
    total_dicts[match_id]['event_df'] = event_df
    teams_dict = match_dict['teams']
    home_team = teams_dict['Home'].copy()
    away_team = teams_dict['Away'].copy()
    teams_df = pd.concat([home_team, away_team])
    teams_df.reset_index(drop=True, inplace=True)
    teams_df['player_code'] = teams_df.apply(lambda row : row['team'][0] + str(row['xID']).zfill(2), axis=1)
    total_dicts[match_id]['teams_df'] = teams_df

total_dicts[match_id].keys()

dict_keys(['tracking_df', 'event_df', 'teams', 'meta_data', 'teams_df'])

In [124]:
import torch
from torch.utils.data import DataLoader
import numpy as np
# 기존 임포트 구문에 추가
from sklearn.metrics import roc_auc_score, brier_score_loss, log_loss
from tqdm import tqdm

# 데이터로더를 위해 이전에 정의한 collate 함수가 필요할 수 있습니다.
from utils_data import custom_temporal_collate

# 모델을 평가 모드로 설정 (Dropout 등 비활성화)
model.eval()

# 예측 확률과 실제 레이블을 저장할 리스트
all_preds = []
all_targets = []
device = "cuda"

# 그래디언트 계산을 비활성화하여 메모리 사용량 줄이고 속도 향상
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Calculating predictions"):
        if batch is None: continue
        # CUDA 사용 시 배치 데이터를 device로 이동
        for key, value in batch.items():
            if isinstance(value, torch.Tensor):
                batch[key] = value.to(device)
        
            
        # PyTorch Lightning의 predict_step을 사용하면 편리합니다.
        # 이 메소드는 이미 내부적으로 데이터를 device로 옮기고 sigmoid를 적용합니다.
        preds, targets = model.predict_step(batch, batch_idx=0)
        
        # 결과를 CPU로 옮기고 numpy 배열로 변환하여 리스트에 추가
        all_preds.append(preds.cpu().numpy())
        all_targets.append(targets.cpu().numpy())

# 리스트들을 하나의 numpy 배열로 통합
all_preds = np.concatenate(all_preds)
all_targets = np.concatenate(all_targets)

# scikit-learn을 사용하여 AUROC 점수 계산
auroc_score = roc_auc_score(all_targets, all_preds)

print(f"\nModel: {ckpt_path}")
print(f"Test Set AUROC: {auroc_score:.4f}")

# 2. Brier Score 계산 (낮을수록 좋음)
brier_score = brier_score_loss(all_targets, all_preds)
print(f"Test Set Brier Score: {brier_score:.4f} (Lower is better)")

# 3. Log Loss 계산 (낮을수록 좋음)
logloss_score = log_loss(all_targets, all_preds)
print(f"Test Set Log Loss:    {logloss_score:.4f} (Lower is better)")


Calculating predictions:   0%|          | 1/639 [00:00<02:31,  4.20it/s]

Calculating predictions: 100%|██████████| 639/639 [00:20<00:00, 30.53it/s]


Model: /data/MHL/pressing-intensity-v2/checkpoints_grugat_10_frame_0.9_w/ovel/exPress-epoch=51-val_loss=0.92.ckpt
Test Set AUROC: 0.6372
Test Set Brier Score: 0.1814 (Lower is better)
Test Set Log Loss:    0.5447 (Lower is better)





In [86]:
pd.concat([pd.DataFrame(all_targets), pd.DataFrame(all_preds)], axis=1)

Unnamed: 0,0,0.1
0,0.0,0.422813
1,0.0,0.706924
2,0.0,0.498105
3,0.0,0.201262
4,0.0,0.507761
...,...,...
634,0.0,0.204471
635,0.0,0.185977
636,0.0,0.099575
637,0.0,0.187560


In [65]:
match_id = match_ids[1]
match_dict = total_dicts[match_id]

tracking_df = match_dict['tracking_df']
event_df = match_dict['event_df']
teams_df = match_dict['teams_df']
teams_df.head()

Unnamed: 0,player,position,team,jID,pID,tID,xID,player_code
0,Jefferson Galego,LW,Home,73,529743,4641,0,H00
1,Jaeseok Hong,CB,Home,3,532198,4641,1,H01
2,Youngjun Choi,CDM,Home,30,250959,4641,2,H02
3,Geonwoong Kim,CM,Home,8,413678,4641,3,H03
4,Juhun Song,CB,Home,4,187313,4641,4,H04


In [116]:
def analyze_pressing_contributions(match_id: str, model, test_loader, total_dicts: dict, TEAMID2NAME: dict) -> pd.DataFrame:
    """
    특정 경기에 대한 선수별 압박 기여도와 수비 스탯을 분석합니다.

    Args:
        match_id (str): 분석할 경기의 ID.
        model (pl.LightningModule): 학습된 Pytorch Lightning 모델.
        test_loader (DataLoader): 테스트 데이터로더.
        total_dicts (dict): 모든 경기의 'event_df'와 'teams_df'를 포함하는 딕셔너리.
        TEAMID2NAME (dict): 팀 ID를 팀 이름으로 변환하는 딕셔너리.

    Returns:
        pd.DataFrame: 선수별 압박 기여도 점수와 수비 스탯이 병합된 최종 데이터프레임.
    """
    print(f"Analyzing contributions for match_id: {match_id}...")
    
    device = next(model.parameters()).device
    event_df = total_dicts[match_id]['event_df']
    teams_df = total_dicts[match_id]['teams_df']
    
    # 1. 선수별 압박 기여도(pressing_score) 계산
    pressing_score_dict = {pid: 0 for pid in teams_df['pID'].unique()}

    model.eval()
    with torch.no_grad():
        for batch in test_loader:
            if batch is None or batch['match_info'][0].split("-")[0] != match_id:
                continue

            # CUDA 사용 시 배치 데이터를 device로 이동
            for key, value in batch.items():
                if isinstance(value, torch.Tensor):
                    batch[key] = value.to(device)
            
            # 모델 예측값(압박 성공 확률) 계산
            _, preds, _ = model.step(batch)
            c_total = preds.item()
            
            # 기여도 분배 로직
            # agent_order = [id[0] for id in batch['agent_order']]
            agent_order = batch['agent_order'][0]
            presser_id = batch['presser_id'][0]
            
            pressure_matrix = batch['pressing_intensity'][0, -1, :, :]
            row_len, col_len = pressure_matrix.shape
            thresholded_pressures = pressure_matrix * (pressure_matrix > 0.6)
            
            if agent_order.index(presser_id) < row_len:
                player_ids = agent_order[:row_len]
                w_i_vector = torch.sum(thresholded_pressures, dim=1)
            else:
                player_ids = agent_order[row_len:row_len+col_len]
                w_i_vector = torch.sum(thresholded_pressures, dim=0)

            total_weight = torch.sum(w_i_vector)
            # c_total = 1.0
            if total_weight > 0:
                normalized_weights = w_i_vector / total_weight
                c_i_vector = c_total * normalized_weights
            else:
                c_i_vector = torch.zeros_like(w_i_vector)

            for i, player_id in enumerate(player_ids):
                if str(player_id).startswith("Missing"):
                    continue
                pressing_score_dict[player_id] += c_i_vector[i].item()
    
    # 계산된 점수를 DataFrame으로 변환
    pressing_score_sorted = sorted(pressing_score_dict.items(), key=lambda item: item[1], reverse=True)
    pressing_score_df = pd.DataFrame(pressing_score_sorted, columns=['id', 'pressing_score'])

    # 2. 선수별 수비 스탯(Event Count) 계산
    event_cnt_dict1 = {pid: 0 for pid in teams_df['pID'].unique()}
    # 'intercetpion' 오타 수정 -> 'interception'
    # defensive_events = event_df[event_df['type_name'].isin(['recovery', 'interception'])]
    defensive_events1 = event_df[event_df['type_name'].isin(['interception'])]
    event_counts1 = defensive_events1.groupby('player_id').size()
    
    for player_id, count in event_counts1.items():
        if str(player_id) in event_cnt_dict1:
            player_id = str(player_id)
            event_cnt_dict1[player_id] = count

    event_cnt_sorted1 = sorted(event_cnt_dict1.items(), key=lambda item: item[1], reverse=True)
    event_cnt_df1 = pd.DataFrame(event_cnt_sorted1, columns=['id', 'Recovery_Count'])
    event_cnt_df1['id'] = event_cnt_df1['id'].astype(str)

    # 2. 선수별 수비 스탯(Event Count) 계산
    event_cnt_dict2 = {pid: 0 for pid in teams_df['pID'].unique()}
    # 'intercetpion' 오타 수정 -> 'interception'
    # defensive_events = event_df[event_df['type_name'].isin(['recovery', 'interception'])]
    defensive_events2 = event_df[event_df['type_name'].isin(['interception'])]
    event_counts2 = defensive_events2.groupby('player_id').size()
    
    for player_id, count in event_counts2.items():
        if str(player_id) in event_cnt_dict2:
            player_id = str(player_id)
            event_cnt_dict1[player_id] = count

    event_cnt_sorted2 = sorted(event_cnt_dict2.items(), key=lambda item: item[1], reverse=True)
    event_cnt_df2 = pd.DataFrame(event_cnt_sorted2, columns=['id', 'Interception_Count'])
    event_cnt_df2['id'] = event_cnt_df2['id'].astype(str)

    # 3. 두 데이터프레임과 선수 정보 병합
    final_df = pd.merge(pressing_score_df, event_cnt_df1, on='id', how='left')
    final_df = pd.merge(final_df, event_cnt_df2, on='id', how='left')
    final_df = final_df.merge(teams_df[['pID', 'player']], left_on="id", right_on="pID", how='left')
    
    # 불필요한 pID 열 제거 및 결측치(NaN) 0으로 채우기
    final_df = final_df.drop(columns=['pID'])
    # final_df['Recovery_Interception_Count'] = final_df['Recovery_Interception_Count'].fillna(0).astype(int)
    final_df['Recovery_Count'] = final_df['Recovery_Count'].fillna(0).astype(int)
    final_df['Interception_Count'] = final_df['Interception_Count'].fillna(0).astype(int)
    final_df = final_df[['player', 'pressing_score', 'Recovery_Count', 'Interception_Count']]
    print(f"Analysis for match_id: {match_id} completed.")
    
    return final_df

In [115]:
match1_df = analyze_pressing_contributions("153385", model, test_loader, total_dicts, TEAMID2NAME)
match1_df

Analyzing contributions for match_id: 153385...


Analysis for match_id: 153385 completed.


Unnamed: 0,player,pressing_score,Recovery_Count,Interception_Count
0,Jukong Kim,6.154231,8,0
1,Juyong Lee,5.888013,9,0
2,Chongmu Han,5.684458,3,0
3,Hoyeon Jeong,5.054545,8,0
4,Jefferson Galego,4.284818,5,0
5,Mingi Lee,4.220852,5,0
6,Youngjun Choi,4.15444,8,0
7,Taehui Nam,3.880179,4,0
8,Kaina Yoshio,3.488138,5,0
9,Yool Heo,3.238106,4,0


In [117]:
match1_df = analyze_pressing_contributions("153385", model, test_loader, total_dicts, TEAMID2NAME)
match1_df

Analyzing contributions for match_id: 153385...


Analysis for match_id: 153385 completed.


Unnamed: 0,player,pressing_score,Recovery_Count,Interception_Count
0,Jukong Kim,6.154231,0,0
1,Juyong Lee,5.888013,4,0
2,Chongmu Han,5.684458,1,0
3,Hoyeon Jeong,5.054545,4,0
4,Jefferson Galego,4.284818,1,0
5,Mingi Lee,4.220852,3,0
6,Youngjun Choi,4.154439,3,0
7,Taehui Nam,3.880179,2,0
8,Kaina Yoshio,3.488137,2,0
9,Yool Heo,3.238106,0,0
