1. 패키지

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm

import torch
from torch import nn
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
from torch.utils.data import Dataset, DataLoader

2. 하이퍼파라미터 세팅

In [3]:
TRAIN_PATH = "../data/phase_train.csv"
BATCH_SIZE = 64
EPOCHS = 5
LR = 1e-3
HIDDEN_DIM = 64
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", DEVICE)

Using device: cuda


3. 데이터 로드 및 전처리
[ 학습 데이터 구분 ]
- episode 별로 구분
- phase 별로 구분
- episode & phase 모두 구분

In [18]:
df = pd.read_csv("../data/phase_train.csv")
df = df.sort_values(["game_episode", "time_seconds"]).reset_index(drop=True)

def episode_wise_data(df):
    episodes = []
    targets = []

    for _, g in tqdm(df.groupby("game_episode")):
        g = g.reset_index(drop=True)
        if len(g) < 2:
            continue

        # 정규화된 좌표 준비
        sx = g["start_x"].values / 105.0
        sy = g["start_y"].values / 68.0
        ex = g["end_x"].values   / 105.0
        ey = g["end_y"].values   / 68.0

        coords = []
        for i in range(len(g)):
            # 항상 start는 들어감
            coords.append([sx[i], sy[i]])
            # 마지막 행 이전까지만 end를 넣음 (마지막 end는 타깃이므로)
            if i < len(g) - 1:
                coords.append([ex[i], ey[i]])

        seq = np.array(coords, dtype="float32")        # [T, 2]
        target = np.array([ex[-1], ey[-1]], dtype="float32")  # 마지막 행 end_x, end_y

        episodes.append(seq)
        targets.append(target)
    print("에피소드 수 : ", len(episodes))
    return episodes, targets

def phase_wise_data(df):
    episodes = []
    targets = []

    for _, g in tqdm(df.groupby("phase")):
        g = g.reset_index(drop=True)
        if len(g) < 2:
            continue

        # 정규화된 좌표 준비
        sx = g["start_x"].values / 105.0
        sy = g["start_y"].values / 68.0
        ex = g["end_x"].values   / 105.0
        ey = g["end_y"].values   / 68.0

        coords = []
        for i in range(len(g)):
            # 항상 start는 들어감
            coords.append([sx[i], sy[i]])
            # 마지막 행 이전까지만 end를 넣음 (마지막 end는 타깃이므로)
            if i < len(g) - 1:
                coords.append([ex[i], ey[i]])

        seq = np.array(coords, dtype="float32")        # [T, 2]
        target = np.array([ex[-1], ey[-1]], dtype="float32")  # 마지막 행 end_x, end_y

        episodes.append(seq)
        targets.append(target)
    print("phase 수 : ", len(episodes))
    return episodes, targets

def epi_pha_data(df):
    episodes = []
    targets = []

    for _, g in tqdm(df.groupby(["game_episode", "phase"])):
        g = g.reset_index(drop=True)
        if len(g) < 2:
            continue

        # 정규화된 좌표 준비
        sx = g["start_x"].values / 105.0
        sy = g["start_y"].values / 68.0
        ex = g["end_x"].values   / 105.0
        ey = g["end_y"].values   / 68.0

        coords = []
        for i in range(len(g)):
            # 항상 start는 들어감
            coords.append([sx[i], sy[i]])
            # 마지막 행 이전까지만 end를 넣음 (마지막 end는 타깃이므로)
            if i < len(g) - 1:
                coords.append([ex[i], ey[i]])

        seq = np.array(coords, dtype="float32")        # [T, 2]
        target = np.array([ex[-1], ey[-1]], dtype="float32")  # 마지막 행 end_x, end_y

        episodes.append(seq)
        targets.append(target)
    print("에피소드 + phase 수 : ", len(episodes))
    return episodes, targets

# epi_wise_seq, epi_wise_targets = episode_wise_data(df)
# pha_wise_seq, pha_wise_targets = phase_wise_data(df)
episodes, targets = phase_wise_data(df)
# episodes, targets = epi_pha_data(df)
# both_seq, both_targets = epi_pha_data(df)

100%|██████████| 59346/59346 [00:05<00:00, 11457.81it/s]

phase 수 :  47405





4. Custom Dataset / DataLoader 정의 및 Validation 분할

In [19]:
class EpisodeDataset(Dataset):
    def __init__(self, episodes, targets):
        self.episodes = episodes
        self.targets = targets

    def __len__(self):
        return len(self.episodes)

    def __getitem__(self, idx):
        seq = torch.tensor(self.episodes[idx])   # [T, 2]
        tgt = torch.tensor(self.targets[idx])    # [2]
        length = seq.size(0)
        return seq, length, tgt

def collate_fn(batch):
    seqs, lengths, tgts = zip(*batch)
    lengths = torch.tensor(lengths, dtype=torch.long)
    padded = pad_sequence(seqs, batch_first=True)  # [B, T, 2]
    tgts = torch.stack(tgts, dim=0)                # [B, 2]
    return padded, lengths, tgts

# 에피소드 단위 train / valid split
idx_train, idx_valid = train_test_split(
    np.arange(len(episodes)), test_size=0.2, random_state=42
)

episodes_train = [episodes[i] for i in idx_train]
targets_train  = [targets[i]  for i in idx_train]
episodes_valid = [episodes[i] for i in idx_valid]
targets_valid  = [targets[i]  for i in idx_valid]

train_loader = DataLoader(
    EpisodeDataset(episodes_train, targets_train),
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=collate_fn,
)

valid_loader = DataLoader(
    EpisodeDataset(episodes_valid, targets_valid),
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=collate_fn,
)

print("train episodes:", len(episodes_train), "valid episodes:", len(episodes_valid))

train episodes: 37924 valid episodes: 9481


5. LSTM 베이스라인 모델 정의

In [20]:
class LSTMBaseline(nn.Module):
    def __init__(self, input_dim=2, hidden_dim=64):
        super().__init__()
        self.lstm = nn.LSTM(
            input_size=input_dim,
            hidden_size=hidden_dim,
            num_layers=1,
            batch_first=True,
        )
        self.fc = nn.Linear(hidden_dim, 2)  # (x_norm, y_norm)

    def forward(self, x, lengths):
        # x: [B, T, 2], lengths: [B]
        packed = pack_padded_sequence(
            x, lengths.cpu(), batch_first=True, enforce_sorted=False
        )
        _, (h_n, _) = self.lstm(packed)
        h_last = h_n[-1]      # [B, H] 마지막 layer의 hidden state
        out = self.fc(h_last) # [B, 2]
        return out

model = LSTMBaseline(input_dim=2, hidden_dim=HIDDEN_DIM).to(DEVICE)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)

6. 모델 학습 및 검증

In [21]:
best_dist = float("inf")
best_model_state = None

for epoch in range(1, EPOCHS + 1):
    # --- Train ---
    model.train()
    total_loss = 0.0

    for X, lengths, y in tqdm(train_loader):
        X, lengths, y = X.to(DEVICE), lengths.to(DEVICE), y.to(DEVICE)

        optimizer.zero_grad()
        pred = model(X, lengths)
        loss = criterion(pred, y)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * X.size(0)

    train_loss = total_loss / len(train_loader.dataset)

    # --- Valid: 평균 유클리드 거리 ---
    model.eval()
    dists = []

    with torch.no_grad():
        for X, lengths, y in tqdm(valid_loader):
            X, lengths, y = X.to(DEVICE), lengths.to(DEVICE), y.to(DEVICE)
            pred = model(X, lengths)

            pred_np = pred.cpu().numpy()
            true_np = y.cpu().numpy()

            pred_x = pred_np[:, 0] * 105.0
            pred_y = pred_np[:, 1] * 68.0
            true_x = true_np[:, 0] * 105.0
            true_y = true_np[:, 1] * 68.0

            dist = np.sqrt((pred_x - true_x) ** 2 + (pred_y - true_y) ** 2)
            dists.append(dist)

    mean_dist = np.concatenate(dists).mean()  # 평균 유클리드 거리

    print(
        f"[Epoch {epoch}] "
        f"train_loss={train_loss:.4f} | "
        f"valid_mean_dist={mean_dist:.4f}"
    )

    # ----- BEST MODEL 업데이트 -----
    if mean_dist < best_dist:
        best_dist = mean_dist
        best_model_state = model.state_dict().copy()
        print(f" --> Best model updated! (dist={best_dist:.4f})")

100%|██████████| 593/593 [00:03<00:00, 197.29it/s]
100%|██████████| 149/149 [00:00<00:00, 533.63it/s]


[Epoch 1] train_loss=0.0343 | valid_mean_dist=15.8117
 --> Best model updated! (dist=15.8117)


100%|██████████| 593/593 [00:02<00:00, 251.59it/s]
100%|██████████| 149/149 [00:00<00:00, 540.03it/s]


[Epoch 2] train_loss=0.0233 | valid_mean_dist=15.2145
 --> Best model updated! (dist=15.2145)


100%|██████████| 593/593 [00:02<00:00, 255.09it/s]
100%|██████████| 149/149 [00:00<00:00, 560.32it/s]


[Epoch 3] train_loss=0.0231 | valid_mean_dist=14.9441
 --> Best model updated! (dist=14.9441)


100%|██████████| 593/593 [00:02<00:00, 249.91it/s]
100%|██████████| 149/149 [00:00<00:00, 561.68it/s]


[Epoch 4] train_loss=0.0229 | valid_mean_dist=14.8987
 --> Best model updated! (dist=14.8987)


100%|██████████| 593/593 [00:02<00:00, 253.76it/s]
100%|██████████| 149/149 [00:00<00:00, 560.13it/s]

[Epoch 5] train_loss=0.0228 | valid_mean_dist=15.0660





7. 평가 데이터셋 추론

In [22]:
# Best Model Load
model.load_state_dict(best_model_state)
model.eval()

test_meta = pd.read_csv("../data/test.csv")
submission = pd.read_csv("../data/sample_submission.csv")

submission = submission.merge(test_meta, on="game_episode", how="left")

preds_x, preds_y = [], []

for _, row in tqdm(submission.iterrows(), total=len(submission)):
    g = pd.read_csv('../data' + row["path"][1:]).reset_index(drop=True)

    # 마지막 행 정보
    last_row = g.iloc[-1]
    last_phase = last_row["phase"]
    last_team = last_row["team_id"]

    # 마지막 phase 데이터
    phase_df = g[g["phase"] == last_phase]
    input_df = phase_df
    
    # -----------
    # case 1 : 마지막 phase 길이 >= 2
    # if len(phase_df) >= 2:
    #     input_df = phase_df
    # -----------
    # case 2 : 마지막 phase 길이 == 1
    # else:
    #     input_df = g
        # prev_df = g[g["phase"] < last_phase]
        # prev_df = prev_df[prev_df["team_id"] == last_team]

        # 만약 추론할 phase와 동일한 팀의 phase가 없을 때는 그냥 episode 전체 데이터를 활용
        # if len(prev_df) == 0:
        #     input_df = g
        # else:
        #     prev_phase = prev_df["phase"].max()
        #     input_df = prev_df[prev_df["phase"] == prev_phase]

    # 정규화된 좌표 준비
    sx = input_df["start_x"].values / 105.0
    sy = input_df["start_y"].values / 68.0
    ex = input_df["end_x"].values / 105.0
    ey = input_df["end_y"].values / 68.0
    
    coords = []
    for i in range(len(input_df)):
        # start는 항상 존재하므로 그대로 사용
        coords.append([sx[i], sy[i]])
        # 마지막 행은 end_x가 NaN이므로 자동으로 제외됨
        if i < len(input_df) - 1:
            coords.append([ex[i], ey[i]])

    seq = np.array(coords, dtype="float32")  # [T, 2]

    x = torch.tensor(seq).unsqueeze(0).to(DEVICE)      # [1, T, 2]
    length = torch.tensor([seq.shape[0]]).to(DEVICE)   # [1]

    with torch.no_grad():
        pred = model(x, length).cpu().numpy()[0]       # [2], 정규화 좌표

    preds_x.append(pred[0] * 105.0)
    preds_y.append(pred[1] * 68.0)
print("Inference Done.")

100%|██████████| 2414/2414 [00:04<00:00, 581.54it/s]

Inference Done.





8. 제출 Submission 생성

In [23]:
submission["end_x"] = preds_x
submission["end_y"] = preds_y
submission[["game_episode", "end_x", "end_y"]].to_csv("../data/phase_submit2.csv", index=False)
print("Saved: baseline_submit.csv")

Saved: baseline_submit.csv
