In [70]:
import numpy as np
import os
import pickle
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import accuracy_score, roc_auc_score
import sys
sys.path.append(os.path.abspath(".."))
import torch
import pandas as pd
from datasets import PressingSequenceDataset, SoccerMapInputDataset
import config as C
import features as F
from bisect import bisect_right
from collections import defaultdict
import torch.nn.functional as F

In [222]:

data_path = "/home/express-v2/data/baseline_feature_PC_forward/fold_2"

with open(f"{data_path}/fold_info.pkl", "rb") as f:
    info = pickle.load(f)

with open(f"{data_path}/train_dataset_0.9.pkl", "rb") as f:
    train_dataset = pickle.load(f)

with open(f"{data_path}/valid_dataset_0.9.pkl", "rb") as f:
    valid_dataset = pickle.load(f)

with open(f"{data_path}/test_dataset_0.9.pkl", "rb") as f:
    test_dataset = pickle.load(f)

len(train_dataset), len(valid_dataset), len(test_dataset)

(5199, 1325, 1291)

In [18]:
from sklearn.model_selection import KFold
full_data = train_dataset + valid_dataset + test_dataset  # Ï†ÑÏ≤¥ ÏÉòÌîå pool
press_threshold = 0.9
# Step 2: Í≤ΩÍ∏∞ ID Í∏∞Ï§ÄÏúºÎ°ú ÏÉòÌîå Ï†ïÎ¶¨
from collections import defaultdict

match_to_samples = defaultdict(list)
for sample in full_data:
    match_id = sample['match_info'].split('-')[0]
    match_to_samples[match_id].append(sample)
match_ids = sorted(match_to_samples.keys())
match_ids_array = np.array(match_ids)


In [20]:
# Step 3: Re-run KFold to re-generate folds
n_folds = 6
outer_kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

for fold_idx, (train_val_idx, test_idx) in enumerate(outer_kf.split(match_ids_array)):
    if fold_idx == 1:
        continue  # Skip fold 2 (already done)

    print(f"\n=== Fold {fold_idx + 1}/{n_folds} ===")

    train_val_ids = match_ids_array[train_val_idx]
    test_ids = match_ids_array[test_idx]

    # Split train/val (80/20)
    inner_kf = KFold(n_splits=5, shuffle=True, random_state=42)
    train_idx, val_idx = next(inner_kf.split(train_val_ids))
    train_ids = train_val_ids[train_idx]
    val_ids = train_val_ids[val_idx]

    def collect_samples(match_ids):
        return [sample for mid in match_ids for sample in match_to_samples[mid]]

    train_samples = collect_samples(train_ids)
    val_samples = collect_samples(val_ids)
    test_samples = collect_samples(test_ids)

    fold_save_path = f"/home/express-v2/data/baseline_feature_PC_forward/fold_{fold_idx + 1}"
    os.makedirs(fold_save_path, exist_ok=True)

    for name, samples in zip(['train', 'valid', 'test'], [train_samples, val_samples, test_samples]):
        with open(f"{fold_save_path}/{name}_dataset_{press_threshold}.pkl", "wb") as f:
            pickle.dump(samples, f)
        print(f"[Fold {fold_idx + 1}] Saved {name}: {len(samples)} samples")

    # Save fold info for record-keeping
    fold_info = {
        'train_match_ids': train_ids.tolist(),
        'val_match_ids': val_ids.tolist(),
        'test_match_ids': test_ids.tolist(),
        'fold_idx': fold_idx + 1,
        'total_folds': n_folds
    }
    with open(f"{fold_save_path}/fold_info.pkl", "wb") as f:
        pickle.dump(fold_info, f)


=== Fold 1/6 ===
[Fold 1] Saved train: 5298 samples
[Fold 1] Saved valid: 1264 samples
[Fold 1] Saved test: 1253 samples

=== Fold 3/6 ===
[Fold 3] Saved train: 5167 samples
[Fold 3] Saved valid: 1293 samples
[Fold 3] Saved test: 1355 samples

=== Fold 4/6 ===
[Fold 4] Saved train: 5150 samples
[Fold 4] Saved valid: 1328 samples
[Fold 4] Saved test: 1337 samples

=== Fold 5/6 ===
[Fold 5] Saved train: 5271 samples
[Fold 5] Saved valid: 1272 samples
[Fold 5] Saved test: 1272 samples

=== Fold 6/6 ===
[Fold 6] Saved train: 5187 samples
[Fold 6] Saved valid: 1321 samples
[Fold 6] Saved test: 1307 samples


In [296]:

data_path = "/home/express-v2/data/baseline_feature_PC_forward/fold_6"

with open(f"{data_path}/fold_info.pkl", "rb") as f:
    info = pickle.load(f)

with open(f"{data_path}/train_dataset_0.9.pkl", "rb") as f:
    train_dataset = pickle.load(f)

with open(f"{data_path}/valid_dataset_0.9.pkl", "rb") as f:
    valid_dataset = pickle.load(f)

with open(f"{data_path}/test_dataset_0.9.pkl", "rb") as f:
    test_dataset = pickle.load(f)

len(train_dataset), len(valid_dataset), len(test_dataset)

(5187, 1321, 1307)

In [24]:
sample = train_dataset[0]

print(f"Features : {sample['features'].shape}")
print(f"Pressing Intensity : {sample['pressing_intensity'].shape}")
print(f"Labels : {sample['label']}")
print(f"Presser ID : {sample['presser_id']}")
print(f"Players Order : {sample['agent_order']}")
print(f"match info : {sample['match_info']}")

Features : torch.Size([10, 23, 41])
Pressing Intensity : torch.Size([10, 11, 11])
Labels : 0
Presser ID : 77414
Players Order : ['188178', '250079', '250101', '250102', '500133', '500140', '500141', '500142', '62365', '62386', '77414', '187259', '343587', '408792', '500113', '500115', '500116', '500117', '500118', '500121', '500502', '83615', 'ball']
match info : 126293-1.0-170


In [46]:
def print_feature_shape_diff(dataset, dataset_name="dataset"):
    print(f"[{dataset_name}] Feature shape before update:")
    for i in range(6):  # ÏïûÏùò 3Í∞ú ÏÉòÌîåÎßå ÏòàÏãúÎ°ú ÌôïÏù∏
        print(f"  Sample {i}: {dataset[i]['features'].shape}")

# 1. ÏóÖÎç∞Ïù¥Ìä∏ Ï†Ñ
print_feature_shape_diff(train_dataset, "train_dataset (after)")
print_feature_shape_diff(valid_dataset, "valid dataset (after)")
print_feature_shape_diff(test_dataset, "test_dataset (after)")

[train_dataset (after)] Feature shape before update:
  Sample 0: torch.Size([10, 23, 41])
  Sample 1: torch.Size([10, 23, 41])
  Sample 2: torch.Size([5, 23, 41])
  Sample 3: torch.Size([10, 23, 41])
  Sample 4: torch.Size([10, 23, 41])
  Sample 5: torch.Size([10, 23, 41])
[valid dataset (after)] Feature shape before update:
  Sample 0: torch.Size([10, 23, 41])
  Sample 1: torch.Size([10, 23, 41])
  Sample 2: torch.Size([10, 23, 41])
  Sample 3: torch.Size([10, 23, 41])
  Sample 4: torch.Size([10, 23, 41])
  Sample 5: torch.Size([10, 23, 41])
[test_dataset (after)] Feature shape before update:
  Sample 0: torch.Size([10, 23, 41])
  Sample 1: torch.Size([10, 23, 41])
  Sample 2: torch.Size([10, 23, 41])
  Sample 3: torch.Size([10, 23, 41])
  Sample 4: torch.Size([10, 23, 41])
  Sample 5: torch.Size([10, 23, 41])


In [297]:
import numpy as np
import torch
from sklearn.preprocessing import StandardScaler

# --- Ï†ïÍ∑úÌôî Ìï®Ïàò ---
def normalize_features(X_train, X_valid, X_test):
    scaler = StandardScaler()
    X_train_norm = scaler.fit_transform(X_train)
    X_valid_norm = scaler.transform(X_valid)
    X_test_norm = scaler.transform(X_test)
    return X_train_norm, X_valid_norm, X_test_norm

def process_dataset(dataset, agent_wise_features=None, global_wise_feature=None, 
                    use_pressing=False, use_action =False, num_agents=11):
    """
    Îç∞Ïù¥ÌÑ∞ÏÖãÏùÑ ÏàúÌöåÌïòÎ©∞ Î™®Îç∏Ïóê ÏûÖÎ†•Ìï† ÌîºÏ≤ò Î≤°ÌÑ∞ÏôÄ ÎùºÎ≤®ÏùÑ Ï∂îÏ∂úÌï©ÎãàÎã§.

    Args:
        dataset (Dataset): Ï≤òÎ¶¨Ìï† PyTorch Îç∞Ïù¥ÌÑ∞ÏÖã (Ïòà: train_dataset).
        minho_feature_indices (list): ÏÇ¨Ïö©Ìï† ÏÑ†ÏàòÎ≥Ñ(agent) ÌîºÏ≤òÏùò Ïù∏Îç±Ïä§ Î¶¨Ïä§Ìä∏.
        global_feature_indices (list): ÏÇ¨Ïö©Ìï† Ï†ÑÏó≠(global) ÌîºÏ≤òÏùò Ïù∏Îç±Ïä§ Î¶¨Ïä§Ìä∏.
        use_pressing (bool): ÏïïÎ∞ï Í∞ïÎèÑ ÌîºÏ≤ò ÏÇ¨Ïö© Ïó¨Î∂Ä.
        num_agents (int): ÌåÄÎãπ ÏÑ†Ïàò Ïàò.

    Returns:
        tuple: (ÌîºÏ≤ò NumPy Î∞∞Ïó¥, ÎùºÎ≤® NumPy Î∞∞Ïó¥)
    """
    features_list = []
    labels_list = []

    for sample in dataset:
        features = sample['features']
        if features.shape[0] < 10:  # ÏãúÌÄÄÏä§ Í∏∏Ïù¥Í∞Ä 1 ÎØ∏ÎßåÏù¥Î©¥ Í±¥ÎÑàÎõ∞Í∏∞
            continue

        parts = []
        # 1. Agent-wise features
        if agent_wise_features is not None:
            agent_feat = features[:, :, agent_wise_features]
            parts.append(agent_feat.flatten().numpy())

        # 2. Global-wise features
        if global_wise_feature is not None:
            global_feat = features[:, 0, global_wise_feature]
            parts.append(global_feat.flatten().numpy())


        # 3. (ÏÑ†ÌÉù) ÏïïÎ∞ï Í∞ïÎèÑ ÌîºÏ≤ò Ï∂îÍ∞Ä
        if use_pressing:
            press_intensity = sample['pressing_intensity'][-1:]
            
            # Ìå®Îî© Ï≤òÎ¶¨
            h, w = press_intensity.shape[1], press_intensity.shape[2]
            pad_h = max(0, num_agents - h)
            pad_w = max(0, num_agents - w)
            if pad_h > 0 or pad_w > 0:
                press_intensity = F.pad(press_intensity, (0, pad_w, 0, pad_h), "constant", 0)
            parts.append(press_intensity.flatten().numpy())

        if use_action:
            action_ids = features[:, 0, 18]  # shape: (T, agents)
            action_ids = action_ids.long()
            num_actions = 20
            one_hot = torch.nn.functional.one_hot(action_ids, num_classes=num_actions)
            parts.append(one_hot.flatten().numpy())


        feature_vector = np.concatenate(parts)
        features_list.append(feature_vector)
        labels_list.append(sample['label'].item())

    return np.array(features_list), np.array(labels_list)


In [298]:
# --- Î©îÏù∏ Ïã§Ìñâ Î°úÏßÅ ---

# 1. ÌïòÏù¥ÌçºÌååÎùºÎØ∏ÌÑ∞ Î∞è ÏÑ§Ï†ï
use_pressing_intensity = False
use_action_type = True
num_agents = 11
normalize = True  # ‚Üê Ï†ïÍ∑úÌôî ON/OFF ÏÑ§Ï†ï
selected_player_features_idx = [i for i in range(18)]  # ÏÑ†ÏàòÎ≥Ñ ÌîºÏ≤ò (0~18), 18 is actiontype
selected_global_features_idx = [i for i in range(19, 41) if i!=32] # Ï†ÑÏó≠ ÌîºÏ≤ò (19~42)

# 2. Ìï®ÏàòÎ•º ÏÇ¨Ïö©ÌïòÏó¨ Í∞Å Îç∞Ïù¥ÌÑ∞ÏÖã Ï≤òÎ¶¨
print("Processing training data...")
X_train, y_train = process_dataset(
    train_dataset, 
    agent_wise_features=selected_player_features_idx, 
    # global_wise_feature=selected_global_features_idx,
    # use_pressing=use_pressing_intensity,
    use_action = use_action_type,
    num_agents=num_agents
)

print("Processing validation data...")
X_valid, y_valid = process_dataset(
    valid_dataset, 
    agent_wise_features=selected_player_features_idx, 
    # global_wise_feature=selected_global_features_idx,
    # use_pressing=use_pressing_intensity,
    use_action = use_action_type,
    num_agents=num_agents
)

print("Processing test data...")
X_test, y_test = process_dataset(
    test_dataset, 
    agent_wise_features=selected_player_features_idx, 
    # global_wise_feature=selected_global_features_idx,
    # use_pressing=use_pressing_intensity,
    use_action = use_action_type,
    num_agents=num_agents
)

# 3. ÏÑ†ÌÉùÏ†Å Ï†ïÍ∑úÌôî Ï†ÅÏö©
if normalize:
    print("\nApplying StandardScaler normalization...")
    X_train, X_valid, X_test = normalize_features(X_train, X_valid, X_test)


# 3. ÏµúÏ¢Ö Í≤∞Í≥º ÌôïÏù∏
print("\n--- Data Shapes ---")
print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"X_valid shape: {X_valid.shape}, y_valid shape: {y_valid.shape}")
print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")

Processing training data...
Processing validation data...
Processing test data...

Applying StandardScaler normalization...

--- Data Shapes ---
X_train shape: (5161, 4340), y_train shape: (5161,)
X_valid shape: (1311, 4340), y_valid shape: (1311,)
X_test shape: (1303, 4340), y_test shape: (1303,)


In [102]:
def print_dataset_distribution(y_train, y_val, y_test):
    def _print_split(name, labels):
        total = len(labels)
        unique, counts = np.unique(labels, return_counts=True)
        print(f"{name} Set:")
        print(f"  Total samples: {total}")
        for label, count in zip(unique, counts):
            percent = (count / total) * 100
            print(f"    Label {label}: {count:>5} samples ({percent:5.2f}%)")
        print("-" * 40)

    print("\nüìä Dataset Distribution Summary")
    print("=" * 40)
    _print_split("Train", y_train)
    _print_split("Validation", y_val)
    _print_split("Test", y_test)

print_dataset_distribution(y_train, y_valid, y_test)
     


üìä Dataset Distribution Summary
Train Set:
  Total samples: 5167
    Label 0:  3718 samples (71.96%)
    Label 1:  1449 samples (28.04%)
----------------------------------------
Validation Set:
  Total samples: 1318
    Label 0:   949 samples (72.00%)
    Label 1:   369 samples (28.00%)
----------------------------------------
Test Set:
  Total samples: 1290
    Label 0:   945 samples (73.26%)
    Label 1:   345 samples (26.74%)
----------------------------------------


### 1. XGBoost

In [133]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

# Î™®Îç∏ Ï†ïÏùò
model = XGBClassifier(
    objective='binary:logistic',
    eval_metric='auc',
    seed=42,
    use_label_encoder=False  # Í≤ΩÍ≥† Î∞©ÏßÄ
)

# ÌïòÏù¥ÌçºÌååÎùºÎØ∏ÌÑ∞ Í∑∏Î¶¨Îìú Ï†ïÏùò
param_grid = {
    'max_depth': [5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [200, 400],
    'subsample': [0.6, 0.8],
    'colsample_bytree': [0.8],
    'min_child_weight': [1, 3],
    'gamma': [0, 1],
    'scale_pos_weight': [np.sum(y_train==0)/np.sum(y_train==1)]
}

# GridSearchCV Í∞ùÏ≤¥ ÏÉùÏÑ±
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='roc_auc',  # AUC Í∏∞Ï§Ä
    cv=3,               # 3-fold ÍµêÏ∞®Í≤ÄÏ¶ù
    verbose=1,
    n_jobs=-1           # Í∞ÄÎä•Ìïú Î™®Îì† ÏΩîÏñ¥ ÏÇ¨Ïö©
)

# ÌïôÏäµ ÏãúÏûë
grid_search.fit(X_train, y_train)


Fitting 3 folds for each of 96 candidates, totalling 288 fits


0,1,2
,estimator,"XGBClassifier...ree=None, ...)"
,param_grid,"{'colsample_bytree': [0.8], 'gamma': [0, 1], 'learning_rate': [0.01, 0.05, ...], 'max_depth': [5, 7], ...}"
,scoring,'roc_auc'
,n_jobs,-1
,refit,True
,cv,3
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [134]:
print("Best parameters:", grid_search.best_params_)
print("Best AUC score (train CV):", grid_search.best_score_)


Best parameters: {'colsample_bytree': 0.8, 'gamma': 1, 'learning_rate': 0.01, 'max_depth': 5, 'min_child_weight': 1, 'n_estimators': 200, 'scale_pos_weight': 2.633910034602076, 'subsample': 0.8}
Best AUC score (train CV): 0.7128946390607922


In [301]:
# Create XGBoost DMatrix objects for train and test sets
dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_valid, label=y_valid)
dtest = xgb.DMatrix(X_test, label=y_test)

params = {
    'objective': 'binary:logistic',
    'learning_rate': 0.01,
    'eval_metric': 'auc',
    'max_depth': 5,                # ÏïΩÍ∞Ñ Ï§ÑÏó¨ÏÑú ÏãúÏûë
    'eta': 0.05,                   # ÌïôÏäµÎ•†ÏùÑ Ï§ÑÏó¨ÏÑú Îçî ÎßéÏùÄ ÎùºÏö¥ÎìúÎ•º ÌïôÏäµÌïòÎèÑÎ°ù Ïú†ÎèÑ
    'subsample': 0.8,              # ÏÉòÌîåÎßÅ
    'colsample_bytree': 0.8,       # ÌîºÏ≤ò ÏÉòÌîåÎßÅ
    'min_child_weight': 1,         # Í∏∞Î≥∏Í∞í Ïú†ÏßÄ ÎòêÎäî 2~3 ÏãúÎèÑ
    'gamma': 1,                  # Í∞êÎßà ÏÑ§Ï†ï
    'scale_pos_weight': np.sum(y_train==0)/np.sum(y_train==1), # Î∂àÍ∑†Ìòï Ï≤òÎ¶¨ (Í∞ÄÏû• Ï§ëÏöî)
    'n_estimators': 200,
    'seed': 42,
    # 'tree_method': 'hist',       # Îç∞Ïù¥ÌÑ∞Í∞Ä ÌÅ¨Î©¥ ÏÜçÎèÑ Ìñ•ÏÉÅÏùÑ ÏúÑÌï¥ ÏÇ¨Ïö© Í≥†Î†§ (CPU)
    # 'grow_policy': 'lossguide'   # tree_method='hist'ÏôÄ Ìï®Íªò ÏÇ¨Ïö© (Î∂àÍ∑†Ìòï Îç∞Ïù¥ÌÑ∞ÏÖãÏóê Ïú†Î¶¨Ìï† Ïàò ÏûàÏùå)
}

num_rounds = 500 # etaÎ•º Ï§ÑÏòÄÏúºÎØÄÎ°ú ÎùºÏö¥Îìú ÏàòÎ•º ÎäòÎ¶ΩÎãàÎã§.
early_stopping_rounds = 30 # Îçî Í∏∏Í≤å Î¥êÏÑú ÏµúÏ†ÅÏùò ÎùºÏö¥Îìú Ï∞æÎèÑÎ°ù ÏÑ§Ï†ï

watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
     

In [302]:
bst = xgb.train(params, dtrain, num_rounds, watchlist, early_stopping_rounds=20)



[0]	train-auc:0.75512	eval-auc:0.63792
[1]	train-auc:0.79435	eval-auc:0.65539
[2]	train-auc:0.81228	eval-auc:0.66216
[3]	train-auc:0.82943	eval-auc:0.67448
[4]	train-auc:0.83832	eval-auc:0.68133
[5]	train-auc:0.84749	eval-auc:0.68635
[6]	train-auc:0.85129	eval-auc:0.69415
[7]	train-auc:0.85754	eval-auc:0.69702
[8]	train-auc:0.86139	eval-auc:0.69404
[9]	train-auc:0.86434	eval-auc:0.69256
[10]	train-auc:0.86656	eval-auc:0.69452
[11]	train-auc:0.86660	eval-auc:0.69371
[12]	train-auc:0.86769	eval-auc:0.69313
[13]	train-auc:0.86845	eval-auc:0.68977
[14]	train-auc:0.86777	eval-auc:0.69388
[15]	train-auc:0.87019	eval-auc:0.69753
[16]	train-auc:0.87176	eval-auc:0.69641
[17]	train-auc:0.87239	eval-auc:0.69725
[18]	train-auc:0.87269	eval-auc:0.69592
[19]	train-auc:0.87357	eval-auc:0.69592
[20]	train-auc:0.87388	eval-auc:0.69733
[21]	train-auc:0.87428	eval-auc:0.69767
[22]	train-auc:0.87493	eval-auc:0.69583
[23]	train-auc:0.87524	eval-auc:0.69622
[24]	train-auc:0.87711	eval-auc:0.69732
[25]	train

In [303]:
from sklearn.metrics import (
    accuracy_score, roc_auc_score, f1_score,
    brier_score_loss, log_loss
)
# W/O Pressing Intensity
# Get predictions on the test set
y_pred = bst.predict(dtest)
y_pred_label = (y_pred > 0.5).astype(int)

# Metric Í≥ÑÏÇ∞
accuracy = accuracy_score(y_test, y_pred_label)
auc = roc_auc_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred_label)
brier = brier_score_loss(y_test, y_pred)
logloss = log_loss(y_test, y_pred)

# Ï∂úÎ†•
print(f"Accuracy     : {accuracy:.4f}")
print(f"AUC          : {auc:.4f}")
print(f"F1-score     : {f1:.4f} ‚Üë")            # ‚Üë Ï¢ãÏùÑÏàò0.Î°ù Ï¢ãÏùå
print(f"Brier Score  : {brier:.4f} ‚Üì")         # ‚Üì ÎÇÆÏùÑÏàòÎ°ù Ï¢ãÏùå
print(f"Log Loss     : {logloss:.4f} ‚Üì")       # ‚Üì ÎÇÆÏùÑÏàòÎ°ù Ï¢ãÏùå

Accuracy     : 0.7237
AUC          : 0.7258
F1-score     : 0.5396 ‚Üë
Brier Score  : 0.2137 ‚Üì
Log Loss     : 0.6190 ‚Üì


### 2. Logistic Regrssion

In [300]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score

lr_model = LogisticRegression(
    class_weight='balanced',
    penalty='l1', 
    C=0.01,
    solver='liblinear',
    max_iter=1000,    
    random_state=42,    
    n_jobs=-1
)
X_combined = np.concatenate([X_train, X_valid])
y_combined = np.concatenate([y_train, y_valid])

print("Î°úÏßÄÏä§Ìã± ÌöåÍ∑Ä Î™®Îç∏ ÌõàÎ†® ÏãúÏûë...")
lr_model.fit(X_combined, y_combined)
print("ÌõàÎ†® ÏôÑÎ£å!")


print("\n--- ÌÖåÏä§Ìä∏ Îç∞Ïù¥ÌÑ∞ÏÖã ÏÑ±Îä• ÌèâÍ∞Ä ---")
y_pred = lr_model.predict_proba(X_test)[:, 1]
y_pred_label = lr_model.predict(X_test)


# Metric Í≥ÑÏÇ∞
accuracy = accuracy_score(y_test, y_pred_label)
auc = roc_auc_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred_label)
brier = brier_score_loss(y_test, y_pred)
logloss = log_loss(y_test, y_pred)

# Ï∂úÎ†•
print(f"Accuracy     : {accuracy:.4f}")
print(f"AUC          : {auc:.4f}")
print(f"F1-score     : {f1:.4f} ‚Üë")            # ‚Üë Ï¢ãÏùÑÏàòÎ°ù Ï¢ãÏùå
print(f"Brier Score  : {brier:.4f} ‚Üì")         # ‚Üì ÎÇÆÏùÑÏàòÎ°ù Ï¢ãÏùå
print(f"Log Loss     : {logloss:.4f} ‚Üì")       # ‚Üì ÎÇÆÏùÑÏàòÎ°ù Ï¢ãÏùå


Î°úÏßÄÏä§Ìã± ÌöåÍ∑Ä Î™®Îç∏ ÌõàÎ†® ÏãúÏûë...
ÌõàÎ†® ÏôÑÎ£å!

--- ÌÖåÏä§Ìä∏ Îç∞Ïù¥ÌÑ∞ÏÖã ÏÑ±Îä• ÌèâÍ∞Ä ---
Accuracy     : 0.6999
AUC          : 0.7367
F1-score     : 0.5582 ‚Üë
Brier Score  : 0.2138 ‚Üì
Log Loss     : 0.6203 ‚Üì


### 3. Randomforest Î™®Îç∏

In [280]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2']
}

rf = RandomForestClassifier(n_jobs=-1)

grid = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    scoring='roc_auc',
    cv=3,
    n_jobs=-1,
    verbose=1
)

grid.fit(X_combined, y_combined)

print("Best Params:", grid.best_params_)
print("Best AUC Score:", grid.best_score_)

Fitting 3 folds for each of 32 candidates, totalling 96 fits


Best Params: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}
Best AUC Score: 0.7134944304564349


In [299]:

from sklearn.ensemble import RandomForestClassifier
X_combined = np.concatenate([X_train, X_valid])
y_combined = np.concatenate([y_train, y_valid])

rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=9,
    min_samples_split=3,
    min_samples_leaf=1,
    max_features=0.8,
    bootstrap=True,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)
print("ÎûúÎç§ Ìè¨Î†àÏä§Ìä∏ Î™®Îç∏ ÌõàÎ†® ÏãúÏûë...")
rf_model.fit(X_combined, y_combined)
print("ÌõàÎ†® ÏôÑÎ£å!")
print("\n--- ÌÖåÏä§Ìä∏ Îç∞Ïù¥ÌÑ∞ÏÖã ÏÑ±Îä• ÌèâÍ∞Ä ---")

y_pred = rf_model.predict_proba(X_test)[:, 1]
y_pred_label = rf_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_label)
auc = roc_auc_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred_label)
brier = brier_score_loss(y_test, y_pred)
logloss = log_loss(y_test, y_pred)

print(f"Accuracy     : {accuracy:.4f}")
print(f"AUC          : {auc:.4f}")
print(f"F1-score     : {f1:.4f} ‚Üë")
print(f"Brier Score  : {brier:.4f} ‚Üì")
print(f"Log Loss     : {logloss:.4f} ‚Üì")

ÎûúÎç§ Ìè¨Î†àÏä§Ìä∏ Î™®Îç∏ ÌõàÎ†® ÏãúÏûë...
ÌõàÎ†® ÏôÑÎ£å!

--- ÌÖåÏä§Ìä∏ Îç∞Ïù¥ÌÑ∞ÏÖã ÏÑ±Îä• ÌèâÍ∞Ä ---
Accuracy     : 0.7391
AUC          : 0.7302
F1-score     : 0.4620 ‚Üë
Brier Score  : 0.1820 ‚Üì
Log Loss     : 0.5476 ‚Üì


### 4. Naive Bayes Î™®Îç∏

In [None]:
from sklearn.naive_bayes  import GaussianNB
nb_model = GaussianNB()

X_combined = np.concatenate([X_train, X_valid])
y_combined = np.concatenate([y_train, y_valid])

print("Naive Bayes Î™®Îç∏ ÌõàÎ†® ÏãúÏûë...")
nb_model.fit(X_combined, y_combined)

print("ÌõàÎ†® ÏôÑÎ£å!")
print("\n--- ÌÖåÏä§Ìä∏ Îç∞Ïù¥ÌÑ∞ÏÖã ÏÑ±Îä• ÌèâÍ∞Ä ---")
y_pred = nb_model.predict_proba(X_test)[:, 1]
y_pred_label = nb_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_label)
auc = roc_auc_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred_label)
brier = brier_score_loss(y_test, y_pred)
logloss = log_loss(y_test, y_pred)

print(f"Accuracy     : {accuracy:.4f}")
print(f"AUC          : {auc:.4f}")
print(f"F1-score     : {f1:.4f} ‚Üë")
print(f"Brier Score  : {brier:.4f} ‚Üì")
print(f"Log Loss     : {logloss:.4f} ‚Üì")



Naive Bayes Î™®Îç∏ ÌõàÎ†® ÏãúÏûë...
ÌõàÎ†® ÏôÑÎ£å!

--- ÌÖåÏä§Ìä∏ Îç∞Ïù¥ÌÑ∞ÏÖã ÏÑ±Îä• ÌèâÍ∞Ä ---
Accuracy     : 0.6946
AUC          : 0.6776
F1-score     : 0.4632 ‚Üë
Brier Score  : 0.2908 ‚Üì
Log Loss     : 3.9860 ‚Üì


### 5. KNN

In [None]:
param_grid = {
    'n_neighbors': [3, 5, 7, 11],
    'weights': ['uniform', 'distance'],
    'leaf_size': [10,30,50],
    'metric': ['euclidean', 'manhattan', 'cosine']
}

knn = KNeighborsClassifier(n_jobs=-1)

grid = GridSearchCV(
    estimator=knn,
    param_grid=param_grid,
    scoring='roc_auc',
    cv=3,
    n_jobs=-1,
    verbose=1
)

grid.fit(X_combined, y_combined)

print("Best Params:", grid.best_params_)
print("Best AUC Score:", grid.best_score_)

Fitting 3 folds for each of 72 candidates, totalling 216 fits
Best Params: {'leaf_size': 10, 'metric': 'manhattan', 'n_neighbors': 11, 'weights': 'distance'}
Best AUC Score: 0.601947266511837


In [None]:
from sklearn.neighbors import KNeighborsClassifier
# K=5Îäî Í∏∞Î≥∏Í∞í, ÎÇòÏ§ëÏóê ÌäúÎãù Í∞ÄÎä•
knn_model = KNeighborsClassifier(
    n_neighbors=11,
    metric='manhattan',
    leaf_size=30,
    n_jobs=-1,
    weights='distance'
)

X_combined = np.concatenate([X_train, X_valid])
y_combined = np.concatenate([y_train, y_valid])
print("KNN Î™®Îç∏ ÌõàÎ†® ÏãúÏûë...")
knn_model.fit(X_combined, y_combined)
print("ÌõàÎ†® ÏôÑÎ£å!")
print("\n--- ÌÖåÏä§Ìä∏ Îç∞Ïù¥ÌÑ∞ÏÖã ÏÑ±Îä• ÌèâÍ∞Ä ---")

y_pred = knn_model.predict_proba(X_test)[:, 1]
y_pred_label = knn_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_label)
auc = roc_auc_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred_label)
brier = brier_score_loss(y_test, y_pred)
logloss = log_loss(y_test, y_pred)

print(f"Accuracy     : {accuracy:.4f}")
print(f"AUC          : {auc:.4f}")
print(f"F1-score     : {f1:.4f} ‚Üë")
print(f"Brier Score  : {brier:.4f} ‚Üì")
print(f"Log Loss     : {logloss:.4f} ‚Üì")


KNN Î™®Îç∏ ÌõàÎ†® ÏãúÏûë...
ÌõàÎ†® ÏôÑÎ£å!

--- ÌÖåÏä§Ìä∏ Îç∞Ïù¥ÌÑ∞ÏÖã ÏÑ±Îä• ÌèâÍ∞Ä ---
Accuracy     : 0.7426
AUC          : 0.6574
F1-score     : 0.2966 ‚Üë
Brier Score  : 0.1860 ‚Üì
Log Loss     : 0.8366 ‚Üì


### 6. MLP Classifier

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
X_combined = np.concatenate([X_train, X_valid])
y_combined = np.concatenate([y_train, y_valid])
param_grid = {
    'hidden_layer_sizes': [(64,), (128,), (64, 32), (128, 64)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'sgd'],
    'alpha': [1e-4, 1e-3, 1e-2],
    'learning_rate_init': [0.001, 0.01]
}
mlp = MLPClassifier()

grid_search = GridSearchCV(
    estimator=mlp,
    param_grid=param_grid,
    scoring='roc_auc',
    cv=3,
    n_jobs=-1,
    verbose=1
)
grid_search.fit(X_combined, y_combined)
print("Best Params:", grid_search.best_params_)
print("Best AUC Score:", grid_search.best_score_)


Fitting 3 folds for each of 96 candidates, totalling 288 fits
Best Params: {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (128, 64), 'learning_rate_init': 0.01, 'solver': 'sgd'}
Best AUC Score: 0.6424974886117175


In [None]:
mlp = MLPClassifier(
    hidden_layer_sizes= (128, 64),
    activation= 'relu',
    alpha=0.0001,
    learning_rate_init= 0.01,
    solver='sgd',
    random_state=42,
    early_stopping=True,
)
X_combined = np.concatenate([X_train, X_valid])
y_combined = np.concatenate([y_train, y_valid])
print("MLPClassifier Î™®Îç∏ ÌõàÎ†® ÏãúÏûë...")
mlp.fit(X_combined, y_combined)
print("ÌõàÎ†® ÏôÑÎ£å!")
print("\n--- ÌÖåÏä§Ìä∏ Îç∞Ïù¥ÌÑ∞ÏÖã ÏÑ±Îä• ÌèâÍ∞Ä ---")

y_pred = mlp.predict_proba(X_test)[:, 1]
y_pred_label = mlp.predict(X_test)
from sklearn.metrics import (
    accuracy_score, roc_auc_score, f1_score,
    brier_score_loss, log_loss
)
accuracy = accuracy_score(y_test, y_pred_label)
auc = roc_auc_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred_label)
brier = brier_score_loss(y_test, y_pred)
logloss = log_loss(y_test, y_pred)

print(f"Accuracy     : {accuracy:.4f}")
print(f"AUC          : {auc:.4f}")
print(f"F1-score     : {f1:.4f} ‚Üë")
print(f"Brier Score  : {brier:.4f} ‚Üì")
print(f"Log Loss     : {logloss:.4f} ‚Üì")


MLPClassifier Î™®Îç∏ ÌõàÎ†® ÏãúÏûë...
ÌõàÎ†® ÏôÑÎ£å!

--- ÌÖåÏä§Ìä∏ Îç∞Ïù¥ÌÑ∞ÏÖã ÏÑ±Îä• ÌèâÍ∞Ä ---
Accuracy     : 0.7110
AUC          : 0.6687
F1-score     : 0.3234 ‚Üë
Brier Score  : 0.1978 ‚Üì
Log Loss     : 0.5981 ‚Üì
