In [1]:
import os
import numpy as np
import pandas as pd
from scipy.stats import skew, kurtosis
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from tqdm import tqdm

import lightgbm as lgb
from xgboost import XGBClassifier

In [2]:
# origin code
N_FFT = 10
def extract_features(fp):
    df = pd.read_csv(fp, sep=' ', header=None,
                     names=['Ax','Ay','Az','Gx','Gy','Gz'],
                     dtype=float, on_bad_lines='skip').dropna()
    
    feats = {}
    for col in df.columns:
        v = df[col].values
        feats[f'{col}_mean']     = v.mean()
        feats[f'{col}_std']      = v.std()
        feats[f'{col}_max']      = v.max()
        feats[f'{col}_min']      = v.min()
        feats[f'{col}_range']    = v.max() - v.min()
        feats[f'{col}_skew']     = skew(v)
        feats[f'{col}_kurtosis'] = kurtosis(v)
        feats[f'{col}_zcross']   = np.sum(v[:-1]*v[1:] < 0)
        feats[f'{col}_rms']      = np.sqrt(np.mean(v**2)) # 新增特徵

    mag = np.sqrt(df['Ax']**2 + df['Ay']**2 + df['Az']**2)
    feats['acc_mag_mean'] = mag.mean()
    feats['acc_mag_std']  = mag.std()

    for col in df.columns:
        signal = df[col].values
        ps = np.abs(np.fft.rfft(df[col].values))**2
        ps_norm = ps / np.sum(ps + 1e-8)  # 避免除以零
        for i in range(1, N_FFT+1):
            feats[f'{col}_fft{i}'] = ps[i]
            feats[f'{col}_psd{i}'] = ps[i] / len(signal)  # PSD：功率密度，新增特徵
        feats[f'{col}_entropy'] = -np.sum(ps_norm * np.log(ps_norm + 1e-8))  # Spectral Entropy，新增特徵
    return feats

1. 每筆txt檔有27次揮拍。
2. 利用 np.linspace(0, len(df), 28) 切割出 27 段 index
3. 每段套用 extract_features()，回傳一個 list of dict
4. 對所有 dict 做特徵平均，回傳單一 dict（代表整份資料）

In [None]:
# change code (27feature average)
N_FFT = 10
def extract_features_segment_avg(fp, n_segment=27):
    df = pd.read_csv(fp, sep=' ', header=None,
                     names=['Ax','Ay','Az','Gx','Gy','Gz'],
                     dtype=float, on_bad_lines='skip').dropna()

    segment_indices = np.linspace(0, len(df), n_segment + 1, dtype=int)
    features_list = []

    for i in range(n_segment):
        start, end = segment_indices[i], segment_indices[i+1]
        if end - start < 10:  # 太短不計
            continue
        df_seg = df.iloc[start:end]
        feats = {}

        for col in df_seg.columns:
            v = df_seg[col].values
            feats[f'{col}_mean']     = v.mean()
            feats[f'{col}_std']      = v.std()
            feats[f'{col}_max']      = v.max()
            feats[f'{col}_min']      = v.min()
            feats[f'{col}_range']    = v.max() - v.min()
            feats[f'{col}_skew']     = skew(v)
            feats[f'{col}_kurtosis'] = kurtosis(v)
            feats[f'{col}_zcross']   = np.sum(v[:-1]*v[1:] < 0)
            feats[f'{col}_rms']      = np.sqrt(np.mean(v**2))
        
        mag = np.sqrt(df_seg['Ax']**2 + df_seg['Ay']**2 + df_seg['Az']**2)
        feats['acc_mag_mean'] = mag.mean()
        feats['acc_mag_std']  = mag.std()

        for col in df_seg.columns:
            signal = df_seg[col].values
            ps = np.abs(np.fft.rfft(signal))**2
            ps_norm = ps / np.sum(ps + 1e-8)
            for j in range(1, N_FFT + 1):
                if j < len(ps):
                    feats[f'{col}_fft{j}'] = ps[j]
                    feats[f'{col}_psd{j}'] = ps[j] / len(signal)
                else:
                    feats[f'{col}_fft{j}'] = 0.0
                    feats[f'{col}_psd{j}'] = 0.0
            feats[f'{col}_entropy'] = -np.sum(ps_norm * np.log(ps_norm + 1e-8))

        features_list.append(feats)

    # 對 27 段特徵取平均
    df_feats = pd.DataFrame(features_list)
    avg_feats = df_feats.mean(axis=0).to_dict()
    return avg_feats

In [None]:
# change code (27feature)
N_FFT = 5
def extract_features_segments(fp, n_segment=27):
    df = pd.read_csv(fp, sep=' ', header=None,
                     names=['Ax','Ay','Az','Gx','Gy','Gz'],
                     dtype=float, on_bad_lines='skip').dropna()

    segment_indices = np.linspace(0, len(df), n_segment + 1, dtype=int)
    features_list = []

    for i in range(n_segment):
        start, end = segment_indices[i], segment_indices[i+1]
        if end - start < 10:
            continue  # 若片段太短則略過
        df_seg = df.iloc[start:end]
        feats = {}

        for col in df_seg.columns:
            v = df_seg[col].values
            feats[f'{col}_mean']     = v.mean()
            feats[f'{col}_std']      = v.std()
            feats[f'{col}_max']      = v.max()
            feats[f'{col}_min']      = v.min()
            feats[f'{col}_range']    = v.max() - v.min()
            feats[f'{col}_skew']     = skew(v)
            feats[f'{col}_kurtosis'] = kurtosis(v)
            feats[f'{col}_zcross']   = np.sum(v[:-1]*v[1:] < 0)
            feats[f'{col}_rms']      = np.sqrt(np.mean(v**2))

        mag = np.sqrt(df_seg['Ax']**2 + df_seg['Ay']**2 + df_seg['Az']**2)
        feats['acc_mag_mean'] = mag.mean()
        feats['acc_mag_std']  = mag.std()

        for col in df_seg.columns:
            signal = df_seg[col].values
            ps = np.abs(np.fft.rfft(signal))**2
            ps_norm = ps / np.sum(ps + 1e-8)
            for j in range(1, N_FFT + 1):
                if j < len(ps):
                    feats[f'{col}_fft{j}'] = ps[j]
                    feats[f'{col}_psd{j}'] = ps[j] / len(signal)
                else:
                    feats[f'{col}_fft{j}'] = 0.0
                    feats[f'{col}_psd{j}'] = 0.0
            feats[f'{col}_entropy'] = -np.sum(ps_norm * np.log(ps_norm + 1e-8))

        features_list.append(feats)

    return features_list  # ✅ 直接回傳 27 筆（或部分）segment 特徵


In [3]:
# 先在全域讀取 test_info，避免 NameError
test_info = pd.read_csv(os.path.join('./data/39_Test_Dataset/test_info.csv'))

In [4]:
# origin code
info = pd.read_csv(os.path.join('./data/39_Training_Dataset/train_info.csv'))
X_list, y_gender, y_hand, y_years, y_level = [], [], [], [], []
for _, r in info.iterrows():
    fp = os.path.join('./data/39_Training_Dataset/train_data', f"{r['unique_id']}.txt")

    X_list.append(extract_features(fp)) # origin code
    # X_list.append(extract_features_segment_avg(fp)) # change code (27feature avg)
    y_gender.append(1 if r['gender']==1 else 0)
    y_hand.append(1 if r['hold racket handed']==1 else 0)
    y_years.append(r['play years'])
    y_level.append(r['level'] - 2)
X_train = pd.DataFrame(X_list)

In [None]:
# change code (27feature train)
# 定義 flatten 函數（可放在 utils.py 或 notebook 前面）
def flatten_train_segments(info_csv_path, data_folder, extract_fn):
    info = pd.read_csv(info_csv_path)
    X_list, y_gender, y_hand, y_years, y_level = [], [], [], [], []

    for _, r in info.iterrows():
        fp = os.path.join(data_folder, f"{r['unique_id']}.txt")
        segments = extract_fn(fp)
        for seg in segments:
            X_list.append(seg)
            y_gender.append(r['gender'])  # ✅ 男生=1, 女生=0
            y_hand.append(r['hold racket handed'])  # ✅ 右手=1, 左手=0
            y_years.append(r['play years'])
            y_level.append(r['level'] - 2)

    X_train = pd.DataFrame(X_list)
    return X_train, y_gender, y_hand, y_years, y_level

X_train, y_gender, y_hand, y_years, y_level = flatten_train_segments(
    info_csv_path='./data/39_Training_Dataset/train_info.csv',
    data_folder='./data/39_Training_Dataset/train_data',
    extract_fn=extract_features_segments
)

In [None]:
# change code (change parameter & learning rate decay)
def cv_predict(X, y, params, objective, num_class=None):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    # （此處改用全域預先讀好的 test_info）
    X_test = pd.DataFrame(
        # [extract_features(os.path.join('./data/39_Test_Dataset/test_data',f"{r['unique_id']}.txt")) for _, r in test_info.iterrows()], # origin code
        [extract_features_segment_avg(os.path.join('./data/39_Test_Dataset/test_data',f"{r['unique_id']}.txt")) for _, r in test_info.iterrows()], # change code(27feature avg)
        index=test_info['unique_id']
    )
    test_pred = (np.zeros((len(X_test), num_class))
                 if num_class else np.zeros(len(X_test)))

    aucs = []
    for fold, (ti, vi) in enumerate(skf.split(X, y)):
        X_tr, X_va = X.iloc[ti], X.iloc[vi]
        y_tr, y_va = np.array(y)[ti], np.array(y)[vi]

        clf = lgb.LGBMClassifier(**params)
        clf.set_params(objective=objective)
        if num_class:
            clf.set_params(num_class=num_class)
        # 根據任務選擇 eval_metric
        eval_metric = 'auc' if num_class is None else 'multi_logloss'

        learning_rates = lambda iter: 0.07 * (0.99 ** iter)

        clf.fit(
            X_tr, y_tr,
            eval_set=[(X_va, y_va)],
            eval_metric=eval_metric,
            callbacks=[
                lgb.early_stopping(stopping_rounds=50),
                lgb.log_evaluation(period=10),
                lgb.reset_parameter(learning_rate=learning_rates)
            ]
        )

        va_pred = clf.predict_proba(X_va)
        fold_auc = (roc_auc_score(y_va, va_pred[:,1])
                    if num_class is None
                    else roc_auc_score(y_va, va_pred,
                                       multi_class='ovr', average='micro'))
        print(f"Fold{fold} AUC = {fold_auc:.6f}")
        aucs.append(fold_auc)

        if num_class:
            test_pred += clf.predict_proba(X_test) / skf.n_splits
        else:
            test_pred += clf.predict_proba(X_test)[:,1] / skf.n_splits

    print(f"Mean CV AUC = {np.mean(aucs):.6f}\n")
    return test_pred

params = {
    'n_estimators': 1000,
    'learning_rate': 0.05,
    'num_leaves': 31,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'lambda_l1': 1.0,                # L1 正則化（加強稀疏性）
    'lambda_l2': 1.0,                # L2 正則化（防止權重爆炸）
    'verbose': -1
}


# 執行四個任務的 CV
pred_gender = cv_predict(X_train, y_gender, params,   'binary',    None)

params = {
    'n_estimators': 2000,
    'learning_rate': 0.07,
    'num_leaves': 31,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'lambda_l1': 1.0,                # L1 正則化（加強稀疏性）
    'lambda_l2': 1.0,                # L2 正則化（防止權重爆炸）
    'verbose': -1
}  

pred_years  = cv_predict(X_train, y_years,  params,   'multiclass',3)
pred_level  = cv_predict(X_train, y_level,  params,   'multiclass',4)

params = {
    'n_estimators': 1000,            # 最多迭代 1000 棵樹
    'learning_rate': 0.07,           # 初始學習率（會使用動態調整）
    'num_leaves': 31,                # 每棵樹的最大葉節點數
    'max_depth': 7,                  # 每棵樹的最大深度（控制模型複雜度）
    'feature_fraction': 0.8,         # 每棵樹隨機選用的特徵比例
    'bagging_fraction': 0.8,         # 每棵樹隨機使用的資料比例
    'bagging_freq': 5,               # 每5次 boosting 做一次 row sampling
    'lambda_l1': 1.0,                # L1 正則化（加強稀疏性）
    'lambda_l2': 1.0,                # L2 正則化（防止權重爆炸）
    'verbose': -1                    # 關閉訓練輸出
}

print(f"learning_rate: {params['learning_rate']}")
print(f"num_leaves: {params['num_leaves']}")
print(f"max_depth: {params['max_depth']}")


pred_hand   = cv_predict(X_train, y_hand,   params,   'binary',    None)

# params = {
#     'n_estimators': 1000,            # 最多迭代 1000 棵樹
#     'learning_rate': 0.07,           # 初始學習率（會使用動態調整）
#     'num_leaves': 63,                # 每棵樹的最大葉節點數
#     'max_depth': 0,                  # 每棵樹的最大深度（控制模型複雜度）
#     'feature_fraction': 0.8,         # 每棵樹隨機選用的特徵比例
#     'bagging_fraction': 0.8,         # 每棵樹隨機使用的資料比例
#     'bagging_freq': 5,               # 每5次 boosting 做一次 row sampling
#     'lambda_l1': 1.0,                # L1 正則化（加強稀疏性）
#     'lambda_l2': 1.0,                # L2 正則化（防止權重爆炸）
#     'verbose': -1                    # 關閉訓練輸出
# }

# print(f"learning_rate: {params['learning_rate']}")
# print(f"num_leaves: {params['num_leaves']}")
# print(f"max_depth: {params['max_depth']}")

# pred_years  = cv_predict(X_train, y_years,  params,   'multiclass',3)

# params = {
#     'n_estimators': 1000,            # 最多迭代 1000 棵樹
#     'learning_rate': 0.07,           # 初始學習率（會使用動態調整）
#     'num_leaves': 63,                # 每棵樹的最大葉節點數
#     'max_depth': 0,                  # 每棵樹的最大深度（控制模型複雜度）
#     'feature_fraction': 0.8,         # 每棵樹隨機選用的特徵比例
#     'bagging_fraction': 0.8,         # 每棵樹隨機使用的資料比例
#     'bagging_freq': 5,               # 每5次 boosting 做一次 row sampling
#     'lambda_l1': 1.0,                # L1 正則化（加強稀疏性）
#     'lambda_l2': 1.0,                # L2 正則化（防止權重爆炸）
#     'verbose': -1                    # 關閉訓練輸出
# }

# print(f"learning_rate: {params['learning_rate']}")
# print(f"num_leaves: {params['num_leaves']}")
# print(f"max_depth: {params['max_depth']}")

# pred_level  = cv_predict(X_train, y_level,  params,   'multiclass',4)

In [None]:
# origin code
def cv_predict(X, y, params, objective, num_class=None):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    # （此處改用全域預先讀好的 test_info）
    X_test = pd.DataFrame(
        [extract_features(os.path.join('./data/39_Test_Dataset/test_data',f"{r['unique_id']}.txt"))for _, r in test_info.iterrows()], # origin code
        # [extract_features_segment_avg(os.path.join('./data/39_Test_Dataset/test_data',f"{r['unique_id']}.txt")) for _, r in test_info.iterrows()], # change code(27feature avg)
        index=test_info['unique_id']
    )
    test_pred = (np.zeros((len(X_test), num_class))
                 if num_class else np.zeros(len(X_test)))

    aucs = []
    for fold, (ti, vi) in enumerate(skf.split(X, y)):
        X_tr, X_va = X.iloc[ti], X.iloc[vi]
        y_tr, y_va = np.array(y)[ti], np.array(y)[vi]

        clf = lgb.LGBMClassifier(**params)

        clf.set_params(objective=objective)
        if num_class:
            clf.set_params(num_class=num_class)
        # 根據任務選擇 eval_metric
        eval_metric = 'auc' if num_class is None else 'multi_logloss'
        
        clf.fit(
            X_tr, y_tr,
            eval_set=[(X_va, y_va)],
            eval_metric=eval_metric,
            callbacks=[
                lgb.early_stopping(stopping_rounds=50),
                lgb.log_evaluation(period=0)
            ]
        )


        va_pred = clf.predict_proba(X_va)
        fold_auc = (roc_auc_score(y_va, va_pred[:,1])
                    if num_class is None
                    else roc_auc_score(y_va, va_pred,
                                       multi_class='ovr', average='micro'))
        print(f"Fold{fold} AUC = {fold_auc:.6f}")
        aucs.append(fold_auc)

        if num_class:
            test_pred += clf.predict_proba(X_test) / skf.n_splits
        else:
            test_pred += clf.predict_proba(X_test)[:,1] / skf.n_splits


    print(f"Mean CV AUC = {np.mean(aucs):.6f}\n")
    return test_pred

In [None]:
# change code (27feature test)
def cv_predict(X, y, params, objective, num_class=None, test_info=None, test_folder='./data/39_Test_Dataset/test_data'):
    assert test_info is not None, "請提供 test_info"

    # 型別安全轉換
    X = X.apply(pd.to_numeric, errors='coerce').fillna(0)

    # 預先紀錄測試預測
    X_test_list = []
    test_id_list = []
    skipped_ids = []

    # for _, r in test_info.iterrows():
    for _, r in tqdm(test_info.iterrows(), total=len(test_info)):
        uid = r['unique_id']
        path = os.path.join(test_folder, f"{uid}.txt")
        try:
            segs = extract_features_segments(path)
        except:
            segs = []
        if not segs:
            skipped_ids.append(uid)
            continue
        X_test_list.extend(segs)
        test_id_list.extend([uid] * len(segs))

    X_test = pd.DataFrame(X_test_list).apply(pd.to_numeric, errors='coerce').fillna(0)

    # 對齊欄位
    common_cols = [col for col in X.columns if col in X_test.columns]
    if len(common_cols) == 0:
        raise ValueError("❌ 訓練與測試無共同特徵")
    X = X[common_cols]
    X_test = X_test[common_cols]

    # 建立模型
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    aucs, test_pred_all = [], []

    for fold, (ti, vi) in enumerate(skf.split(X, y)):
        X_tr, X_va = X.iloc[ti], X.iloc[vi]
        y_tr, y_va = np.array(y)[ti], np.array(y)[vi]

        clf = lgb.LGBMClassifier(**params)
        clf.set_params(objective=objective)
        if num_class: clf.set_params(num_class=num_class)

        clf.fit(
            X_tr, y_tr,
            eval_set=[(X_va, y_va)],
            eval_metric='auc' if num_class is None else 'multi_logloss',
            callbacks=[
                lgb.early_stopping(stopping_rounds=50),
                lgb.log_evaluation(period=0)
            ]
        )

        va_pred = clf.predict_proba(X_va)
        if num_class is None:
            auc = roc_auc_score(y_va, va_pred[:, 1])
        else:
            auc = roc_auc_score(y_va, va_pred, multi_class='ovr', average='micro')
        aucs.append(auc)
        print(f"Fold{fold} AUC = {auc:.6f}")

        test_pred_all.append(clf.predict_proba(X_test))

    # 平均所有 fold 的測試預測
    test_pred_avg = sum(test_pred_all) / len(test_pred_all)
    pred_df = pd.DataFrame(test_pred_avg)
    pred_df['unique_id'] = test_id_list

    # 根據 unique_id 群組統整每份檔案的預測
    final_pred_dict = {}

    # if num_class:
    #     for uid, group in pred_df.groupby('unique_id'):
    #         arr = group.drop(columns='unique_id').values
    #         top_class = np.argmax(np.bincount(np.argmax(arr, axis=1)))
    #         best_idx = np.argmax(arr[:, top_class])
    #         final_pred_dict[uid] = arr[best_idx]
    # else:
    #     for uid, group in pred_df.groupby('unique_id'):
    #         probs = group[0].values
    #         votes = (probs > 0.5).astype(int)
    #         maj_class = np.bincount(votes).argmax()
    #         best_idx = np.argmax(probs) if maj_class == 1 else np.argmin(probs)
    #         final_pred_dict[uid] = probs[best_idx]

    if num_class:
        for uid, group in pred_df.groupby('unique_id'):
            arr = group.drop(columns='unique_id').values
            final_pred_dict[uid] = arr.mean(axis=0)
    else:
        for uid, group in pred_df.groupby('unique_id'):
            probs = group[0].values
            final_pred_dict[uid] = probs.mean()


    # 遍歷 test_info 建立對應預測值，缺失檔案補預設值
    # pred_id = test_info['unique_id'].tolist()
    # if num_class:
    #     default_pred = [1.0 / num_class] * num_class
    #     final_pred = np.array([
    #         final_pred_dict.get(uid, default_pred)
    #         for uid in pred_id
    #     ])
    # else:
    #     final_pred = np.array([
    #         final_pred_dict.get(uid, 0.0)
    #         for uid in pred_id
    #     ])

    pred_id = test_info['unique_id'].tolist()

    if num_class:
        # 建立補值向量 = 所有預測值的平均（shape: [num_class]）
        all_preds = np.array(list(final_pred_dict.values()))
        avg_pred = all_preds.mean(axis=0)  # 各類平均機率向量

        final_pred = np.array([
            final_pred_dict.get(uid, avg_pred)
            for uid in pred_id
        ])
    else:
        # 建立補值 scalar = 所有預測的平均機率
        all_probs = np.array(list(final_pred_dict.values()))
        avg_prob = all_probs.mean()

        final_pred = np.array([
            final_pred_dict.get(uid, avg_prob)
            for uid in pred_id
        ])


    print(f"✅ 完成預測，共 {len(final_pred)} 筆，平均 AUC: {np.mean(aucs):.6f}")
    return final_pred, pred_id


In [None]:
params = { 
    'n_estimators': 1000,
    'learning_rate': 0.05,
    'num_leaves': 31,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'lambda_l1': 1.0,                # L1 正則化（加強稀疏性）
    'lambda_l2': 1.0,                # L2 正則化（防止權重爆炸）
    'verbose': -1
}

# origin code
# 執行四個任務的 CV
pred_gender = cv_predict(X_train, y_gender, params,   'binary',    None)
pred_hand   = cv_predict(X_train, y_hand,   params,   'binary',    None)
pred_years  = cv_predict(X_train, y_years,  params,   'multiclass',3)
pred_level  = cv_predict(X_train, y_level,  params,   'multiclass',4)

# # change code (27feature test)
# # 注意：test_info 是你之前已經用 pd.read_csv() 讀入的測試 meta 資訊
# pred_gender, pred_id       = cv_predict(X_train, y_gender, params, 'binary', None, test_info=test_info)
# pred_hand, _               = cv_predict(X_train, y_hand,   params, 'binary', None, test_info=test_info)
# pred_years, _              = cv_predict(X_train, y_years,  params, 'multiclass', 3, test_info=test_info)
# pred_level, _              = cv_predict(X_train, y_level,  params, 'multiclass', 4, test_info=test_info)

TypeError: XGBClassifier.fit() got an unexpected keyword argument 'early_stopping_rounds'

In [13]:
# XGboost
def cv_predict(X, y, params, objective, num_class=None):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # 準備測試資料
    X_test = pd.DataFrame(
        [extract_features(os.path.join('./data/39_Test_Dataset/test_data', f"{r['unique_id']}.txt"))
         for _, r in test_info.iterrows()],
        index=test_info['unique_id']
    )
    test_pred = (np.zeros((len(X_test), num_class)) if num_class else np.zeros(len(X_test)))

    aucs = []

    for fold, (ti, vi) in enumerate(skf.split(X, y)):
        X_tr, X_va = X.iloc[ti], X.iloc[vi]
        y_tr, y_va = np.array(y)[ti], np.array(y)[vi]

        # 建立分類器
        clf = XGBClassifier(**params)

        # 多分類設定
        if objective == 'multiclass':
            clf.set_params(objective='multi:softprob', num_class=num_class)
            eval_metric = 'mlogloss'
        else:
            clf.set_params(objective='binary:logistic')
            eval_metric = 'auc'

        # 訓練模型（相容 XGBoost 3.0.2）
        clf.fit(
            X_tr, y_tr,
            eval_set=[(X_va, y_va)]
        )

        # 預測與 AUC 計算
        va_pred = clf.predict_proba(X_va)
        fold_auc = (roc_auc_score(y_va, va_pred[:, 1])
                    if num_class is None
                    else roc_auc_score(y_va, va_pred, multi_class='ovr', average='micro'))
        print(f"Fold{fold} AUC = {fold_auc:.6f}")
        aucs.append(fold_auc)

        # 測試預測
        test_prob = clf.predict_proba(X_test)
        if num_class:
            test_pred += test_prob / skf.n_splits
        else:
            test_pred += test_prob[:, 1] / skf.n_splits

    print(f"Mean CV AUC = {np.mean(aucs):.6f}\n")
    return test_pred

params = { 
    'n_estimators': 1000,
    'learning_rate': 0.05,
    'max_depth': 7,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 1.0,
    'reg_lambda': 1.0,
    'use_label_encoder': False,
    'verbosity': 0
}

pred_gender = cv_predict(X_train, y_gender, params,   'binary',    None)
pred_hand   = cv_predict(X_train, y_hand,   params,   'binary',    None)
pred_years  = cv_predict(X_train, y_years,  params,   'multiclass',3)
pred_level  = cv_predict(X_train, y_level,  params,   'multiclass',4)


[0]	validation_0-logloss:0.42541
[1]	validation_0-logloss:0.40261
[2]	validation_0-logloss:0.38068
[3]	validation_0-logloss:0.36134
[4]	validation_0-logloss:0.34484
[5]	validation_0-logloss:0.33224
[6]	validation_0-logloss:0.32092
[7]	validation_0-logloss:0.30782
[8]	validation_0-logloss:0.29240
[9]	validation_0-logloss:0.27795
[10]	validation_0-logloss:0.26623
[11]	validation_0-logloss:0.25499
[12]	validation_0-logloss:0.24554
[13]	validation_0-logloss:0.23587
[14]	validation_0-logloss:0.22644
[15]	validation_0-logloss:0.21811
[16]	validation_0-logloss:0.21073
[17]	validation_0-logloss:0.20337
[18]	validation_0-logloss:0.19642
[19]	validation_0-logloss:0.19007
[20]	validation_0-logloss:0.18403
[21]	validation_0-logloss:0.17765
[22]	validation_0-logloss:0.17201
[23]	validation_0-logloss:0.16602
[24]	validation_0-logloss:0.16117
[25]	validation_0-logloss:0.15691
[26]	validation_0-logloss:0.15148
[27]	validation_0-logloss:0.14706
[28]	validation_0-logloss:0.14312
[29]	validation_0-loglos

In [14]:
sub = pd.DataFrame({
    'unique_id':           test_info['unique_id'],
    'gender':              pred_gender,
    'hold racket handed':  pred_hand,
    # 'gender':              1 - pred_gender,
    # 'hold racket handed':  1 - pred_hand,
    'play years_0':        pred_years[:,0],
    'play years_1':        pred_years[:,1],
    'play years_2':        pred_years[:,2],
    'level_2':             pred_level[:,0],
    'level_3':             pred_level[:,1],
    'level_4':             pred_level[:,2],
    'level_5':             pred_level[:,3],
})
sub.to_csv("./output/XGboost_submission1.csv", index=False, float_format="%.6f")
print("✅ 已完成 submission.csv 輸出")

✅ 已完成 submission.csv 輸出
