In [2]:
import pandas as pd
import numpy as np
import optuna
from sklearn.metrics import roc_auc_score

DATA_PATH = "../../data/raw/"
OOF_PATH = "./oof_data/"

# 1. 데이터 로드
df = pd.read_csv(DATA_PATH + "train.csv")

# 2. 인덱스 동기화 작업 (중요!)
# m1, m2, m4 OOF가 생성될 때 이미 'familysize > 50'이 제거된 상태라면 (길이 45529)
# 우리 데이터프레임도 똑같이 맞춰야 합니다.
df = df[df['familysize'] <= 50].reset_index(drop=True) 

# 이제 df의 길이는 45529가 되었고, 인덱스는 0~45528이 됩니다.
# 여기서 m5의 추가 조건인 'straight-lining' 필터링을 수행합니다.
a_cols = [col for col in df.columns if col.endswith('A') and col.startswith('Q')]
mask = df[a_cols].std(axis=1) > 0
valid_indices = df[mask].index # 이제 이 인덱스는 45529 미만의 값만 가집니다.

# 정답(y) 추출
train_y = (2 - df['voted'].to_numpy()).astype(np.float32)
train_y_filtered = train_y[valid_indices]

# 3. 모델 OOF 로드 및 필터링
m1_oof = np.load(OOF_PATH + "exp11_sota_AUC_0.77212.npy")[valid_indices]
m2_oof = np.load(OOF_PATH + "exp12_auc_AUC_0.77233.npy")[valid_indices]
m4_oof = np.load(OOF_PATH + "exp15_snn_AUC_0.77330.npy")[valid_indices]
m5_oof = np.load(OOF_PATH + "exp20_m5_FE_AUC_0.77338.npy") # m5는 이미 45490 정도의 길이일 것임

# 4. Optuna 목적 함수 정의
def objective(trial):
    w1 = trial.suggest_float('w1', 0, 1)
    w2 = trial.suggest_float('w2', 0, 1)
    w4 = trial.suggest_float('w4', 0, 1)
    w5 = trial.suggest_float('w5', 0, 1)
    
    total = w1 + w2 + w4 + w5
    w1 /= total; w2 /= total; w4 /= total; w5 /= total
    
    ensemble_pred = (w1 * m1_oof) + (w2 * m2_oof) + (w4 * m4_oof) + (w5 * m5_oof)
    return roc_auc_score(train_y_filtered, ensemble_pred)

# 5. 최적화 실행
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=200)

print("="*50)
print(f"최적 가중치")
for key, value in study.best_params.items():
    print(f"{key}: {value:.4f}")
print(f"최적화된 로컬 AUC: {study.best_value:.5f}")
print("="*50)

[I 2026-02-01 22:59:00,230] A new study created in memory with name: no-name-2b99cdbd-c202-4da8-b6d8-fb42bc129448
[I 2026-02-01 22:59:00,244] Trial 0 finished with value: 0.7733034364786288 and parameters: {'w1': 0.7772411469509901, 'w2': 0.9724779117222011, 'w4': 0.08899270622273692, 'w5': 0.42788317846809454}. Best is trial 0 with value: 0.7733034364786288.
[I 2026-02-01 22:59:00,256] Trial 1 finished with value: 0.7732437892547516 and parameters: {'w1': 0.526225165530029, 'w2': 0.8693571158985949, 'w4': 0.16960591742069464, 'w5': 0.21564999578444655}. Best is trial 0 with value: 0.7733034364786288.
[I 2026-02-01 22:59:00,268] Trial 2 finished with value: 0.7735664922800477 and parameters: {'w1': 0.8137908507093575, 'w2': 0.7332137514576836, 'w4': 0.4657057478873199, 'w5': 0.4002929431805361}. Best is trial 2 with value: 0.7735664922800477.
[I 2026-02-01 22:59:00,280] Trial 3 finished with value: 0.7733964992520141 and parameters: {'w1': 0.5689638210119665, 'w2': 0.37419804779157007,

최적 가중치
w1: 0.0210
w2: 0.0913
w4: 0.9788
w5: 0.5170
최적화된 로컬 AUC: 0.77437
