In [3]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score

# 1. 기준 인덱스 설정 (m7/Exp 40에서 사용한 동일한 클리닝 기준)
train_df = pd.read_csv('../../data/raw/train.csv')
a_cols = [col for col in train_df.columns if col.endswith('A') and col.startswith('Q')]
m7_mask = (train_df['familysize'] <= 50) & (train_df[a_cols].std(axis=1) > 0)
m7_indices = train_df[m7_mask].index # 45,490개의 원래 인덱스

# 정답지 (45,490개)
train_y = (2 - train_df.loc[m7_indices, 'voted'].to_numpy()).astype(np.float32)

# 2. OOF 로드 및 리인덱싱으로 정렬 강제화
# Exp 26 (이미 45,490개지만, 인덱스를 부여해서 확실히 함)
oof26_raw = np.load('./oof_data/exp26_m1_m7_RankBlend_AUC_0.77364.npy').flatten()
oof26_series = pd.Series(oof26_raw, index=m7_indices)

# Exp 40 (46,628개 중 앞부분 45,490개만 추출하여 m7_indices 부여)
oof40_raw = np.load('./oof_data/exp40_pure_pl_5pct_AUC_0.77376.npy').flatten()
# Exp 40은 reset_index를 했으므로 앞부분만 슬라이싱 후 m7_indices를 주입
oof40_series = pd.Series(oof40_raw[:len(m7_indices)], index=m7_indices)

# 3. 랭크 변환
rank26 = oof26_series.rank(pct=True)
rank40 = oof40_series.rank(pct=True)

# 4. 가중치 시뮬레이션
print(f"{'Weight (26:40)':<15} | {'Local Ensemble AUC':<15}")
print('-' * 40)
for w in [0.9, 0.85, 0.8, 0.75, 0.7, 0.6, 0.5]:
    blended = (rank26 * w) + (rank40 * (1-w))
    auc = roc_auc_score(train_y, blended)
    print(f"{w:.2f} : {1-w:.2f}         | {auc:.5f}")

Weight (26:40)  | Local Ensemble AUC
----------------------------------------
0.90 : 0.10         | 0.77380
0.85 : 0.15         | 0.77387
0.80 : 0.20         | 0.77392
0.75 : 0.25         | 0.77397
0.70 : 0.30         | 0.77401
0.60 : 0.40         | 0.77408
0.50 : 0.50         | 0.77411


In [4]:
import pandas as pd
import numpy as np

# 1. 파일 로드
SUB_PATH = './submissions/'
DATA_PATH = '../../data/raw/'

# Exp 26: 현재 메인 SOTA (0.78151) / 서브 SOTA (0.78296)
exp26 = pd.read_csv(f'{SUB_PATH}26_m1_m7_Rank_Ensemble_0.77364.csv')
# Exp 40: 새로운 로컬 SOTA (0.77376)
exp40 = pd.read_csv(f'{SUB_PATH}40_Pure_PL_5pct_AUC_0.77376.csv')

# 2. Rank Scaling
rank26 = exp26['voted'].rank(pct=True)
rank40 = exp40['voted'].rank(pct=True)

# 3. 8:2 가중치 결합 (노이즈 억제 및 안정성 확보)
final_rank = (rank26 * 0.8) + (rank40 * 0.2)

# 4. 결과 저장
submission = pd.read_csv(f'{DATA_PATH}sample_submission.csv')
submission['voted'] = exp26['voted'].sort_values().values[
    (final_rank * (len(final_rank) - 1)).astype(int)
]

output_name = f'{SUB_PATH}42_Master_Ensemble_82.csv'
submission.to_csv(output_name, index=False)
print(f'Exp 42: Master Ensemble (8:2) Created -> {output_name}')

Exp 42: Master Ensemble (8:2) Created -> ./submissions/42_Master_Ensemble_82.csv
