In [6]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score

# 1. 경로 설정
DATA_PATH = "../../data/raw/"
OOF_PATH = "./oof_data/"
SUB_PATH = "./submissions/"

# 2. 정답 및 인덱스 정렬
train_df = pd.read_csv(f"{DATA_PATH}train.csv")
m7_mask = (train_df.familysize <= 50)
a_cols = [col for col in train_df.columns if col.endswith('A') and col.startswith('Q')]
m7_mask &= (train_df[a_cols].std(axis=1) > 0)
m7_indices = train_df[m7_mask].index
# train_y: 1 = Voted, 0 = Not Voted (m1과 동일한 매핑)
train_y = (2 - train_df.loc[m7_indices, 'voted'].to_numpy()).astype(np.float32)

# 3. OOF 로드 (둘 다 진단 결과 Large = Voted이므로 그대로 사용)
m1_oof_raw = np.load(f"{OOF_PATH}exp11_sota_AUC_0.77212.npy")
m1_indices = train_df[train_df.familysize <= 50].index
m1_oof_df = pd.DataFrame(m1_oof_raw, index=m1_indices, columns=['val'])
m1_oof = m1_oof_df.reindex(m7_indices)['val'].values

m7_oof = np.load(f"{OOF_PATH}exp25_m7_Refined_AUC_0.77375.npy")

# 4. Rank Averaging (정방향: 큰 값이 투표자)
m1_rank = pd.Series(m1_oof).rank(pct=True).values
m7_rank = pd.Series(m7_oof).rank(pct=True).values

# 상관계수 확인 (이제 양수가 나와야 함)
correlation = np.corrcoef(m1_rank, m7_rank)[0, 1]

# 5. 앙상블 및 로컬 검증
ensemble_rank = (m1_rank + m7_rank) / 2
# 큰 랭크(1에 가까움)가 Voted이므로 그대로 AUC 계산
final_auc = roc_auc_score(train_y, ensemble_rank)

print("-" * 50)
print(f"Correlation between m1 and m7 (Raw): {correlation:.4f}")
print(f"Final Local AUC: {final_auc:.5f}")
print("-" * 50)

# 6. 테스트 데이터 앙상블 (m1의 제출 파일 특성 반영)
m1_test = pd.read_csv(f"{SUB_PATH}0130-1923.csv")['voted'] # Small = Voted (1.1~1.9)
m7_test = pd.read_csv(f"{SUB_PATH}m7_Refined_test_preds.csv")['voted'] # Large = Voted (0.9~0.1)

# m1_test를 다시 정방향(Large = Voted)으로 복구하여 m7과 결합
m1_test_fixed = 2.0 - m1_test 
m1_t_rank = m1_test_fixed.rank(pct=True).values
m7_t_rank = m7_test.rank(pct=True).values

final_test_rank = (m1_t_rank + m7_t_rank) / 2

# 최종 제출 파일: m1의 기준(Small = Voted)에 맞춰 2.0에서 뺀 값을 저장
submission = pd.read_csv(f"{DATA_PATH}sample_submission.csv")
# 2.0 - 0.99(높은랭크) = 1.01 (투표자)
# 2.0 - 0.01(낮은랭크) = 1.99 (비투표자)
submission['voted'] = 2.0 - final_test_rank 

output_name = f"{SUB_PATH}26_m1_m7_Rank_Ensemble_{final_auc:.5f}.csv"
submission.to_csv(output_name, index=False)
np.save(f"{OOF_PATH}exp26_m1_m7_RankBlend_AUC_{final_auc:.5f}.npy", ensemble_rank)

print(f"OOF Saved: exp26_m1_m7_RankBlend_AUC_{final_auc:.5f}.npy")
print(f"Submission Created: {output_name}")

--------------------------------------------------
Correlation between m1 and m7 (Raw): 0.9852
Final Local AUC: 0.77364
--------------------------------------------------
OOF Saved: exp26_m1_m7_RankBlend_AUC_0.77364.npy
Submission Created: ./submissions/26_m1_m7_Rank_Ensemble_0.77364.csv


In [4]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score

# 1. 정답 로드 (m7 필터링 기준)
DATA_PATH = "../../data/raw/"
train_df = pd.read_csv(f"{DATA_PATH}train.csv")
m7_mask = (train_df.familysize <= 50)
a_cols = [col for col in train_df.columns if col.endswith('A') and col.startswith('Q')]
m7_mask &= (train_df[a_cols].std(axis=1) > 0)
m7_indices = train_df[m7_mask].index
# train_y: 1 = Voted, 0 = Not Voted
train_y = (2 - train_df.loc[m7_indices, 'voted'].to_numpy()).astype(np.float32)

# 2. OOF 로드 및 정렬
m1_oof_raw = np.load("./oof_data/exp11_sota_AUC_0.77212.npy")
m1_indices = train_df[train_df.familysize <= 50].index
m1_oof = pd.DataFrame(m1_oof_raw, index=m1_indices).reindex(m7_indices).values.flatten()

m7_oof = np.load("./oof_data/exp25_m7_Refined_AUC_0.77375.npy").flatten()

# 3. 방향 판정 (AUC가 0.5보다 크면 '큰 값이 Voted')
auc_m1 = roc_auc_score(train_y, m1_oof)
auc_m7 = roc_auc_score(train_y, m7_oof)

print("-" * 50)
print(f"m1 원본 OOF AUC (정방향): {auc_m1:.5f}")
print(f"m7 원본 OOF AUC (정방향): {auc_m7:.5f}")
print("-" * 50)

if auc_m1 > 0.5: print("m1: 큰 값이 투표(Voted)")
else: print("m1: 작은 값이 투표(Voted)")

if auc_m7 > 0.5: print("m7: 큰 값이 투표(Voted)")
else: print("m7: 작은 값이 투표(Voted)")
print("-" * 50)

--------------------------------------------------
m1 원본 OOF AUC (정방향): 0.77196
m7 원본 OOF AUC (정방향): 0.77375
--------------------------------------------------
m1: 큰 값이 투표(Voted)
m7: 큰 값이 투표(Voted)
--------------------------------------------------
