In [None]:
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

# 데이터 불러오기
train = pd.read_parquet('상관계수 적용 all_data.parquet')

# 1단계: 학습용 / 검증용 분리
train_data, valid_data = train_test_split(train, test_size=0.2, random_state=42, stratify=train['Segment'])

# 2단계: 학습용 데이터를 AB와 CDE로 분리
ab_train = train_data[train_data['Segment'].isin(['A', 'B'])].copy()
cde_train = train_data[train_data['Segment'].isin(['C', 'D', 'E'])].copy()

# 3단계: 각 그룹별로 모델 학습 (CatBoostClassifier)

# AB 그룹
X_ab = ab_train.drop(columns=['ID', 'Segment', '기준년월'])
y_ab = ab_train['Segment']

# 범주형 컬럼 자동 탐색
cat_features_ab = X_ab.select_dtypes(include='object').columns.tolist()

model_ab = CatBoostClassifier(verbose=0, random_state=42, task_type="GPU")
model_ab.fit(X_ab, y_ab, cat_features=cat_features_ab)

# CDE 그룹
X_cde = cde_train.drop(columns=['ID', 'Segment', '기준년월'])
y_cde = cde_train['Segment']

cat_features_cde = X_cde.select_dtypes(include='object').columns.tolist()

model_cde = CatBoostClassifier(verbose=0, random_state=42, task_type="GPU")
model_cde.fit(X_cde, y_cde, cat_features=cat_features_cde)

In [None]:
from sklearn.metrics import classification_report, accuracy_score
import numpy as np
import pandas as pd

# 1. 검증용 입력 데이터 분리
X_valid = valid_data.drop(columns=['ID', 'Segment', '기준년월'])
y_true = valid_data['Segment'].values

# 2. 범주형 컬럼 식별 및 object로 유지
cat_features_valid = X_valid.select_dtypes(include='object').columns.tolist()
for col in cat_features_valid:
    X_valid[col] = X_valid[col].astype('object')  # 문자열형 유지가 핵심

# 3. AB 모델 예측 확률 (A/B만)
proba_ab = pd.DataFrame(model_ab.predict_proba(X_valid), columns=model_ab.classes_)

# 4. CDE 모델 예측 확률 (C/D/E만)
proba_cde = pd.DataFrame(model_cde.predict_proba(X_valid), columns=model_cde.classes_)

# 5. A~E 모든 클래스를 포함하는 확률 행렬 생성
all_classes = ['A', 'B', 'C', 'D', 'E']
proba_combined = pd.DataFrame(0, index=X_valid.index, columns=all_classes)

# # AB 예측 확률 삽입
# for col in proba_ab.columns:
#     proba_combined[col] = proba_ab[col]

# # CDE 예측 확률 삽입
# for col in proba_cde.columns:
#     proba_combined[col] = proba_cde[col]

# AB 예측 확률 삽입
for col in proba_ab.columns:
    proba_combined.loc[:, col] = proba_ab[col].values

# CDE 예측 확률 삽입
for col in proba_cde.columns:
    proba_combined.loc[:, col] = proba_cde[col].values

# 6. 최종 예측: 가장 확률 높은 클래스로 선택 (→ 문자열 배열로 명확하게 변환)
y_pred = proba_combined.idxmax(axis=1).values.astype(str)

In [None]:
# 7. 전체 성능 평가 출력
print("✅ 전체 정확도:", accuracy_score(y_true, y_pred))
print("\n📊 Classification Report:")
print(classification_report(y_true, y_pred))

In [None]:
from sklearn.metrics import f1_score

# micro F1 score 계산
micro_f1 = f1_score(y_true, y_pred, average='micro')
print(f"\n🔎 Micro F1 Score: {micro_f1:.4f}")

In [None]:
# 8. 클래스별 정확도 출력
print("\n🎯 클래스별 정확도:")
for label in sorted(np.unique(y_true)):
    true_mask = (y_true == label)
    correct = (y_pred[true_mask] == label).sum()
    total = true_mask.sum()
    acc = correct / total
    print(f" - {label}: {acc:.4f} ({acc * 100:.2f}%)")