In [None]:
import pandas as pd
import random
import os
import numpy as np

from google.colab import drive
drive.mount('/content/drive')

train_path = '/content/drive/MyDrive/블루문/신용카드 세그먼트/card_train.csv'
test_path = '/content/drive/MyDrive/블루문/신용카드 세그먼트/card_test.csv'

train_df = pd.read_csv(train_path)
train_df.shape

#시계열 데이터
for i in train_df.columns:
  print(i)

X = train_df.drop(columns=['ID', 'Unnamed: 0.1', 'Segment.1', 'Segment'])
Y = train_df['Segment']

금융 정보 & 연체/잔액 관련
신용 위험 & 금융 습관 분석

In [None]:
import pandas as pd
# 1) 파일 경로 지정
train_path = '/content/drive/MyDrive/블루문/신용카드 세그먼트/card_train.csv'
test_path  = '/content/drive/MyDrive/블루문/신용카드 세그먼트/card_test.csv'

# 2) 데이터 로드
train = pd.read_csv(train_path)
test  = pd.read_csv(test_path)

# 3) 금융정보_연체잔액 키워드 정의
keywords = [
    '한도','이자율','RV약정청구율','RV최소결제비율',
    '잔액','평잔','변동률','연체','RP'
]

# 4) train에서 키워드 매칭 컬럼 추출
financial_train = [c for c in train.columns if any(k in c for k in keywords)]

# 5) test에도 동일한 컬럼이 있는지 확인 (train↔test 간 mismatch 방지)
financial_common = [c for c in financial_train if c in test.columns]

print(f"매칭된 공통 컬럼 수: {len(financial_common)}개")
print(financial_common)

In [None]:
import pandas as pd

# 1. 데이터 로드
train_path = '/content/drive/MyDrive/블루문/신용카드 세그먼트/card_train.csv'
test_path = '/content/drive/MyDrive/블루문/신용카드 세그먼트/card_test.csv'
train = pd.read_csv(train_path)

# 2. 금융정보_연체 관련 키워드로 컬럼 필터링
keywords = ['한도','이자율','RV약정청구율','RV최소결제비율','잔액','평잔','변동률','연체','RP']
financial_common = [col for col in train.columns if any(k in col for k in keywords)]

# 3. 결측치 개수 및 비율 계산
missing_count = train[financial_common].isna().sum()
missing_pct = (missing_count / len(train)) * 100

# 4. 데이터프레임으로 정리
missing_df = pd.DataFrame({
    'missing_count': missing_count,
    'missing_pct(%)': missing_pct.round(2)
}).sort_values(by='missing_pct(%)', ascending=False)

# 5. 출력
print(f"총 샘플 수: {len(train)}")
print(missing_df.head(20))


In [None]:
# 연체일자_B0M -> 연체 없음 0, 연체 있음 1 값으로 대체
train['is_overdue_B0M'] = train['연체일자_B0M'].notnull().astype(int)
print(train['is_overdue_B0M'].value_counts())

# 기존 컬럼 필터링된 데이터 + 연체일자 파생 변수
x_train = train[financial_common + ['is_overdue_B0M']].copy()

print(x_train.shape)
x_train.head()

In [None]:
# object 확인
x_train.dtypes.value_counts()

In [None]:
from sklearn.preprocessing import LabelEncoder

# 1. object 타입 컬럼 추출
object_cols = x_train.select_dtypes(include='object').columns
print(f"인코딩 대상 컬럼: {list(object_cols)}")

# 2. 각 컬럼별 LabelEncoder 적용
le = LabelEncoder()
for col in object_cols:
    x_train[col] = le.fit_transform(x_train[col].astype(str))  # NaN 방지 위해 문자열로 처리

print(x_train[object_cols].head())

y_train = train['Segment']
print(y_train.value_counts())


train set 훈련/검증 분리

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# 1. 타깃 이진화: E → 1, 나머지(A~D) → 0
y_binary = (y_train == 'E').astype(int)

# 2. 학습/검증 세트 분리 (전체의 20%를 검증용)
X_tr, X_val, y_tr, y_val = train_test_split(
    x_train, y_binary,
    test_size=0.2,
    stratify=y_binary,
    random_state=42
)

# 3. 모델 정의 및 학습 (클래스 불균형 보정)
clf = RandomForestClassifier(
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)
clf.fit(X_tr, y_tr)

# 4. 예측
y_pred = clf.predict(X_val)

# 5. 성능 평가
print(" [Classification Report]")
print(classification_report(y_val, y_pred, target_names=['Non-E', 'E']))

print("\n [Confusion Matrix]")
print(confusion_matrix(y_val, y_pred))


2단계 멀티클래스 분류 모델 학습 (A/B/C/D)
여기서 a,b예측 실패 + c,d 불균형으로 d가 많아 편향됨

In [None]:
# 1단계 분류용 E 여부 레이블 생성
y_binary = (y_train == 'E').astype(int)

# ⛳ E가 아닌 데이터만 추출 (1단계 모델에서 Non-E에 해당)
non_e_mask = y_train != 'E'
x_train_non_e = x_train[non_e_mask]        # 설명변수
y_train_non_e = y_train[non_e_mask]        # 원래 다중 클래스 레이블 (A, B, C, D 중 하나)

# 이후 2단계 분류 수행
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# 학습/검증 분리
X2_tr, X2_val, y2_tr, y2_val = train_test_split(
    x_train_non_e, y_train_non_e,
    test_size=0.2,
    stratify=y_train_non_e,
    random_state=42
)

# 모델 학습
clf_multi = RandomForestClassifier(
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)
clf_multi.fit(X2_tr, y2_tr)

# 평가
y2_pred = clf_multi.predict(X2_val)

print("[2단계 Classification Report]")
print(classification_report(y2_val, y2_pred))
print("[2단계 Confusion Matrix]")
print(confusion_matrix(y2_val, y2_pred))

a/b 를 ab로 병합

학습/검증 세트 분리 (d,c,ab)
모델 학습 및 평가 (randomforest)

In [None]:
# A/B를 'AB'로 병합
y_train_non_e_3class = y_train_non_e.replace({'A': 'AB', 'B': 'AB'})
print(y_train_non_e_3class.value_counts())

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# 분리
X3_tr, X3_val, y3_tr, y3_val = train_test_split(
    x_train_non_e, y_train_non_e_3class,
    test_size=0.2,
    stratify=y_train_non_e_3class,
    random_state=42
)

# 모델 정의
clf_3class = RandomForestClassifier(
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

# 학습
clf_3class.fit(X3_tr, y3_tr)

# 예측
y3_pred = clf_3class.predict(X3_val)

# 평가
print("[3분류 Classification Report]")
print(classification_report(y3_val, y3_pred))

print("\n[3분류 Confusion Matrix]")
print(confusion_matrix(y3_val, y3_pred))



LightGBM으로 모델교체

In [None]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# 1. 데이터: 기존 Non-E 대상 + AB 병합 완료된 타깃
# x_train_non_e, y_train_non_e_3class

# 2. 학습/검증 분리
X3_tr, X3_val, y3_tr, y3_val = train_test_split(
    x_train_non_e, y_train_non_e_3class,
    test_size=0.2,
    stratify=y_train_non_e_3class,
    random_state=42
)

# 3. 모델 정의
lgbm = LGBMClassifier(
    class_weight='balanced',   # 클래스 불균형 대응
    n_estimators=200,
    max_depth=7,
    random_state=42,
    n_jobs=-1
)

# 4. 학습
lgbm.fit(X3_tr, y3_tr)

# 5. 예측
y3_pred_lgbm = lgbm.predict(X3_val)

# 6. 평가
print("[3-Class Classification Report - LightGBM]")
print(classification_report(y3_val, y3_pred_lgbm))

print("\n[Confusion Matrix - LightGBM]")
print(confusion_matrix(y3_val, y3_pred_lgbm))

XGBoost로 모델교체

In [None]:
from sklearn.preprocessing import LabelEncoder

# 1. 타깃값 레이블 인코딩 (AB/C/D → 0/1/2)
le_y3 = LabelEncoder()
y3_tr_encoded = le_y3.fit_transform(y3_tr)
y3_val_encoded = le_y3.transform(y3_val)

# 2. XGBoost 모델 학습
from xgboost import XGBClassifier

xgb_clf = XGBClassifier(
    objective='multi:softmax',
    num_class=3,
    eval_metric='mlogloss',
    use_label_encoder=False,
    random_state=42,
    n_jobs=-1
)

xgb_clf.fit(X3_tr, y3_tr_encoded)

# 3. 예측
y3_pred_xgb = xgb_clf.predict(X3_val)

# 4. 디코딩 (숫자 → AB/C/D)
y3_pred_decoded = le_y3.inverse_transform(y3_pred_xgb)

# 5. 평가
from sklearn.metrics import classification_report, confusion_matrix

print("[3-Class Classification Report - XGBoost]")
print(classification_report(y3_val, y3_pred_decoded))

print("\n[Confusion Matrix - XGBoost]")
print(confusion_matrix(y3_val, y3_pred_decoded))

In [None]:
!apt-get -qq install -y fonts-nanum

In [None]:
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import matplotlib as mpl

# ✅ 설치된 경로 직접 지정
font_path = '/usr/share/fonts/truetype/nanum/NanumGothic.ttf'
font_name = fm.FontProperties(fname=font_path).get_name()

# ✅ 전역 설정
plt.rcParams['font.family'] = font_name
plt.rcParams['axes.unicode_minus'] = False

# ✅ 테스트
plt.figure(figsize=(6, 4))
plt.title('한글 폰트 테스트')
plt.plot([1, 2, 3], [1, 4, 9])
plt.xlabel('X축')
plt.ylabel('Y축')
plt.grid(True)
plt.show()

RF모델의 FI분석

In [None]:
!apt-get -qq install -y fonts-nanum
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import seaborn as sns

plt.rcParams['font.family'] = 'NanumGothic'
plt.rcParams['axes.unicode_minus'] = False

# ✅ Feature importance 추출
importances = clf_3class.feature_importances_
feature_names = X3_tr.columns
fi_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})

# ✅ 중요도 순 정렬
fi_df = fi_df.sort_values(by='Importance', ascending=False).head(30)

# ✅ 시각화
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=fi_df)
plt.title('Top 30 Feature Importances (RandomForest)')
plt.tight_layout()
plt.show()

# ✅ 주요 변수 리스트 출력
print("주요 변수 상위 30개:")
for i, feature in enumerate(fi_df['Feature'].values, start=1):
    print(f"{i}. {feature}")

lightGBM의 FI분석

In [None]:
# ✅ 1. Feature Importance 추출
lgb_importances = lgbm.feature_importances_
lgb_feature_names = X3_tr.columns

lgb_fi_df = pd.DataFrame({
    'Feature': lgb_feature_names,
    'Importance': lgb_importances
})

# ✅ 2. 중요도 순 정렬
lgb_fi_df = lgb_fi_df.sort_values(by='Importance', ascending=False).head(30)

# ✅ 3. 주요 변수 리스트 출력 (번호 포함)
print("LightGBM 중요 변수 Top 30 리스트:")
for i, feature in enumerate(lgb_fi_df['Feature'].values, start=1):
    print(f"{i}. {feature}")

# ✅ 4. 시각화
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=lgb_fi_df)
plt.title('Top 30 Feature Importances (LightGBM)')
plt.tight_layout()
plt.show()

XGBoost의 FI분석

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# ✅ 1. Feature Importance 추출
xgb_importances = xgb_clf.feature_importances_
xgb_feature_names = X3_tr.columns

xgb_fi_df = pd.DataFrame({
    'Feature': xgb_feature_names,
    'Importance': xgb_importances
})

# ✅ 2. 중요도 순 정렬
xgb_fi_df = xgb_fi_df.sort_values(by='Importance', ascending=False).head(30)

# ✅ 3. 주요 변수 리스트 출력 (번호 포함)
print("XGBoost 중요 변수 Top 30 리스트:")
for i, feature in enumerate(xgb_fi_df['Feature'].values, start=1):
    print(f"{i}. {feature}")

# ✅ 4. 시각화
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=xgb_fi_df)
plt.title('Top 30 Feature Importances (XGBoost)')
plt.tight_layout()
plt.show()

3가지 모델의 FI 분석

In [None]:
# 📌 1. 세 모델의 Top 30 중요 변수 리스트
rf_top_features = fi_df['Feature'].head(30).tolist()
xgb_top_features = xgb_fi_df['Feature'].head(30).tolist()
lgb_top_features = lgb_fi_df['Feature'].head(30).tolist()

# 📌 2. 공통 변수
common_all = list(set(rf_top_features) & set(xgb_top_features) & set(lgb_top_features))
common_rf_xgb = list(set(rf_top_features) & set(xgb_top_features) - set(lgb_top_features))
common_rf_lgb = list(set(rf_top_features) & set(lgb_top_features) - set(xgb_top_features))
common_xgb_lgb = list(set(xgb_top_features) & set(lgb_top_features) - set(rf_top_features))

# 📌 3. 고유 변수
rf_only = list(set(rf_top_features) - set(xgb_top_features) - set(lgb_top_features))
xgb_only = list(set(xgb_top_features) - set(rf_top_features) - set(lgb_top_features))
lgb_only = list(set(lgb_top_features) - set(rf_top_features) - set(xgb_top_features))

# 📌 4. 결과 출력
print("🔗 [공통 중요 변수 - RF, XGB, LGB]")
for i, feat in enumerate(common_all, 1):
    print(f"{i}. {feat}")

print("\n🔗 [공통 중요 변수 - RF & XGB]")
for i, feat in enumerate(common_rf_xgb, 1):
    print(f"{i}. {feat}")

print("\n🔗 [공통 중요 변수 - RF & LGB]")
for i, feat in enumerate(common_rf_lgb, 1):
    print(f"{i}. {feat}")

print("\n🔗 [공통 중요 변수 - XGB & LGB]")
for i, feat in enumerate(common_xgb_lgb, 1):
    print(f"{i}. {feat}")

print("\n❗ [RF 단독 중요 변수]")
for i, feat in enumerate(rf_only, 1):
    print(f"{i}. {feat}")

print("\n❗ [XGB 단독 중요 변수]")
for i, feat in enumerate(xgb_only, 1):
    print(f"{i}. {feat}")

print("\n❗ [LGB 단독 중요 변수]")
for i, feat in enumerate(lgb_only, 1):
    print(f"{i}. {feat}")

top30 변수 리스트


In [None]:
# 각 모델의 FI 상위 30개 변수 리스트
rf_top30 = fi_df['Feature'].head(30).tolist()
xgb_top30 = xgb_fi_df['Feature'].head(30).tolist()
lgb_top30 = lgb_fi_df['Feature'].head(30).tolist()

# 전체에서 빈도 기반으로 가장 많이 등장한 Top 30 변수 선정
from collections import Counter

# 변수들 모아서 빈도 계산
all_top_features = rf_top30 + xgb_top30 + lgb_top30
top_30_features = [item[0] for item in Counter(all_top_features).most_common(30)]

3가지 모델 FI분석 - RF

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import pandas as pd

# ✅ 실제 존재하는 컬럼으로 필터링
top_30_features_valid = [col for col in top_30_features if col in X3_tr.columns]

# ✅ 데이터 추출
X3_tr_top30 = X3_tr[top_30_features_valid]
X3_val_top30 = X3_val[top_30_features_valid]

# ✅ 모델 정의 및 학습
rf_top30 = RandomForestClassifier(class_weight='balanced', random_state=42, n_jobs=-1)
rf_top30.fit(X3_tr_top30, y3_tr)

# ✅ 예측
y_pred_top30 = rf_top30.predict(X3_val_top30)

# ✅ 성능 평가
report_top30 = classification_report(y3_val, y_pred_top30, output_dict=True)
report_df = pd.DataFrame(report_top30).transpose()

# ✅ 전체 지표 출력
print("[Top 30 Feature 기반 RandomForest 성능 평가]")
display(report_df.round(3))  # 소수점 3자리로 보기 좋게 출력

# ✅ 정확도 및 f1-score 시각화용 데이터 구성
scores = {
    'accuracy': report_df.loc['accuracy']['precision'],
    'f1-score': report_df.loc['weighted avg']['f1-score']
}
score_df = pd.DataFrame(list(scores.items()), columns=['Metric', 'Score'])

# ✅ 시각화
plt.figure(figsize=(6, 4))
sns.barplot(data=score_df, x='Metric', y='Score', palette='Blues_d')
plt.ylim(0, 1)
plt.title('Top 30 Feature 기반 RandomForest 성능')
plt.tight_layout()
plt.show()

3가지 모델 FI분석 - XGBoost

In [None]:
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# ✅ 레이블 인코딩
le = LabelEncoder()
y3_tr_enc = le.fit_transform(y3_tr)
y3_val_enc = le.transform(y3_val)

# ✅ 유효한 feature 필터링
top_30_features_valid = [col for col in top_30_features if col in X3_tr.columns]
X3_tr_top30 = X3_tr[top_30_features_valid]
X3_val_top30 = X3_val[top_30_features_valid]

# ✅ 모델 정의 및 학습
xgb_top30 = XGBClassifier(
    objective='multi:softmax',
    num_class=3,
    use_label_encoder=False,
    eval_metric='mlogloss',
    random_state=42,
    n_jobs=-1
)
xgb_top30.fit(X3_tr_top30, y3_tr_enc)

# ✅ 예측
y_pred_top30_enc = xgb_top30.predict(X3_val_top30)
y_pred_top30 = le.inverse_transform(y_pred_top30_enc)

# ✅ 평가
report_top30 = classification_report(y3_val, y_pred_top30, output_dict=True)
report_df = pd.DataFrame(report_top30).transpose()

# ✅ 정확도 및 f1-score 시각화용 데이터 구성
scores = {
    'accuracy': report_df.loc['accuracy']['precision'],
    'f1-score': report_df.loc['weighted avg']['f1-score']
}
score_df = pd.DataFrame(list(scores.items()), columns=['Metric', 'Score'])

# ✅ 시각화
plt.figure(figsize=(6, 4))
sns.barplot(data=score_df, x='Metric', y='Score')
plt.ylim(0, 1)
plt.title('Top 30 Feature 기반 XGBoost 성능')
plt.tight_layout()
plt.show()

# ✅ 상세 보고서 출력
print("XGBoost 평가 상세 결과 (Top 30 Feature)")
print(report_df[['precision', 'recall', 'f1-score']].round(3))

SMOTE -> AB 데이터 증강

In [None]:
from imblearn.over_sampling import SMOTE
from collections import Counter

# 결측치가 있는 경우 0으로 대체 (예: 연체 관련 파생 변수)
X3_tr = X3_tr.fillna(0)

# SMOTE 적용 전 클래스 분포 확인
print("Before SMOTE:", Counter(y3_tr))

# SMOTE 객체 생성 및 적용
smote = SMOTE(random_state=42)
X3_tr_smote, y3_tr_smote = smote.fit_resample(X3_tr, y3_tr)

# SMOTE 적용 후 클래스 분포 확인
print("After SMOTE:", Counter(y3_tr_smote))

SMOTE 후 RF

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter

# 1. 결측치 처리
X3_tr_clean = X3_tr.fillna(0)
X3_val_clean = X3_val.fillna(0)

# 2. SMOTE 적용 전 클래스 분포 확인
print("Before SMOTE:", Counter(y3_tr))

# 3. SMOTE 적용
smote = SMOTE(random_state=42)
X3_tr_smote, y3_tr_smote = smote.fit_resample(X3_tr_clean, y3_tr)

print("After SMOTE:", Counter(y3_tr_smote))

# 4. 모델 학습
rf_smote = RandomForestClassifier(class_weight='balanced', random_state=42, n_jobs=-1)
rf_smote.fit(X3_tr_smote, y3_tr_smote)

# 5. 검증 세트 컬럼 맞춤
X3_val_aligned = X3_val_clean[X3_tr_smote.columns]

from sklearn.metrics import classification_report, confusion_matrix

# 6. 예측 및 평가
y3_pred_smote = rf_smote.predict(X3_val_aligned)

# Classification Report 출력
print("\n[3-Class Classification Report - RandomForest]")
print(classification_report(y3_val, y3_pred_smote, digits=2))

# Confusion Matrix 출력
cm = confusion_matrix(y3_val, y3_pred_smote, labels=rf_smote.classes_)
print("\n[Confusion Matrix - RandomForest]")
print("Labels:", rf_smote.classes_.tolist())
print(cm)

# 7. 정확도 및 f1-score 시각화
score_dict = {
    'accuracy': report_df.loc['accuracy']['precision'],
    'f1-score': report_df.loc['weighted avg']['f1-score']
}
score_df = pd.DataFrame(score_dict.items(), columns=['Metric', 'Score'])

plt.figure(figsize=(6, 4))
sns.barplot(data=score_df, x='Metric', y='Score')
plt.ylim(0, 1)
plt.title('RandomForest 성능 (SMOTE 적용 후)')
plt.tight_layout()
plt.show()

# 8. 상세 리포트 출력
print("SMOTE 적용 후 RandomForest 상세 성능:")
print(report_df.round(3))

SMOTE 후 Xgboost

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter

# 1. 결측치 처리
X3_tr_clean = X3_tr.fillna(0)
X3_val_clean = X3_val.fillna(0)

# 2. SMOTE 적용 전 클래스 분포 확인
print("Before SMOTE:", Counter(y3_tr))

# 3. SMOTE 적용
smote = SMOTE(random_state=42)
X3_tr_smote, y3_tr_smote = smote.fit_resample(X3_tr_clean, y3_tr)

print("After SMOTE:", Counter(y3_tr_smote))

# 4. Label Encoding
le = LabelEncoder()
y3_tr_smote_encoded = le.fit_transform(y3_tr_smote)
y3_val_encoded = le.transform(y3_val)

# 5. XGBoost 모델 정의 및 학습
xgb_smote = XGBClassifier(
    objective='multi:softmax',
    num_class=3,
    eval_metric='mlogloss',
    random_state=42,
    n_jobs=-1
)
xgb_smote.fit(X3_tr_smote, y3_tr_smote_encoded)

# 6. 검증 데이터 컬럼 정렬
X3_val_fixed = X3_val_clean[X3_tr_smote.columns]

# 7. 예측 및 역변환
y3_pred_encoded = xgb_smote.predict(X3_val_fixed)
y3_pred = le.inverse_transform(y3_pred_encoded)

# 8. 성능 평가
report_xgb = classification_report(y3_val, y3_pred, output_dict=True)
report_df_xgb = pd.DataFrame(report_xgb).transpose()

# 9. 성능 시각화
score_df = pd.DataFrame({
    'Metric': ['accuracy', 'f1-score'],
    'Score': [report_df_xgb.loc['accuracy']['precision'], report_df_xgb.loc['weighted avg']['f1-score']]
})

plt.figure(figsize=(6, 4))
sns.barplot(data=score_df, x='Metric', y='Score')
plt.ylim(0, 1)
plt.title('SMOTE 기반 XGBoost 성능')
plt.tight_layout()
plt.show()

# 10. 상세 리포트 출력
print("\n[3-Class Classification Report - XGBoost]")
print(classification_report(y3_val, y3_pred, digits=2))

# 11. Confusion Matrix 출력
cm = confusion_matrix(y3_val, y3_pred, labels=le.classes_)
print("\n[Confusion Matrix - XGBoost]")
print("Labels:", le.classes_.tolist())
print(cm)

In [None]:
pip install catboost

catboost로 진행

In [None]:
from catboost import CatBoostClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from collections import Counter

# 1. 결측치 처리
X3_tr_clean = X3_tr.fillna(0)
X3_val_clean = X3_val.fillna(0)

# 2. SMOTE 적용 전 클래스 분포 확인
print("Before SMOTE:", Counter(y3_tr))

# 3. SMOTE 적용
smote = SMOTE(random_state=42)
X3_tr_smote, y3_tr_smote = smote.fit_resample(X3_tr_clean, y3_tr)

print("After SMOTE:", Counter(y3_tr_smote))

# 4. 레이블 인코딩
le = LabelEncoder()
y3_tr_smote_encoded = le.fit_transform(y3_tr_smote)
y3_val_encoded = le.transform(y3_val)

# 5. CatBoost 모델 정의 및 학습
cat_model = CatBoostClassifier(
    iterations=500,
    learning_rate=0.05,
    depth=6,
    loss_function='MultiClass',
    verbose=0,
    random_seed=42
)
cat_model.fit(X3_tr_smote, y3_tr_smote_encoded)

# 6. 예측 및 역변환
y3_pred_encoded = cat_model.predict(X3_val_clean)
y3_pred = le.inverse_transform(y3_pred_encoded.flatten())

# 7. 성능 평가
report_cat = classification_report(y3_val, y3_pred, output_dict=True)
report_df_cat = pd.DataFrame(report_cat).transpose()

# 8. 성능 시각화
score_df = pd.DataFrame({
    'Metric': ['accuracy', 'f1-score'],
    'Score': [report_df_cat.loc['accuracy']['precision'], report_df_cat.loc['weighted avg']['f1-score']]
})

plt.figure(figsize=(6, 4))
sns.barplot(data=score_df, x='Metric', y='Score')
plt.ylim(0, 1)
plt.title('SMOTE 기반 CatBoost 성능')
plt.tight_layout()
plt.show()

# 9. 상세 결과 출력
print("\n[3-Class Classification Report - CatBoost]")
print(classification_report(y3_val, y3_pred, digits=2))

# 10. Confusion Matrix 출력
cm = confusion_matrix(y3_val, y3_pred, labels=le.classes_)
print("\n[Confusion Matrix - CatBoost]")
print("Labels:", le.classes_.tolist())
print(cm)

CatBoost 클래스 가중치 (smote X)

In [None]:
from catboost import CatBoostClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd
from collections import Counter

# 1. 결측치 처리
X3_tr_clean = X3_tr.fillna(0)
X3_val_clean = X3_val.fillna(0)

# 2. 레이블 인코딩
le = LabelEncoder()
y3_tr_encoded = le.fit_transform(y3_tr)
y3_val_encoded = le.transform(y3_val)

# 3. CatBoost 모델 정의 (클래스 가중치 적용)
cat_model = CatBoostClassifier(
    iterations=500,
    learning_rate=0.05,
    depth=6,
    loss_function='MultiClass',
    auto_class_weights='Balanced',
    random_seed=42,
    verbose=0
)

# 4. 학습
cat_model.fit(X3_tr_clean, y3_tr_encoded)

# 5. 예측 및 역변환
y3_pred_encoded = cat_model.predict(X3_val_clean)
y3_pred = le.inverse_transform(y3_pred_encoded.flatten())

# 6. 성능 평가
report_cat = classification_report(y3_val, y3_pred, output_dict=True)
report_df_cat = pd.DataFrame(report_cat).transpose()

# 7. 출력
print("\n[3-Class Classification Report - CatBoost (Class Weight)]")
print(report_df_cat[['precision', 'recall', 'f1-score']].round(3))

# 8. Confusion Matrix
cm = confusion_matrix(y3_val, y3_pred, labels=le.classes_)
print("\n[Confusion Matrix - CatBoost (Class Weight)]")
print(f"Labels: {list(le.classes_)}")
print(cm)

클래스 가중치 기반 XGBoost

In [None]:
from xgboost import XGBClassifier
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd
from collections import Counter

# 1. 레이블 인코딩
le = LabelEncoder()
y3_tr_encoded = le.fit_transform(y3_tr)
y3_val_encoded = le.transform(y3_val)

# 2. 클래스 가중치 → sample_weight 계산
sample_weight = compute_sample_weight(class_weight='balanced', y=y3_tr_encoded)

# 3. XGBoost 모델 정의
xgb_weighted = XGBClassifier(
    objective='multi:softmax',
    num_class=3,
    eval_metric='mlogloss',
    use_label_encoder=False,
    random_state=42,
    n_jobs=-1
)

# 4. 학습
xgb_weighted.fit(X3_tr, y3_tr_encoded, sample_weight=sample_weight)

# 5. 예측 및 역변환
y3_pred_encoded = xgb_weighted.predict(X3_val)
y3_pred = le.inverse_transform(y3_pred_encoded)

# 6. 성능 평가
report = classification_report(y3_val, y3_pred, output_dict=True)
report_df = pd.DataFrame(report).transpose()

# 7. 출력
print("\n[3-Class Classification Report - XGBoost (Class Weight)]")
print(report_df[['precision', 'recall', 'f1-score']].round(3))

# 8. Confusion Matrix
cm = confusion_matrix(y3_val, y3_pred, labels=le.classes_)
print("\n[Confusion Matrix - XGBoost (Class Weight)]")
print(f"Labels: {list(le.classes_)}")
print(cm)

SMOTE : XGBoost + RandomizedSearchCV

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from scipy.stats import randint, uniform
from tqdm.auto import tqdm
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 1. 레이블 인코딩
le = LabelEncoder()
y3_tr_smote_enc = le.fit_transform(y3_tr_smote)
y3_val_enc = le.transform(y3_val)

# 2. 파라미터 탐색 공간 정의
param_dist = {
    'n_estimators': randint(100, 500),
    'learning_rate': uniform(0.01, 0.2),
    'max_depth': randint(3, 10),
    'subsample': uniform(0.7, 0.3),
    'colsample_bytree': uniform(0.7, 0.3),
    'gamma': uniform(0, 5),
    'reg_alpha': uniform(0, 5),
    'reg_lambda': uniform(0, 5)
}

# 3. XGBoost 모델 정의
xgb = XGBClassifier(
    objective='multi:softmax',
    num_class=3,
    eval_metric='mlogloss',
    random_state=42,
    n_jobs=-1,
    verbosity=0
)

# 4. RandomizedSearchCV 래핑 (진행률 표시 포함)
class TqdmRandomizedSearchCV(RandomizedSearchCV):
    def fit(self, X, y=None, **fit_params):
        with tqdm(total=self.n_iter, desc="⚡ Random Search 진행중") as pbar:
            self._pbar = pbar
            return super().fit(X, y, **fit_params)

    def _run_search(self, evaluate_candidates):
        def wrapped(candidate_params):
            results = evaluate_candidates(candidate_params)
            self._pbar.update(len(candidate_params))
            return results
        return super()._run_search(wrapped)

# 5. 탐색기 정의 및 학습 시작
random_search = TqdmRandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist,
    n_iter=20,  # 탐색 횟수: 20개
    scoring='f1_weighted',
    cv=3,
    verbose=0,
    random_state=42
)

random_search.fit(X3_tr_smote, y3_tr_smote_enc)

# 6. 최적 모델 성능 평가
best_xgb = random_search.best_estimator_
y3_pred_enc = best_xgb.predict(X3_val)
y3_pred = le.inverse_transform(y3_pred_enc)

# 7. 리포트 및 시각화
report_xgb = classification_report(y3_val, y3_pred, output_dict=True)
report_df_xgb = pd.DataFrame(report_xgb).transpose()

score_df = pd.DataFrame({
    'Metric': ['accuracy', 'f1-score'],
    'Score': [report_df_xgb.loc['accuracy']['precision'], report_df_xgb.loc['weighted avg']['f1-score']]
})

plt.figure(figsize=(6, 4))
sns.barplot(data=score_df, x='Metric', y='Score')
plt.ylim(0, 1)
plt.title('SMOTE 기반 XGBoost 성능 (Random Search)')
plt.tight_layout()
plt.show()

print("XGBoost 최적 파라미터:", random_search.best_params_)
print("XGBoost 성능 상세:")
print(report_df_xgb[['precision', 'recall', 'f1-score']])

SMOTE : CatBoost + RandomizedSearchCV

In [None]:
from catboost import CatBoostClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from scipy.stats import randint, uniform
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm

# 1. 레이블 인코딩
le = LabelEncoder()
y3_tr_smote_enc = le.fit_transform(y3_tr_smote)
y3_val_enc = le.transform(y3_val)

# 2. 파라미터 탐색 공간 정의
param_dist = {
    'depth': randint(4, 10),
    'learning_rate': uniform(0.01, 0.2),
    'iterations': randint(100, 500),
    'l2_leaf_reg': uniform(1, 10),
    'random_strength': uniform(0, 5)
}

# 3. CatBoost 모델 정의
cat_model = CatBoostClassifier(
    loss_function='MultiClass',
    random_seed=42,
    verbose=0
)

# 4. tqdm RandomizedSearchCV 래핑
class TqdmRandomizedSearchCV(RandomizedSearchCV):
    def fit(self, X, y=None, **fit_params):
        with tqdm(total=self.n_iter, desc="⚡ Random Search 진행중 (CatBoost)") as pbar:
            self._pbar = pbar
            return super().fit(X, y, **fit_params)

    def _run_search(self, evaluate_candidates):
        def wrapped(candidate_params):
            results = evaluate_candidates(candidate_params)
            self._pbar.update(len(candidate_params))
            return results
        return super()._run_search(wrapped)

# 5. 탐색기 정의 및 학습
random_search_cat = TqdmRandomizedSearchCV(
    estimator=cat_model,
    param_distributions=param_dist,
    n_iter=20,
    scoring='f1_weighted',
    cv=3,
    verbose=0,
    random_state=42
)

random_search_cat.fit(X3_tr_smote, y3_tr_smote_enc)

# 6. 예측
best_cat = random_search_cat.best_estimator_
y3_pred_enc = best_cat.predict(X3_val)
y3_pred = le.inverse_transform(y3_pred_enc.flatten())

# 7. 평가 및 출력
report_cat = classification_report(y3_val, y3_pred, output_dict=True)
report_df_cat = pd.DataFrame(report_cat).transpose()

print("CatBoost 최적 파라미터:", random_search_cat.best_params_)
print("\nCatBoost 성능 상세:")
print(report_df_cat[['precision', 'recall', 'f1-score']])

# 8. 간단 시각화
score_df = pd.DataFrame({
    'Metric': ['accuracy', 'f1-score'],
    'Score': [report_df_cat.loc['accuracy']['precision'], report_df_cat.loc['weighted avg']['f1-score']]
})

plt.figure(figsize=(6, 4))
sns.barplot(data=score_df, x='Metric', y='Score')
plt.ylim(0, 1)
plt.title('SMOTE 기반 CatBoost 성능 (Random Search)')
plt.tight_layout()
splt.show()

Soft Voting 앙상블

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# ✅ 1. 레이블 인코딩 (이미 한 경우 생략 가능)
le = LabelEncoder()
y3_tr_smote_enc = le.fit_transform(y3_tr_smote)
y3_val_enc = le.transform(y3_val)

# ✅ 2. Soft Voting 앙상블 정의
voting_model = VotingClassifier(
    estimators=[
        ('xgb', best_xgb),   # ✅ RandomizedSearchCV 결과로 도출된 XGBoost
        ('cat', best_cat)    # ✅ RandomizedSearchCV 결과로 도출된 CatBoost
    ],
    voting='soft'  # 확률 평균 방식
)

# ✅ 3. 학습
voting_model.fit(X3_tr_smote, y3_tr_smote_enc)

# ✅ 4. 예측 및 역변환
y_pred_ens_enc = voting_model.predict(X3_val)
y_pred_ens = le.inverse_transform(y_pred_ens_enc)

# ✅ 5. 리포트 및 출력
report_ens = classification_report(y3_val, y_pred_ens, output_dict=True)
report_df_ens = pd.DataFrame(report_ens).transpose()

print("\n📊 [3-Class Classification Report - VotingClassifier (Soft)]")
print(report_df_ens[['precision', 'recall', 'f1-score']].round(3))

# ✅ 6. Confusion Matrix
cm = confusion_matrix(y3_val, y_pred_ens, labels=le.classes_)
print("\n[Confusion Matrix - VotingClassifier (Soft)]")
print("Labels:", list(le.classes_))
print(cm)

# ✅ 7. 정확도 & F1-score 시각화
score_df = pd.DataFrame({
    'Metric': ['accuracy', 'f1-score'],
    'Score': [
        report_df_ens.loc['accuracy']['precision'],  # accuracy는 precision key에 저장됨
        report_df_ens.loc['weighted avg']['f1-score']
    ]
})

plt.figure(figsize=(6, 4))
sns.barplot(data=score_df, x='Metric', y='Score')
plt.ylim(0, 1)
plt.title('VotingClassifier 성능 (SMOTE + XGBoost + CatBoost)')
plt.tight_layout()
plt.show()

SMOTE : LGBM + RandomizedSearchCV

In [None]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from scipy.stats import randint, uniform
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm

# 1. 레이블 인코딩
le = LabelEncoder()
y3_tr_smote_enc = le.fit_transform(y3_tr_smote)
y3_val_enc = le.transform(y3_val)

# 2. 파라미터 탐색 공간 정의
param_dist = {
    'n_estimators': randint(100, 500),
    'learning_rate': uniform(0.01, 0.2),
    'max_depth': randint(3, 10),
    'num_leaves': randint(15, 50),
    'subsample': uniform(0.7, 0.3),
    'colsample_bytree': uniform(0.7, 0.3),
    'reg_alpha': uniform(0, 5),
    'reg_lambda': uniform(0, 5)
}

# 3. LGBM 모델 정의
lgbm = LGBMClassifier(
    objective='multiclass',
    random_state=42,
    n_jobs=-1
)

# 4. 진행률 포함 RandomizedSearchCV 클래스 정의
class TqdmRandomizedSearchCV(RandomizedSearchCV):
    def fit(self, X, y=None, **fit_params):
        with tqdm(total=self.n_iter, desc="🔍 LGBM Random Search") as pbar:
            self._pbar = pbar
            return super().fit(X, y, **fit_params)

    def _run_search(self, evaluate_candidates):
        def wrapped(candidate_params):
            results = evaluate_candidates(candidate_params)
            self._pbar.update(len(candidate_params))
            return results
        return super()._run_search(wrapped)

# 5. 탐색기 정의 및 학습
random_search_lgbm = TqdmRandomizedSearchCV(
    estimator=lgbm,
    param_distributions=param_dist,
    n_iter=20,
    scoring='f1_weighted',
    cv=3,
    random_state=42
)

random_search_lgbm.fit(X3_tr_smote, y3_tr_smote_enc)

# 6. 최적 모델 평가
best_lgbm = random_search_lgbm.best_estimator_
y3_pred_enc = best_lgbm.predict(X3_val)
y3_pred = le.inverse_transform(y3_pred_enc)

# 7. 리포트 및 시각화
report_lgbm = classification_report(y3_val, y3_pred, output_dict=True)
report_df_lgbm = pd.DataFrame(report_lgbm).transpose()

score_df = pd.DataFrame({
    'Metric': ['accuracy', 'f1-score'],
    'Score': [report_df_lgbm.loc['accuracy']['precision'], report_df_lgbm.loc['weighted avg']['f1-score']]
})

plt.figure(figsize=(6, 4))
sns.barplot(data=score_df, x='Metric', y='Score')
plt.ylim(0, 1)
plt.title('SMOTE 기반 LGBM 성능 (Random Search)')
plt.tight_layout()
plt.show()

# 출력
print("LGBM 최적 파라미터:", random_search_lgbm.best_params_)
print("LGBM 성능 상세:")
print(report_df_lgbm[['precision', 'recall', 'f1-score']])

Soft Voting 앙상블 (SMOTE 기반 XGB + CAT + LGBM)

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 1. 레이블 인코딩 (재사용)
y3_val_enc = le.transform(y3_val)

# 2. Soft Voting 앙상블 구성
voting_clf = VotingClassifier(
    estimators=[
        ('xgb', best_xgb),
        ('cat', best_cat),
        ('lgbm', best_lgbm)
    ],
    voting='soft',
    n_jobs=-1
)

# 3. 앙상블 학습
voting_clf.fit(X3_tr_smote, y3_tr_smote_enc)

# 4. 예측 및 역변환
y_pred_enc = voting_clf.predict(X3_val)
y_pred = le.inverse_transform(y_pred_enc)

# 5. 성능 평가
report = classification_report(y3_val, y_pred, output_dict=True)
report_df = pd.DataFrame(report).transpose()

# 6. Confusion Matrix
cm = confusion_matrix(y3_val, y_pred, labels=le.classes_)

# 7. 정확도 및 f1-score 시각화
score_df = pd.DataFrame({
    'Metric': ['accuracy', 'f1-score'],
    'Score': [report_df.loc['accuracy']['precision'], report_df.loc['weighted avg']['f1-score']]
})

plt.figure(figsize=(6, 4))
sns.barplot(data=score_df, x='Metric', y='Score')
plt.ylim(0, 1)
plt.title('SMOTE 기반 Soft Voting 앙상블 성능 (XGB+CAT+LGBM)')
plt.tight_layout()
plt.show()

# 8. 출력
print("\n[3-Class Classification Report - VotingClassifier (Soft)]")
print(report_df[['precision', 'recall', 'f1-score']].round(3))

print("\n[Confusion Matrix - VotingClassifier (Soft)]")
print("Labels:", le.classes_.tolist())
print(cm)