In [8]:
from IPython.display import display, HTML
display(HTML("""
<style>
div.container{width:86% !important;}
div.cell.code_cell.rendered{width:100%;}
div.CodeMirror {font-family:Consolas; font-size:12pt;}
div.output {font-size:15pt; font-weight:bold;}
div.input {font-family:Consolas; font-size:12pt;}
div.prompt {min-width:70px;}
div#toc-wrapper{padding-top:120px;}
div.text_cell_render ul li{font-size:12pt;padding:5px;}
table.dataframe{font-size:15px;}
</style>
"""))

In [3]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

df = pd.read_csv('WA_Fn-UseC_-HR-Employee-Attrition_변환.csv')

x_col = ['나이','출장','부서','학력','전공','참여프로젝트','직급','경력','전년도교육출장횟수','현회사근속년수']
y_col = ['업무평가']

df_data = df[x_col + y_col]

from sklearn.model_selection import train_test_split
x = df_data[x_col]
y = df_data[y_col]
X_train, X_test, Y_train, Y_test = train_test_split(x,y,test_size=0.5,stratify=y)
print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)


le_buisness = LabelEncoder() # 출장
le_depart = LabelEncoder() # 부서
le_major = LabelEncoder() # 전공
le_manager = LabelEncoder() # 직급

X_train['출장'] = le_buisness.fit_transform(X_train['출장'])
X_train['부서'] = le_depart.fit_transform(X_train['부서'])
X_train['전공'] = le_major.fit_transform(X_train['전공'])
X_train['직급'] = le_manager.fit_transform(X_train['직급'])

X_test['출장'] = le_buisness.transform(X_test['출장'])
X_test['부서'] = le_depart.transform(X_test['부서'])
X_test['전공'] = le_major.transform(X_test['전공'])
X_test['직급'] = le_manager.transform(X_test['직급'])

rf_model = RandomForestClassifier()
history = rf_model.fit(np.array(X_train), np.array(Y_train).ravel())

pred = rf_model.predict(np.array(X_test))
value = np.array(Y_test).ravel()

display(pd.crosstab(value,pred, rownames=['실제값'],colnames=['예측값']))

(735, 10) (735, 10) (735, 1) (735, 1)


예측값,보통,좋다
실제값,Unnamed: 1_level_1,Unnamed: 2_level_1
보통,617,5
좋다,113,0


In [6]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTENC  # ← 혼합형용 SMOTE

# 1) 데이터 로드
df = pd.read_csv('WA_Fn-UseC_-HR-Employee-Attrition_변환.csv')

x_col = ['나이','출장','부서','학력','전공','참여프로젝트','직급','경력','전년도교육출장횟수','현회사근속년수']
y_col = ['업무평가']

df_data = df[x_col + y_col].dropna().copy()

# Series로(경고 방지)
X = df_data[x_col].copy()
y = df_data[y_col[0]].copy()

# 2) Train/Test split
X_train, X_test, Y_train, Y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)
print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)

# 3) 라벨 인코딩 (train으로 fit → test transform)
cat_cols = ['출장','부서','전공','직급']
le = {c: LabelEncoder() for c in cat_cols}

X_train = X_train.copy()
X_test  = X_test.copy()

for c in cat_cols:
    X_train[c] = le[c].fit_transform(X_train[c])
    X_test[c]  = le[c].transform(X_test[c])

# 4) ⚖️ SMOTENC (학습 데이터에만 적용)
#    범주형 컬럼의 인덱스 지정
cat_idx = [x_col.index(c) for c in cat_cols]

#    소수 클래스 샘플이 매우 적을 때 에러 방지용으로 k_neighbors를 3으로 낮춤
smote = SMOTENC(categorical_features=cat_idx, k_neighbors=3, random_state=42)
X_res, y_res = smote.fit_resample(X_train, Y_train)

print("\n[클래스 분포] 원본 학습데이터:")
print(Y_train.value_counts().sort_index())
print("\n[클래스 분포] SMOTENC 이후:")
print(pd.Series(y_res).value_counts().sort_index())

# 5) 모델 학습
rf_model = RandomForestClassifier(
    n_estimators=300,
    n_jobs=-1,
    random_state=42
)
rf_model.fit(X_res, y_res)

# 6) 예측 및 평가
pred = rf_model.predict(X_test)
cm = pd.crosstab(Y_test, pred, rownames=['실제값'], colnames=['예측값'])
display(cm)

print("\n[Classification Report]")
print(classification_report(Y_test, pred, digits=4))

(1029, 10) (441, 10) (1029,) (441,)

[클래스 분포] 원본 학습데이터:
업무평가
보통    871
좋다    158
Name: count, dtype: int64

[클래스 분포] SMOTENC 이후:
업무평가
보통    871
좋다    871
Name: count, dtype: int64


예측값,보통,좋다
실제값,Unnamed: 1_level_1,Unnamed: 2_level_1
보통,326,47
좋다,58,10



[Classification Report]
              precision    recall  f1-score   support

          보통     0.8490    0.8740    0.8613       373
          좋다     0.1754    0.1471    0.1600        68

    accuracy                         0.7619       441
   macro avg     0.5122    0.5105    0.5106       441
weighted avg     0.7451    0.7619    0.7532       441



In [7]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, precision_score
from imblearn.over_sampling import SMOTENC  # 혼합형용 SMOTE

# 1) 데이터 로드
df = pd.read_csv('WA_Fn-UseC_-HR-Employee-Attrition_변환.csv')

x_col = ['나이','출장','부서','학력','전공','참여프로젝트','직급','경력','전년도교육출장횟수','현회사근속년수']
y_col = ['업무평가']
POS_LABEL = '좋다'   # ← precision을 끌어올릴 타깃 라벨

df_data = df[x_col + y_col].dropna().copy()

# Series로 (경고 방지)
X = df_data[x_col].copy()
y = df_data[y_col[0]].copy()

# 2) Train/Test split (test는 건드리지 않음)
X_train, X_test, Y_train, Y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)
print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)

# 3) 라벨 인코딩 (train으로 fit → test transform)
cat_cols = ['출장','부서','전공','직급']
le = {c: LabelEncoder() for c in cat_cols}

X_train = X_train.copy()
X_test  = X_test.copy()

for c in cat_cols:
    X_train[c] = le[c].fit_transform(X_train[c])
    X_test[c]  = le[c].transform(X_test[c])

# 3-1) 학습/검증 분리 (검증은 원 분포 유지! → 임계값 튜닝용)
X_fit, X_val, y_fit, y_val = train_test_split(
    X_train, Y_train, test_size=0.2, stratify=Y_train, random_state=42
)

# 4) SMOTENC (학습 파트에만 적용)
cat_idx = [x_col.index(c) for c in cat_cols]
smote = SMOTENC(categorical_features=cat_idx, k_neighbors=3, random_state=42)
X_res, y_res = smote.fit_resample(X_fit, y_fit)

print("\n[클래스 분포] 원본 학습(Train) 파트:", y_fit.value_counts().to_dict())
print("[클래스 분포] SMOTENC 이후:", pd.Series(y_res).value_counts().to_dict())

# 5) 모델 학습
rf_model = RandomForestClassifier(
    n_estimators=300, n_jobs=-1, random_state=42
)
rf_model.fit(X_res, y_res)

# 6) 임계값(τ) 튜닝: 검증셋에서 '좋다' precision 최대화
if POS_LABEL not in rf_model.classes_:
    raise ValueError(f"'{POS_LABEL}' 라벨이 존재하지 않습니다. 사용 가능한 클래스: {rf_model.classes_}")

pos_idx = np.where(rf_model.classes_ == POS_LABEL)[0][0]
val_proba = rf_model.predict_proba(X_val)[:, pos_idx]
y_val_bin = (y_val == POS_LABEL).astype(int)

def find_best_tau(proba, y_true_bin, grid=None):
    if grid is None:
        grid = np.linspace(0.50, 0.99, 100)  # 공격적으로 precision↑
    best_tau, best_prec = 0.5, -1.0
    for tau in grid:
        y_pred_bin = (proba >= tau).astype(int)
        # 양성 예측이 하나도 없으면 precision은 0으로 처리
        prec = precision_score(y_true_bin, y_pred_bin, zero_division=0)
        # precision이 같으면 더 높은 τ를 채택(더 보수적)
        if (prec > best_prec) or (prec == best_prec and tau > best_tau):
            best_prec, best_tau = prec, tau
    return best_tau, best_prec

best_tau, val_prec = find_best_tau(val_proba, y_val_bin)
print(f"\n[임계값 튜닝 결과] τ={best_tau:.3f}, 검증셋 '좋다' precision={val_prec:.4f}")

# 7) 테스트셋 예측
proba_test = rf_model.predict_proba(X_test)
classes = rf_model.classes_
pos_idx = np.where(classes == POS_LABEL)[0][0]

# (a) 기본 예측(비교용)
pred_base = classes[np.argmax(proba_test, axis=1)]

# (b) 임계값 적용 예측: τ 이상이면 '좋다', 아니면 '좋다' 제외한 argmax
pred_threshold = []
for row in proba_test:
    if row[pos_idx] >= best_tau:
        pred_threshold.append(POS_LABEL)
    else:
        # '좋다'를 제외한 클래스 중 최댓값
        mask = np.arange(len(classes)) != pos_idx
        other_idx = np.argmax(row[mask])
        mapped_idx = np.arange(len(classes))[mask][other_idx]
        pred_threshold.append(classes[mapped_idx])
pred_threshold = np.array(pred_threshold)

# 8) 평가 출력
print("\n[기본모델] Classification Report (테스트)")
print(classification_report(Y_test, pred_base, digits=4))

print("\n[임계값 적용] Classification Report (테스트)")
print(classification_report(Y_test, pred_threshold, digits=4))

# '좋다' precision만 별도 비교
from sklearn.metrics import precision_recall_fscore_support
labels_order = list(classes)
p_base = precision_recall_fscore_support(Y_test, pred_base, labels=[POS_LABEL], zero_division=0)[0][0]
p_thr  = precision_recall_fscore_support(Y_test, pred_threshold, labels=[POS_LABEL], zero_division=0)[0][0]
print(f"\n'좋다' precision  비교 → 기본: {p_base:.4f}  |  임계값(τ={best_tau:.3f}) 적용: {p_thr:.4f}")

# 혼동행렬(임계값 적용 버전)
cm = pd.crosstab(Y_test, pred_threshold, rownames=['실제값'], colnames=['예측값'])
display(cm)


(1029, 10) (441, 10) (1029,) (441,)

[클래스 분포] 원본 학습(Train) 파트: {'보통': 697, '좋다': 126}
[클래스 분포] SMOTENC 이후: {'보통': 697, '좋다': 697}

[임계값 튜닝 결과] τ=0.510, 검증셋 '좋다' precision=0.1333

[기본모델] Classification Report (테스트)
              precision    recall  f1-score   support

          보통     0.8531    0.8874    0.8699       373
          좋다     0.2075    0.1618    0.1818        68

    accuracy                         0.7755       441
   macro avg     0.5303    0.5246    0.5259       441
weighted avg     0.7536    0.7755    0.7638       441


[임계값 적용] Classification Report (테스트)
              precision    recall  f1-score   support

          보통     0.8535    0.8901    0.8714       373
          좋다     0.2115    0.1618    0.1833        68

    accuracy                         0.7778       441
   macro avg     0.5325    0.5259    0.5274       441
weighted avg     0.7545    0.7778    0.7653       441


'좋다' precision  비교 → 기본: 0.2075  |  임계값(τ=0.510) 적용: 0.2115


예측값,보통,좋다
실제값,Unnamed: 1_level_1,Unnamed: 2_level_1
보통,332,41
좋다,57,11


# smote + 게이트분류 도입을 통한 precision 올리기

In [10]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, precision_score, precision_recall_fscore_support
from imblearn.over_sampling import SMOTENC  # 혼합형용 SMOTE

# -----------------------------
# 공통 설정
# -----------------------------
POS_LABEL = '좋다'  # precision을 올릴 타깃

# 1) 데이터 로드
df = pd.read_csv('WA_Fn-UseC_-HR-Employee-Attrition_변환.csv')

x_col = ['나이','출장','부서','학력','전공','참여프로젝트','직급','경력','전년도교육출장횟수','현회사근속년수']
y_col = ['업무평가']
cat_cols = ['출장','부서','전공','직급']
num_cols = [c for c in x_col if c not in cat_cols]

df_data = df[x_col + y_col].dropna().copy()
X = df_data[x_col].copy()
y = df_data[y_col[0]].copy()

# 2) Train/Test split
X_train, X_test, Y_train, Y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

# =====================================
# A. 기본 다중분류 모델 (RF + SMOTENC)
# =====================================
# 라벨인코딩 (트리기반에 적합)
X_train_tree = X_train.copy()
X_test_tree  = X_test.copy()
le = {c: LabelEncoder() for c in cat_cols}
for c in cat_cols:
    X_train_tree[c] = le[c].fit_transform(X_train_tree[c])
    X_test_tree[c]  = le[c].transform(X_test_tree[c])

# SMOTENC (train에만)
cat_idx = [x_col.index(c) for c in cat_cols]
smote = SMOTENC(categorical_features=cat_idx, k_neighbors=3, random_state=42)
X_res, y_res = smote.fit_resample(X_train_tree, Y_train)

rf_model = RandomForestClassifier(n_estimators=300, n_jobs=-1, random_state=42)
rf_model.fit(X_res, y_res)

rf_classes = rf_model.classes_
if POS_LABEL not in rf_classes:
    raise ValueError(f"'{POS_LABEL}' 라벨이 없습니다. 사용 가능: {rf_classes}")

# =====================================
# B. 게이트 분류기 (One-Hot + Logistic, "좋다 vs 나머지")
# =====================================
# 원-핫 인코딩 파이프라인 (선형모델용)
pre = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=True), cat_cols),
        ('num', 'passthrough', num_cols)
    ]
)
gate_clf = Pipeline([
    ('pre', pre),
    ('clf', LogisticRegression(
        penalty='l2', C=1.0, class_weight='balanced',
        solver='saga', max_iter=2000, n_jobs=-1
    ))
])

# 게이트용 검증 분리 (원 분포 유지; 임계값 튜닝용)
X_fit, X_val, y_fit, y_val = train_test_split(
    X_train, Y_train, test_size=0.2, stratify=Y_train, random_state=42
)
y_fit_bin = (y_fit == POS_LABEL).astype(int)
y_val_bin = (y_val == POS_LABEL).astype(int)

gate_clf.fit(X_fit, y_fit_bin)

# 검증셋에서 τ 선택: 목표 정밀도(target_precision)를 우선, 불가능하면 precision 최대화
def pick_tau_for_precision(proba, y_true_bin, target_precision=0.9):
    # grid을 높게 잡아 더 보수적으로(precision↑)
    grid = np.linspace(0.60, 0.999, 200)
    best_tau, best_prec = 0.5, -1.0
    chosen_tau = None
    for tau in grid:
        pred_bin = (proba >= tau).astype(int)
        prec = precision_score(y_true_bin, pred_bin, zero_division=0)
        if prec >= target_precision:
            chosen_tau = tau
            break
        if (prec > best_prec) or (prec == best_prec and tau > best_tau):
            best_prec, best_tau = prec, tau
    return chosen_tau if chosen_tau is not None else best_tau

val_proba = gate_clf.predict_proba(X_val)[:,1]
tau = pick_tau_for_precision(val_proba, y_val_bin, target_precision=0.999)
print(f"[게이트] 선택된 τ = {tau:.3f}")

# =====================================
# C. 최종 예측 조립
# =====================================
# (1) 기본 RF 예측
rf_proba_test = rf_model.predict_proba(X_test_tree)
rf_pred_base  = rf_classes[np.argmax(rf_proba_test, axis=1)]

# (2) 게이트 점수
gate_proba_test = gate_clf.predict_proba(X_test)[:,1]

# (3) 게이트 적용: τ 이상이면 '좋다', 아니면 '좋다' 제외 argmax
pos_idx = np.where(rf_classes == POS_LABEL)[0][0]
pred_final = rf_pred_base.copy()

for i in range(len(X_test)):
    if gate_proba_test[i] >= tau:
        pred_final[i] = POS_LABEL
    else:
        # RF가 '좋다'라고 했더라도 게이트가 막으면 두번째 후보(좋다 제외 argmax)로
        row = rf_proba_test[i]
        mask = np.arange(len(rf_classes)) != pos_idx
        other_idx = np.argmax(row[mask])
        mapped_idx = np.arange(len(rf_classes))[mask][other_idx]
        pred_final[i] = rf_classes[mapped_idx]

# =====================================
# D. 성능 비교 출력
# =====================================
print("\n[기본 RF] Classification Report")
print(classification_report(Y_test, rf_pred_base, digits=4))

print("\n[게이트 적용] Classification Report")
print(classification_report(Y_test, pred_final, digits=4))

p_base = precision_recall_fscore_support(Y_test, rf_pred_base, labels=[POS_LABEL], zero_division=0)[0][0]
p_gate = precision_recall_fscore_support(Y_test, pred_final, labels=[POS_LABEL], zero_division=0)[0][0]
cnt_base = np.sum(rf_pred_base == POS_LABEL)
cnt_gate = np.sum(pred_final == POS_LABEL)
print(f"\n'좋다' precision  → 기본: {p_base:.4f} | 게이트: {p_gate:.4f}")
print(f"'좋다' 예측 개수(커버리지) → 기본: {cnt_base} | 게이트: {cnt_gate}")



[게이트] 선택된 τ = 0.618

[기본 RF] Classification Report
              precision    recall  f1-score   support

          보통     0.8490    0.8740    0.8613       373
          좋다     0.1754    0.1471    0.1600        68

    accuracy                         0.7619       441
   macro avg     0.5122    0.5105    0.5106       441
weighted avg     0.7451    0.7619    0.7532       441


[게이트 적용] Classification Report
              precision    recall  f1-score   support

          보통     0.8495    0.9383    0.8917       373
          좋다     0.2069    0.0882    0.1237        68

    accuracy                         0.8073       441
   macro avg     0.5282    0.5133    0.5077       441
weighted avg     0.7504    0.8073    0.7733       441


'좋다' precision  → 기본: 0.1754 | 게이트: 0.2069
'좋다' 예측 개수(커버리지) → 기본: 57 | 게이트: 29
