In [1]:
from IPython.display import display, HTML
display(HTML("""
<style>
div.container{width:86% !important;}
div.cell.code_cell.rendered{width:100%;}
div.CodeMirror {font-family:Consolas; font-size:12pt;}
div.output {font-size:15pt; font-weight:bold;}
div.input {font-family:Consolas; font-size:12pt;}
div.prompt {min-width:70px;}
div#toc-wrapper{padding-top:120px;}
div.text_cell_render ul li{font-size:12pt;padding:5px;}
table.dataframe{font-size:15px;}
</style>
"""))

In [3]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

df = pd.read_csv('WA_Fn-UseC_-HR-Employee-Attrition_변환.csv')

x_col = ['나이','출장','부서','학력','전공','참여프로젝트','직급','경력','전년도교육출장횟수','현회사근속년수']
y_col = ['업무평가']

df_data = df[x_col + y_col]

from sklearn.model_selection import train_test_split
x = df_data[x_col]
y = df_data[y_col]
X_train, X_test, Y_train, Y_test = train_test_split(x,y,test_size=0.5,stratify=y)
print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)


le_buisness = LabelEncoder() # 출장
le_depart = LabelEncoder() # 부서
le_major = LabelEncoder() # 전공
le_manager = LabelEncoder() # 직급

X_train['출장'] = le_buisness.fit_transform(X_train['출장'])
X_train['부서'] = le_depart.fit_transform(X_train['부서'])
X_train['전공'] = le_major.fit_transform(X_train['전공'])
X_train['직급'] = le_manager.fit_transform(X_train['직급'])

X_test['출장'] = le_buisness.transform(X_test['출장'])
X_test['부서'] = le_depart.transform(X_test['부서'])
X_test['전공'] = le_major.transform(X_test['전공'])
X_test['직급'] = le_manager.transform(X_test['직급'])

rf_model = RandomForestClassifier()
history = rf_model.fit(np.array(X_train), np.array(Y_train).ravel())

pred = rf_model.predict(np.array(X_test))
value = np.array(Y_test).ravel()

display(pd.crosstab(value,pred, rownames=['실제값'],colnames=['예측값']))

(735, 10) (735, 10) (735, 1) (735, 1)


예측값,보통,좋다
실제값,Unnamed: 1_level_1,Unnamed: 2_level_1
보통,618,4
좋다,111,2


In [5]:
df_data['업무평가']

0       보통
1       좋다
2       보통
3       보통
4       보통
        ..
1465    보통
1466    보통
1467    좋다
1468    보통
1469    보통
Name: 업무평가, Length: 1470, dtype: object

In [9]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# 1) 데이터 로드
df = pd.read_csv('WA_Fn-UseC_-HR-Employee-Attrition_변환.csv')

x_col = ['나이','출장','부서','학력','전공','참여프로젝트','직급','경력','전년도교육출장횟수','현회사근속년수']
y_col = ['업무평가']

df_data = df[x_col + y_col].dropna().copy()

X = df_data[x_col].copy()
y = df_data[y_col[0]].copy()

# 2) Train/Test split (타깃 기준 stratify)
X_train, X_test, Y_train, Y_test = train_test_split(
    X, y, test_size=0.5, stratify=y, random_state=42
)
print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)

# 3) 라벨 인코딩 (train으로 fit → test transform)
le_buisness = LabelEncoder() # 출장
le_depart   = LabelEncoder() # 부서
le_major    = LabelEncoder() # 전공
le_manager  = LabelEncoder() # 직급

X_train = X_train.copy()
X_test  = X_test.copy()

X_train['출장'] = le_buisness.fit_transform(X_train['출장'])
X_train['부서'] = le_depart.fit_transform(X_train['부서'])
X_train['전공'] = le_major.fit_transform(X_train['전공'])
X_train['직급'] = le_manager.fit_transform(X_train['직급'])

X_test['출장'] = le_buisness.transform(X_test['출장'])
X_test['부서'] = le_depart.transform(X_test['부서'])
X_test['전공'] = le_major.transform(X_test['전공'])
X_test['직급'] = le_manager.transform(X_test['직급'])

# 4) ⚖️ 랜덤 언더샘플링 (학습 데이터에만)
def random_undersample_df(X_df: pd.DataFrame, y_s: pd.Series, random_state=42):
    tmp = X_df.copy()
    tmp['__y__'] = y_s.values
    # 가장 적은 클래스 수로 맞춤 (imblearn RandomUnderSampler(sampling_strategy='auto')와 유사)
    n_min = tmp['__y__'].value_counts().min()
    parts = []
    for cls, grp in tmp.groupby('__y__'):
        parts.append(grp.sample(n=n_min, random_state=random_state))
    out = pd.concat(parts, axis=0).sample(frac=1.0, random_state=random_state)
    y_out = out.pop('__y__')
    return out, y_out

X_res, y_res = random_undersample_df(X_train, Y_train, random_state=42)

print("\n[클래스 분포] 원본 학습데이터:")
print(Y_train.value_counts().sort_index())
print("\n[클래스 분포] 언더샘플링 후:")
print(y_res.value_counts().sort_index())

# 5) 모델 학습
rf_model = RandomForestClassifier(
    n_estimators=300,
    n_jobs=-1,
    random_state=42
)
rf_model.fit(X_res, y_res)

# 6) 예측 및 평가
pred = rf_model.predict(X_test)
cm = pd.crosstab(Y_test, pred, rownames=['실제값'], colnames=['예측값'])
display(cm)

print("\n[Classification Report]")
print(classification_report(Y_test, pred, digits=4))


(735, 10) (735, 10) (735,) (735,)

[클래스 분포] 원본 학습데이터:
업무평가
보통    622
좋다    113
Name: count, dtype: int64

[클래스 분포] 언더샘플링 후:
__y__
보통    113
좋다    113
Name: count, dtype: int64


예측값,보통,좋다
실제값,Unnamed: 1_level_1,Unnamed: 2_level_1
보통,331,291
좋다,63,50



[Classification Report]
              precision    recall  f1-score   support

          보통     0.8401    0.5322    0.6516       622
          좋다     0.1466    0.4425    0.2203       113

    accuracy                         0.5184       735
   macro avg     0.4934    0.4873    0.4359       735
weighted avg     0.7335    0.5184    0.5853       735

