In [4]:
# 컬럼 설명 : satisfaction_level(직원만족도점수), last_evaluation(고용주평가점수), numbers_projects(할당된 프로젝트수)
# average_monthly_hours(한달동안 직원이 일한 평균시간), time_spent_company(회사에서 근무한 연수), work_accident(근무중 사고유무무)
# promotion_last_5years(지난 5년 직원이 승진했는지 여부), Departments(부서), Salary(월급수준 낮음, 중간, 높음), left(직원퇴사 여부부)
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import matplotlib.pyplot as plt

# 폰트지정
plt.rcParams["font.family"] = 'Malgun_Gothic'
# 마이너스 부호 깨짐 지정
plt.rcParams["axes.unicode_minus"] = False
# 숫자가 지수표현식으로 나올 때 지정
pd.options.display.float_format = "{:.2f}".format

# 데이터 로드
df = pd.read_csv("dataset/HR_comma_sep.csv",encoding='CP949')
# 데이터 확인
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,Departments,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [6]:
# 'left' 열이 정수형인지 확인하고 필요시 변환
if df['left'].dtype != 'int64':
    df['left'] = df['left'].astype[int]
# 범주형 변수 확인 및 전처리
# 'Departments ' 끝에 공백 제거
df.rename(columns={'Departments ':'Departments'},inplace=True)

In [7]:
# 원-핫 인코딩 (범주형 변수 처리)
df = pd.get_dummies(df, columns=['Departments','salary'],drop_first=True)
# 독립 변수(X)와 종속 변수(y) 설정
X=df.drop('left',axis=1)
y=df['left']
# 데이터 분할 (Train: 80%, Test: 20%)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [8]:
# 데이터 스케일링
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# 모델 학습 - 랜덤 포레스트
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled,y_train)
# 예측
y_pred = model.predict(X_test_scaled)

In [9]:
# 평가
accuracy = accuracy_score(y_test,y_pred)
print(f"{accuracy:.2f}")
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

0.99
[[2286    8]
 [  27  679]]
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      2294
           1       0.99      0.96      0.97       706

    accuracy                           0.99      3000
   macro avg       0.99      0.98      0.98      3000
weighted avg       0.99      0.99      0.99      3000



In [10]:
# Feature Importance (중요 변수 확인)
feature_importances = pd.DataFrame({
    "Feature":X.columns,
    "Importance":model.feature_importances_
}).sort_values(by='Importance', ascending=False)

print(feature_importances)

                    Feature  Importance
0        satisfaction_level        0.33
2            number_project        0.19
4        time_spend_company        0.18
3      average_montly_hours        0.14
1           last_evaluation        0.12
5             Work_accident        0.01
16               salary_low        0.01
15    Departments_technical        0.00
17            salary_medium        0.00
13        Departments_sales        0.00
14      Departments_support        0.00
7         Departments_RandD        0.00
8    Departments_accounting        0.00
6     promotion_last_5years        0.00
9            Departments_hr        0.00
10   Departments_management        0.00
11    Departments_marketing        0.00
12  Departments_product_mng        0.00
