# CatBoost

In [1]:
import pandas as pd
import numpy as np
import catboost as cb
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import os
data_path = os.environ.get('DATA_PATH')

In [2]:
train = pd.read_csv(data_path + 'AI_Cheats/employee_turnover/train.csv', index_col=0)
test = pd.read_csv(data_path + 'AI_Cheats/employee_turnover/test.csv', index_col=0)
# print(train['Attrition'].value_counts())

# 处理 Attrition 字段
train['Attrition'] = train['Attrition'].map(lambda x:1 if x=='Yes' else 0)

# 查看数据中每列是否有空值
# print(train.isna().sum())

In [3]:
# 去掉没用的列 员工号码，标准工时（=80）
train = train.drop(['EmployeeNumber', 'StandardHours'], axis=1)
test = test.drop(['EmployeeNumber', 'StandardHours'], axis=1)


In [4]:
# 对于分类特征进行特征值编码
attr=['Age','BusinessTravel','Department','Education','EducationField','Gender','JobRole','MaritalStatus','Over18','OverTime']
lbe_list=[]
for feature in attr:
    lbe=LabelEncoder()
    train[feature]=lbe.fit_transform(train[feature])
    test[feature]=lbe.transform(test[feature])
    lbe_list.append(lbe)
#print(train)

In [5]:
X_train, X_valid, y_train, y_valid = train_test_split(train.drop('Attrition',axis=1), train['Attrition'], test_size=0.2, random_state=42)

In [6]:
model = cb.CatBoostClassifier(
    iterations=1000, 
    depth=7, 
    learning_rate=0.01, 
    loss_function='Logloss', 
    eval_metric='AUC',
    logging_level='Verbose', 
    metric_period=50
)

In [7]:
# 得到分类特征的列号
categorical_features_indices = []
for i in range(len(X_train.columns)):
    if X_train.columns.values[i] in attr:
        categorical_features_indices.append(i)
print(categorical_features_indices)

[0, 1, 3, 5, 6, 9, 13, 15, 19, 20]


In [8]:
model.fit(X_train, y_train, eval_set=(X_valid, y_valid), cat_features=categorical_features_indices)

0:	test: 0.6390374	best: 0.6390374 (0)	total: 55.1ms	remaining: 55s
50:	test: 0.7895886	best: 0.7895886 (50)	total: 254ms	remaining: 4.72s
100:	test: 0.8008294	best: 0.8008294 (100)	total: 397ms	remaining: 3.53s
150:	test: 0.8057405	best: 0.8057405 (150)	total: 497ms	remaining: 2.79s
200:	test: 0.8041035	best: 0.8057405 (150)	total: 607ms	remaining: 2.41s
250:	test: 0.8029030	best: 0.8057405 (150)	total: 746ms	remaining: 2.23s
300:	test: 0.8059587	best: 0.8059587 (300)	total: 859ms	remaining: 1.99s
350:	test: 0.8075958	best: 0.8075958 (350)	total: 967ms	remaining: 1.79s
400:	test: 0.8081414	best: 0.8081414 (400)	total: 1.09s	remaining: 1.63s
450:	test: 0.8068318	best: 0.8081414 (400)	total: 1.24s	remaining: 1.51s
500:	test: 0.8113063	best: 0.8113063 (500)	total: 1.35s	remaining: 1.35s
550:	test: 0.8118520	best: 0.8118520 (550)	total: 1.52s	remaining: 1.24s
600:	test: 0.8129434	best: 0.8129434 (600)	total: 1.64s	remaining: 1.09s
650:	test: 0.8150169	best: 0.8150169 (650)	total: 1.79s	re

<catboost.core.CatBoostClassifier at 0x3295e7400>

In [9]:
predict = model.predict(test)
test['Attrition']=predict
test[['Attrition']].to_csv(data_path + 'AI_Cheats/employee_turnover/submit_cb.csv')